Add an option to prevent DB::Open() from querying sizes of all sst files (#6353)
Summary: When paranoid_checks is on, DBImpl::CheckConsistency() iterates over all sst files and calls Env::GetFileSize() for each of them. As far as I could understand, this is pretty arbitrary and doesn't affect correctness - if filesystem doesn't corrupt fsynced files, the file sizes will always match; if it does, it may as well corrupt contents as well as sizes, and rocksdb doesn't check contents on open. If there are thousands of sst files, getting all their sizes takes a while. If, on top of that, Env is overridden to use some remote storage instead of local filesystem, it can be *really* slow and overload the remote storage service. This PR adds an option to not do GetFileSize(); instead it does GetChildren() for parent directory to check that all the expected sst files are at least present, but doesn't check their sizes. We can't just disable paranoid_checks instead because paranoid_checks do a few other important things: make the DB read-only on write errors, print error messages on read errors, etc. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6353 Test Plan: ran the added sanity check unit test. Will try it out in a LogDevice test cluster where the GetFileSize() calls are causing a lot of trouble. Differential Revision: D19656425 Pulled By: al13n321 fbshipit-source-id: c2c421b367633033760d1f56747bad206d1fbf82
This commit is contained in:
parent
7330ec0ff1
commit
637e64b9ac
@ -7,6 +7,7 @@
|
|||||||
* Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev().
|
* Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev().
|
||||||
* Fix a bug that prevents opening a DB after two consecutive crash with TransactionDB, where the first crash recovers from a corrupted WAL with kPointInTimeRecovery but the second cannot.
|
* Fix a bug that prevents opening a DB after two consecutive crash with TransactionDB, where the first crash recovers from a corrupted WAL with kPointInTimeRecovery but the second cannot.
|
||||||
* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
|
* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
|
||||||
|
* Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open().
|
||||||
|
|
||||||
### Public API Change
|
### Public API Change
|
||||||
* The BlobDB garbage collector now emits the statistics `BLOB_DB_GC_NUM_FILES` (number of blob files obsoleted during GC), `BLOB_DB_GC_NUM_NEW_FILES` (number of new blob files generated during GC), `BLOB_DB_GC_FAILURES` (number of failed GC passes), `BLOB_DB_GC_NUM_KEYS_RELOCATED` (number of blobs relocated during GC), and `BLOB_DB_GC_BYTES_RELOCATED` (total size of blobs relocated during GC). On the other hand, the following statistics, which are not relevant for the new GC implementation, are now deprecated: `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, and `BLOB_DB_GC_MICROS`.
|
* The BlobDB garbage collector now emits the statistics `BLOB_DB_GC_NUM_FILES` (number of blob files obsoleted during GC), `BLOB_DB_GC_NUM_NEW_FILES` (number of new blob files generated during GC), `BLOB_DB_GC_FAILURES` (number of failed GC passes), `BLOB_DB_GC_NUM_KEYS_RELOCATED` (number of blobs relocated during GC), and `BLOB_DB_GC_BYTES_RELOCATED` (total size of blobs relocated during GC). On the other hand, the following statistics, which are not relevant for the new GC implementation, are now deprecated: `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, and `BLOB_DB_GC_MICROS`.
|
||||||
|
5
db/c.cc
5
db/c.cc
@ -2320,6 +2320,11 @@ void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
|
|||||||
opt->rep.skip_stats_update_on_db_open = val;
|
opt->rep.skip_stats_update_on_db_open = val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
|
||||||
|
rocksdb_options_t* opt, unsigned char val) {
|
||||||
|
opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
|
||||||
|
}
|
||||||
|
|
||||||
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
|
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
|
||||||
opt->rep.num_levels = n;
|
opt->rep.num_levels = n;
|
||||||
}
|
}
|
||||||
|
@ -3318,6 +3318,44 @@ Status DBImpl::CheckConsistency() {
|
|||||||
TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
|
TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
|
||||||
|
|
||||||
std::string corruption_messages;
|
std::string corruption_messages;
|
||||||
|
|
||||||
|
if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
|
||||||
|
// Instead of calling GetFileSize() for each expected file, call
|
||||||
|
// GetChildren() for the DB directory and check that all expected files
|
||||||
|
// are listed, without checking their sizes.
|
||||||
|
// Since sst files might be in different directories, do it for each
|
||||||
|
// directory separately.
|
||||||
|
std::map<std::string, std::vector<std::string>> files_by_directory;
|
||||||
|
for (const auto& md : metadata) {
|
||||||
|
// md.name has a leading "/". Remove it.
|
||||||
|
std::string fname = md.name;
|
||||||
|
if (!fname.empty() && fname[0] == '/') {
|
||||||
|
fname = fname.substr(1);
|
||||||
|
}
|
||||||
|
files_by_directory[md.db_path].push_back(fname);
|
||||||
|
}
|
||||||
|
for (const auto& dir_files : files_by_directory) {
|
||||||
|
std::string directory = dir_files.first;
|
||||||
|
std::vector<std::string> existing_files;
|
||||||
|
Status s = env_->GetChildren(directory, &existing_files);
|
||||||
|
if (!s.ok()) {
|
||||||
|
corruption_messages +=
|
||||||
|
"Can't list files in " + directory + ": " + s.ToString() + "\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::sort(existing_files.begin(), existing_files.end());
|
||||||
|
|
||||||
|
for (const std::string& fname : dir_files.second) {
|
||||||
|
if (!std::binary_search(existing_files.begin(), existing_files.end(),
|
||||||
|
fname) &&
|
||||||
|
!std::binary_search(existing_files.begin(), existing_files.end(),
|
||||||
|
Rocks2LevelTableFileName(fname))) {
|
||||||
|
corruption_messages +=
|
||||||
|
"Missing sst file " + fname + " in " + directory + "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
for (const auto& md : metadata) {
|
for (const auto& md : metadata) {
|
||||||
// md.name has a leading "/".
|
// md.name has a leading "/".
|
||||||
std::string file_path = md.db_path + md.name;
|
std::string file_path = md.db_path + md.name;
|
||||||
@ -3339,6 +3377,8 @@ Status DBImpl::CheckConsistency() {
|
|||||||
ToString(fsize) + "\n";
|
ToString(fsize) + "\n";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (corruption_messages.size() == 0) {
|
if (corruption_messages.size() == 0) {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
} else {
|
} else {
|
||||||
|
@ -172,6 +172,13 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
|||||||
result.sst_file_manager = sst_file_manager;
|
result.sst_file_manager = sst_file_manager;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (!result.paranoid_checks) {
|
||||||
|
result.skip_checking_sst_file_sizes_on_db_open = true;
|
||||||
|
ROCKS_LOG_INFO(result.info_log,
|
||||||
|
"file size check will be skipped during open.");
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -468,6 +468,11 @@ Status DBImplSecondary::CheckConsistency() {
|
|||||||
// approach and just proceed.
|
// approach and just proceed.
|
||||||
TEST_SYNC_POINT_CALLBACK(
|
TEST_SYNC_POINT_CALLBACK(
|
||||||
"DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
|
"DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
|
||||||
|
|
||||||
|
if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<LiveFileMetaData> metadata;
|
std::vector<LiveFileMetaData> metadata;
|
||||||
versions_->GetLiveFilesMetaData(&metadata);
|
versions_->GetLiveFilesMetaData(&metadata);
|
||||||
|
|
||||||
|
@ -124,6 +124,20 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
|
|||||||
Destroy(options);
|
Destroy(options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check that we don't crash when opening DB with
|
||||||
|
// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
|
||||||
|
TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
|
||||||
|
ASSERT_OK(Put("pika", "choo"));
|
||||||
|
ASSERT_OK(Flush());
|
||||||
|
|
||||||
|
// Just open the DB with the option set to true and check that we don't crash.
|
||||||
|
Options options;
|
||||||
|
options.skip_checking_sst_file_sizes_on_db_open = true;
|
||||||
|
Reopen(options);
|
||||||
|
|
||||||
|
ASSERT_EQ("choo", Get("pika"));
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
TEST_F(DBSSTTest, DontDeleteMovedFile) {
|
TEST_F(DBSSTTest, DontDeleteMovedFile) {
|
||||||
// This test triggers move compaction and verifies that the file is not
|
// This test triggers move compaction and verifies that the file is not
|
||||||
|
@ -854,6 +854,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
|
|||||||
extern ROCKSDB_LIBRARY_API void
|
extern ROCKSDB_LIBRARY_API void
|
||||||
rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
|
rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
|
||||||
unsigned char val);
|
unsigned char val);
|
||||||
|
extern ROCKSDB_LIBRARY_API void
|
||||||
|
rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
|
||||||
|
rocksdb_options_t* opt, unsigned char val);
|
||||||
|
|
||||||
/* returns a pointer to a malloc()-ed, null terminated string */
|
/* returns a pointer to a malloc()-ed, null terminated string */
|
||||||
extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
|
extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
|
||||||
|
@ -985,6 +985,16 @@ struct DBOptions {
|
|||||||
// Default: false
|
// Default: false
|
||||||
bool skip_stats_update_on_db_open = false;
|
bool skip_stats_update_on_db_open = false;
|
||||||
|
|
||||||
|
// If true, then DB::Open() will not fetch and check sizes of all sst files.
|
||||||
|
// This may significantly speed up startup if there are many sst files,
|
||||||
|
// especially when using non-default Env with expensive GetFileSize().
|
||||||
|
// We'll still check that all required sst files exist.
|
||||||
|
// If paranoid_checks is false, this option is ignored, and sst files are
|
||||||
|
// not checked at all.
|
||||||
|
//
|
||||||
|
// Default: false
|
||||||
|
bool skip_checking_sst_file_sizes_on_db_open = false;
|
||||||
|
|
||||||
// Recovery mode to control the consistency while replaying WAL
|
// Recovery mode to control the consistency while replaying WAL
|
||||||
// Default: kPointInTimeRecovery
|
// Default: kPointInTimeRecovery
|
||||||
WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
||||||
|
@ -1738,6 +1738,30 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(
|
|||||||
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
|
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_Options
|
||||||
|
* Method: setSkipCheckingSstFileSizesOnDbOpen
|
||||||
|
* Signature: (JZ)V
|
||||||
|
*/
|
||||||
|
void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen(
|
||||||
|
JNIEnv*, jobject, jlong jhandle,
|
||||||
|
jboolean jskip_checking_sst_file_sizes_on_db_open) {
|
||||||
|
auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
|
||||||
|
opt->skip_checking_sst_file_sizes_on_db_open =
|
||||||
|
static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_Options
|
||||||
|
* Method: skipCheckingSstFileSizesOnDbOpen
|
||||||
|
* Signature: (J)Z
|
||||||
|
*/
|
||||||
|
jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen(
|
||||||
|
JNIEnv*, jobject, jlong jhandle) {
|
||||||
|
auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
|
||||||
|
return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_Options
|
* Class: org_rocksdb_Options
|
||||||
* Method: setWalRecoveryMode
|
* Method: setWalRecoveryMode
|
||||||
@ -6010,6 +6034,30 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(
|
|||||||
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
|
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_DBOptions
|
||||||
|
* Method: setSkipCheckingSstFileSizesOnDbOpen
|
||||||
|
* Signature: (JZ)V
|
||||||
|
*/
|
||||||
|
void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen(
|
||||||
|
JNIEnv*, jobject, jlong jhandle,
|
||||||
|
jboolean jskip_checking_sst_file_sizes_on_db_open) {
|
||||||
|
auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
|
||||||
|
opt->skip_checking_sst_file_sizes_on_db_open =
|
||||||
|
static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_DBOptions
|
||||||
|
* Method: skipCheckingSstFileSizesOnDbOpen
|
||||||
|
* Signature: (J)Z
|
||||||
|
*/
|
||||||
|
jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen(
|
||||||
|
JNIEnv*, jobject, jlong jhandle) {
|
||||||
|
auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
|
||||||
|
return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_DBOptions
|
* Class: org_rocksdb_DBOptions
|
||||||
* Method: setWalRecoveryMode
|
* Method: setWalRecoveryMode
|
||||||
|
@ -74,6 +74,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
|
|||||||
write_thread_max_yield_usec(options.write_thread_max_yield_usec),
|
write_thread_max_yield_usec(options.write_thread_max_yield_usec),
|
||||||
write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
|
write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
|
||||||
skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
|
skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
|
||||||
|
skip_checking_sst_file_sizes_on_db_open(
|
||||||
|
options.skip_checking_sst_file_sizes_on_db_open),
|
||||||
wal_recovery_mode(options.wal_recovery_mode),
|
wal_recovery_mode(options.wal_recovery_mode),
|
||||||
allow_2pc(options.allow_2pc),
|
allow_2pc(options.allow_2pc),
|
||||||
row_cache(options.row_cache),
|
row_cache(options.row_cache),
|
||||||
|
@ -68,6 +68,7 @@ struct ImmutableDBOptions {
|
|||||||
uint64_t write_thread_max_yield_usec;
|
uint64_t write_thread_max_yield_usec;
|
||||||
uint64_t write_thread_slow_yield_usec;
|
uint64_t write_thread_slow_yield_usec;
|
||||||
bool skip_stats_update_on_db_open;
|
bool skip_stats_update_on_db_open;
|
||||||
|
bool skip_checking_sst_file_sizes_on_db_open;
|
||||||
WALRecoveryMode wal_recovery_mode;
|
WALRecoveryMode wal_recovery_mode;
|
||||||
bool allow_2pc;
|
bool allow_2pc;
|
||||||
std::shared_ptr<Cache> row_cache;
|
std::shared_ptr<Cache> row_cache;
|
||||||
|
@ -119,6 +119,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
|
|||||||
immutable_db_options.write_thread_slow_yield_usec;
|
immutable_db_options.write_thread_slow_yield_usec;
|
||||||
options.skip_stats_update_on_db_open =
|
options.skip_stats_update_on_db_open =
|
||||||
immutable_db_options.skip_stats_update_on_db_open;
|
immutable_db_options.skip_stats_update_on_db_open;
|
||||||
|
options.skip_checking_sst_file_sizes_on_db_open =
|
||||||
|
immutable_db_options.skip_checking_sst_file_sizes_on_db_open;
|
||||||
options.wal_recovery_mode = immutable_db_options.wal_recovery_mode;
|
options.wal_recovery_mode = immutable_db_options.wal_recovery_mode;
|
||||||
options.allow_2pc = immutable_db_options.allow_2pc;
|
options.allow_2pc = immutable_db_options.allow_2pc;
|
||||||
options.row_cache = immutable_db_options.row_cache;
|
options.row_cache = immutable_db_options.row_cache;
|
||||||
@ -1473,6 +1475,9 @@ std::unordered_map<std::string, OptionTypeInfo>
|
|||||||
{"skip_stats_update_on_db_open",
|
{"skip_stats_update_on_db_open",
|
||||||
{offsetof(struct DBOptions, skip_stats_update_on_db_open),
|
{offsetof(struct DBOptions, skip_stats_update_on_db_open),
|
||||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||||
|
{"skip_checking_sst_file_sizes_on_db_open",
|
||||||
|
{offsetof(struct DBOptions, skip_checking_sst_file_sizes_on_db_open),
|
||||||
|
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||||
{"new_table_reader_for_compaction_inputs",
|
{"new_table_reader_for_compaction_inputs",
|
||||||
{offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
|
{offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
|
||||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||||
|
@ -248,6 +248,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
|||||||
"new_table_reader_for_compaction_inputs=false;"
|
"new_table_reader_for_compaction_inputs=false;"
|
||||||
"keep_log_file_num=4890;"
|
"keep_log_file_num=4890;"
|
||||||
"skip_stats_update_on_db_open=false;"
|
"skip_stats_update_on_db_open=false;"
|
||||||
|
"skip_checking_sst_file_sizes_on_db_open=false;"
|
||||||
"max_manifest_file_size=4295009941;"
|
"max_manifest_file_size=4295009941;"
|
||||||
"db_log_dir=path/to/db_log_dir;"
|
"db_log_dir=path/to/db_log_dir;"
|
||||||
"skip_log_error_on_recovery=true;"
|
"skip_log_error_on_recovery=true;"
|
||||||
|
@ -265,6 +265,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
|
|||||||
db_opt->paranoid_checks = rnd->Uniform(2);
|
db_opt->paranoid_checks = rnd->Uniform(2);
|
||||||
db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
|
db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
|
||||||
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
|
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
|
||||||
|
db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
|
||||||
db_opt->use_adaptive_mutex = rnd->Uniform(2);
|
db_opt->use_adaptive_mutex = rnd->Uniform(2);
|
||||||
db_opt->use_fsync = rnd->Uniform(2);
|
db_opt->use_fsync = rnd->Uniform(2);
|
||||||
db_opt->recycle_log_file_num = rnd->Uniform(2);
|
db_opt->recycle_log_file_num = rnd->Uniform(2);
|
||||||
|
Loading…
Reference in New Issue
Block a user