Add an option to prevent DB::Open() from querying sizes of all sst files (#6353)
Summary: When paranoid_checks is on, DBImpl::CheckConsistency() iterates over all sst files and calls Env::GetFileSize() for each of them. As far as I could understand, this is pretty arbitrary and doesn't affect correctness - if filesystem doesn't corrupt fsynced files, the file sizes will always match; if it does, it may as well corrupt contents as well as sizes, and rocksdb doesn't check contents on open. If there are thousands of sst files, getting all their sizes takes a while. If, on top of that, Env is overridden to use some remote storage instead of local filesystem, it can be *really* slow and overload the remote storage service. This PR adds an option to not do GetFileSize(); instead it does GetChildren() for parent directory to check that all the expected sst files are at least present, but doesn't check their sizes. We can't just disable paranoid_checks instead because paranoid_checks do a few other important things: make the DB read-only on write errors, print error messages on read errors, etc. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6353 Test Plan: ran the added sanity check unit test. Will try it out in a LogDevice test cluster where the GetFileSize() calls are causing a lot of trouble. Differential Revision: D19656425 Pulled By: al13n321 fbshipit-source-id: c2c421b367633033760d1f56747bad206d1fbf82
This commit is contained in:
parent
7330ec0ff1
commit
637e64b9ac
@ -7,6 +7,7 @@
|
||||
* Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev().
|
||||
* Fix a bug that prevents opening a DB after two consecutive crash with TransactionDB, where the first crash recovers from a corrupted WAL with kPointInTimeRecovery but the second cannot.
|
||||
* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
|
||||
* Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open().
|
||||
|
||||
### Public API Change
|
||||
* The BlobDB garbage collector now emits the statistics `BLOB_DB_GC_NUM_FILES` (number of blob files obsoleted during GC), `BLOB_DB_GC_NUM_NEW_FILES` (number of new blob files generated during GC), `BLOB_DB_GC_FAILURES` (number of failed GC passes), `BLOB_DB_GC_NUM_KEYS_RELOCATED` (number of blobs relocated during GC), and `BLOB_DB_GC_BYTES_RELOCATED` (total size of blobs relocated during GC). On the other hand, the following statistics, which are not relevant for the new GC implementation, are now deprecated: `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, and `BLOB_DB_GC_MICROS`.
|
||||
|
5
db/c.cc
5
db/c.cc
@ -2320,6 +2320,11 @@ void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
|
||||
opt->rep.skip_stats_update_on_db_open = val;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
|
||||
rocksdb_options_t* opt, unsigned char val) {
|
||||
opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
|
||||
opt->rep.num_levels = n;
|
||||
}
|
||||
|
@ -3318,27 +3318,67 @@ Status DBImpl::CheckConsistency() {
|
||||
TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
|
||||
|
||||
std::string corruption_messages;
|
||||
for (const auto& md : metadata) {
|
||||
// md.name has a leading "/".
|
||||
std::string file_path = md.db_path + md.name;
|
||||
|
||||
uint64_t fsize = 0;
|
||||
TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
|
||||
Status s = env_->GetFileSize(file_path, &fsize);
|
||||
if (!s.ok() &&
|
||||
env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
|
||||
s = Status::OK();
|
||||
if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
|
||||
// Instead of calling GetFileSize() for each expected file, call
|
||||
// GetChildren() for the DB directory and check that all expected files
|
||||
// are listed, without checking their sizes.
|
||||
// Since sst files might be in different directories, do it for each
|
||||
// directory separately.
|
||||
std::map<std::string, std::vector<std::string>> files_by_directory;
|
||||
for (const auto& md : metadata) {
|
||||
// md.name has a leading "/". Remove it.
|
||||
std::string fname = md.name;
|
||||
if (!fname.empty() && fname[0] == '/') {
|
||||
fname = fname.substr(1);
|
||||
}
|
||||
files_by_directory[md.db_path].push_back(fname);
|
||||
}
|
||||
if (!s.ok()) {
|
||||
corruption_messages +=
|
||||
"Can't access " + md.name + ": " + s.ToString() + "\n";
|
||||
} else if (fsize != md.size) {
|
||||
corruption_messages += "Sst file size mismatch: " + file_path +
|
||||
". Size recorded in manifest " +
|
||||
ToString(md.size) + ", actual size " +
|
||||
ToString(fsize) + "\n";
|
||||
for (const auto& dir_files : files_by_directory) {
|
||||
std::string directory = dir_files.first;
|
||||
std::vector<std::string> existing_files;
|
||||
Status s = env_->GetChildren(directory, &existing_files);
|
||||
if (!s.ok()) {
|
||||
corruption_messages +=
|
||||
"Can't list files in " + directory + ": " + s.ToString() + "\n";
|
||||
continue;
|
||||
}
|
||||
std::sort(existing_files.begin(), existing_files.end());
|
||||
|
||||
for (const std::string& fname : dir_files.second) {
|
||||
if (!std::binary_search(existing_files.begin(), existing_files.end(),
|
||||
fname) &&
|
||||
!std::binary_search(existing_files.begin(), existing_files.end(),
|
||||
Rocks2LevelTableFileName(fname))) {
|
||||
corruption_messages +=
|
||||
"Missing sst file " + fname + " in " + directory + "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (const auto& md : metadata) {
|
||||
// md.name has a leading "/".
|
||||
std::string file_path = md.db_path + md.name;
|
||||
|
||||
uint64_t fsize = 0;
|
||||
TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
|
||||
Status s = env_->GetFileSize(file_path, &fsize);
|
||||
if (!s.ok() &&
|
||||
env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
|
||||
s = Status::OK();
|
||||
}
|
||||
if (!s.ok()) {
|
||||
corruption_messages +=
|
||||
"Can't access " + md.name + ": " + s.ToString() + "\n";
|
||||
} else if (fsize != md.size) {
|
||||
corruption_messages += "Sst file size mismatch: " + file_path +
|
||||
". Size recorded in manifest " +
|
||||
ToString(md.size) + ", actual size " +
|
||||
ToString(fsize) + "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (corruption_messages.size() == 0) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
|
@ -172,6 +172,13 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
||||
result.sst_file_manager = sst_file_manager;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!result.paranoid_checks) {
|
||||
result.skip_checking_sst_file_sizes_on_db_open = true;
|
||||
ROCKS_LOG_INFO(result.info_log,
|
||||
"file size check will be skipped during open.");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -468,6 +468,11 @@ Status DBImplSecondary::CheckConsistency() {
|
||||
// approach and just proceed.
|
||||
TEST_SYNC_POINT_CALLBACK(
|
||||
"DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
|
||||
|
||||
if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
versions_->GetLiveFilesMetaData(&metadata);
|
||||
|
||||
|
@ -124,6 +124,20 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
// Check that we don't crash when opening DB with
|
||||
// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
|
||||
TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
|
||||
ASSERT_OK(Put("pika", "choo"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
// Just open the DB with the option set to true and check that we don't crash.
|
||||
Options options;
|
||||
options.skip_checking_sst_file_sizes_on_db_open = true;
|
||||
Reopen(options);
|
||||
|
||||
ASSERT_EQ("choo", Get("pika"));
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
TEST_F(DBSSTTest, DontDeleteMovedFile) {
|
||||
// This test triggers move compaction and verifies that the file is not
|
||||
|
@ -854,6 +854,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
|
||||
extern ROCKSDB_LIBRARY_API void
|
||||
rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
|
||||
unsigned char val);
|
||||
extern ROCKSDB_LIBRARY_API void
|
||||
rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
|
||||
rocksdb_options_t* opt, unsigned char val);
|
||||
|
||||
/* returns a pointer to a malloc()-ed, null terminated string */
|
||||
extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
|
||||
|
@ -985,6 +985,16 @@ struct DBOptions {
|
||||
// Default: false
|
||||
bool skip_stats_update_on_db_open = false;
|
||||
|
||||
// If true, then DB::Open() will not fetch and check sizes of all sst files.
|
||||
// This may significantly speed up startup if there are many sst files,
|
||||
// especially when using non-default Env with expensive GetFileSize().
|
||||
// We'll still check that all required sst files exist.
|
||||
// If paranoid_checks is false, this option is ignored, and sst files are
|
||||
// not checked at all.
|
||||
//
|
||||
// Default: false
|
||||
bool skip_checking_sst_file_sizes_on_db_open = false;
|
||||
|
||||
// Recovery mode to control the consistency while replaying WAL
|
||||
// Default: kPointInTimeRecovery
|
||||
WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
||||
|
@ -1738,6 +1738,30 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(
|
||||
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_Options
|
||||
* Method: setSkipCheckingSstFileSizesOnDbOpen
|
||||
* Signature: (JZ)V
|
||||
*/
|
||||
void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen(
|
||||
JNIEnv*, jobject, jlong jhandle,
|
||||
jboolean jskip_checking_sst_file_sizes_on_db_open) {
|
||||
auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
|
||||
opt->skip_checking_sst_file_sizes_on_db_open =
|
||||
static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_Options
|
||||
* Method: skipCheckingSstFileSizesOnDbOpen
|
||||
* Signature: (J)Z
|
||||
*/
|
||||
jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen(
|
||||
JNIEnv*, jobject, jlong jhandle) {
|
||||
auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
|
||||
return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_Options
|
||||
* Method: setWalRecoveryMode
|
||||
@ -6010,6 +6034,30 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(
|
||||
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_DBOptions
|
||||
* Method: setSkipCheckingSstFileSizesOnDbOpen
|
||||
* Signature: (JZ)V
|
||||
*/
|
||||
void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen(
|
||||
JNIEnv*, jobject, jlong jhandle,
|
||||
jboolean jskip_checking_sst_file_sizes_on_db_open) {
|
||||
auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
|
||||
opt->skip_checking_sst_file_sizes_on_db_open =
|
||||
static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_DBOptions
|
||||
* Method: skipCheckingSstFileSizesOnDbOpen
|
||||
* Signature: (J)Z
|
||||
*/
|
||||
jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen(
|
||||
JNIEnv*, jobject, jlong jhandle) {
|
||||
auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
|
||||
return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_DBOptions
|
||||
* Method: setWalRecoveryMode
|
||||
|
@ -74,6 +74,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
|
||||
write_thread_max_yield_usec(options.write_thread_max_yield_usec),
|
||||
write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
|
||||
skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
|
||||
skip_checking_sst_file_sizes_on_db_open(
|
||||
options.skip_checking_sst_file_sizes_on_db_open),
|
||||
wal_recovery_mode(options.wal_recovery_mode),
|
||||
allow_2pc(options.allow_2pc),
|
||||
row_cache(options.row_cache),
|
||||
|
@ -68,6 +68,7 @@ struct ImmutableDBOptions {
|
||||
uint64_t write_thread_max_yield_usec;
|
||||
uint64_t write_thread_slow_yield_usec;
|
||||
bool skip_stats_update_on_db_open;
|
||||
bool skip_checking_sst_file_sizes_on_db_open;
|
||||
WALRecoveryMode wal_recovery_mode;
|
||||
bool allow_2pc;
|
||||
std::shared_ptr<Cache> row_cache;
|
||||
|
@ -119,6 +119,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
|
||||
immutable_db_options.write_thread_slow_yield_usec;
|
||||
options.skip_stats_update_on_db_open =
|
||||
immutable_db_options.skip_stats_update_on_db_open;
|
||||
options.skip_checking_sst_file_sizes_on_db_open =
|
||||
immutable_db_options.skip_checking_sst_file_sizes_on_db_open;
|
||||
options.wal_recovery_mode = immutable_db_options.wal_recovery_mode;
|
||||
options.allow_2pc = immutable_db_options.allow_2pc;
|
||||
options.row_cache = immutable_db_options.row_cache;
|
||||
@ -1473,6 +1475,9 @@ std::unordered_map<std::string, OptionTypeInfo>
|
||||
{"skip_stats_update_on_db_open",
|
||||
{offsetof(struct DBOptions, skip_stats_update_on_db_open),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||
{"skip_checking_sst_file_sizes_on_db_open",
|
||||
{offsetof(struct DBOptions, skip_checking_sst_file_sizes_on_db_open),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||
{"new_table_reader_for_compaction_inputs",
|
||||
{offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||
|
@ -248,6 +248,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
||||
"new_table_reader_for_compaction_inputs=false;"
|
||||
"keep_log_file_num=4890;"
|
||||
"skip_stats_update_on_db_open=false;"
|
||||
"skip_checking_sst_file_sizes_on_db_open=false;"
|
||||
"max_manifest_file_size=4295009941;"
|
||||
"db_log_dir=path/to/db_log_dir;"
|
||||
"skip_log_error_on_recovery=true;"
|
||||
|
@ -265,6 +265,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
|
||||
db_opt->paranoid_checks = rnd->Uniform(2);
|
||||
db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
|
||||
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
|
||||
db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
|
||||
db_opt->use_adaptive_mutex = rnd->Uniform(2);
|
||||
db_opt->use_fsync = rnd->Uniform(2);
|
||||
db_opt->recycle_log_file_num = rnd->Uniform(2);
|
||||
|
Loading…
Reference in New Issue
Block a user