add VerifyChecksum() to db.h

Summary:
We need a tool to check any sst file corruption in the db.
It will check all the sst files in current version and read all the blocks (data, meta, index) with checksum verification. If any verification fails, the function will return non-OK status.
Closes https://github.com/facebook/rocksdb/pull/2498

Differential Revision: D5324269

Pulled By: lightmark

fbshipit-source-id: 6f8a272008b722402a772acfc804524c9d1a483b
This commit is contained in:
Aaron G 2017-08-09 15:49:40 -07:00 committed by Facebook Github Bot
parent 47ed3bfc3b
commit 7848f0b24c
13 changed files with 185 additions and 2 deletions

View File

@ -24,6 +24,35 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
->DeleteFilesInRange(column_family, begin, end); ->DeleteFilesInRange(column_family, begin, end);
} }
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const std::string& file_path) {
unique_ptr<RandomAccessFile> file;
uint64_t file_size;
InternalKeyComparator internal_comparator(options.comparator);
ImmutableCFOptions ioptions(options);
Status s = ioptions.env->NewRandomAccessFile(file_path, &file, env_options);
if (s.ok()) {
s = ioptions.env->GetFileSize(file_path, &file_size);
} else {
return s;
}
unique_ptr<TableReader> table_reader;
std::unique_ptr<RandomAccessFileReader> file_reader(
new RandomAccessFileReader(std::move(file), file_path));
s = ioptions.table_factory->NewTableReader(
TableReaderOptions(ioptions, env_options, internal_comparator,
false /* skip_filters */, -1 /* level */),
std::move(file_reader), file_size, &table_reader,
false /* prefetch_index_and_filter_in_cache */);
if (!s.ok()) {
return s;
}
s = table_reader->VerifyChecksum();
return s;
}
} // namespace rocksdb } // namespace rocksdb
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE

View File

@ -20,6 +20,7 @@
#include "db/log_format.h" #include "db/log_format.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/convenience.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
@ -179,6 +180,9 @@ class CorruptionTest : public testing::Test {
} }
s = WriteStringToFile(Env::Default(), contents, fname); s = WriteStringToFile(Env::Default(), contents, fname);
ASSERT_TRUE(s.ok()) << s.ToString(); ASSERT_TRUE(s.ok()) << s.ToString();
Options options;
EnvOptions env_options;
ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname));
} }
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
@ -312,6 +316,7 @@ TEST_F(CorruptionTest, TableFile) {
Corrupt(kTableFile, 100, 1); Corrupt(kTableFile, 100, 1);
Check(99, 99); Check(99, 99);
ASSERT_NOK(dbi->VerifyChecksum());
} }
TEST_F(CorruptionTest, TableFileIndexData) { TEST_F(CorruptionTest, TableFileIndexData) {
@ -330,6 +335,7 @@ TEST_F(CorruptionTest, TableFileIndexData) {
// one full file should be readable, since only one was corrupted // one full file should be readable, since only one was corrupted
// the other file should be fully non-readable, since index was corrupted // the other file should be fully non-readable, since index was corrupted
Check(5000, 5000); Check(5000, 5000);
ASSERT_NOK(dbi->VerifyChecksum());
} }
TEST_F(CorruptionTest, MissingDescriptor) { TEST_F(CorruptionTest, MissingDescriptor) {
@ -389,10 +395,12 @@ TEST_F(CorruptionTest, CompactionInputError) {
Corrupt(kTableFile, 100, 1); Corrupt(kTableFile, 100, 1);
Check(9, 9); Check(9, 9);
ASSERT_NOK(dbi->VerifyChecksum());
// Force compactions by writing lots of values // Force compactions by writing lots of values
Build(10000); Build(10000);
Check(10000, 10000); Check(10000, 10000);
ASSERT_NOK(dbi->VerifyChecksum());
} }
TEST_F(CorruptionTest, CompactionInputErrorParanoid) { TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
@ -424,6 +432,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
CorruptTableFileAtLevel(0, 100, 1); CorruptTableFileAtLevel(0, 100, 1);
Check(9, 9); Check(9, 9);
ASSERT_NOK(dbi->VerifyChecksum());
// Write must eventually fail because of corrupted table // Write must eventually fail because of corrupted table
Status s; Status s;
@ -445,6 +454,7 @@ TEST_F(CorruptionTest, UnrelatedKeys) {
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable(); dbi->TEST_FlushMemTable();
Corrupt(kTableFile, 100, 1); Corrupt(kTableFile, 100, 1);
ASSERT_NOK(dbi->VerifyChecksum());
std::string tmp1, tmp2; std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));

View File

@ -67,6 +67,7 @@
#include "port/port.h" #include "port/port.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
@ -80,6 +81,7 @@
#include "table/merging_iterator.h" #include "table/merging_iterator.h"
#include "table/table_builder.h" #include "table/table_builder.h"
#include "table/two_level_iterator.h" #include "table/two_level_iterator.h"
#include "tools/sst_dump_tool_imp.h"
#include "util/auto_roll_logger.h" #include "util/auto_roll_logger.h"
#include "util/autovector.h" #include "util/autovector.h"
#include "util/build_version.h" #include "util/build_version.h"
@ -2740,6 +2742,54 @@ Status DBImpl::IngestExternalFile(
return status; return status;
} }
Status DBImpl::VerifyChecksum() {
Status s;
Options options;
EnvOptions env_options;
std::vector<ColumnFamilyData*> cfd_list;
{
InstrumentedMutexLock l(&mutex_);
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (!cfd->IsDropped() && cfd->initialized()) {
cfd->Ref();
cfd_list.push_back(cfd);
}
}
}
std::vector<SuperVersion*> sv_list;
for (auto cfd : cfd_list) {
sv_list.push_back(cfd->GetReferencedSuperVersion(&mutex_));
}
for (auto& sv : sv_list) {
VersionStorageInfo* vstorage = sv->current->storage_info();
for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
j++) {
const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
std::string fname = TableFileName(immutable_db_options_.db_paths,
fd.GetNumber(), fd.GetPathId());
s = rocksdb::VerifySstFileChecksum(options, env_options, fname);
}
}
if (!s.ok()) {
break;
}
}
{
InstrumentedMutexLock l(&mutex_);
for (auto sv : sv_list) {
if (sv && sv->Unref()) {
sv->Cleanup();
delete sv;
}
}
for (auto cfd : cfd_list) {
cfd->Unref();
}
}
return s;
}
void DBImpl::NotifyOnExternalFileIngested( void DBImpl::NotifyOnExternalFileIngested(
ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE

View File

@ -293,6 +293,8 @@ class DBImpl : public DB {
const std::vector<std::string>& external_files, const std::vector<std::string>& external_files,
const IngestExternalFileOptions& ingestion_options) override; const IngestExternalFileOptions& ingestion_options) override;
virtual Status VerifyChecksum() override;
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
// Similar to GetSnapshot(), but also lets the db know that this snapshot // Similar to GetSnapshot(), but also lets the db know that this snapshot

View File

@ -2235,6 +2235,10 @@ class ModelDB : public DB {
return Status::NotSupported("Not implemented."); return Status::NotSupported("Not implemented.");
} }
virtual Status VerifyChecksum() override {
return Status::NotSupported("Not implemented.");
}
using DB::GetPropertiesOfAllTables; using DB::GetPropertiesOfAllTables;
virtual Status GetPropertiesOfAllTables( virtual Status GetPropertiesOfAllTables(
ColumnFamilyHandle* column_family, ColumnFamilyHandle* column_family,

View File

@ -329,6 +329,11 @@ void CancelAllBackgroundWork(DB* db, bool wait = false);
// Snapshots before the delete might not see the data in the given range. // Snapshots before the delete might not see the data in the given range.
Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end); const Slice* begin, const Slice* end);
// Verify the checksum of file
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const std::string& file_path);
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
} // namespace rocksdb } // namespace rocksdb

View File

@ -976,6 +976,8 @@ class DB {
return IngestExternalFile(DefaultColumnFamily(), external_files, options); return IngestExternalFile(DefaultColumnFamily(), external_files, options);
} }
virtual Status VerifyChecksum() = 0;
// AddFile() is deprecated, please use IngestExternalFile() // AddFile() is deprecated, please use IngestExternalFile()
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
ColumnFamilyHandle* column_family, ColumnFamilyHandle* column_family,

View File

@ -95,6 +95,8 @@ class StackableDB : public DB {
return db_->IngestExternalFile(column_family, external_files, options); return db_->IngestExternalFile(column_family, external_files, options);
} }
virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
using DB::KeyMayExist; using DB::KeyMayExist;
virtual bool KeyMayExist(const ReadOptions& options, virtual bool KeyMayExist(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,

View File

@ -820,7 +820,6 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
std::unique_ptr<InternalIterator>* iter) { std::unique_ptr<InternalIterator>* iter) {
// TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
// it is an empty block. // it is an empty block.
// TODO: we never really verify check sum for meta index block
std::unique_ptr<Block> meta; std::unique_ptr<Block> meta;
Status s = ReadBlockFromFile( Status s = ReadBlockFromFile(
rep->file.get(), rep->footer, ReadOptions(), rep->file.get(), rep->footer, ReadOptions(),
@ -1746,6 +1745,60 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
return Status::OK(); return Status::OK();
} }
Status BlockBasedTable::VerifyChecksum() {
Status s;
// Check Meta blocks
std::unique_ptr<Block> meta;
std::unique_ptr<InternalIterator> meta_iter;
s = ReadMetaBlock(rep_, &meta, &meta_iter);
if (s.ok()) {
s = VerifyChecksumInBlocks(meta_iter.get());
if (!s.ok()) {
return s;
}
} else {
return s;
}
// Check Data blocks
BlockIter iiter_on_stack;
InternalIterator* iiter = NewIndexIterator(ReadOptions(), &iiter_on_stack);
std::unique_ptr<InternalIterator> iiter_unique_ptr;
if (iiter != &iiter_on_stack) {
iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
}
if (!iiter->status().ok()) {
// error opening index iterator
return iiter->status();
}
s = VerifyChecksumInBlocks(iiter);
return s;
}
Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) {
Status s;
for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
s = index_iter->status();
if (!s.ok()) {
break;
}
BlockHandle handle;
Slice input = index_iter->value();
s = handle.DecodeFrom(&input);
if (!s.ok()) {
break;
}
BlockContents contents;
s = ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(),
handle, &contents, rep_->ioptions,
false /* decompress */, Slice() /*compression dict*/,
rep_->persistent_cache_options);
if (!s.ok()) {
break;
}
}
return s;
}
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
const Slice& key) { const Slice& key) {
std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options)); std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options));

View File

@ -139,6 +139,8 @@ class BlockBasedTable : public TableReader {
// convert SST file to a human readable form // convert SST file to a human readable form
Status DumpTable(WritableFile* out_file) override; Status DumpTable(WritableFile* out_file) override;
Status VerifyChecksum() override;
void Close() override; void Close() override;
~BlockBasedTable(); ~BlockBasedTable();
@ -310,6 +312,8 @@ class BlockBasedTable : public TableReader {
static Status ReadMetaBlock(Rep* rep, std::unique_ptr<Block>* meta_block, static Status ReadMetaBlock(Rep* rep, std::unique_ptr<Block>* meta_block,
std::unique_ptr<InternalIterator>* iter); std::unique_ptr<InternalIterator>* iter);
Status VerifyChecksumInBlocks(InternalIterator* index_iter);
// Create the filter from the filter block. // Create the filter from the filter block.
FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, FilterBlockReader* ReadFilter(const BlockHandle& filter_handle,
const bool is_a_filter_partition) const; const bool is_a_filter_partition) const;

View File

@ -98,6 +98,11 @@ class TableReader {
return Status::NotSupported("DumpTable() not supported"); return Status::NotSupported("DumpTable() not supported");
} }
// check whether there is corruption in this db file
virtual Status VerifyChecksum() {
return Status::NotSupported("VerifyChecksum() not supported");
}
virtual void Close() {} virtual void Close() {}
}; };

View File

@ -127,6 +127,10 @@ Status SstFileReader::NewTableReader(
std::move(file_), file_size, &table_reader_); std::move(file_), file_size, &table_reader_);
} }
Status SstFileReader::VerifyChecksum() {
return table_reader_->VerifyChecksum();
}
Status SstFileReader::DumpTable(const std::string& out_filename) { Status SstFileReader::DumpTable(const std::string& out_filename) {
unique_ptr<WritableFile> out_file; unique_ptr<WritableFile> out_file;
Env* env = Env::Default(); Env* env = Env::Default();
@ -349,10 +353,11 @@ void print_help() {
--file=<data_dir_OR_sst_file> --file=<data_dir_OR_sst_file>
Path to SST file or directory containing SST files Path to SST file or directory containing SST files
--command=check|scan|raw --command=check|scan|raw|verify
check: Iterate over entries in files but dont print anything except if an error is encounterd (default command) check: Iterate over entries in files but dont print anything except if an error is encounterd (default command)
scan: Iterate over entries in files and print them to screen scan: Iterate over entries in files and print them to screen
raw: Dump all the table contents to <file_name>_dump.txt raw: Dump all the table contents to <file_name>_dump.txt
verify: Iterate all the blocks in files verifying checksum to detect possible coruption but dont print anything except if a corruption is encountered
--output_hex --output_hex
Can be combined with scan command to print the keys and values in Hex Can be combined with scan command to print the keys and values in Hex
@ -580,6 +585,17 @@ int SSTDumpTool::Run(int argc, char** argv) {
} }
} }
if (command == "verify") {
st = reader.VerifyChecksum();
if (!st.ok()) {
fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(),
st.ToString().c_str());
} else {
fprintf(stdout, "The file is ok\n");
}
continue;
}
if (show_properties || show_summary) { if (show_properties || show_summary) {
const rocksdb::TableProperties* table_properties; const rocksdb::TableProperties* table_properties;

View File

@ -30,6 +30,7 @@ class SstFileReader {
uint64_t GetReadNumber() { return read_num_; } uint64_t GetReadNumber() { return read_num_; }
TableProperties* GetInitTableProperties() { return table_properties_.get(); } TableProperties* GetInitTableProperties() { return table_properties_.get(); }
Status VerifyChecksum();
Status DumpTable(const std::string& out_filename); Status DumpTable(const std::string& out_filename);
Status getStatus() { return init_result_; } Status getStatus() { return init_result_; }