diff --git a/HISTORY.md b/HISTORY.md index 609073a1f..314d9fa1a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,7 +10,7 @@ * In best-efforts recovery, an error that is not Corruption or IOError::kNotFound or IOError::kPathNotFound will be overwritten silently. Fix this by checking all non-ok cases and return early. * When `file_checksum_gen_factory` is set to `GetFileChecksumGenCrc32cFactory()`, BackupEngine will compare the crc32c checksums of table files computed when creating a backup to the expected checksums stored in the DB manifest, and will fail `CreateNewBackup()` on mismatch (corruption). If the `file_checksum_gen_factory` is not set or set to any other customized factory, there is no checksum verification to detect if SST files in a DB are corrupt when read, copied, and independently checksummed by BackupEngine. * When a DB sets `stats_dump_period_sec > 0`, either as the initial value for DB open or as a dynamic option change, the first stats dump is staggered in the following X seconds, where X is an integer in `[0, stats_dump_period_sec)`. Subsequent stats dumps are still spaced `stats_dump_period_sec` seconds apart. - +* When the paranoid_file_checks option is true, a hash is generated of all keys and values are generated when the SST file is written, and then the values are read back in to validate the file. A corruption is signaled if the two hashes do not match. ### Bug fixes * Compressed block cache was automatically disabled with read-only DBs by mistake. Now it is fixed: compressed block cache will be in effective with read-only DB too. * Fix a bug of wrong iterator result if another thread finishes an update and a DB flush between two statement. diff --git a/db/builder.cc b/db/builder.cc index eb3ab5422..510eb36fb 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -93,6 +93,7 @@ Status BuildTable( column_family_name.empty()); // Reports the IOStats for flush for every following bytes. const size_t kReportFlushIOStatsEvery = 1048576; + uint64_t paranoid_hash = 0; Status s; IOStatus io_s; meta->fd.file_size = 0; @@ -110,7 +111,6 @@ Status BuildTable( ioptions.listeners, dbname, column_family_name, fname, job_id, reason); #endif // !ROCKSDB_LITE TableProperties tp; - if (iter->Valid() || !range_del_agg->IsEmpty()) { TableBuilder* builder; std::unique_ptr file_writer; @@ -168,6 +168,11 @@ Status BuildTable( const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); const ParsedInternalKey& ikey = c_iter.ikey(); + if (paranoid_file_checks) { + // Generate a rolling 64-bit hash of the key and values + paranoid_hash = Hash64(key.data(), key.size(), paranoid_hash); + paranoid_hash = Hash64(value.data(), value.size(), paranoid_hash); + } builder->Add(key, value); meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); @@ -256,9 +261,17 @@ Status BuildTable( /*allow_unprepared_value*/ false)); s = it->status(); if (s.ok() && paranoid_file_checks) { + uint64_t check_hash = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { + // Generate a rolling 64-bit hash of the key and values + check_hash = Hash64(it->key().data(), it->key().size(), check_hash); + check_hash = + Hash64(it->value().data(), it->value().size(), check_hash); } s = it->status(); + if (s.ok() && check_hash != paranoid_hash) { + s = Status::Corruption("Paraniod checksums do not match"); + } } } } diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 9eaa76f08..13654ef36 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/compaction/compaction_job.h" + #include #include #include @@ -19,7 +21,6 @@ #include #include "db/builder.h" -#include "db/compaction/compaction_job.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -54,6 +55,7 @@ #include "table/table_builder.h" #include "test_util/sync_point.h" #include "util/coding.h" +#include "util/hash.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/stop_watch.h" @@ -123,6 +125,7 @@ struct CompactionJob::SubcompactionState { struct Output { FileMetaData meta; bool finished; + uint64_t paranoid_hash; std::shared_ptr table_properties; }; @@ -132,7 +135,7 @@ struct CompactionJob::SubcompactionState { std::unique_ptr builder; Output* current_output() { if (outputs.empty()) { - // This subcompaction's outptut could be empty if compaction was aborted + // This subcompaction's output could be empty if compaction was aborted // before this subcompaction had a chance to generate any output files. // When subcompactions are executed sequentially this is more likely and // will be particulalry likely for the later subcompactions to be empty. @@ -202,6 +205,21 @@ struct CompactionJob::SubcompactionState { SubcompactionState& operator=(const SubcompactionState&) = delete; + // Adds the key and value to the builder + // If paranoid is true, adds the key-value to the paranoid hash + void AddToBuilder(const Slice& key, const Slice& value, bool paranoid) { + auto curr = current_output(); + assert(builder != nullptr); + assert(curr != nullptr); + if (paranoid) { + // Generate a rolling 64-bit hash of the key and values + curr->paranoid_hash = Hash64(key.data(), key.size(), curr->paranoid_hash); + curr->paranoid_hash = + Hash64(value.data(), value.size(), curr->paranoid_hash); + } + builder->Add(key, value); + } + // Returns true iff we should stop building the current output // before processing "internal_key". bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) { @@ -635,20 +653,20 @@ Status CompactionJob::Run() { } if (status.ok()) { thread_pool.clear(); - std::vector files_meta; + std::vector files_output; for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.outputs) { - files_meta.emplace_back(&output.meta); + files_output.emplace_back(&output); } } ColumnFamilyData* cfd = compact_->compaction->column_family_data(); auto prefix_extractor = compact_->compaction->mutable_cf_options()->prefix_extractor.get(); - std::atomic next_file_meta_idx(0); + std::atomic next_file_idx(0); auto verify_table = [&](Status& output_status) { while (true) { - size_t file_idx = next_file_meta_idx.fetch_add(1); - if (file_idx >= files_meta.size()) { + size_t file_idx = next_file_idx.fetch_add(1); + if (file_idx >= files_output.size()) { break; } // Verify that the table is usable @@ -659,7 +677,8 @@ Status CompactionJob::Run() { // to cache it here for further user reads InternalIterator* iter = cfd->table_cache()->NewIterator( ReadOptions(), file_options_, cfd->internal_comparator(), - *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor, + files_output[file_idx]->meta, /*range_del_agg=*/nullptr, + prefix_extractor, /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), @@ -673,8 +692,16 @@ Status CompactionJob::Run() { auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {} + uint64_t hash = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + // Generate a rolling 64-bit hash of the key and values, using the + hash = Hash64(iter->key().data(), iter->key().size(), hash); + hash = Hash64(iter->value().data(), iter->value().size(), hash); + } s = iter->status(); + if (s.ok() && hash != files_output[file_idx]->paranoid_hash) { + s = Status::Corruption("Paraniod checksums do not match"); + } } delete iter; @@ -948,9 +975,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { break; } } - assert(sub_compact->builder != nullptr); - assert(sub_compact->current_output() != nullptr); - sub_compact->builder->Add(key, value); + sub_compact->AddToBuilder(key, value, paranoid_file_checks_); + sub_compact->current_output_file_size = sub_compact->builder->EstimatedFileSize(); const ParsedInternalKey& ikey = c_iter->ikey(); @@ -1246,7 +1272,8 @@ Status CompactionJob::FinishCompactionOutputFile( auto kv = tombstone.Serialize(); assert(lower_bound == nullptr || ucmp->Compare(*lower_bound, kv.second) < 0); - sub_compact->builder->Add(kv.first.Encode(), kv.second); + sub_compact->AddToBuilder(kv.first.Encode(), kv.second, + paranoid_file_checks_); InternalKey smallest_candidate = std::move(kv.first); if (lower_bound != nullptr && ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) { @@ -1543,6 +1570,7 @@ Status CompactionJob::OpenCompactionOutputFile( out.meta.oldest_ancester_time = oldest_ancester_time; out.meta.file_creation_time = current_time; out.finished = false; + out.paranoid_hash = 0; sub_compact->outputs.push_back(out); } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index cae3ad728..f1f2e471d 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -30,6 +30,7 @@ #include "rocksdb/write_batch.h" #include "table/block_based/block_based_table_builder.h" #include "table/meta_blocks.h" +#include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/random.h" @@ -560,6 +561,59 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) { } } +static const auto& corruption_modes = {mock::MockTableFactory::kCorruptNone, + mock::MockTableFactory::kCorruptKey, + mock::MockTableFactory::kCorruptValue}; + +TEST_F(CorruptionTest, ParaniodFileChecksOnFlush) { + Options options; + options.paranoid_file_checks = true; + options.create_if_missing = true; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + s = DestroyDB(dbname_, options); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + mock->SetCorruptionMode(mode); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + Build(10); + s = db_->Flush(FlushOptions()); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + +TEST_F(CorruptionTest, ParaniodFileChecksOnCompact) { + Options options; + options.paranoid_file_checks = true; + options.create_if_missing = true; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + s = DestroyDB(dbname_, options); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + Build(100, 2); + // ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + mock->SetCorruptionMode(mode); + s = dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 574e9390a..5c3a19a34 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -643,6 +643,8 @@ struct AdvancedColumnFamilyOptions { bool optimize_filters_for_hits = false; // After writing every SST file, reopen it and read all the keys. + // Checks the hash of all of the keys and values written versus the + // keys in the file and signals a corruption if they do not match // // Default: false // diff --git a/table/mock_table.cc b/table/mock_table.cc index 30ec9a671..eb198a44b 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -27,6 +27,154 @@ stl_wrappers::KVMap MakeMockFile( return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); } +class MockTableReader : public TableReader { + public: + explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {} + + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + uint64_t ApproximateOffsetOf(const Slice& /*key*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + size_t ApproximateMemoryUsage() const override { return 0; } + + void SetupForCompaction() override {} + + std::shared_ptr GetTableProperties() const override; + + ~MockTableReader() {} + + private: + const stl_wrappers::KVMap& table_; +}; + +class MockTableIterator : public InternalIterator { + public: + explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) { + itr_ = table_.end(); + } + + bool Valid() const override { return itr_ != table_.end(); } + + void SeekToFirst() override { itr_ = table_.begin(); } + + void SeekToLast() override { + itr_ = table_.end(); + --itr_; + } + + void Seek(const Slice& target) override { + std::string str_target(target.data(), target.size()); + itr_ = table_.lower_bound(str_target); + } + + void SeekForPrev(const Slice& target) override { + std::string str_target(target.data(), target.size()); + itr_ = table_.upper_bound(str_target); + Prev(); + } + + void Next() override { ++itr_; } + + void Prev() override { + if (itr_ == table_.begin()) { + itr_ = table_.end(); + } else { + --itr_; + } + } + + Slice key() const override { return Slice(itr_->first); } + + Slice value() const override { return Slice(itr_->second); } + + Status status() const override { return Status::OK(); } + + private: + const stl_wrappers::KVMap& table_; + stl_wrappers::KVMap::const_iterator itr_; +}; + +class MockTableBuilder : public TableBuilder { + public: + MockTableBuilder(uint32_t id, MockTableFileSystem* file_system, + MockTableFactory::MockCorruptionMode corrupt_mode = + MockTableFactory::kCorruptNone) + : id_(id), file_system_(file_system), corrupt_mode_(corrupt_mode) { + table_ = MakeMockFile({}); + } + + // REQUIRES: Either Finish() or Abandon() has been called. + ~MockTableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override { + if (corrupt_mode_ == MockTableFactory::kCorruptValue) { + // Corrupt the value + table_.insert({key.ToString(), value.ToString() + " "}); + corrupt_mode_ = MockTableFactory::kCorruptNone; + } else if (corrupt_mode_ == MockTableFactory::kCorruptKey) { + table_.insert({key.ToString() + " ", value.ToString()}); + corrupt_mode_ = MockTableFactory::kCorruptNone; + } else { + table_.insert({key.ToString(), value.ToString()}); + } + } + + // Return non-ok iff some error has been detected. + Status status() const override { return Status::OK(); } + + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override { return IOStatus::OK(); } + + Status Finish() override { + MutexLock lock_guard(&file_system_->mutex); + file_system_->files.insert({id_, table_}); + return Status::OK(); + } + + void Abandon() override {} + + uint64_t NumEntries() const override { return table_.size(); } + + uint64_t FileSize() const override { return table_.size(); } + + TableProperties GetTableProperties() const override { + return TableProperties(); + } + + // Get file checksum + std::string GetFileChecksum() const override { return kUnknownFileChecksum; } + // Get file checksum function name + const char* GetFileChecksumFuncName() const override { + return kUnknownFileChecksumFuncName; + } + + private: + uint32_t id_; + MockTableFileSystem* file_system_; + int corrupt_mode_; + stl_wrappers::KVMap table_; +}; + InternalIterator* MockTableReader::NewIterator( const ReadOptions&, const SliceTransform* /* prefix_extractor */, Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/, @@ -58,7 +206,8 @@ std::shared_ptr MockTableReader::GetTableProperties() return std::shared_ptr(new TableProperties()); } -MockTableFactory::MockTableFactory() : next_id_(1) {} +MockTableFactory::MockTableFactory() + : next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {} Status MockTableFactory::NewTableReader( const ReadOptions& /*ro*/, @@ -85,7 +234,7 @@ TableBuilder* MockTableFactory::NewTableBuilder( uint32_t /*column_family_id*/, WritableFileWriter* file) const { uint32_t id = GetAndWriteNextID(file); - return new MockTableBuilder(id, &file_system_); + return new MockTableBuilder(id, &file_system_, corrupt_mode_); } Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, diff --git a/table/mock_table.h b/table/mock_table.h index 7d89fd382..3484dbd2d 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -36,144 +36,14 @@ struct MockTableFileSystem { std::map files; }; -class MockTableReader : public TableReader { - public: - explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {} - - InternalIterator* NewIterator(const ReadOptions&, - const SliceTransform* prefix_extractor, - Arena* arena, bool skip_filters, - TableReaderCaller caller, - size_t compaction_readahead_size = 0, - bool allow_unprepared_value = false) override; - - Status Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context, const SliceTransform* prefix_extractor, - bool skip_filters = false) override; - - uint64_t ApproximateOffsetOf(const Slice& /*key*/, - TableReaderCaller /*caller*/) override { - return 0; - } - - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, - TableReaderCaller /*caller*/) override { - return 0; - } - - size_t ApproximateMemoryUsage() const override { return 0; } - - void SetupForCompaction() override {} - - std::shared_ptr GetTableProperties() const override; - - ~MockTableReader() {} - - private: - const stl_wrappers::KVMap& table_; -}; - -class MockTableIterator : public InternalIterator { - public: - explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) { - itr_ = table_.end(); - } - - bool Valid() const override { return itr_ != table_.end(); } - - void SeekToFirst() override { itr_ = table_.begin(); } - - void SeekToLast() override { - itr_ = table_.end(); - --itr_; - } - - void Seek(const Slice& target) override { - std::string str_target(target.data(), target.size()); - itr_ = table_.lower_bound(str_target); - } - - void SeekForPrev(const Slice& target) override { - std::string str_target(target.data(), target.size()); - itr_ = table_.upper_bound(str_target); - Prev(); - } - - void Next() override { ++itr_; } - - void Prev() override { - if (itr_ == table_.begin()) { - itr_ = table_.end(); - } else { - --itr_; - } - } - - Slice key() const override { return Slice(itr_->first); } - - Slice value() const override { return Slice(itr_->second); } - - Status status() const override { return Status::OK(); } - - private: - const stl_wrappers::KVMap& table_; - stl_wrappers::KVMap::const_iterator itr_; -}; - -class MockTableBuilder : public TableBuilder { - public: - MockTableBuilder(uint32_t id, MockTableFileSystem* file_system) - : id_(id), file_system_(file_system) { - table_ = MakeMockFile({}); - } - - // REQUIRES: Either Finish() or Abandon() has been called. - ~MockTableBuilder() {} - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value) override { - table_.insert({key.ToString(), value.ToString()}); - } - - // Return non-ok iff some error has been detected. - Status status() const override { return Status::OK(); } - - // Return non-ok iff some error happens during IO. - IOStatus io_status() const override { return IOStatus::OK(); } - - Status Finish() override { - MutexLock lock_guard(&file_system_->mutex); - file_system_->files.insert({id_, table_}); - return Status::OK(); - } - - void Abandon() override {} - - uint64_t NumEntries() const override { return table_.size(); } - - uint64_t FileSize() const override { return table_.size(); } - - TableProperties GetTableProperties() const override { - return TableProperties(); - } - - // Get file checksum - std::string GetFileChecksum() const override { return kUnknownFileChecksum; } - // Get file checksum function name - const char* GetFileChecksumFuncName() const override { - return kUnknownFileChecksumFuncName; - } - - private: - uint32_t id_; - MockTableFileSystem* file_system_; - stl_wrappers::KVMap table_; -}; - class MockTableFactory : public TableFactory { public: + enum MockCorruptionMode { + kCorruptNone, + kCorruptKey, + kCorruptValue, + }; + MockTableFactory(); const char* Name() const override { return "MockTable"; } using TableFactory::NewTableReader; @@ -202,6 +72,7 @@ class MockTableFactory : public TableFactory { return std::string(); } + void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; } // This function will assert that only a single file exists and that the // contents are equal to file_contents void AssertSingleFile(const stl_wrappers::KVMap& file_contents); @@ -213,6 +84,7 @@ class MockTableFactory : public TableFactory { mutable MockTableFileSystem file_system_; mutable std::atomic next_id_; + MockCorruptionMode corrupt_mode_; }; } // namespace mock