diff --git a/HISTORY.md b/HISTORY.md index bb248d371..fc62add11 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ * Added more fields to FilterBuildingContext with LSM details, for custom filter policies that vary behavior based on where they are in the LSM-tree. * Added DB::Properties::kBlockCacheEntryStats for querying statistics on what percentage of block cache is used by various kinds of blocks, etc. using DB::GetProperty and DB::GetMapProperty. The same information is now dumped to info LOG periodically according to `stats_dump_period_sec`. * Add an experimental Remote Compaction feature, which allows the user to run Compaction on a different host or process. The feature is still under development, currently only works on some basic use cases. The interface will be changed without backward/forward compatibility support. +* RocksDB would validate total entries read in flush, and compare with counter inserted into it. If flush_verify_memtable_count = true (default), flush will fail. Otherwise, only log to info logs. ### Performance Improvements * BlockPrefetcher is used by iterators to prefetch data if they anticipate more data to be used in future. It is enabled implicitly by rocksdb. Added change to take in account read pattern if reads are sequential. This would disable prefetching for random reads in MultiGet and iterators as readahead_size is increased exponential doing large prefetches. diff --git a/db/builder.cc b/db/builder.cc index d4ea5bbf7..6314ea589 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -69,7 +69,7 @@ Status BuildTable( int job_id, const Env::IOPriority io_priority, TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, const std::string* full_history_ts_low, - BlobFileCompletionCallback* blob_callback) { + BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries) { assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == tboptions.column_family_name.empty()); @@ -88,7 +88,10 @@ Status BuildTable( std::unique_ptr range_del_agg( new CompactionRangeDelAggregator(&tboptions.internal_comparator, snapshots)); + uint64_t num_unfragmented_tombstones = 0; for (auto& range_del_iter : range_del_iters) { + num_unfragmented_tombstones += + range_del_iter->num_unfragmented_tombstones(); range_del_agg->AddTombstones(std::move(range_del_iter)); } @@ -231,6 +234,10 @@ Status BuildTable( TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); const bool empty = builder->IsEmpty(); + if (num_input_entries != nullptr) { + *num_input_entries = + c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; + } if (!s.ok() || empty) { builder->Abandon(); } else { diff --git a/db/builder.h b/db/builder.h index 529d618d8..879fb2bd8 100644 --- a/db/builder.h +++ b/db/builder.h @@ -65,6 +65,7 @@ extern Status BuildTable( TableProperties* table_properties = nullptr, Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, const std::string* full_history_ts_low = nullptr, - BlobFileCompletionCallback* blob_callback = nullptr); + BlobFileCompletionCallback* blob_callback = nullptr, + uint64_t* num_input_entries = nullptr); } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index 426c770df..11db69fb4 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -33,7 +33,6 @@ (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq)))) namespace ROCKSDB_NAMESPACE { - CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, @@ -73,7 +72,10 @@ CompactionIterator::CompactionIterator( const std::atomic* manual_compaction_paused, const std::shared_ptr info_log, const std::string* full_history_ts_low) - : input_(input), + : input_( + input, cmp, + compaction == + nullptr), // Now only need to count number of entries in flush. cmp_(cmp), merge_helper_(merge_helper), snapshots_(snapshots), @@ -130,13 +132,13 @@ CompactionIterator::CompactionIterator( assert(timestamp_size_ == 0 || !full_history_ts_low_ || timestamp_size_ == full_history_ts_low_->size()); #endif - input_->SetPinnedItersMgr(&pinned_iters_mgr_); + input_.SetPinnedItersMgr(&pinned_iters_mgr_); TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); } CompactionIterator::~CompactionIterator() { // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime - input_->SetPinnedItersMgr(nullptr); + input_.SetPinnedItersMgr(nullptr); } void CompactionIterator::ResetRecordCounts() { @@ -189,7 +191,7 @@ void CompactionIterator::Next() { // Only advance the input iterator if there is no merge output and the // iterator is not already at the next record. if (!at_next_) { - input_->Next(); + AdvanceInputIter(); } NextFromInput(); } @@ -356,10 +358,10 @@ void CompactionIterator::NextFromInput() { at_next_ = false; valid_ = false; - while (!valid_ && input_->Valid() && !IsPausingManualCompaction() && + while (!valid_ && input_.Valid() && !IsPausingManualCompaction() && !IsShuttingDown()) { - key_ = input_->key(); - value_ = input_->value(); + key_ = input_.key(); + value_ = input_.value(); iter_stats_.num_input_records++; Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); @@ -559,12 +561,12 @@ void CompactionIterator::NextFromInput() { // The easiest way to process a SingleDelete during iteration is to peek // ahead at the next key. ParsedInternalKey next_ikey; - input_->Next(); + AdvanceInputIter(); // Check whether the next key exists, is not corrupt, and is the same key // as the single delete. - if (input_->Valid() && - ParseInternalKey(input_->key(), &next_ikey, allow_data_in_errors_) + if (input_.Valid() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) .ok() && cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { // Check whether the next key belongs to the same snapshot as the @@ -578,7 +580,7 @@ void CompactionIterator::NextFromInput() { // to handle the second SingleDelete // First SingleDelete has been skipped since we already called - // input_->Next(). + // input_.Next(). ++iter_stats_.num_record_drop_obsolete; ++iter_stats_.num_single_del_mismatch; } else if (has_outputted_key_ || @@ -600,9 +602,9 @@ void CompactionIterator::NextFromInput() { ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_obsolete; - // Already called input_->Next() once. Call it a second time to + // Already called input_.Next() once. Call it a second time to // skip past the second key. - input_->Next(); + AdvanceInputIter(); } else { // Found a matching value, but we cannot drop both keys since // there is an earlier snapshot and we need to leave behind a record @@ -670,7 +672,7 @@ void CompactionIterator::NextFromInput() { } ++iter_stats_.num_record_drop_hidden; // rule (A) - input_->Next(); + AdvanceInputIter(); } else if (compaction_ != nullptr && (ikey_.type == kTypeDeletion || (ikey_.type == kTypeDeletionWithTimestamp && @@ -706,7 +708,7 @@ void CompactionIterator::NextFromInput() { if (!bottommost_level_) { ++iter_stats_.num_optimized_del_drop_obsolete; } - input_->Next(); + AdvanceInputIter(); } else if ((ikey_.type == kTypeDeletion || (ikey_.type == kTypeDeletionWithTimestamp && cmp_with_history_ts_low_ < 0)) && @@ -717,7 +719,7 @@ void CompactionIterator::NextFromInput() { assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( ikey_.user_key, &level_ptrs_)); ParsedInternalKey next_ikey; - input_->Next(); + AdvanceInputIter(); // Skip over all versions of this key that happen to occur in the same // snapshot range as the delete. // @@ -725,18 +727,18 @@ void CompactionIterator::NextFromInput() { // considered to have a different user key unless the timestamp is older // than *full_history_ts_low_. while (!IsPausingManualCompaction() && !IsShuttingDown() && - input_->Valid() && - (ParseInternalKey(input_->key(), &next_ikey, allow_data_in_errors_) + input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) .ok()) && cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) && (prev_snapshot == 0 || DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) { - input_->Next(); + AdvanceInputIter(); } // If you find you still need to output a row with this key, we need to output the // delete too - if (input_->Valid() && - (ParseInternalKey(input_->key(), &next_ikey, allow_data_in_errors_) + if (input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) .ok()) && cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { valid_ = true; @@ -755,7 +757,7 @@ void CompactionIterator::NextFromInput() { // We encapsulate the merge related state machine in a different // object to minimize change to the existing flow. Status s = - merge_helper_->MergeUntil(input_, range_del_agg_, prev_snapshot, + merge_helper_->MergeUntil(&input_, range_del_agg_, prev_snapshot, bottommost_level_, allow_data_in_errors_); merge_out_iter_.SeekToFirst(); @@ -799,14 +801,14 @@ void CompactionIterator::NextFromInput() { if (should_delete) { ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_range_del; - input_->Next(); + AdvanceInputIter(); } else { valid_ = true; } } if (need_skip) { - input_->Seek(skip_until); + SkipUntil(skip_until); } } diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 6d968fec5..51749ce80 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -24,6 +24,49 @@ namespace ROCKSDB_NAMESPACE { class BlobFileBuilder; +// A wrapper of internal iterator whose purpose is to count how +// many entries there are in the iterator. +class SequenceIterWrapper : public InternalIterator { + public: + SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp, + bool need_count_entries) + : cmp_(cmp), inner_iter_(iter), need_count_entries_(need_count_entries) {} + bool Valid() const override { return inner_iter_->Valid(); } + Status status() const override { return inner_iter_->status(); } + void Next() override { + num_itered_++; + inner_iter_->Next(); + } + void Seek(const Slice& target) override { + if (!need_count_entries_) { + inner_iter_->Seek(target); + } else { + // For flush cases, we need to count total number of entries, so we + // do Next() rather than Seek(). + while (inner_iter_->Valid() && + cmp_->Compare(inner_iter_->key(), target) < 0) { + Next(); + } + } + } + Slice key() const override { return inner_iter_->key(); } + Slice value() const override { return inner_iter_->value(); } + + // Unused InternalIterator methods + void SeekToFirst() override { assert(false); } + void Prev() override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + uint64_t num_itered() const { return num_itered_; } + + private: + const Comparator* cmp_; // not owned + InternalIterator* inner_iter_; // not owned + uint64_t num_itered_ = 0; + bool need_count_entries_; +}; + class CompactionIterator { public: // A wrapper around Compaction. Has a much smaller interface, only what @@ -162,6 +205,7 @@ class CompactionIterator { bool Valid() const { return valid_; } const Slice& user_key() const { return current_user_key_; } const CompactionIterationStats& iter_stats() const { return iter_stats_; } + uint64_t num_input_entry_scanned() const { return input_.num_itered(); } private: // Processes the input stream to find the next output @@ -234,7 +278,7 @@ class CompactionIterator { static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber( const CompactionProxy* compaction); - InternalIterator* input_; + SequenceIterWrapper input_; const Comparator* cmp_; MergeHelper* merge_helper_; const std::vector* snapshots_; @@ -342,6 +386,10 @@ class CompactionIterator { const int level_; + void AdvanceInputIter() { input_.Next(); } + + void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); } + bool IsShuttingDown() { // This is a best-effort facility, so memory_order_relaxed is sufficient. return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); diff --git a/db/db_test2.cc b/db/db_test2.cc index 8a3724307..5f87955e0 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -4988,7 +4988,7 @@ TEST_F(DBTest2, SameSmallestInSameLevel) { ASSERT_OK(Put("key", "2")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "3")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "4")); - Flush(); + ASSERT_OK(Flush()); CompactRangeOptions cro; cro.change_level = true; cro.target_level = 2; diff --git a/db/flush_job.cc b/db/flush_job.cc index 92787e728..b55d971f6 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -166,7 +166,6 @@ void FlushJob::RecordFlushIOStats() { ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); IOSTATS_RESET(bytes_written); } - void FlushJob::PickMemTable() { db_mutex_->AssertHeld(); assert(!pick_memtable_called); @@ -403,6 +402,7 @@ Status FlushJob::WriteLevel0Table() { ? current_time : meta_.oldest_ancester_time; + uint64_t num_input_entries = 0; IOStatus io_s; const std::string* const full_history_ts_low = (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_; @@ -420,10 +420,22 @@ Status FlushJob::WriteLevel0Table() { earliest_write_conflict_snapshot_, snapshot_checker_, mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), &io_s, io_tracer_, event_logger_, job_context_->job_id, Env::IO_HIGH, - &table_properties_, write_hint, full_history_ts_low, blob_callback_); + &table_properties_, write_hint, full_history_ts_low, blob_callback_, + &num_input_entries); if (!io_s.ok()) { io_status_ = io_s; } + if (num_input_entries != total_num_entries && s.ok()) { + std::string msg = "Expected " + ToString(total_num_entries) + + " entries in memtables, but read " + + ToString(num_input_entries); + ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s", + cfd_->GetName().c_str(), job_context_->job_id, + msg.c_str()); + if (db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } LogFlush(db_options_.info_log); } ROCKS_LOG_INFO(db_options_.info_log, diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 7721a78c7..e9f353d58 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -25,12 +25,12 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( return; } bool is_sorted = true; - int num_tombstones = 0; InternalKey pinned_last_start_key; Slice last_start_key; + num_unfragmented_tombstones_ = 0; for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); - unfragmented_tombstones->Next(), num_tombstones++) { - if (num_tombstones > 0 && + unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) { + if (num_unfragmented_tombstones_ > 0 && icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) { is_sorted = false; break; @@ -50,8 +50,8 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( // Sort the tombstones before fragmenting them. std::vector keys, values; - keys.reserve(num_tombstones); - values.reserve(num_tombstones); + keys.reserve(num_unfragmented_tombstones_); + values.reserve(num_unfragmented_tombstones_); for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); unfragmented_tombstones->Next()) { keys.emplace_back(unfragmented_tombstones->key().data(), diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index 63ec24e64..d0d07e4e2 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -68,6 +68,10 @@ struct FragmentedRangeTombstoneList { // number in [lower, upper]. bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const; + uint64_t num_unfragmented_tombstones() const { + return num_unfragmented_tombstones_; + } + private: // Given an ordered range tombstone iterator unfragmented_tombstones, // "fragment" the tombstones into non-overlapping pieces, and store them in @@ -82,6 +86,7 @@ struct FragmentedRangeTombstoneList { std::set seq_set_; std::list pinned_slices_; PinnedIteratorsManager pinned_iters_mgr_; + uint64_t num_unfragmented_tombstones_; }; // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del @@ -180,6 +185,10 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { SequenceNumber upper_bound() const { return upper_bound_; } SequenceNumber lower_bound() const { return lower_bound_; } + uint64_t num_unfragmented_tombstones() const { + return tombstones_->num_unfragmented_tombstones(); + } + private: using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index e6e212098..d8cbc1156 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -430,6 +430,13 @@ struct DBOptions { // Default: true bool paranoid_checks = true; + // If true, during memtable flush, RocksDB will validate total entries + // read in flush, and compare with counter inserted into it. + // The option is here to turn the feature off in case this new validation + // feature has a bug. + // Default: true + bool flush_verify_memtable_count = true; + // If true, the log numbers and sizes of the synced WALs are tracked // in MANIFEST, then during DB recovery, if a synced WAL is missing // from disk, or the WAL's size does not match the recorded size in diff --git a/options/db_options.cc b/options/db_options.cc index b08402599..28d17e151 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -200,6 +200,10 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, paranoid_checks), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"flush_verify_memtable_count", + {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"track_and_verify_wals_in_manifest", {offsetof(struct ImmutableDBOptions, track_and_verify_wals_in_manifest), @@ -503,6 +507,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) create_missing_column_families(options.create_missing_column_families), error_if_exists(options.error_if_exists), paranoid_checks(options.paranoid_checks), + flush_verify_memtable_count(options.flush_verify_memtable_count), track_and_verify_wals_in_manifest( options.track_and_verify_wals_in_manifest), env(options.env), @@ -598,6 +603,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { create_if_missing); ROCKS_LOG_HEADER(log, " Options.paranoid_checks: %d", paranoid_checks); + ROCKS_LOG_HEADER(log, " Options.flush_verify_memtable_count: %d", + flush_verify_memtable_count); ROCKS_LOG_HEADER(log, " " "Options.track_and_verify_wals_in_manifest: %d", diff --git a/options/db_options.h b/options/db_options.h index 85968ffcf..edbdbe6a2 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -24,6 +24,7 @@ struct ImmutableDBOptions { bool create_missing_column_families; bool error_if_exists; bool paranoid_checks; + bool flush_verify_memtable_count; bool track_and_verify_wals_in_manifest; Env* env; std::shared_ptr rate_limiter; diff --git a/options/options_helper.cc b/options/options_helper.cc index 68bd6a4c9..823ef4a45 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -65,6 +65,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.create_missing_column_families; options.error_if_exists = immutable_db_options.error_if_exists; options.paranoid_checks = immutable_db_options.paranoid_checks; + options.flush_verify_memtable_count = + immutable_db_options.flush_verify_memtable_count; options.track_and_verify_wals_in_manifest = immutable_db_options.track_and_verify_wals_in_manifest; options.env = immutable_db_options.env; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index a9f5d7bcd..3cfa2ecff 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -287,6 +287,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "skip_log_error_on_recovery=true;" "writable_file_max_buffer_size=1048576;" "paranoid_checks=true;" + "flush_verify_memtable_count=true;" "track_and_verify_wals_in_manifest=true;" "is_fd_close_on_exec=false;" "bytes_per_sync=4295013613;"