diff --git a/HISTORY.md b/HISTORY.md index 8f60d2954..b3d41a7ee 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,9 @@ ### Behavior Changes * Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation. +### Bug Fixes +* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`. + ### Bug Fixes * Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected. diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 8bf19bf3b..f5f97dc36 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -1348,13 +1348,19 @@ TEST_P(DBWALTestWithParams, kPointInTimeRecovery) { size_t recovered_row_count = RecoveryTestHelper::GetData(this); ASSERT_LT(recovered_row_count, row_count); - bool expect_data = true; - for (size_t k = 0; k < maxkeys; ++k) { - bool found = Get("key" + ToString(corrupt_offset)) != "NOT_FOUND"; - if (expect_data && !found) { - expect_data = false; + // Verify a prefix of keys were recovered. But not in the case of full WAL + // truncation, because we have no way to know there was a corruption when + // truncation happened on record boundaries (preventing recovery holes in + // that case requires using `track_and_verify_wals_in_manifest`). + if (!trunc || corrupt_offset != 0) { + bool expect_data = true; + for (size_t k = 0; k < maxkeys; ++k) { + bool found = Get("key" + ToString(k)) != "NOT_FOUND"; + if (expect_data && !found) { + expect_data = false; + } + ASSERT_EQ(found, expect_data); } - ASSERT_EQ(found, expect_data); } const size_t min = RecoveryTestHelper::kKeysPerWALFile * diff --git a/db/log_reader.cc b/db/log_reader.cc index 6e71eaa2f..84082e8ea 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -119,16 +119,26 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, break; case kBadHeader: - if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. ReportCorruption(drop_size, "truncated header"); } FALLTHROUGH_INTENDED; case kEof: if (in_fragmented_record) { - if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. ReportCorruption(scratch->size(), "error reading trailing data"); } // This can be caused by the writer dying immediately after @@ -142,8 +152,13 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) { // Treat a record from a previous instance of the log as EOF. if (in_fragmented_record) { - if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, + // which higher layers can choose to ignore when it's provable + // there is no hole. ReportCorruption(scratch->size(), "error reading trailing data"); } // This can be caused by the writer dying immediately after @@ -164,6 +179,20 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, break; case kBadRecordLen: + if (eof_) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(drop_size, "truncated record body"); + } + return false; + } + FALLTHROUGH_INTENDED; + case kBadRecordChecksum: if (recycled_ && wal_recovery_mode == @@ -355,18 +384,14 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { } } if (header_size + length > buffer_.size()) { + assert(buffer_.size() >= static_cast(header_size)); *drop_size = buffer_.size(); buffer_.clear(); - if (!eof_) { - return kBadRecordLen; - } - // If the end of the file has been reached without reading |length| - // bytes of payload, assume the writer died in the middle of writing the - // record. Don't report a corruption unless requested. - if (*drop_size) { - return kBadHeader; - } - return kEof; + // If the end of the read has been reached without seeing + // `header_size + length` bytes of payload, report a corruption. The + // higher layers can decide how to handle it based on the recovery mode, + // whether this occurred at EOF, whether this is the final WAL, etc. + return kBadRecordLen; } if (type == kZeroType && length == 0) { diff --git a/db/log_test.cc b/db/log_test.cc index 849b89d8a..269761968 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -465,7 +465,7 @@ TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { ShrinkSize(1); ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); - ASSERT_EQ("OK", MatchError("Corruption: truncated header")); + ASSERT_EQ("OK", MatchError("Corruption: truncated record body")); } TEST_P(LogTest, ChecksumMismatch) { @@ -573,9 +573,7 @@ TEST_P(LogTest, PartialLastIsNotIgnored) { ShrinkSize(1); ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); - ASSERT_EQ("OK", MatchError( - "Corruption: truncated headerCorruption: " - "error reading trailing data")); + ASSERT_EQ("OK", MatchError("Corruption: truncated record body")); } TEST_P(LogTest, ErrorJoinsRecords) {