Recover same sequence id from WAL (#1350)

Summary:
Revert the behavior where we don't read sequence id from WAL, but increase it as we replay the log. We still keep the behave for 2PC for now but will fix later.

This change fixes github issue 1339, where some writes come with WAL disabled and we may recover records with wrong sequence id.

Test Plan: Added unit test.

Subscribers: andrewkr, dhruba

Differential Revision: https://reviews.facebook.net/D64275
This commit is contained in:
yiwu-arbug 2016-09-23 16:15:14 -07:00 committed by Islam AbdelRahman
parent 165cec6ef6
commit a34c1e3373
4 changed files with 56 additions and 15 deletions

View File

@ -393,9 +393,10 @@ PARALLEL_TEST = \
db_compaction_filter_test \
db_compaction_test \
db_sst_test \
external_sst_file_test \
db_test \
db_universal_compaction_test \
db_wal_test \
external_sst_file_test \
fault_injection_test \
inlineskiplist_test \
manual_compaction_test \
@ -599,7 +600,7 @@ gen_parallel_tests:
# 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest
#
slow_test_regexp = \
^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$
^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
prioritize_long_running_tests = \
perl -pe 's,($(slow_test_regexp)),100 $$1,' \
| sort -k1,1gr \

View File

@ -1393,7 +1393,6 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
bool stop_replay_by_wal_filter = false;
bool stop_replay_for_corruption = false;
bool flushed = false;
SequenceNumber recovered_sequence = 0;
for (auto log_number : log_numbers) {
// The previous incarnation may not have written any MANIFEST
// records after allocating this log number. So we manually
@ -1472,13 +1471,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
WriteBatchInternal::SetContents(&batch, record);
SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
// In point-in-time recovery mode, if sequence id of log files are
// consecutive, we continue recovery despite corruption. This could happen
// when we open and write to a corrupted DB, where sequence id will start
// from the last sequence id we recovered.
if (db_options_.wal_recovery_mode ==
WALRecoveryMode::kPointInTimeRecovery) {
if (sequence == recovered_sequence + 1) {
// In point-in-time recovery mode, if sequence id of log files are
// consecutive, we continue recovery despite corruption. This could
// happen when we open and write to a corrupted DB, where sequence id
// will start from the last sequence id we recovered.
if (sequence == *next_sequence) {
stop_replay_for_corruption = false;
}
if (stop_replay_for_corruption) {
@ -1487,13 +1486,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
}
}
recovered_sequence = sequence;
bool no_prev_seq = true;
if (*next_sequence == kMaxSequenceNumber) {
if (!db_options_.allow_2pc) {
*next_sequence = sequence;
} else {
no_prev_seq = false;
WriteBatchInternal::SetSequence(&batch, *next_sequence);
if (*next_sequence == kMaxSequenceNumber) {
*next_sequence = sequence;
} else {
no_prev_seq = false;
WriteBatchInternal::SetSequence(&batch, *next_sequence);
}
}
#ifndef ROCKSDB_LITE
@ -1590,8 +1592,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
// after replaying the file, this file may be a stale file. We ignore
// sequence IDs from the file. Otherwise, if a newer stale log file that
// has been deleted, the sequenceID may be wrong.
if (no_prev_seq && !has_valid_writes) {
*next_sequence = kMaxSequenceNumber;
if (db_options_.allow_2pc) {
if (no_prev_seq && !has_valid_writes) {
*next_sequence = kMaxSequenceNumber;
}
}
MaybeIgnoreError(&status);
if (!status.ok()) {

View File

@ -620,7 +620,7 @@ class DBImpl : public DB {
// REQUIRES: log_numbers are sorted in ascending order
Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* max_sequence, bool read_only);
SequenceNumber* next_sequence, bool read_only);
// The following two methods are used to flush a memtable to
// storage. The first one is used at database RecoveryTime (when the

View File

@ -9,6 +9,7 @@
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "util/fault_injection_test_env.h"
#include "util/options_helper.h"
#include "util/sync_point.h"
@ -509,6 +510,41 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
ASSERT_OK(dbfull()->SyncWAL());
}
// Github issue 1339. Prior the fix we read sequence id from the first log to
// a local variable, then keep increase the variable as we replay logs,
// ignoring actual sequence id of the records. This is incorrect if some writes
// come with WAL disabled.
TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
std::unique_ptr<FaultInjectionTestEnv> fault_env(
new FaultInjectionTestEnv(env_));
Options options = CurrentOptions();
options.env = fault_env.get();
options.disable_auto_compactions = true;
// TODO(yiwu): fix for 2PC.
options.allow_2pc = false;
WriteOptions wal_on, wal_off;
wal_on.sync = true;
wal_on.disableWAL = false;
wal_off.disableWAL = true;
CreateAndReopenWithCF({"dummy"}, options);
ASSERT_OK(Put(1, "dummy", "d1", wal_on)); // seq id 1
ASSERT_OK(Put(1, "dummy", "d2", wal_off));
ASSERT_OK(Put(1, "dummy", "d3", wal_off));
ASSERT_OK(Put(0, "key", "v4", wal_on)); // seq id 4
ASSERT_OK(Flush(0));
ASSERT_OK(Put(0, "key", "v5", wal_on)); // seq id 5
ASSERT_EQ("v5", Get(0, "key"));
// Simulate a crash.
fault_env->SetFilesystemActive(false);
Close();
fault_env->ResetState();
ReopenWithColumnFamilies({"default", "dummy"}, options);
// Prior to the fix, we may incorrectly recover "v5" with sequence id = 3.
ASSERT_EQ("v5", Get(0, "key"));
// Destroy DB before destruct fault_env.
Destroy(options);
}
//
// Test WAL recovery for the various modes available
//