Fix 2PC Recovery SeqId Miscount

Summary:
Originally sequence ids were calculated, in recovery, based off of the first seqid found if the first log recovered. The working seqid was then incremented from that value based on every insertion that took place. This was faulty because of the potential for missing log files or inserts that skipped the WAL. The current recovery scheme grabs sequence from current recovering batch and increments using memtableinserter to track how many actual inserts take place. This works for 2PC batches as well scenarios where some logs are missing or inserts that skip the WAL.
Closes https://github.com/facebook/rocksdb/pull/1486

Differential Revision: D4156064

Pulled By: reidHoruff

fbshipit-source-id: a6da8d9
This commit is contained in:
Reid Horuff 2016-11-10 10:59:06 -08:00 committed by Facebook Github Bot
parent e095d0cbc7
commit 1ca5f6d132
3 changed files with 59 additions and 23 deletions

View File

@ -1516,18 +1516,6 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
}
}
bool no_prev_seq = true;
if (!immutable_db_options_.allow_2pc) {
*next_sequence = sequence;
} else {
if (*next_sequence == kMaxSequenceNumber) {
*next_sequence = sequence;
} else {
no_prev_seq = false;
WriteBatchInternal::SetSequence(&batch, *next_sequence);
}
}
#ifndef ROCKSDB_LITE
if (immutable_db_options_.wal_filter != nullptr) {
WriteBatch new_batch;
@ -1614,15 +1602,6 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
&batch, column_family_memtables_.get(), &flush_scheduler_, true,
log_number, this, false /* concurrent_memtable_writes */,
next_sequence, &has_valid_writes);
// If it is the first log file and there is no column family updated
// after replaying the file, this file may be a stale file. We ignore
// sequence IDs from the file. Otherwise, if a newer stale log file that
// has been deleted, the sequenceID may be wrong.
if (immutable_db_options_.allow_2pc) {
if (no_prev_seq && !has_valid_writes) {
*next_sequence = kMaxSequenceNumber;
}
}
MaybeIgnoreError(&status);
if (!status.ok()) {
// We are treating this as a failure while reading since we read valid

View File

@ -645,8 +645,6 @@ TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
Options options = CurrentOptions();
options.env = fault_env.get();
options.disable_auto_compactions = true;
// TODO(yiwu): fix for 2PC.
options.allow_2pc = false;
WriteOptions wal_on, wal_off;
wal_on.sync = true;
wal_on.disableWAL = false;

View File

@ -1178,6 +1178,65 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
delete cfb;
}
/*
* 1) use prepare to keep first log around to determine starting sequence
* during recovery.
* 2) insert many values, skipping wal, to increase seqid.
* 3) insert final value into wal
* 4) recover and see that final value was properly recovered - not
* hidden behind improperly summed sequence ids
*/
TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) {
DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
WriteOptions wal_on, wal_off;
wal_on.sync = true;
wal_on.disableWAL = false;
wal_off.disableWAL = true;
ReadOptions read_options;
TransactionOptions txn_options;
std::string value;
Status s;
Transaction* txn1 = db->BeginTransaction(wal_on, txn_options);
s = txn1->SetName("1");
ASSERT_OK(s);
s = db->Put(wal_on, "first", "first");
ASSERT_OK(s);
s = txn1->Put(Slice("dummy"), Slice("dummy"));
ASSERT_OK(s);
s = txn1->Prepare();
ASSERT_OK(s);
s = db->Put(wal_off, "cats", "dogs1");
ASSERT_OK(s);
s = db->Put(wal_off, "cats", "dogs2");
ASSERT_OK(s);
s = db->Put(wal_off, "cats", "dogs3");
ASSERT_OK(s);
s = db_impl->TEST_FlushMemTable(true);
ASSERT_OK(s);
s = db->Put(wal_on, "cats", "dogs4");
ASSERT_OK(s);
// kill and reopen
env->SetFilesystemActive(false);
ReOpenNoDelete();
s = db->Get(read_options, "first", &value);
ASSERT_OK(s);
ASSERT_EQ(value, "first");
s = db->Get(read_options, "cats", &value);
ASSERT_OK(s);
ASSERT_EQ(value, "dogs4");
}
TEST_P(TransactionTest, FirstWriteTest) {
WriteOptions write_options;