WritePrepared Txn: cross-compatibility test

Summary:
Add tests to ensure that WritePrepared and WriteCommitted policies are cross compatible when the db WAL is empty. This is important when the admin want to switch between the policies. In such case, before the switch the admin needs to empty the WAL by i) committing/rollbacking all the pending transactions, ii) FlushMemTables
Closes https://github.com/facebook/rocksdb/pull/3118

Differential Revision: D6227247

Pulled By: maysamyabandeh

fbshipit-source-id: bcde3d92c1e89cda3b9cfa69f6a20af5d8993db7
This commit is contained in:
Maysam Yabandeh 2017-11-11 11:23:43 -08:00 committed by Facebook Github Bot
parent 857adf388f
commit 2edc92bc28
10 changed files with 211 additions and 7 deletions

View File

@ -728,6 +728,12 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
}
if (!status.ok()) {
if (status.IsNotSupported()) {
// We should not treat NotSupported as corruption. It is rather a clear
// sign that we are processing a WAL that is produced by an incompatible
// version of the code.
return status;
}
if (immutable_db_options_.wal_recovery_mode ==
WALRecoveryMode::kSkipAnyCorruptedRecords) {
// We should ignore all errors unconditionally

View File

@ -49,6 +49,11 @@ enum ValueType : unsigned char {
kTypeRangeDeletion = 0xF, // meta block
kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only
kTypeBlobIndex = 0x11, // Blob DB only
// When the prepared record is also persisted in db, we use a different
// record. This is to ensure that the WAL that is generated by a WritePolicy
// is not mistakenly read by another, which would result into data
// inconsistency.
kTypeBeginPersistedPrepareXID = 0x12, // WAL only.
kMaxValue = 0x7F // Not used for storing records.
};

View File

@ -24,6 +24,7 @@
// kTypeEndPrepareXID
// kTypeCommitXID varstring
// kTypeRollbackXID varstring
// kTypeBeginPersistedPrepareXID varstring
// kTypeNoop
// varstring :=
// len: varint32
@ -353,6 +354,9 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
break;
case kTypeNoop:
case kTypeBeginPrepareXID:
// This indicates that the prepared batch is also persisted in the db.
// This is used in WritePreparedTxn
case kTypeBeginPersistedPrepareXID:
break;
case kTypeEndPrepareXID:
if (!GetLengthPrefixedSlice(input, xid)) {
@ -457,6 +461,24 @@ Status WriteBatch::Iterate(Handler* handler) const {
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
handler->MarkBeginPrepare();
empty_batch = false;
if (!handler->WriteAfterCommit()) {
s = Status::NotSupported(
"WriteCommitted txn tag when write_after_commit_ is disabled (in "
"WritePrepared mode). If it is not due to corruption, the WAL "
"must be emptied before changing the WritePolicy.");
}
break;
case kTypeBeginPersistedPrepareXID:
assert(content_flags_.load(std::memory_order_relaxed) &
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
handler->MarkBeginPrepare();
empty_batch = false;
if (handler->WriteAfterCommit()) {
s = Status::NotSupported(
"WritePrepared txn tag when write_after_commit_ is enabled (in "
"default WriteCommitted mode). If it is not due to corruption, "
"the WAL must be emptied before changing the WritePolicy.");
}
break;
case kTypeEndPrepareXID:
assert(content_flags_.load(std::memory_order_relaxed) &
@ -575,7 +597,8 @@ Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
return Status::OK();
}
Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) {
Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
bool write_after_commit) {
// a manually constructed batch can only contain one prepare section
assert(b->rep_[12] == static_cast<char>(kTypeNoop));
@ -587,7 +610,9 @@ Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) {
}
// rewrite noop as begin marker
b->rep_[12] = static_cast<char>(kTypeBeginPrepareXID);
b->rep_[12] =
static_cast<char>(write_after_commit ? kTypeBeginPrepareXID
: kTypeBeginPersistedPrepareXID);
b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
PutLengthPrefixedSlice(&b->rep_, xid);
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
@ -920,6 +945,9 @@ class MemTableInserter : public WriteBatch::Handler {
return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
}
protected:
virtual bool WriteAfterCommit() const override { return write_after_commit_; }
public:
// cf_mems should not be shared with concurrent inserters
MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
@ -957,6 +985,8 @@ class MemTableInserter : public WriteBatch::Handler {
MemTableInserter(const MemTableInserter&) = delete;
MemTableInserter& operator=(const MemTableInserter&) = delete;
virtual bool WriterAfterCommit() const { return write_after_commit_; }
void MaybeAdvanceSeq(bool batch_boundry = false) {
if (batch_boundry == seq_per_batch_) {
sequence_++;

View File

@ -102,7 +102,8 @@ class WriteBatchInternal {
static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const Slice& value);
static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid);
static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
const bool write_after_commit = true);
static Status MarkRollback(WriteBatch* batch, const Slice& xid);

View File

@ -267,6 +267,10 @@ class WriteBatch : public WriteBatchBase {
// iteration is halted. Otherwise, it continues iterating. The default
// implementation always returns true.
virtual bool Continue();
protected:
friend class WriteBatch;
virtual bool WriteAfterCommit() const { return true; }
};
Status Iterate(Handler* handler) const;

View File

@ -236,6 +236,125 @@ class TransactionTest : public ::testing::TestWithParam<
}
delete txn;
};
// Test that we can change write policy after a clean shutdown (which would
// empty the WAL)
void CrossCompatibilityTest(TxnDBWritePolicy from_policy,
TxnDBWritePolicy to_policy, bool empty_wal) {
TransactionOptions txn_options;
ReadOptions read_options;
WriteOptions write_options;
uint32_t index = 0;
Random rnd(1103);
options.write_buffer_size = 1024; // To create more sst files
std::unordered_map<std::string, std::string> committed_kvs;
Transaction* txn;
txn_db_options.write_policy = from_policy;
ReOpen();
for (int i = 0; i < 1024; i++) {
auto istr = std::to_string(index);
auto k = Slice("foo-" + istr).ToString();
auto v = Slice("bar-" + istr).ToString();
// For test the duplicate keys
auto v2 = Slice("bar2-" + istr).ToString();
auto type = rnd.Uniform(4);
Status s;
switch (type) {
case 0:
committed_kvs[k] = v;
s = db->Put(write_options, k, v);
ASSERT_OK(s);
committed_kvs[k] = v2;
s = db->Put(write_options, k, v2);
ASSERT_OK(s);
break;
case 1: {
WriteBatch wb;
committed_kvs[k] = v;
wb.Put(k, v);
// TODO(myabandeh): remove this when we supprot duplicate keys in
// db->Write method
if (false) {
committed_kvs[k] = v2;
wb.Put(k, v2);
}
s = db->Write(write_options, &wb);
ASSERT_OK(s);
} break;
case 2:
case 3:
txn = db->BeginTransaction(write_options, txn_options);
s = txn->SetName("xid" + istr);
ASSERT_OK(s);
committed_kvs[k] = v;
s = txn->Put(k, v);
ASSERT_OK(s);
// TODO(myabandeh): remove this when we supprot duplicate keys in
// db->Write method
if (false) {
committed_kvs[k] = v2;
s = txn->Put(k, v2);
}
ASSERT_OK(s);
if (type == 3) {
s = txn->Prepare();
ASSERT_OK(s);
}
s = txn->Commit();
ASSERT_OK(s);
if (type == 2) {
auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
// TODO(myabandeh): this is counter-intuitive. The destructor should
// also do the unregistering.
pdb->UnregisterTransaction(txn);
}
delete txn;
break;
default:
assert(0);
}
index++;
} // for i
txn_db_options.write_policy = to_policy;
auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
// Before upgrade/downgrade the WAL must be emptied
if (empty_wal) {
db_impl->TEST_FlushMemTable();
} else {
db_impl->FlushWAL(true);
}
auto s = ReOpenNoDelete();
if (empty_wal) {
ASSERT_OK(s);
} else {
// Test that we can detect the WAL that is produced by an incompatbile
// WritePolicy and fail fast before mis-interpreting the WAL.
ASSERT_TRUE(s.IsNotSupported());
return;
}
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
// Check that WAL is empty
VectorLogPtr log_files;
db_impl->GetSortedWalFiles(log_files);
ASSERT_EQ(0, log_files.size());
for (auto& kv : committed_kvs) {
std::string value;
s = db->Get(read_options, kv.first, &value);
if (s.IsNotFound()) {
printf("key = %s\n", kv.first.c_str());
}
ASSERT_OK(s);
if (kv.second != value) {
printf("key = %s\n", kv.first.c_str());
}
ASSERT_EQ(kv.second, value);
}
}
};
class MySQLStyleTransactionTest : public TransactionTest {};

View File

@ -1629,6 +1629,34 @@ TEST_P(WritePreparedTransactionTest, Iterate) {
delete transaction;
}
// Test that we can change write policy from WriteCommitted to WritePrepared
// after a clean shutdown (which would empty the WAL)
TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, empty_wal);
}
// Test that we fail fast if WAL is not emptied between changing the write
// policy from WriteCommitted to WritePrepared
TEST_P(WritePreparedTransactionTest, WP_WC_WALBackwardIncompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, !empty_wal);
}
// Test that we can change write policy from WritePrepare back to WriteCommitted
// after a clean shutdown (which would empty the WAL)
TEST_P(WritePreparedTransactionTest, WC_WP_ForwardCompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, empty_wal);
}
// Test that we fail fast if WAL is not emptied between changing the write
// policy from WriteCommitted to WritePrepared
TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) {
bool empty_wal = true;
CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, !empty_wal);
}
} // namespace rocksdb
int main(int argc, char** argv) {

View File

@ -61,7 +61,9 @@ Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
Status WritePreparedTxn::PrepareInternal() {
WriteOptions write_options = write_options_;
write_options.disableWAL = false;
WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
const bool write_after_commit = true;
WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
!write_after_commit);
const bool disable_memtable = true;
uint64_t seq_used = kMaxSequenceNumber;
bool collapsed = GetWriteBatch()->Collapse();
@ -216,6 +218,9 @@ Status WritePreparedTxn::RollbackInternal() {
Status MarkRollback(const Slice&) override {
return Status::InvalidArgument();
}
protected:
virtual bool WriteAfterCommit() const override { return false; }
} rollback_handler(db_impl_, wpt_db_, last_visible_txn, &rollback_batch);
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
assert(s.ok());

View File

@ -20,6 +20,7 @@
#include "rocksdb/iterator.h"
#include "util/arena.h"
#include "util/cast_util.h"
#include "util/string_util.h"
#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
namespace rocksdb {
@ -552,13 +553,15 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
break;
case kTypeLogData:
case kTypeBeginPrepareXID:
case kTypeBeginPersistedPrepareXID:
case kTypeEndPrepareXID:
case kTypeCommitXID:
case kTypeRollbackXID:
case kTypeNoop:
break;
default:
return Status::Corruption("unknown WriteBatch tag");
return Status::Corruption("unknown WriteBatch tag in ReBuildIndex",
ToString(static_cast<unsigned int>(tag)));
}
}
@ -622,6 +625,7 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
break;
case kTypeLogData:
case kTypeBeginPrepareXID:
case kTypeBeginPersistedPrepareXID:
case kTypeEndPrepareXID:
case kTypeCommitXID:
case kTypeRollbackXID:

View File

@ -71,13 +71,15 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
break;
case kTypeNoop:
case kTypeBeginPrepareXID:
case kTypeBeginPersistedPrepareXID:
case kTypeEndPrepareXID:
case kTypeCommitXID:
case kTypeRollbackXID:
*type = kXIDRecord;
break;
default:
return Status::Corruption("unknown WriteBatch tag");
return Status::Corruption("unknown WriteBatch tag ",
ToString(static_cast<unsigned int>(tag)));
}
return Status::OK();
}