Modification of WriteBatch to support two phase commit

Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.

Test Plan: single unit test in write_batch_test.

Reviewers: hermanlee4, sdong, anthony

Subscribers: andrewkr, vasilep, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D54093
This commit is contained in:
Reid Horuff 2016-04-07 23:35:51 -07:00
parent 1d2e4ef747
commit 6e56a114be
9 changed files with 241 additions and 22 deletions

View File

@ -39,6 +39,11 @@ enum ValueType : unsigned char {
kTypeColumnFamilyMerge = 0x6, // WAL only. kTypeColumnFamilyMerge = 0x6, // WAL only.
kTypeSingleDeletion = 0x7, kTypeSingleDeletion = 0x7,
kTypeColumnFamilySingleDeletion = 0x8, // WAL only. kTypeColumnFamilySingleDeletion = 0x8, // WAL only.
kTypeBeginPrepareXID = 0x9, // WAL only.
kTypeEndPrepareXID = 0xA, // WAL only.
kTypeCommitXID = 0xB, // WAL only.
kTypeRollbackXID = 0xC, // WAL only.
kTypeNoop = 0xD, // WAL only.
kMaxValue = 0x7F // Not used for storing records. kMaxValue = 0x7F // Not used for storing records.
}; };
@ -478,5 +483,5 @@ extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key,
// input will be advanced to after the record. // input will be advanced to after the record.
extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
uint32_t* column_family, Slice* key, uint32_t* column_family, Slice* key,
Slice* value, Slice* blob); Slice* value, Slice* blob, Slice* xid);
} // namespace rocksdb } // namespace rocksdb

View File

@ -20,6 +20,11 @@
// kTypeColumnFamilyDeletion varint32 varstring varstring // kTypeColumnFamilyDeletion varint32 varstring varstring
// kTypeColumnFamilySingleDeletion varint32 varstring varstring // kTypeColumnFamilySingleDeletion varint32 varstring varstring
// kTypeColumnFamilyMerge varint32 varstring varstring // kTypeColumnFamilyMerge varint32 varstring varstring
// kTypeBeginPrepareXID varstring
// kTypeEndPrepareXID
// kTypeCommitXID varstring
// kTypeRollbackXID varstring
// kTypeNoop
// varstring := // varstring :=
// len: varint32 // len: varint32
// data: uint8[len] // data: uint8[len]
@ -48,11 +53,15 @@ namespace rocksdb {
namespace { namespace {
enum ContentFlags : uint32_t { enum ContentFlags : uint32_t {
DEFERRED = 1, DEFERRED = 1 << 0,
HAS_PUT = 2, HAS_PUT = 1 << 1,
HAS_DELETE = 4, HAS_DELETE = 1 << 2,
HAS_SINGLE_DELETE = 8, HAS_SINGLE_DELETE = 1 << 3,
HAS_MERGE = 16, HAS_MERGE = 1 << 4,
HAS_BEGIN_PREPARE = 1 << 5,
HAS_END_PREPARE = 1 << 6,
HAS_COMMIT = 1 << 7,
HAS_ROLLBACK = 1 << 8,
}; };
struct BatchContentClassifier : public WriteBatch::Handler { struct BatchContentClassifier : public WriteBatch::Handler {
@ -77,6 +86,26 @@ struct BatchContentClassifier : public WriteBatch::Handler {
content_flags |= ContentFlags::HAS_MERGE; content_flags |= ContentFlags::HAS_MERGE;
return Status::OK(); return Status::OK();
} }
Status MarkBeginPrepare() override {
content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
return Status::OK();
}
Status MarkEndPrepare(const Slice&) override {
content_flags |= ContentFlags::HAS_END_PREPARE;
return Status::OK();
}
Status MarkCommit(const Slice&) override {
content_flags |= ContentFlags::HAS_COMMIT;
return Status::OK();
}
Status MarkRollback(const Slice&) override {
content_flags |= ContentFlags::HAS_ROLLBACK;
return Status::OK();
}
}; };
} // anon namespace } // anon namespace
@ -97,6 +126,7 @@ WriteBatch::WriteBatch(size_t reserved_bytes)
rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ? rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ?
reserved_bytes : WriteBatchInternal::kHeader); reserved_bytes : WriteBatchInternal::kHeader);
rep_.resize(WriteBatchInternal::kHeader); rep_.resize(WriteBatchInternal::kHeader);
rep_.push_back(static_cast<char>(kTypeNoop));
} }
WriteBatch::WriteBatch(const std::string& rep) WriteBatch::WriteBatch(const std::string& rep)
@ -146,6 +176,7 @@ bool WriteBatch::Handler::Continue() {
void WriteBatch::Clear() { void WriteBatch::Clear() {
rep_.clear(); rep_.clear();
rep_.resize(WriteBatchInternal::kHeader); rep_.resize(WriteBatchInternal::kHeader);
rep_.push_back(static_cast<char>(kTypeNoop));
content_flags_.store(0, std::memory_order_relaxed); content_flags_.store(0, std::memory_order_relaxed);
@ -209,9 +240,25 @@ bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) {
return GetLengthPrefixedSlice(input, key); return GetLengthPrefixedSlice(input, key);
} }
bool WriteBatch::HasBeginPrepare() const {
return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0;
}
bool WriteBatch::HasEndPrepare() const {
return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0;
}
bool WriteBatch::HasCommit() const {
return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0;
}
bool WriteBatch::HasRollback() const {
return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0;
}
Status ReadRecordFromWriteBatch(Slice* input, char* tag, Status ReadRecordFromWriteBatch(Slice* input, char* tag,
uint32_t* column_family, Slice* key, uint32_t* column_family, Slice* key,
Slice* value, Slice* blob) { Slice* value, Slice* blob, Slice* xid) {
assert(key != nullptr && value != nullptr); assert(key != nullptr && value != nullptr);
*tag = (*input)[0]; *tag = (*input)[0];
input->remove_prefix(1); input->remove_prefix(1);
@ -257,6 +304,24 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
return Status::Corruption("bad WriteBatch Blob"); return Status::Corruption("bad WriteBatch Blob");
} }
break; break;
case kTypeNoop:
case kTypeBeginPrepareXID:
break;
case kTypeEndPrepareXID:
if (!GetLengthPrefixedSlice(input, xid)) {
return Status::Corruption("bad EndPrepare XID");
}
break;
case kTypeCommitXID:
if (!GetLengthPrefixedSlice(input, xid)) {
return Status::Corruption("bad Commit XID");
}
break;
case kTypeRollbackXID:
if (!GetLengthPrefixedSlice(input, xid)) {
return Status::Corruption("bad Rollback XID");
}
break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag");
} }
@ -270,7 +335,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
} }
input.remove_prefix(WriteBatchInternal::kHeader); input.remove_prefix(WriteBatchInternal::kHeader);
Slice key, value, blob; Slice key, value, blob, xid;
int found = 0; int found = 0;
Status s; Status s;
while (s.ok() && !input.empty() && handler->Continue()) { while (s.ok() && !input.empty() && handler->Continue()) {
@ -278,7 +343,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
uint32_t column_family = 0; // default uint32_t column_family = 0; // default
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
&blob); &blob, &xid);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -315,6 +380,28 @@ Status WriteBatch::Iterate(Handler* handler) const {
case kTypeLogData: case kTypeLogData:
handler->LogData(blob); handler->LogData(blob);
break; break;
case kTypeBeginPrepareXID:
assert(content_flags_.load(std::memory_order_relaxed) &
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
handler->MarkBeginPrepare();
break;
case kTypeEndPrepareXID:
assert(content_flags_.load(std::memory_order_relaxed) &
(ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
handler->MarkEndPrepare(xid);
break;
case kTypeCommitXID:
assert(content_flags_.load(std::memory_order_relaxed) &
(ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
handler->MarkCommit(xid);
break;
case kTypeRollbackXID:
assert(content_flags_.load(std::memory_order_relaxed) &
(ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
handler->MarkRollback(xid);
break;
case kTypeNoop:
break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag");
} }
@ -391,6 +478,43 @@ void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
} }
void WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) {
// a manually constructed batch can only contain one prepare section
assert(b->rep_[12] == static_cast<char>(kTypeNoop));
// all savepoints up to this point are cleared
if (b->save_points_ != nullptr) {
while (!b->save_points_->stack.empty()) {
b->save_points_->stack.pop();
}
}
// rewrite noop as begin marker
b->rep_[12] = static_cast<char>(kTypeBeginPrepareXID);
b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
PutLengthPrefixedSlice(&b->rep_, xid);
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
ContentFlags::HAS_END_PREPARE |
ContentFlags::HAS_BEGIN_PREPARE,
std::memory_order_relaxed);
}
void WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) {
b->rep_.push_back(static_cast<char>(kTypeCommitXID));
PutLengthPrefixedSlice(&b->rep_, xid);
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
ContentFlags::HAS_COMMIT,
std::memory_order_relaxed);
}
void WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
PutLengthPrefixedSlice(&b->rep_, xid);
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
ContentFlags::HAS_ROLLBACK,
std::memory_order_relaxed);
}
void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, void WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
const Slice& key) { const Slice& key) {
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);

View File

@ -92,6 +92,14 @@ class WriteBatchInternal {
static void Merge(WriteBatch* batch, uint32_t column_family_id, static void Merge(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key, const SliceParts& value); const SliceParts& key, const SliceParts& value);
static void MarkBeginPrepare(WriteBatch* batch);
static void MarkEndPrepare(WriteBatch* batch, const Slice& xid);
static void MarkRollback(WriteBatch* batch, const Slice& xid);
static void MarkCommit(WriteBatch* batch, const Slice& xid);
// Return the number of entries in the batch. // Return the number of entries in the batch.
static int Count(const WriteBatch* batch); static int Count(const WriteBatch* batch);

View File

@ -231,6 +231,22 @@ namespace {
virtual void LogData(const Slice& blob) override { virtual void LogData(const Slice& blob) override {
seen += "LogData(" + blob.ToString() + ")"; seen += "LogData(" + blob.ToString() + ")";
} }
virtual Status MarkBeginPrepare() override {
seen += "MarkBeginPrepare()";
return Status::OK();
}
virtual Status MarkEndPrepare(const Slice& xid) override {
seen += "MarkEndPrepare(" + xid.ToString() + ")";
return Status::OK();
}
virtual Status MarkCommit(const Slice& xid) override {
seen += "MarkCommit(" + xid.ToString() + ")";
return Status::OK();
}
virtual Status MarkRollback(const Slice& xid) override {
seen += "MarkRollback(" + xid.ToString() + ")";
return Status::OK();
}
}; };
} }
@ -308,6 +324,30 @@ TEST_F(WriteBatchTest, Blob) {
handler.seen); handler.seen);
} }
TEST_F(WriteBatchTest, PrepareCommit) {
WriteBatch batch;
batch.Put(Slice("k1"), Slice("v1"));
batch.Put(Slice("k2"), Slice("v2"));
batch.SetSavePoint();
WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"));
Status s = batch.RollbackToSavePoint();
ASSERT_EQ(s, Status::NotFound());
WriteBatchInternal::MarkCommit(&batch, Slice("xid1"));
WriteBatchInternal::MarkRollback(&batch, Slice("xid1"));
ASSERT_EQ(2, batch.Count());
TestHandler handler;
batch.Iterate(&handler);
ASSERT_EQ(
"MarkBeginPrepare()"
"Put(k1, v1)"
"Put(k2, v2)"
"MarkEndPrepare(xid1)"
"MarkCommit(xid1)"
"MarkRollback(xid1)",
handler.seen);
}
// It requires more than 30GB of memory to run the test. With single memory // It requires more than 30GB of memory to run the test. With single memory
// allocation of more than 30GB. // allocation of more than 30GB.
// Not all platform can run it. Also it runs a long time. So disable it. // Not all platform can run it. Also it runs a long time. So disable it.

View File

@ -34,7 +34,8 @@ enum WriteType {
kMergeRecord, kMergeRecord,
kDeleteRecord, kDeleteRecord,
kSingleDeleteRecord, kSingleDeleteRecord,
kLogDataRecord kLogDataRecord,
kXIDRecord,
}; };
// an entry for Put, Merge, Delete, or SingleDelete entry for write batches. // an entry for Put, Merge, Delete, or SingleDelete entry for write batches.

View File

@ -186,6 +186,23 @@ class WriteBatch : public WriteBatchBase {
// The default implementation of LogData does nothing. // The default implementation of LogData does nothing.
virtual void LogData(const Slice& blob); virtual void LogData(const Slice& blob);
virtual Status MarkBeginPrepare() {
return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
}
virtual Status MarkEndPrepare(const Slice& xid) {
return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
}
virtual Status MarkRollback(const Slice& xid) {
return Status::InvalidArgument(
"MarkRollbackPrepare() handler not defined.");
}
virtual Status MarkCommit(const Slice& xid) {
return Status::InvalidArgument("MarkCommit() handler not defined.");
}
// Continue is called by WriteBatch::Iterate. If it returns false, // Continue is called by WriteBatch::Iterate. If it returns false,
// iteration is halted. Otherwise, it continues iterating. The default // iteration is halted. Otherwise, it continues iterating. The default
// implementation always returns true. // implementation always returns true.
@ -214,6 +231,18 @@ class WriteBatch : public WriteBatchBase {
// Returns trie if MergeCF will be called during Iterate // Returns trie if MergeCF will be called during Iterate
bool HasMerge() const; bool HasMerge() const;
// Returns true if MarkBeginPrepare will be called during Iterate
bool HasBeginPrepare() const;
// Returns true if MarkEndPrepare will be called during Iterate
bool HasEndPrepare() const;
// Returns trie if MarkCommit will be called during Iterate
bool HasCommit() const;
// Returns trie if MarkRollback will be called during Iterate
bool HasRollback() const;
using WriteBatchBase::GetWriteBatch; using WriteBatchBase::GetWriteBatch;
WriteBatch* GetWriteBatch() override { return this; } WriteBatch* GetWriteBatch() override { return this; }

View File

@ -337,13 +337,13 @@ class WBWIIteratorImpl : public WBWIIterator {
virtual WriteEntry Entry() const override { virtual WriteEntry Entry() const override {
WriteEntry ret; WriteEntry ret;
Slice blob; Slice blob, xid;
const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
// this is guaranteed with Valid() // this is guaranteed with Valid()
assert(iter_entry != nullptr && assert(iter_entry != nullptr &&
iter_entry->column_family == column_family_id_); iter_entry->column_family == column_family_id_);
auto s = write_batch_->GetEntryFromDataOffset(iter_entry->offset, &ret.type, auto s = write_batch_->GetEntryFromDataOffset(
&ret.key, &ret.value, &blob); iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid);
assert(s.ok()); assert(s.ok());
assert(ret.type == kPutRecord || ret.type == kDeleteRecord || assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
ret.type == kSingleDeleteRecord || ret.type == kMergeRecord); ret.type == kSingleDeleteRecord || ret.type == kMergeRecord);
@ -501,7 +501,7 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
// Loop through all entries in Rep and add each one to the index // Loop through all entries in Rep and add each one to the index
int found = 0; int found = 0;
while (s.ok() && !input.empty()) { while (s.ok() && !input.empty()) {
Slice key, value, blob; Slice key, value, blob, xid;
uint32_t column_family_id = 0; // default uint32_t column_family_id = 0; // default
char tag = 0; char tag = 0;
@ -509,7 +509,7 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
last_entry_offset = input.data() - write_batch.Data().data(); last_entry_offset = input.data() - write_batch.Data().data();
s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key,
&value, &blob); &value, &blob, &xid);
if (!s.ok()) { if (!s.ok()) {
break; break;
} }
@ -529,6 +529,11 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
} }
break; break;
case kTypeLogData: case kTypeLogData:
case kTypeBeginPrepareXID:
case kTypeEndPrepareXID:
case kTypeCommitXID:
case kTypeRollbackXID:
case kTypeNoop:
break; break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag");

View File

@ -24,10 +24,10 @@ class Statistics;
Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
WriteType* type, Slice* Key, WriteType* type, Slice* Key,
Slice* value, Slice* value, Slice* blob,
Slice* blob) const { Slice* xid) const {
if (type == nullptr || Key == nullptr || value == nullptr || if (type == nullptr || Key == nullptr || value == nullptr ||
blob == nullptr) { blob == nullptr || xid == nullptr) {
return Status::InvalidArgument("Output parameters cannot be null"); return Status::InvalidArgument("Output parameters cannot be null");
} }
@ -42,8 +42,8 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset); Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset);
char tag; char tag;
uint32_t column_family; uint32_t column_family;
Status s = Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value,
ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value, blob); blob, xid);
switch (tag) { switch (tag) {
case kTypeColumnFamilyValue: case kTypeColumnFamilyValue:
@ -65,6 +65,12 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
case kTypeLogData: case kTypeLogData:
*type = kLogDataRecord; *type = kLogDataRecord;
break; break;
case kTypeBeginPrepareXID:
case kTypeEndPrepareXID:
case kTypeCommitXID:
case kTypeRollbackXID:
*type = kXIDRecord;
break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag");
} }
@ -183,7 +189,8 @@ WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
result = WriteBatchWithIndexInternal::Result::kDeleted; result = WriteBatchWithIndexInternal::Result::kDeleted;
break; break;
} }
case kLogDataRecord: { case kLogDataRecord:
case kXIDRecord: {
// ignore // ignore
break; break;
} }

View File

@ -58,7 +58,7 @@ class ReadableWriteBatch : public WriteBatch {
// Retrieve some information from a write entry in the write batch, given // Retrieve some information from a write entry in the write batch, given
// the start offset of the write entry. // the start offset of the write entry.
Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
Slice* value, Slice* blob) const; Slice* value, Slice* blob, Slice* xid) const;
}; };
class WriteBatchEntryComparator { class WriteBatchEntryComparator {