WritePrepared Txn: Duplicate Keys, Txn Part
Summary: This patch takes advantage of memtable being able to detect duplicate <key,seq> and returning TryAgain to handle duplicate keys in WritePrepared Txns. Through WriteBatchWithIndex's index it detects existence of at least a duplicate key in the write batch. If duplicate key was reported, it then pays the cost of counting the number of sub-patches by iterating over the write batch and pass it to DBImpl::Write. DB will make use of the provided batch_count to assign proper sequence numbers before sending them to the WAL. When later inserting the batch to the memtable, it increases the seq each time memtbale reports a duplicate (a sub-patch in our counting) and tries again. Closes https://github.com/facebook/rocksdb/pull/3455 Differential Revision: D6873699 Pulled By: maysamyabandeh fbshipit-source-id: db8487526c3a5dc1ddda0ea49f0f979b26ae648d
This commit is contained in:
parent
4b124fb9d3
commit
88d8b2a2f5
@ -673,10 +673,14 @@ class DBImpl : public DB {
|
|||||||
// batch which will be written to memtable later during the commit, and in
|
// batch which will be written to memtable later during the commit, and in
|
||||||
// WritePrepared it is guaranteed since it will be used only for WAL markers
|
// WritePrepared it is guaranteed since it will be used only for WAL markers
|
||||||
// which will never be written to memtable.
|
// which will never be written to memtable.
|
||||||
|
// batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
|
||||||
|
// the number of sub-patches. A sub-patch is a subset of the write batch that
|
||||||
|
// does not have duplicate keys.
|
||||||
Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
|
Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
|
||||||
WriteCallback* callback = nullptr,
|
WriteCallback* callback = nullptr,
|
||||||
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
|
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
|
||||||
bool disable_memtable = false, uint64_t* seq_used = nullptr,
|
bool disable_memtable = false, uint64_t* seq_used = nullptr,
|
||||||
|
size_t batch_cnt = 0,
|
||||||
PreReleaseCallback* pre_release_callback = nullptr);
|
PreReleaseCallback* pre_release_callback = nullptr);
|
||||||
|
|
||||||
Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
|
Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
|
||||||
@ -685,10 +689,13 @@ class DBImpl : public DB {
|
|||||||
bool disable_memtable = false,
|
bool disable_memtable = false,
|
||||||
uint64_t* seq_used = nullptr);
|
uint64_t* seq_used = nullptr);
|
||||||
|
|
||||||
|
// batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
|
||||||
|
// the number of sub-patches. A sub-patch is a subset of the write batch that
|
||||||
|
// does not have duplicate keys.
|
||||||
Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates,
|
Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates,
|
||||||
WriteCallback* callback = nullptr,
|
WriteCallback* callback = nullptr,
|
||||||
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
|
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
|
||||||
uint64_t* seq_used = nullptr,
|
uint64_t* seq_used = nullptr, size_t batch_cnt = 0,
|
||||||
PreReleaseCallback* pre_release_callback = nullptr);
|
PreReleaseCallback* pre_release_callback = nullptr);
|
||||||
|
|
||||||
uint64_t FindMinLogContainingOutstandingPrep();
|
uint64_t FindMinLogContainingOutstandingPrep();
|
||||||
|
@ -64,7 +64,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
WriteBatch* my_batch, WriteCallback* callback,
|
WriteBatch* my_batch, WriteCallback* callback,
|
||||||
uint64_t* log_used, uint64_t log_ref,
|
uint64_t* log_used, uint64_t log_ref,
|
||||||
bool disable_memtable, uint64_t* seq_used,
|
bool disable_memtable, uint64_t* seq_used,
|
||||||
|
size_t batch_cnt,
|
||||||
PreReleaseCallback* pre_release_callback) {
|
PreReleaseCallback* pre_release_callback) {
|
||||||
|
assert(!seq_per_batch_ || batch_cnt != 0);
|
||||||
if (my_batch == nullptr) {
|
if (my_batch == nullptr) {
|
||||||
return Status::Corruption("Batch is nullptr!");
|
return Status::Corruption("Batch is nullptr!");
|
||||||
}
|
}
|
||||||
@ -76,6 +78,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
"pipelined_writes is not compatible with concurrent prepares");
|
"pipelined_writes is not compatible with concurrent prepares");
|
||||||
}
|
}
|
||||||
if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
|
if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
|
||||||
|
// TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
|
||||||
return Status::NotSupported(
|
return Status::NotSupported(
|
||||||
"pipelined_writes is not compatible with seq_per_batch");
|
"pipelined_writes is not compatible with seq_per_batch");
|
||||||
}
|
}
|
||||||
@ -93,7 +96,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
|
|
||||||
if (two_write_queues_ && disable_memtable) {
|
if (two_write_queues_ && disable_memtable) {
|
||||||
return WriteImplWALOnly(write_options, my_batch, callback, log_used,
|
return WriteImplWALOnly(write_options, my_batch, callback, log_used,
|
||||||
log_ref, seq_used, pre_release_callback);
|
log_ref, seq_used, batch_cnt, pre_release_callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (immutable_db_options_.enable_pipelined_write) {
|
if (immutable_db_options_.enable_pipelined_write) {
|
||||||
@ -103,7 +106,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
|
|
||||||
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
||||||
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
||||||
disable_memtable, pre_release_callback);
|
disable_memtable, batch_cnt, pre_release_callback);
|
||||||
|
|
||||||
if (!write_options.disableWAL) {
|
if (!write_options.disableWAL) {
|
||||||
RecordTick(stats_, WRITE_WITH_WAL);
|
RecordTick(stats_, WRITE_WITH_WAL);
|
||||||
@ -122,7 +125,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
w.status = WriteBatchInternal::InsertInto(
|
w.status = WriteBatchInternal::InsertInto(
|
||||||
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
||||||
write_options.ignore_missing_column_families, 0 /*log_number*/, this,
|
write_options.ignore_missing_column_families, 0 /*log_number*/, this,
|
||||||
true /*concurrent_memtable_writes*/, seq_per_batch_);
|
true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (write_thread_.CompleteParallelMemTableWriter(&w)) {
|
if (write_thread_.CompleteParallelMemTableWriter(&w)) {
|
||||||
@ -214,7 +217,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
uint64_t total_byte_size = 0;
|
uint64_t total_byte_size = 0;
|
||||||
for (auto* writer : write_group) {
|
for (auto* writer : write_group) {
|
||||||
if (writer->CheckCallback(this)) {
|
if (writer->CheckCallback(this)) {
|
||||||
valid_batches++;
|
valid_batches += writer->batch_cnt;
|
||||||
if (writer->ShouldWriteToMemtable()) {
|
if (writer->ShouldWriteToMemtable()) {
|
||||||
total_count += WriteBatchInternal::Count(writer->batch);
|
total_count += WriteBatchInternal::Count(writer->batch);
|
||||||
parallel = parallel && !writer->batch->HasMerge();
|
parallel = parallel && !writer->batch->HasMerge();
|
||||||
@ -303,7 +306,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
}
|
}
|
||||||
writer->sequence = next_sequence;
|
writer->sequence = next_sequence;
|
||||||
if (seq_per_batch_) {
|
if (seq_per_batch_) {
|
||||||
next_sequence++;
|
assert(writer->batch_cnt);
|
||||||
|
next_sequence += writer->batch_cnt;
|
||||||
} else if (writer->ShouldWriteToMemtable()) {
|
} else if (writer->ShouldWriteToMemtable()) {
|
||||||
next_sequence += WriteBatchInternal::Count(writer->batch);
|
next_sequence += WriteBatchInternal::Count(writer->batch);
|
||||||
}
|
}
|
||||||
@ -323,7 +327,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
w.status = WriteBatchInternal::InsertInto(
|
w.status = WriteBatchInternal::InsertInto(
|
||||||
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
&w, w.sequence, &column_family_memtables, &flush_scheduler_,
|
||||||
write_options.ignore_missing_column_families, 0 /*log_number*/,
|
write_options.ignore_missing_column_families, 0 /*log_number*/,
|
||||||
this, true /*concurrent_memtable_writes*/, seq_per_batch_);
|
this, true /*concurrent_memtable_writes*/, seq_per_batch_,
|
||||||
|
w.batch_cnt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (seq_used != nullptr) {
|
if (seq_used != nullptr) {
|
||||||
@ -515,12 +520,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|||||||
Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
||||||
WriteBatch* my_batch, WriteCallback* callback,
|
WriteBatch* my_batch, WriteCallback* callback,
|
||||||
uint64_t* log_used, uint64_t log_ref,
|
uint64_t* log_used, uint64_t log_ref,
|
||||||
uint64_t* seq_used,
|
uint64_t* seq_used, size_t batch_cnt,
|
||||||
PreReleaseCallback* pre_release_callback) {
|
PreReleaseCallback* pre_release_callback) {
|
||||||
Status status;
|
Status status;
|
||||||
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
||||||
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
WriteThread::Writer w(write_options, my_batch, callback, log_ref,
|
||||||
true /* disable_memtable */, pre_release_callback);
|
true /* disable_memtable */, batch_cnt,
|
||||||
|
pre_release_callback);
|
||||||
RecordTick(stats_, WRITE_WITH_WAL);
|
RecordTick(stats_, WRITE_WITH_WAL);
|
||||||
StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
|
StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
|
||||||
|
|
||||||
@ -576,7 +582,15 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
|||||||
PERF_TIMER_GUARD(write_wal_time);
|
PERF_TIMER_GUARD(write_wal_time);
|
||||||
// LastAllocatedSequence is increased inside WriteToWAL under
|
// LastAllocatedSequence is increased inside WriteToWAL under
|
||||||
// wal_write_mutex_ to ensure ordered events in WAL
|
// wal_write_mutex_ to ensure ordered events in WAL
|
||||||
size_t seq_inc = seq_per_batch_ ? write_group.size : 0 /*total_count*/;
|
size_t seq_inc = 0 /* total_count */;
|
||||||
|
if (seq_per_batch_) {
|
||||||
|
size_t total_batch_cnt = 0;
|
||||||
|
for (auto* writer : write_group) {
|
||||||
|
assert(writer->batch_cnt);
|
||||||
|
total_batch_cnt += writer->batch_cnt;
|
||||||
|
}
|
||||||
|
seq_inc = total_batch_cnt;
|
||||||
|
}
|
||||||
if (!write_options.disableWAL) {
|
if (!write_options.disableWAL) {
|
||||||
status =
|
status =
|
||||||
ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
|
ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
|
||||||
@ -591,7 +605,8 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
|||||||
}
|
}
|
||||||
writer->sequence = curr_seq;
|
writer->sequence = curr_seq;
|
||||||
if (seq_per_batch_) {
|
if (seq_per_batch_) {
|
||||||
curr_seq++;
|
assert(writer->batch_cnt);
|
||||||
|
curr_seq += writer->batch_cnt;
|
||||||
}
|
}
|
||||||
// else seq advances only by memtable writes
|
// else seq advances only by memtable writes
|
||||||
}
|
}
|
||||||
|
@ -402,14 +402,21 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
|||||||
bool empty_batch = true;
|
bool empty_batch = true;
|
||||||
int found = 0;
|
int found = 0;
|
||||||
Status s;
|
Status s;
|
||||||
while (s.ok() && !input.empty() && handler->Continue()) {
|
char tag = 0;
|
||||||
char tag = 0;
|
uint32_t column_family = 0; // default
|
||||||
uint32_t column_family = 0; // default
|
while ((s.ok() || s.IsTryAgain()) && !input.empty() && handler->Continue()) {
|
||||||
|
if (!s.IsTryAgain()) {
|
||||||
|
tag = 0;
|
||||||
|
column_family = 0; // default
|
||||||
|
|
||||||
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
|
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
|
||||||
&blob, &xid);
|
&blob, &xid);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert(s.IsTryAgain());
|
||||||
|
s = Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (tag) {
|
switch (tag) {
|
||||||
@ -418,47 +425,59 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
|||||||
assert(content_flags_.load(std::memory_order_relaxed) &
|
assert(content_flags_.load(std::memory_order_relaxed) &
|
||||||
(ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
|
(ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
|
||||||
s = handler->PutCF(column_family, key, value);
|
s = handler->PutCF(column_family, key, value);
|
||||||
empty_batch = false;
|
if (s.ok()) {
|
||||||
found++;
|
empty_batch = false;
|
||||||
|
found++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case kTypeColumnFamilyDeletion:
|
case kTypeColumnFamilyDeletion:
|
||||||
case kTypeDeletion:
|
case kTypeDeletion:
|
||||||
assert(content_flags_.load(std::memory_order_relaxed) &
|
assert(content_flags_.load(std::memory_order_relaxed) &
|
||||||
(ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
|
(ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
|
||||||
s = handler->DeleteCF(column_family, key);
|
s = handler->DeleteCF(column_family, key);
|
||||||
empty_batch = false;
|
if (s.ok()) {
|
||||||
found++;
|
empty_batch = false;
|
||||||
|
found++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case kTypeColumnFamilySingleDeletion:
|
case kTypeColumnFamilySingleDeletion:
|
||||||
case kTypeSingleDeletion:
|
case kTypeSingleDeletion:
|
||||||
assert(content_flags_.load(std::memory_order_relaxed) &
|
assert(content_flags_.load(std::memory_order_relaxed) &
|
||||||
(ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
|
(ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
|
||||||
s = handler->SingleDeleteCF(column_family, key);
|
s = handler->SingleDeleteCF(column_family, key);
|
||||||
empty_batch = false;
|
if (s.ok()) {
|
||||||
found++;
|
empty_batch = false;
|
||||||
|
found++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case kTypeColumnFamilyRangeDeletion:
|
case kTypeColumnFamilyRangeDeletion:
|
||||||
case kTypeRangeDeletion:
|
case kTypeRangeDeletion:
|
||||||
assert(content_flags_.load(std::memory_order_relaxed) &
|
assert(content_flags_.load(std::memory_order_relaxed) &
|
||||||
(ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
|
(ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
|
||||||
s = handler->DeleteRangeCF(column_family, key, value);
|
s = handler->DeleteRangeCF(column_family, key, value);
|
||||||
empty_batch = false;
|
if (s.ok()) {
|
||||||
found++;
|
empty_batch = false;
|
||||||
|
found++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case kTypeColumnFamilyMerge:
|
case kTypeColumnFamilyMerge:
|
||||||
case kTypeMerge:
|
case kTypeMerge:
|
||||||
assert(content_flags_.load(std::memory_order_relaxed) &
|
assert(content_flags_.load(std::memory_order_relaxed) &
|
||||||
(ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
|
(ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
|
||||||
s = handler->MergeCF(column_family, key, value);
|
s = handler->MergeCF(column_family, key, value);
|
||||||
empty_batch = false;
|
if (s.ok()) {
|
||||||
found++;
|
empty_batch = false;
|
||||||
|
found++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case kTypeColumnFamilyBlobIndex:
|
case kTypeColumnFamilyBlobIndex:
|
||||||
case kTypeBlobIndex:
|
case kTypeBlobIndex:
|
||||||
assert(content_flags_.load(std::memory_order_relaxed) &
|
assert(content_flags_.load(std::memory_order_relaxed) &
|
||||||
(ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
|
(ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
|
||||||
s = handler->PutBlobIndexCF(column_family, key, value);
|
s = handler->PutBlobIndexCF(column_family, key, value);
|
||||||
found++;
|
if (s.ok()) {
|
||||||
|
found++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case kTypeLogData:
|
case kTypeLogData:
|
||||||
handler->LogData(blob);
|
handler->LogData(blob);
|
||||||
@ -1084,12 +1103,23 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
MaybeAdvanceSeq();
|
MaybeAdvanceSeq();
|
||||||
return seek_status;
|
return seek_status;
|
||||||
}
|
}
|
||||||
|
Status ret_status;
|
||||||
|
|
||||||
MemTable* mem = cf_mems_->GetMemTable();
|
MemTable* mem = cf_mems_->GetMemTable();
|
||||||
auto* moptions = mem->GetImmutableMemTableOptions();
|
auto* moptions = mem->GetImmutableMemTableOptions();
|
||||||
|
// inplace_update_support is inconsistent with snapshots, and therefore with
|
||||||
|
// any kind of transactions including the ones that use seq_per_batch
|
||||||
|
assert(!seq_per_batch_ || !moptions->inplace_update_support);
|
||||||
if (!moptions->inplace_update_support) {
|
if (!moptions->inplace_update_support) {
|
||||||
mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_,
|
bool mem_res =
|
||||||
get_post_process_info(mem));
|
mem->Add(sequence_, value_type, key, value,
|
||||||
|
concurrent_memtable_writes_, get_post_process_info(mem));
|
||||||
|
if (!mem_res) {
|
||||||
|
assert(seq_per_batch_);
|
||||||
|
ret_status = Status::TryAgain("key+seq exists");
|
||||||
|
const bool BATCH_BOUNDRY = true;
|
||||||
|
MaybeAdvanceSeq(BATCH_BOUNDRY);
|
||||||
|
}
|
||||||
} else if (moptions->inplace_callback == nullptr) {
|
} else if (moptions->inplace_callback == nullptr) {
|
||||||
assert(!concurrent_memtable_writes_);
|
assert(!concurrent_memtable_writes_);
|
||||||
mem->Update(sequence_, key, value);
|
mem->Update(sequence_, key, value);
|
||||||
@ -1125,11 +1155,15 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
value, &merged_value);
|
value, &merged_value);
|
||||||
if (status == UpdateStatus::UPDATED_INPLACE) {
|
if (status == UpdateStatus::UPDATED_INPLACE) {
|
||||||
// prev_value is updated in-place with final value.
|
// prev_value is updated in-place with final value.
|
||||||
mem->Add(sequence_, value_type, key, Slice(prev_buffer, prev_size));
|
bool mem_res __attribute__((__unused__)) = mem->Add(
|
||||||
|
sequence_, value_type, key, Slice(prev_buffer, prev_size));
|
||||||
|
assert(mem_res);
|
||||||
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
||||||
} else if (status == UpdateStatus::UPDATED) {
|
} else if (status == UpdateStatus::UPDATED) {
|
||||||
// merged_value contains the final value.
|
// merged_value contains the final value.
|
||||||
mem->Add(sequence_, value_type, key, Slice(merged_value));
|
bool mem_res __attribute__((__unused__)) =
|
||||||
|
mem->Add(sequence_, value_type, key, Slice(merged_value));
|
||||||
|
assert(mem_res);
|
||||||
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1139,7 +1173,7 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
// in memtable add/update.
|
// in memtable add/update.
|
||||||
MaybeAdvanceSeq();
|
MaybeAdvanceSeq();
|
||||||
CheckMemtableFull();
|
CheckMemtableFull();
|
||||||
return Status::OK();
|
return ret_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||||
@ -1149,12 +1183,20 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
|
|
||||||
Status DeleteImpl(uint32_t column_family_id, const Slice& key,
|
Status DeleteImpl(uint32_t column_family_id, const Slice& key,
|
||||||
const Slice& value, ValueType delete_type) {
|
const Slice& value, ValueType delete_type) {
|
||||||
|
Status ret_status;
|
||||||
MemTable* mem = cf_mems_->GetMemTable();
|
MemTable* mem = cf_mems_->GetMemTable();
|
||||||
mem->Add(sequence_, delete_type, key, value, concurrent_memtable_writes_,
|
bool mem_res =
|
||||||
get_post_process_info(mem));
|
mem->Add(sequence_, delete_type, key, value,
|
||||||
|
concurrent_memtable_writes_, get_post_process_info(mem));
|
||||||
|
if (!mem_res) {
|
||||||
|
assert(seq_per_batch_);
|
||||||
|
ret_status = Status::TryAgain("key+seq exists");
|
||||||
|
const bool BATCH_BOUNDRY = true;
|
||||||
|
MaybeAdvanceSeq(BATCH_BOUNDRY);
|
||||||
|
}
|
||||||
MaybeAdvanceSeq();
|
MaybeAdvanceSeq();
|
||||||
CheckMemtableFull();
|
CheckMemtableFull();
|
||||||
return Status::OK();
|
return ret_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status DeleteCF(uint32_t column_family_id,
|
virtual Status DeleteCF(uint32_t column_family_id,
|
||||||
@ -1246,6 +1288,7 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
return seek_status;
|
return seek_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status ret_status;
|
||||||
MemTable* mem = cf_mems_->GetMemTable();
|
MemTable* mem = cf_mems_->GetMemTable();
|
||||||
auto* moptions = mem->GetImmutableMemTableOptions();
|
auto* moptions = mem->GetImmutableMemTableOptions();
|
||||||
bool perform_merge = false;
|
bool perform_merge = false;
|
||||||
@ -1301,18 +1344,30 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
perform_merge = false;
|
perform_merge = false;
|
||||||
} else {
|
} else {
|
||||||
// 3) Add value to memtable
|
// 3) Add value to memtable
|
||||||
mem->Add(sequence_, kTypeValue, key, new_value);
|
bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
|
||||||
|
if (!mem_res) {
|
||||||
|
assert(seq_per_batch_);
|
||||||
|
ret_status = Status::TryAgain("key+seq exists");
|
||||||
|
const bool BATCH_BOUNDRY = true;
|
||||||
|
MaybeAdvanceSeq(BATCH_BOUNDRY);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!perform_merge) {
|
if (!perform_merge) {
|
||||||
// Add merge operator to memtable
|
// Add merge operator to memtable
|
||||||
mem->Add(sequence_, kTypeMerge, key, value);
|
bool mem_res = mem->Add(sequence_, kTypeMerge, key, value);
|
||||||
|
if (!mem_res) {
|
||||||
|
assert(seq_per_batch_);
|
||||||
|
ret_status = Status::TryAgain("key+seq exists");
|
||||||
|
const bool BATCH_BOUNDRY = true;
|
||||||
|
MaybeAdvanceSeq(BATCH_BOUNDRY);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MaybeAdvanceSeq();
|
MaybeAdvanceSeq();
|
||||||
CheckMemtableFull();
|
CheckMemtableFull();
|
||||||
return Status::OK();
|
return ret_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
|
virtual Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
|
||||||
@ -1496,6 +1551,8 @@ Status WriteBatchInternal::InsertInto(
|
|||||||
if (!w->status.ok()) {
|
if (!w->status.ok()) {
|
||||||
return w->status;
|
return w->status;
|
||||||
}
|
}
|
||||||
|
assert(!seq_per_batch || w->batch_cnt != 0);
|
||||||
|
assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
@ -1504,7 +1561,7 @@ Status WriteBatchInternal::InsertInto(
|
|||||||
WriteThread::Writer* writer, SequenceNumber sequence,
|
WriteThread::Writer* writer, SequenceNumber sequence,
|
||||||
ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
|
ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
|
||||||
bool ignore_missing_column_families, uint64_t log_number, DB* db,
|
bool ignore_missing_column_families, uint64_t log_number, DB* db,
|
||||||
bool concurrent_memtable_writes, bool seq_per_batch) {
|
bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt) {
|
||||||
assert(writer->ShouldWriteToMemtable());
|
assert(writer->ShouldWriteToMemtable());
|
||||||
MemTableInserter inserter(sequence, memtables, flush_scheduler,
|
MemTableInserter inserter(sequence, memtables, flush_scheduler,
|
||||||
ignore_missing_column_families, log_number, db,
|
ignore_missing_column_families, log_number, db,
|
||||||
@ -1513,6 +1570,8 @@ Status WriteBatchInternal::InsertInto(
|
|||||||
SetSequence(writer->batch, sequence);
|
SetSequence(writer->batch, sequence);
|
||||||
inserter.set_log_number_ref(writer->log_ref);
|
inserter.set_log_number_ref(writer->log_ref);
|
||||||
Status s = writer->batch->Iterate(&inserter);
|
Status s = writer->batch->Iterate(&inserter);
|
||||||
|
assert(!seq_per_batch || batch_cnt != 0);
|
||||||
|
assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
|
||||||
if (concurrent_memtable_writes) {
|
if (concurrent_memtable_writes) {
|
||||||
inserter.PostProcess();
|
inserter.PostProcess();
|
||||||
}
|
}
|
||||||
|
@ -182,7 +182,7 @@ class WriteBatchInternal {
|
|||||||
bool ignore_missing_column_families = false,
|
bool ignore_missing_column_families = false,
|
||||||
uint64_t log_number = 0, DB* db = nullptr,
|
uint64_t log_number = 0, DB* db = nullptr,
|
||||||
bool concurrent_memtable_writes = false,
|
bool concurrent_memtable_writes = false,
|
||||||
bool seq_per_batch = false);
|
bool seq_per_batch = false, size_t batch_cnt = 0);
|
||||||
|
|
||||||
static Status Append(WriteBatch* dst, const WriteBatch* src,
|
static Status Append(WriteBatch* dst, const WriteBatch* src,
|
||||||
const bool WAL_only = false);
|
const bool WAL_only = false);
|
||||||
|
@ -291,7 +291,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
|
|||||||
woptions.disableWAL = !enable_WAL;
|
woptions.disableWAL = !enable_WAL;
|
||||||
woptions.sync = enable_WAL;
|
woptions.sync = enable_WAL;
|
||||||
Status s;
|
Status s;
|
||||||
if (seq_per_batch && two_queues) {
|
if (seq_per_batch) {
|
||||||
class PublishSeqCallback : public PreReleaseCallback {
|
class PublishSeqCallback : public PreReleaseCallback {
|
||||||
public:
|
public:
|
||||||
PublishSeqCallback(DBImpl* db_impl_in)
|
PublishSeqCallback(DBImpl* db_impl_in)
|
||||||
@ -302,9 +302,13 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
|
|||||||
}
|
}
|
||||||
DBImpl* db_impl_;
|
DBImpl* db_impl_;
|
||||||
} publish_seq_callback(db_impl);
|
} publish_seq_callback(db_impl);
|
||||||
s = db_impl->WriteImpl(woptions, &write_op.write_batch_,
|
// seq_per_batch requires a natural batch separator or Noop
|
||||||
&write_op.callback_, nullptr, 0, false,
|
WriteBatchInternal::InsertNoop(&write_op.write_batch_);
|
||||||
nullptr, &publish_seq_callback);
|
const size_t ONE_BATCH = 1;
|
||||||
|
s = db_impl->WriteImpl(
|
||||||
|
woptions, &write_op.write_batch_, &write_op.callback_,
|
||||||
|
nullptr, 0, false, nullptr, ONE_BATCH,
|
||||||
|
two_queues ? &publish_seq_callback : nullptr);
|
||||||
} else {
|
} else {
|
||||||
s = db_impl->WriteWithCallback(
|
s = db_impl->WriteWithCallback(
|
||||||
woptions, &write_op.write_batch_, &write_op.callback_);
|
woptions, &write_op.write_batch_, &write_op.callback_);
|
||||||
|
@ -118,6 +118,7 @@ class WriteThread {
|
|||||||
bool no_slowdown;
|
bool no_slowdown;
|
||||||
bool disable_wal;
|
bool disable_wal;
|
||||||
bool disable_memtable;
|
bool disable_memtable;
|
||||||
|
size_t batch_cnt; // if non-zero, number of sub-batches in the write batch
|
||||||
PreReleaseCallback* pre_release_callback;
|
PreReleaseCallback* pre_release_callback;
|
||||||
uint64_t log_used; // log number that this batch was inserted into
|
uint64_t log_used; // log number that this batch was inserted into
|
||||||
uint64_t log_ref; // log number that memtable insert should reference
|
uint64_t log_ref; // log number that memtable insert should reference
|
||||||
@ -128,6 +129,7 @@ class WriteThread {
|
|||||||
SequenceNumber sequence; // the sequence number to use for the first key
|
SequenceNumber sequence; // the sequence number to use for the first key
|
||||||
Status status; // status of memtable inserter
|
Status status; // status of memtable inserter
|
||||||
Status callback_status; // status returned by callback->Callback()
|
Status callback_status; // status returned by callback->Callback()
|
||||||
|
|
||||||
std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
|
std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
|
||||||
std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
|
std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
|
||||||
Writer* link_older; // read/write only before linking, or as leader
|
Writer* link_older; // read/write only before linking, or as leader
|
||||||
@ -139,6 +141,7 @@ class WriteThread {
|
|||||||
no_slowdown(false),
|
no_slowdown(false),
|
||||||
disable_wal(false),
|
disable_wal(false),
|
||||||
disable_memtable(false),
|
disable_memtable(false),
|
||||||
|
batch_cnt(0),
|
||||||
pre_release_callback(nullptr),
|
pre_release_callback(nullptr),
|
||||||
log_used(0),
|
log_used(0),
|
||||||
log_ref(0),
|
log_ref(0),
|
||||||
@ -152,12 +155,14 @@ class WriteThread {
|
|||||||
|
|
||||||
Writer(const WriteOptions& write_options, WriteBatch* _batch,
|
Writer(const WriteOptions& write_options, WriteBatch* _batch,
|
||||||
WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
|
WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
|
||||||
|
size_t _batch_cnt = 0,
|
||||||
PreReleaseCallback* _pre_release_callback = nullptr)
|
PreReleaseCallback* _pre_release_callback = nullptr)
|
||||||
: batch(_batch),
|
: batch(_batch),
|
||||||
sync(write_options.sync),
|
sync(write_options.sync),
|
||||||
no_slowdown(write_options.no_slowdown),
|
no_slowdown(write_options.no_slowdown),
|
||||||
disable_wal(write_options.disableWAL),
|
disable_wal(write_options.disableWAL),
|
||||||
disable_memtable(_disable_memtable),
|
disable_memtable(_disable_memtable),
|
||||||
|
batch_cnt(_batch_cnt),
|
||||||
pre_release_callback(_pre_release_callback),
|
pre_release_callback(_pre_release_callback),
|
||||||
log_used(0),
|
log_used(0),
|
||||||
log_ref(_log_ref),
|
log_ref(_log_ref),
|
||||||
|
@ -228,13 +228,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
friend class WritePreparedTxn;
|
friend class WritePreparedTxn;
|
||||||
// TODO(myabandeh): this is hackish, non-efficient solution to enable the e2e
|
// Returns true if there has been duplicate keys in the batch.
|
||||||
// unit tests. Replace it with a proper solution. Collapse the WriteBatch to
|
bool HasDuplicateKeys();
|
||||||
// remove the duplicate keys. The index will not be updated after this.
|
|
||||||
// Returns false if collapse was not necessary
|
|
||||||
bool Collapse();
|
|
||||||
void DisableDuplicateMergeKeys() { allow_dup_merge_ = false; }
|
|
||||||
bool allow_dup_merge_ = true;
|
|
||||||
|
|
||||||
Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
|
Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
|
||||||
ColumnFamilyHandle* column_family, const Slice& key,
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
@ -307,7 +307,7 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch) {
|
Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) {
|
||||||
Status s = db_->Write(write_options_, batch);
|
Status s = db_->Write(write_options_, batch);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
@ -121,7 +121,10 @@ class PessimisticTransaction : public TransactionBaseImpl {
|
|||||||
|
|
||||||
virtual Status CommitWithoutPrepareInternal() = 0;
|
virtual Status CommitWithoutPrepareInternal() = 0;
|
||||||
|
|
||||||
virtual Status CommitBatchInternal(WriteBatch* batch) = 0;
|
// batch_cnt if non-zero is the number of sub-batches. A sub-batch is a batch
|
||||||
|
// with no duplicate keys. If zero, then the number of sub-batches is unknown.
|
||||||
|
virtual Status CommitBatchInternal(WriteBatch* batch,
|
||||||
|
size_t batch_cnt = 0) = 0;
|
||||||
|
|
||||||
virtual Status CommitInternal() = 0;
|
virtual Status CommitInternal() = 0;
|
||||||
|
|
||||||
@ -204,7 +207,7 @@ class WriteCommittedTxn : public PessimisticTransaction {
|
|||||||
|
|
||||||
Status CommitWithoutPrepareInternal() override;
|
Status CommitWithoutPrepareInternal() override;
|
||||||
|
|
||||||
Status CommitBatchInternal(WriteBatch* batch) override;
|
Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
|
||||||
|
|
||||||
Status CommitInternal() override;
|
Status CommitInternal() override;
|
||||||
|
|
||||||
|
@ -82,12 +82,29 @@ PessimisticTransactionDB::~PessimisticTransactionDB() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status PessimisticTransactionDB::VerifyCFOptions(const ColumnFamilyOptions&) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
Status PessimisticTransactionDB::Initialize(
|
Status PessimisticTransactionDB::Initialize(
|
||||||
const std::vector<size_t>& compaction_enabled_cf_indices,
|
const std::vector<size_t>& compaction_enabled_cf_indices,
|
||||||
const std::vector<ColumnFamilyHandle*>& handles) {
|
const std::vector<ColumnFamilyHandle*>& handles) {
|
||||||
for (auto cf_ptr : handles) {
|
for (auto cf_ptr : handles) {
|
||||||
AddColumnFamily(cf_ptr);
|
AddColumnFamily(cf_ptr);
|
||||||
}
|
}
|
||||||
|
// Verify cf options
|
||||||
|
for (auto handle : handles) {
|
||||||
|
ColumnFamilyDescriptor cfd;
|
||||||
|
Status s = handle->GetDescriptor(&cfd);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
s = VerifyCFOptions(cfd.options);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Re-enable compaction for the column families that initially had
|
// Re-enable compaction for the column families that initially had
|
||||||
// compaction enabled.
|
// compaction enabled.
|
||||||
std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
|
std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
|
||||||
@ -158,6 +175,30 @@ TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions(
|
|||||||
return validated;
|
return validated;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PessimisticTransactionDB::UpdateCFComparatorMap(
|
||||||
|
const std::vector<ColumnFamilyHandle*>& handles) {
|
||||||
|
auto cf_map = new std::map<uint32_t, const Comparator*>();
|
||||||
|
for (auto h : handles) {
|
||||||
|
auto id = h->GetID();
|
||||||
|
const Comparator* comparator = h->GetComparator();
|
||||||
|
(*cf_map)[id] = comparator;
|
||||||
|
}
|
||||||
|
cf_map_.store(cf_map);
|
||||||
|
cf_map_gc_.reset(cf_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PessimisticTransactionDB::UpdateCFComparatorMap(
|
||||||
|
const ColumnFamilyHandle* h) {
|
||||||
|
auto old_cf_map_ptr = cf_map_.load();
|
||||||
|
assert(old_cf_map_ptr);
|
||||||
|
auto cf_map = new std::map<uint32_t, const Comparator*>(*old_cf_map_ptr);
|
||||||
|
auto id = h->GetID();
|
||||||
|
const Comparator* comparator = h->GetComparator();
|
||||||
|
(*cf_map)[id] = comparator;
|
||||||
|
cf_map_.store(cf_map);
|
||||||
|
cf_map_gc_.reset(cf_map);
|
||||||
|
}
|
||||||
|
|
||||||
Status TransactionDB::Open(const Options& options,
|
Status TransactionDB::Open(const Options& options,
|
||||||
const TransactionDBOptions& txn_db_options,
|
const TransactionDBOptions& txn_db_options,
|
||||||
const std::string& dbname, TransactionDB** dbptr) {
|
const std::string& dbname, TransactionDB** dbptr) {
|
||||||
@ -245,6 +286,7 @@ Status TransactionDB::WrapDB(
|
|||||||
txn_db = new WriteCommittedTxnDB(
|
txn_db = new WriteCommittedTxnDB(
|
||||||
db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
|
db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
|
||||||
}
|
}
|
||||||
|
txn_db->UpdateCFComparatorMap(handles);
|
||||||
*dbptr = txn_db;
|
*dbptr = txn_db;
|
||||||
Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
|
Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
|
||||||
return s;
|
return s;
|
||||||
@ -270,6 +312,7 @@ Status TransactionDB::WrapStackableDB(
|
|||||||
txn_db = new WriteCommittedTxnDB(
|
txn_db = new WriteCommittedTxnDB(
|
||||||
db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
|
db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
|
||||||
}
|
}
|
||||||
|
txn_db->UpdateCFComparatorMap(handles);
|
||||||
*dbptr = txn_db;
|
*dbptr = txn_db;
|
||||||
Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
|
Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
|
||||||
return s;
|
return s;
|
||||||
@ -286,10 +329,15 @@ Status PessimisticTransactionDB::CreateColumnFamily(
|
|||||||
const ColumnFamilyOptions& options, const std::string& column_family_name,
|
const ColumnFamilyOptions& options, const std::string& column_family_name,
|
||||||
ColumnFamilyHandle** handle) {
|
ColumnFamilyHandle** handle) {
|
||||||
InstrumentedMutexLock l(&column_family_mutex_);
|
InstrumentedMutexLock l(&column_family_mutex_);
|
||||||
|
Status s = VerifyCFOptions(options);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
Status s = db_->CreateColumnFamily(options, column_family_name, handle);
|
s = db_->CreateColumnFamily(options, column_family_name, handle);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
lock_mgr_.AddColumnFamily((*handle)->GetID());
|
lock_mgr_.AddColumnFamily((*handle)->GetID());
|
||||||
|
UpdateCFComparatorMap(*handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
@ -439,8 +487,6 @@ Status PessimisticTransactionDB::Write(const WriteOptions& opts,
|
|||||||
// concurrent transactions.
|
// concurrent transactions.
|
||||||
Transaction* txn = BeginInternalTransaction(opts);
|
Transaction* txn = BeginInternalTransaction(opts);
|
||||||
txn->DisableIndexing();
|
txn->DisableIndexing();
|
||||||
// TODO(myabandeh): indexing being disabled we need another machanism to
|
|
||||||
// detect duplicattes in the input patch
|
|
||||||
|
|
||||||
auto txn_impl =
|
auto txn_impl =
|
||||||
static_cast_with_check<PessimisticTransaction, Transaction>(txn);
|
static_cast_with_check<PessimisticTransaction, Transaction>(txn);
|
||||||
|
@ -113,15 +113,27 @@ class PessimisticTransactionDB : public TransactionDB {
|
|||||||
std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
|
std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
|
||||||
void SetDeadlockInfoBufferSize(uint32_t target_size) override;
|
void SetDeadlockInfoBufferSize(uint32_t target_size) override;
|
||||||
|
|
||||||
|
void UpdateCFComparatorMap(const std::vector<ColumnFamilyHandle*>& handles);
|
||||||
|
void UpdateCFComparatorMap(const ColumnFamilyHandle* handle);
|
||||||
|
std::map<uint32_t, const Comparator*>* GetCFComparatorMap() {
|
||||||
|
return cf_map_.load();
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
DBImpl* db_impl_;
|
DBImpl* db_impl_;
|
||||||
std::shared_ptr<Logger> info_log_;
|
std::shared_ptr<Logger> info_log_;
|
||||||
const TransactionDBOptions txn_db_options_;
|
const TransactionDBOptions txn_db_options_;
|
||||||
|
// A cache of the cf comparators
|
||||||
|
std::atomic<std::map<uint32_t, const Comparator*>*> cf_map_;
|
||||||
|
// GC of the object above
|
||||||
|
std::unique_ptr<std::map<uint32_t, const Comparator*>> cf_map_gc_;
|
||||||
|
|
||||||
void ReinitializeTransaction(
|
void ReinitializeTransaction(
|
||||||
Transaction* txn, const WriteOptions& write_options,
|
Transaction* txn, const WriteOptions& write_options,
|
||||||
const TransactionOptions& txn_options = TransactionOptions());
|
const TransactionOptions& txn_options = TransactionOptions());
|
||||||
|
|
||||||
|
virtual Status VerifyCFOptions(const ColumnFamilyOptions& cf_options);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class WritePreparedTxnDB;
|
friend class WritePreparedTxnDB;
|
||||||
friend class WritePreparedTxnDBMock;
|
friend class WritePreparedTxnDBMock;
|
||||||
|
@ -232,7 +232,7 @@ class TransactionBaseImpl : public Transaction {
|
|||||||
|
|
||||||
// iterates over the given batch and makes the appropriate inserts.
|
// iterates over the given batch and makes the appropriate inserts.
|
||||||
// used for rebuilding prepared transactions after recovery.
|
// used for rebuilding prepared transactions after recovery.
|
||||||
Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
|
virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
|
||||||
|
|
||||||
WriteBatch* GetCommitTimeWriteBatch() override;
|
WriteBatch* GetCommitTimeWriteBatch() override;
|
||||||
|
|
||||||
|
@ -70,6 +70,25 @@ TEST_P(TransactionTest, DoubleEmptyWrite) {
|
|||||||
|
|
||||||
ASSERT_OK(db->Write(write_options, &batch));
|
ASSERT_OK(db->Write(write_options, &batch));
|
||||||
ASSERT_OK(db->Write(write_options, &batch));
|
ASSERT_OK(db->Write(write_options, &batch));
|
||||||
|
|
||||||
|
// Also test committing empty transactions in 2PC
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
ASSERT_OK(txn0->SetName("xid"));
|
||||||
|
ASSERT_OK(txn0->Prepare());
|
||||||
|
ASSERT_OK(txn0->Commit());
|
||||||
|
delete txn0;
|
||||||
|
|
||||||
|
// Also test that it works during recovery
|
||||||
|
txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
ASSERT_OK(txn0->SetName("xid2"));
|
||||||
|
txn0->Put(Slice("foo0"), Slice("bar0a"));
|
||||||
|
ASSERT_OK(txn0->Prepare());
|
||||||
|
delete txn0;
|
||||||
|
ASSERT_OK(ReOpenNoDelete());
|
||||||
|
txn0 = db->GetTransactionByName("xid2");
|
||||||
|
ASSERT_OK(txn0->Commit());
|
||||||
|
delete txn0;
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_P(TransactionTest, SuccessTest) {
|
TEST_P(TransactionTest, SuccessTest) {
|
||||||
@ -2874,12 +2893,8 @@ TEST_P(TransactionTest, UntrackedWrites) {
|
|||||||
// Untracked writes should succeed even though key was written after snapshot
|
// Untracked writes should succeed even though key was written after snapshot
|
||||||
s = txn->PutUntracked("untracked", "1");
|
s = txn->PutUntracked("untracked", "1");
|
||||||
ASSERT_OK(s);
|
ASSERT_OK(s);
|
||||||
if (txn_db_options.write_policy != WRITE_PREPARED) {
|
s = txn->MergeUntracked("untracked", "2");
|
||||||
// WRITE_PREPARED does not currently support dup merge keys.
|
ASSERT_OK(s);
|
||||||
// TODO(myabandeh): remove this if-then when the support is added
|
|
||||||
s = txn->MergeUntracked("untracked", "2");
|
|
||||||
ASSERT_OK(s);
|
|
||||||
}
|
|
||||||
s = txn->DeleteUntracked("untracked");
|
s = txn->DeleteUntracked("untracked");
|
||||||
ASSERT_OK(s);
|
ASSERT_OK(s);
|
||||||
|
|
||||||
@ -4250,11 +4265,6 @@ TEST_P(TransactionTest, SingleDeleteTest) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST_P(TransactionTest, MergeTest) {
|
TEST_P(TransactionTest, MergeTest) {
|
||||||
if (txn_db_options.write_policy == WRITE_PREPARED) {
|
|
||||||
// WRITE_PREPARED does not currently support dup merge keys.
|
|
||||||
// TODO(myabandeh): remove this if-then when the support is added
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
WriteOptions write_options;
|
WriteOptions write_options;
|
||||||
ReadOptions read_options;
|
ReadOptions read_options;
|
||||||
string value;
|
string value;
|
||||||
@ -4978,65 +4988,273 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Test that the transactional db can handle duplicate keys in the write batch
|
// Test that the transactional db can handle duplicate keys in the write batch
|
||||||
TEST_P(TransactionTest, DuplicateKeyTest) {
|
TEST_P(TransactionTest, DuplicateKeys) {
|
||||||
|
ColumnFamilyOptions cf_options;
|
||||||
|
std::string cf_name = "two";
|
||||||
|
ColumnFamilyHandle* cf_handle = nullptr;
|
||||||
|
{
|
||||||
|
db->CreateColumnFamily(cf_options, cf_name, &cf_handle);
|
||||||
|
WriteOptions write_options;
|
||||||
|
WriteBatch batch;
|
||||||
|
batch.Put(Slice("key"), Slice("value"));
|
||||||
|
batch.Put(Slice("key2"), Slice("value2"));
|
||||||
|
// duplicate the keys
|
||||||
|
batch.Put(Slice("key"), Slice("value3"));
|
||||||
|
// duplicate the 2nd key. It should not be counted duplicate since a
|
||||||
|
// sub-patch is cut after the last duplicate.
|
||||||
|
batch.Put(Slice("key2"), Slice("value4"));
|
||||||
|
// duplicate the keys but in a different cf. It should not be counted as
|
||||||
|
// duplicate keys
|
||||||
|
batch.Put(cf_handle, Slice("key"), Slice("value5"));
|
||||||
|
|
||||||
|
ASSERT_OK(db->Write(write_options, &batch));
|
||||||
|
|
||||||
|
ReadOptions ropt;
|
||||||
|
PinnableSlice pinnable_val;
|
||||||
|
auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("value3"));
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "key2", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("value4"));
|
||||||
|
s = db->Get(ropt, cf_handle, "key", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("value5"));
|
||||||
|
|
||||||
|
delete cf_handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test with non-bytewise comparator
|
||||||
|
{
|
||||||
|
// A comparator that uses only the first three bytes
|
||||||
|
class ThreeBytewiseComparator : public Comparator {
|
||||||
|
public:
|
||||||
|
ThreeBytewiseComparator() {}
|
||||||
|
virtual const char* Name() const override {
|
||||||
|
return "test.ThreeBytewiseComparator";
|
||||||
|
}
|
||||||
|
virtual int Compare(const Slice& a, const Slice& b) const override {
|
||||||
|
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
|
||||||
|
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
|
||||||
|
return na.compare(nb);
|
||||||
|
}
|
||||||
|
virtual bool Equal(const Slice& a, const Slice& b) const override {
|
||||||
|
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
|
||||||
|
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
|
||||||
|
return na == nb;
|
||||||
|
}
|
||||||
|
// This methods below dont seem relevant to this test. Implement them if
|
||||||
|
// proven othersize.
|
||||||
|
void FindShortestSeparator(std::string* start,
|
||||||
|
const Slice& limit) const override {
|
||||||
|
const Comparator* bytewise_comp = BytewiseComparator();
|
||||||
|
bytewise_comp->FindShortestSeparator(start, limit);
|
||||||
|
}
|
||||||
|
void FindShortSuccessor(std::string* key) const override {
|
||||||
|
const Comparator* bytewise_comp = BytewiseComparator();
|
||||||
|
bytewise_comp->FindShortSuccessor(key);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
ReOpen();
|
||||||
|
std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
|
||||||
|
cf_options.comparator = comp_gc.get();
|
||||||
|
ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
|
||||||
|
WriteOptions write_options;
|
||||||
|
WriteBatch batch;
|
||||||
|
batch.Put(cf_handle, Slice("key"), Slice("value"));
|
||||||
|
// The first three bytes are the same, do it must be counted as duplicate
|
||||||
|
batch.Put(cf_handle, Slice("key2"), Slice("value2"));
|
||||||
|
ASSERT_OK(db->Write(write_options, &batch));
|
||||||
|
|
||||||
|
// The value must be the most recent value for all the keys equal to "key",
|
||||||
|
// including "key2"
|
||||||
|
ReadOptions ropt;
|
||||||
|
PinnableSlice pinnable_val;
|
||||||
|
ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val));
|
||||||
|
ASSERT_TRUE(pinnable_val == ("value2"));
|
||||||
|
|
||||||
|
// Test duplicate keys with rollback
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
ASSERT_OK(txn0->SetName("xid"));
|
||||||
|
ASSERT_OK(txn0->Put(cf_handle, Slice("key3"), Slice("value3")));
|
||||||
|
ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4")));
|
||||||
|
ASSERT_OK(txn0->Rollback());
|
||||||
|
ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val));
|
||||||
|
ASSERT_TRUE(pinnable_val == ("value2"));
|
||||||
|
delete txn0;
|
||||||
|
|
||||||
|
delete cf_handle;
|
||||||
|
cf_options.comparator = BytewiseComparator();
|
||||||
|
}
|
||||||
|
|
||||||
for (bool do_prepare : {true, false}) {
|
for (bool do_prepare : {true, false}) {
|
||||||
|
for (bool do_rollback : {true, false}) {
|
||||||
|
for (bool with_commit_batch : {true, false}) {
|
||||||
|
if (with_commit_batch && !do_prepare) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (with_commit_batch && do_rollback) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ReOpen();
|
||||||
|
db->CreateColumnFamily(cf_options, cf_name, &cf_handle);
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
txn_options.use_only_the_last_commit_time_batch_for_recovery = false;
|
||||||
|
WriteOptions write_options;
|
||||||
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
|
auto s = txn0->SetName("xid");
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Put(Slice("foo0"), Slice("bar0a"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Put(Slice("foo0"), Slice("bar0b"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Put(Slice("foo1"), Slice("bar1"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Merge(Slice("foo2"), Slice("bar2a"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
// Repeat a key after the start of a sub-patch. This should not cause a
|
||||||
|
// duplicate in the most recent sub-patch and hence not creating a new
|
||||||
|
// sub-patch.
|
||||||
|
s = txn0->Put(Slice("foo0"), Slice("bar0c"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Merge(Slice("foo2"), Slice("bar2b"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
// duplicate the keys but in a different cf. It should not be counted as
|
||||||
|
// duplicate.
|
||||||
|
s = txn0->Put(cf_handle, Slice("foo0"), Slice("bar0-cf1"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Put(Slice("foo3"), Slice("bar3"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Merge(Slice("foo3"), Slice("bar3"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Put(Slice("foo4"), Slice("bar4"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->Delete(Slice("foo4"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = txn0->SingleDelete(Slice("foo4"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
if (do_prepare) {
|
||||||
|
s = txn0->Prepare();
|
||||||
|
ASSERT_OK(s);
|
||||||
|
}
|
||||||
|
if (do_rollback) {
|
||||||
|
// Test rolling back the batch with duplicates
|
||||||
|
s = txn0->Rollback();
|
||||||
|
ASSERT_OK(s);
|
||||||
|
} else {
|
||||||
|
if (with_commit_batch) {
|
||||||
|
assert(do_prepare);
|
||||||
|
auto cb = txn0->GetCommitTimeWriteBatch();
|
||||||
|
// duplicate a key in the original batch
|
||||||
|
// TODO(myabandeh): the behavior of GetCommitTimeWriteBatch
|
||||||
|
// conflicting with the prepared batch is currently undefined and
|
||||||
|
// gives different results in different implementations.
|
||||||
|
|
||||||
|
// s = cb->Put(Slice("foo0"), Slice("bar0d"));
|
||||||
|
// ASSERT_OK(s);
|
||||||
|
// add a new duplicate key
|
||||||
|
s = cb->Put(Slice("foo6"), Slice("bar6a"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = cb->Put(Slice("foo6"), Slice("bar6b"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
// add a duplicate key that is removed in the same batch
|
||||||
|
s = cb->Put(Slice("foo7"), Slice("bar7a"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
s = cb->Delete(Slice("foo7"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
}
|
||||||
|
s = txn0->Commit();
|
||||||
|
ASSERT_OK(s);
|
||||||
|
}
|
||||||
|
if (!do_prepare && !do_rollback) {
|
||||||
|
auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
|
||||||
|
pdb->UnregisterTransaction(txn0);
|
||||||
|
}
|
||||||
|
delete txn0;
|
||||||
|
ReadOptions ropt;
|
||||||
|
PinnableSlice pinnable_val;
|
||||||
|
|
||||||
|
if (do_rollback) {
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
} else {
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("bar0c"));
|
||||||
|
s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("bar0-cf1"));
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("bar1"));
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("bar2a,bar2b"));
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("bar3,bar3"));
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
if (with_commit_batch) {
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo6", &pinnable_val);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_TRUE(pinnable_val == ("bar6b"));
|
||||||
|
s = db->Get(ropt, db->DefaultColumnFamily(), "foo7", &pinnable_val);
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete cf_handle;
|
||||||
|
} // with_commit_batch
|
||||||
|
} // do_rollback
|
||||||
|
} // do_prepare
|
||||||
|
|
||||||
|
{
|
||||||
|
// Also test with max_successive_merges > 0. max_successive_merges will not
|
||||||
|
// affect our algorithm for duplicate key insertion but we add the test to
|
||||||
|
// verify that.
|
||||||
|
cf_options.max_successive_merges = 2;
|
||||||
|
cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
||||||
|
ReOpen();
|
||||||
|
db->CreateColumnFamily(cf_options, cf_name, &cf_handle);
|
||||||
|
WriteOptions write_options;
|
||||||
|
// Ensure one value for the key
|
||||||
|
db->Put(write_options, cf_handle, Slice("key"), Slice("value"));
|
||||||
|
WriteBatch batch;
|
||||||
|
// Merge more than max_successive_merges times
|
||||||
|
batch.Merge(cf_handle, Slice("key"), Slice("1"));
|
||||||
|
batch.Merge(cf_handle, Slice("key"), Slice("2"));
|
||||||
|
batch.Merge(cf_handle, Slice("key"), Slice("3"));
|
||||||
|
batch.Merge(cf_handle, Slice("key"), Slice("4"));
|
||||||
|
ASSERT_OK(db->Write(write_options, &batch));
|
||||||
|
ReadOptions read_options;
|
||||||
|
string value;
|
||||||
|
ASSERT_OK(db->Get(read_options, cf_handle, "key", &value));
|
||||||
|
ASSERT_EQ(value, "value,1,2,3,4");
|
||||||
|
delete cf_handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Test that the duplicate detection is not compromised after rolling back
|
||||||
|
// to a save point
|
||||||
TransactionOptions txn_options;
|
TransactionOptions txn_options;
|
||||||
WriteOptions write_options;
|
WriteOptions write_options;
|
||||||
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
|
||||||
auto s = txn0->SetName("xid");
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
|
||||||
ASSERT_OK(s);
|
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
|
||||||
s = txn0->Put(Slice("foo0"), Slice("bar0a"));
|
txn0->SetSavePoint();
|
||||||
ASSERT_OK(s);
|
txn0->RollbackToSavePoint();
|
||||||
s = txn0->Put(Slice("foo0"), Slice("bar0b"));
|
ASSERT_OK(txn0->Commit());
|
||||||
ASSERT_OK(s);
|
|
||||||
s = txn0->Put(Slice("foo1"), Slice("bar1"));
|
|
||||||
ASSERT_OK(s);
|
|
||||||
s = txn0->Merge(Slice("foo2"), Slice("bar2a"));
|
|
||||||
ASSERT_OK(s);
|
|
||||||
// TODO(myabandeh): enable this after duplicatae merge keys are supported
|
|
||||||
// s = txn0->Merge(Slice("foo2"), Slice("bar2a"));
|
|
||||||
// ASSERT_OK(s);
|
|
||||||
s = txn0->Put(Slice("foo2"), Slice("bar2b"));
|
|
||||||
ASSERT_OK(s);
|
|
||||||
s = txn0->Put(Slice("foo3"), Slice("bar3"));
|
|
||||||
ASSERT_OK(s);
|
|
||||||
// TODO(myabandeh): enable this after duplicatae merge keys are supported
|
|
||||||
// s = txn0->Merge(Slice("foo3"), Slice("bar3"));
|
|
||||||
// ASSERT_OK(s);
|
|
||||||
s = txn0->Put(Slice("foo4"), Slice("bar4"));
|
|
||||||
ASSERT_OK(s);
|
|
||||||
s = txn0->Delete(Slice("foo4"));
|
|
||||||
ASSERT_OK(s);
|
|
||||||
s = txn0->SingleDelete(Slice("foo4"));
|
|
||||||
ASSERT_OK(s);
|
|
||||||
if (do_prepare) {
|
|
||||||
s = txn0->Prepare();
|
|
||||||
ASSERT_OK(s);
|
|
||||||
}
|
|
||||||
s = txn0->Commit();
|
|
||||||
ASSERT_OK(s);
|
|
||||||
if (!do_prepare) {
|
|
||||||
auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
|
|
||||||
pdb->UnregisterTransaction(txn0);
|
|
||||||
}
|
|
||||||
delete txn0;
|
|
||||||
ReadOptions ropt;
|
|
||||||
PinnableSlice pinnable_val;
|
|
||||||
|
|
||||||
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
|
|
||||||
ASSERT_OK(s);
|
|
||||||
ASSERT_TRUE(pinnable_val == ("bar0b"));
|
|
||||||
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
|
|
||||||
ASSERT_OK(s);
|
|
||||||
ASSERT_TRUE(pinnable_val == ("bar1"));
|
|
||||||
s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
|
|
||||||
ASSERT_OK(s);
|
|
||||||
ASSERT_TRUE(pinnable_val == ("bar2b"));
|
|
||||||
s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
|
|
||||||
ASSERT_OK(s);
|
|
||||||
ASSERT_TRUE(pinnable_val == ("bar3"));
|
|
||||||
s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
|
|
||||||
ASSERT_TRUE(s.IsNotFound());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
|
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
#include "db/column_family.h"
|
#include "db/column_family.h"
|
||||||
#include "db/db_impl.h"
|
#include "db/db_impl.h"
|
||||||
@ -30,9 +31,7 @@ WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db,
|
|||||||
const WriteOptions& write_options,
|
const WriteOptions& write_options,
|
||||||
const TransactionOptions& txn_options)
|
const TransactionOptions& txn_options)
|
||||||
: PessimisticTransaction(txn_db, write_options, txn_options),
|
: PessimisticTransaction(txn_db, write_options, txn_options),
|
||||||
wpt_db_(txn_db) {
|
wpt_db_(txn_db) {}
|
||||||
GetWriteBatch()->DisableDuplicateMergeKeys();
|
|
||||||
}
|
|
||||||
|
|
||||||
Status WritePreparedTxn::Get(const ReadOptions& read_options,
|
Status WritePreparedTxn::Get(const ReadOptions& read_options,
|
||||||
ColumnFamilyHandle* column_family,
|
ColumnFamilyHandle* column_family,
|
||||||
@ -63,6 +62,69 @@ Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
|
|||||||
return write_batch_.NewIteratorWithBase(column_family, db_iter);
|
return write_batch_.NewIteratorWithBase(column_family, db_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
// A wrapper around Comparator to make it usable in std::set
|
||||||
|
struct SetComparator {
|
||||||
|
explicit SetComparator() : user_comparator_(BytewiseComparator()) {}
|
||||||
|
explicit SetComparator(const Comparator* user_comparator)
|
||||||
|
: user_comparator_(user_comparator ? user_comparator
|
||||||
|
: BytewiseComparator()) {}
|
||||||
|
bool operator()(const Slice& lhs, const Slice& rhs) const {
|
||||||
|
return user_comparator_->Compare(lhs, rhs) < 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const Comparator* user_comparator_;
|
||||||
|
};
|
||||||
|
// Count the number of sub-batches inside a batch. A sub-batch does not have
|
||||||
|
// duplicate keys.
|
||||||
|
struct SubBatchCounter : public WriteBatch::Handler {
|
||||||
|
explicit SubBatchCounter(std::map<uint32_t, const Comparator*>& comparators)
|
||||||
|
: comparators_(comparators), batches_(1) {}
|
||||||
|
std::map<uint32_t, const Comparator*>& comparators_;
|
||||||
|
using CFKeys = std::set<Slice, SetComparator>;
|
||||||
|
std::map<uint32_t, CFKeys> keys_;
|
||||||
|
size_t batches_;
|
||||||
|
size_t BatchCount() { return batches_; }
|
||||||
|
void AddKey(uint32_t cf, const Slice& key) {
|
||||||
|
CFKeys& cf_keys = keys_[cf];
|
||||||
|
if (cf_keys.size() == 0) { // just inserted
|
||||||
|
auto cmp = comparators_[cf];
|
||||||
|
keys_[cf] = CFKeys(SetComparator(cmp));
|
||||||
|
}
|
||||||
|
auto it = cf_keys.insert(key);
|
||||||
|
if (it.second == false) { // second is false if a element already existed.
|
||||||
|
batches_++;
|
||||||
|
keys_.clear();
|
||||||
|
keys_[cf].insert(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Status MarkNoop(bool) override { return Status::OK(); }
|
||||||
|
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
|
||||||
|
Status MarkCommit(const Slice&) override { return Status::OK(); }
|
||||||
|
|
||||||
|
Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
|
||||||
|
AddKey(cf, key);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
Status DeleteCF(uint32_t cf, const Slice& key) override {
|
||||||
|
AddKey(cf, key);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
|
||||||
|
AddKey(cf, key);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
|
||||||
|
AddKey(cf, key);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
Status MarkBeginPrepare() override { return Status::OK(); }
|
||||||
|
Status MarkRollback(const Slice&) override { return Status::OK(); }
|
||||||
|
bool WriteAfterCommit() const override { return false; }
|
||||||
|
};
|
||||||
|
} // namespace
|
||||||
|
|
||||||
Status WritePreparedTxn::PrepareInternal() {
|
Status WritePreparedTxn::PrepareInternal() {
|
||||||
WriteOptions write_options = write_options_;
|
WriteOptions write_options = write_options_;
|
||||||
write_options.disableWAL = false;
|
write_options.disableWAL = false;
|
||||||
@ -71,15 +133,18 @@ Status WritePreparedTxn::PrepareInternal() {
|
|||||||
!WRITE_AFTER_COMMIT);
|
!WRITE_AFTER_COMMIT);
|
||||||
const bool DISABLE_MEMTABLE = true;
|
const bool DISABLE_MEMTABLE = true;
|
||||||
uint64_t seq_used = kMaxSequenceNumber;
|
uint64_t seq_used = kMaxSequenceNumber;
|
||||||
bool collapsed = GetWriteBatch()->Collapse();
|
// For each duplicate key we account for a new sub-batch
|
||||||
if (collapsed) {
|
prepare_batch_cnt_ = 1;
|
||||||
ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
|
if (GetWriteBatch()->HasDuplicateKeys()) {
|
||||||
"Collapse overhead due to duplicate keys");
|
SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
|
||||||
|
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&counter);
|
||||||
|
assert(s.ok());
|
||||||
|
prepare_batch_cnt_ = counter.BatchCount();
|
||||||
}
|
}
|
||||||
Status s =
|
Status s =
|
||||||
db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
|
db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
|
||||||
/*callback*/ nullptr, &log_number_, /*log ref*/ 0,
|
/*callback*/ nullptr, &log_number_, /*log ref*/ 0,
|
||||||
!DISABLE_MEMTABLE, &seq_used);
|
!DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_);
|
||||||
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
||||||
auto prepare_seq = seq_used;
|
auto prepare_seq = seq_used;
|
||||||
SetId(prepare_seq);
|
SetId(prepare_seq);
|
||||||
@ -93,18 +158,32 @@ Status WritePreparedTxn::PrepareInternal() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status WritePreparedTxn::CommitWithoutPrepareInternal() {
|
Status WritePreparedTxn::CommitWithoutPrepareInternal() {
|
||||||
bool collapsed = GetWriteBatch()->Collapse();
|
// For each duplicate key we account for a new sub-batch
|
||||||
if (collapsed) {
|
size_t batch_cnt = 1;
|
||||||
ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
|
if (GetWriteBatch()->HasDuplicateKeys()) {
|
||||||
"Collapse overhead due to duplicate keys");
|
batch_cnt = 0; // this will trigger a batch cnt compute
|
||||||
}
|
}
|
||||||
return CommitBatchInternal(GetWriteBatch()->GetWriteBatch());
|
return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch) {
|
Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch,
|
||||||
|
size_t batch_cnt) {
|
||||||
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
|
||||||
"CommitBatchInternal");
|
"CommitBatchInternal");
|
||||||
// TODO(myabandeh): handle the duplicate keys in the batch
|
if (batch->Count() == 0) {
|
||||||
|
// Otherwise our 1 seq per batch logic will break since there is no seq
|
||||||
|
// increased for this batch.
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
if (batch_cnt == 0) { // not provided, then compute it
|
||||||
|
// TODO(myabandeh): add an option to allow user skipping this cost
|
||||||
|
SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
|
||||||
|
auto s = batch->Iterate(&counter);
|
||||||
|
assert(s.ok());
|
||||||
|
batch_cnt = counter.BatchCount();
|
||||||
|
}
|
||||||
|
assert(batch_cnt);
|
||||||
|
|
||||||
bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
|
bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
|
||||||
bool sync = write_options_.sync;
|
bool sync = write_options_.sync;
|
||||||
if (!do_one_write) {
|
if (!do_one_write) {
|
||||||
@ -116,12 +195,12 @@ Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch) {
|
|||||||
const bool DISABLE_MEMTABLE = true;
|
const bool DISABLE_MEMTABLE = true;
|
||||||
const uint64_t no_log_ref = 0;
|
const uint64_t no_log_ref = 0;
|
||||||
uint64_t seq_used = kMaxSequenceNumber;
|
uint64_t seq_used = kMaxSequenceNumber;
|
||||||
const bool INCLUDES_DATA = true;
|
const size_t ZERO_PREPARES = 0;
|
||||||
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
||||||
wpt_db_, db_impl_, kMaxSequenceNumber, INCLUDES_DATA);
|
wpt_db_, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt);
|
||||||
auto s = db_impl_->WriteImpl(write_options_, batch, nullptr, nullptr,
|
auto s = db_impl_->WriteImpl(
|
||||||
no_log_ref, !DISABLE_MEMTABLE, &seq_used,
|
write_options_, batch, nullptr, nullptr, no_log_ref, !DISABLE_MEMTABLE,
|
||||||
do_one_write ? &update_commit_map : nullptr);
|
&seq_used, batch_cnt, do_one_write ? &update_commit_map : nullptr);
|
||||||
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
||||||
uint64_t& prepare_seq = seq_used;
|
uint64_t& prepare_seq = seq_used;
|
||||||
SetId(prepare_seq);
|
SetId(prepare_seq);
|
||||||
@ -144,13 +223,14 @@ Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch) {
|
|||||||
// Commit the batch by writing an empty batch to the 2nd queue that will
|
// Commit the batch by writing an empty batch to the 2nd queue that will
|
||||||
// release the commit sequence number to readers.
|
// release the commit sequence number to readers.
|
||||||
WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
|
||||||
wpt_db_, db_impl_, prepare_seq);
|
wpt_db_, db_impl_, prepare_seq, batch_cnt);
|
||||||
WriteBatch empty_batch;
|
WriteBatch empty_batch;
|
||||||
empty_batch.PutLogData(Slice());
|
empty_batch.PutLogData(Slice());
|
||||||
|
const size_t ONE_BATCH = 1;
|
||||||
// In the absence of Prepare markers, use Noop as a batch separator
|
// In the absence of Prepare markers, use Noop as a batch separator
|
||||||
WriteBatchInternal::InsertNoop(&empty_batch);
|
WriteBatchInternal::InsertNoop(&empty_batch);
|
||||||
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
||||||
no_log_ref, DISABLE_MEMTABLE, &seq_used,
|
no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
||||||
&update_commit_map_with_prepare);
|
&update_commit_map_with_prepare);
|
||||||
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
||||||
return s;
|
return s;
|
||||||
@ -175,17 +255,26 @@ Status WritePreparedTxn::CommitInternal() {
|
|||||||
|
|
||||||
auto prepare_seq = GetId();
|
auto prepare_seq = GetId();
|
||||||
const bool includes_data = !empty && !for_recovery;
|
const bool includes_data = !empty && !for_recovery;
|
||||||
|
assert(prepare_batch_cnt_);
|
||||||
|
size_t commit_batch_cnt = 0;
|
||||||
|
if (includes_data) {
|
||||||
|
SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
|
||||||
|
auto s = working_batch->Iterate(&counter);
|
||||||
|
assert(s.ok());
|
||||||
|
commit_batch_cnt = counter.BatchCount();
|
||||||
|
}
|
||||||
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
||||||
wpt_db_, db_impl_, prepare_seq, includes_data);
|
wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt);
|
||||||
const bool disable_memtable = !includes_data;
|
const bool disable_memtable = !includes_data;
|
||||||
uint64_t seq_used = kMaxSequenceNumber;
|
uint64_t seq_used = kMaxSequenceNumber;
|
||||||
// Since the prepared batch is directly written to memtable, there is already
|
// Since the prepared batch is directly written to memtable, there is already
|
||||||
// a connection between the memtable and its WAL, so there is no need to
|
// a connection between the memtable and its WAL, so there is no need to
|
||||||
// redundantly reference the log that contains the prepared data.
|
// redundantly reference the log that contains the prepared data.
|
||||||
const uint64_t zero_log_number = 0ull;
|
const uint64_t zero_log_number = 0ull;
|
||||||
|
size_t batch_cnt = commit_batch_cnt ? commit_batch_cnt : 1;
|
||||||
auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
|
auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
|
||||||
zero_log_number, disable_memtable, &seq_used,
|
zero_log_number, disable_memtable, &seq_used,
|
||||||
&update_commit_map);
|
batch_cnt, &update_commit_map);
|
||||||
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@ -203,16 +292,36 @@ Status WritePreparedTxn::RollbackInternal() {
|
|||||||
ReadOptions roptions;
|
ReadOptions roptions;
|
||||||
WritePreparedTxnReadCallback callback;
|
WritePreparedTxnReadCallback callback;
|
||||||
WriteBatch* rollback_batch_;
|
WriteBatch* rollback_batch_;
|
||||||
RollbackWriteBatchBuilder(DBImpl* db, WritePreparedTxnDB* wpt_db,
|
std::map<uint32_t, const Comparator*>& comparators_;
|
||||||
SequenceNumber snap_seq, WriteBatch* dst_batch)
|
using CFKeys = std::set<Slice, SetComparator>;
|
||||||
: db_(db), callback(wpt_db, snap_seq), rollback_batch_(dst_batch) {}
|
std::map<uint32_t, CFKeys> keys_;
|
||||||
|
RollbackWriteBatchBuilder(
|
||||||
|
DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
|
||||||
|
WriteBatch* dst_batch,
|
||||||
|
std::map<uint32_t, const Comparator*>& comparators)
|
||||||
|
: db_(db),
|
||||||
|
callback(wpt_db, snap_seq),
|
||||||
|
rollback_batch_(dst_batch),
|
||||||
|
comparators_(comparators) {}
|
||||||
|
|
||||||
Status Rollback(uint32_t cf, const Slice& key) {
|
Status Rollback(uint32_t cf, const Slice& key) {
|
||||||
|
Status s;
|
||||||
|
CFKeys& cf_keys = keys_[cf];
|
||||||
|
if (cf_keys.size() == 0) { // just inserted
|
||||||
|
auto cmp = comparators_[cf];
|
||||||
|
keys_[cf] = CFKeys(SetComparator(cmp));
|
||||||
|
}
|
||||||
|
auto it = cf_keys.insert(key);
|
||||||
|
if (it.second ==
|
||||||
|
false) { // second is false if a element already existed.
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
PinnableSlice pinnable_val;
|
PinnableSlice pinnable_val;
|
||||||
bool not_used;
|
bool not_used;
|
||||||
auto cf_handle = db_->GetColumnFamilyHandle(cf);
|
auto cf_handle = db_->GetColumnFamilyHandle(cf);
|
||||||
auto s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, ¬_used,
|
s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, ¬_used,
|
||||||
&callback);
|
&callback);
|
||||||
assert(s.ok() || s.IsNotFound());
|
assert(s.ok() || s.IsNotFound());
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = rollback_batch_->Put(cf_handle, key, pinnable_val);
|
s = rollback_batch_->Put(cf_handle, key, pinnable_val);
|
||||||
@ -254,7 +363,8 @@ Status WritePreparedTxn::RollbackInternal() {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual bool WriteAfterCommit() const override { return false; }
|
virtual bool WriteAfterCommit() const override { return false; }
|
||||||
} rollback_handler(db_impl_, wpt_db_, last_visible_txn, &rollback_batch);
|
} rollback_handler(db_impl_, wpt_db_, last_visible_txn, &rollback_batch,
|
||||||
|
*wpt_db_->GetCFComparatorMap());
|
||||||
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
|
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
|
||||||
assert(s.ok());
|
assert(s.ok());
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
@ -266,11 +376,12 @@ Status WritePreparedTxn::RollbackInternal() {
|
|||||||
const bool DISABLE_MEMTABLE = true;
|
const bool DISABLE_MEMTABLE = true;
|
||||||
const uint64_t no_log_ref = 0;
|
const uint64_t no_log_ref = 0;
|
||||||
uint64_t seq_used = kMaxSequenceNumber;
|
uint64_t seq_used = kMaxSequenceNumber;
|
||||||
const bool INCLUDES_DATA = true;
|
const size_t ZERO_PREPARES = 0;
|
||||||
|
const size_t ONE_BATCH = 1;
|
||||||
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map(
|
||||||
wpt_db_, db_impl_, kMaxSequenceNumber, INCLUDES_DATA);
|
wpt_db_, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, ONE_BATCH);
|
||||||
s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr,
|
s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr,
|
||||||
no_log_ref, !DISABLE_MEMTABLE, &seq_used,
|
no_log_ref, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
||||||
do_one_write ? &update_commit_map : nullptr);
|
do_one_write ? &update_commit_map : nullptr);
|
||||||
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
@ -289,13 +400,13 @@ Status WritePreparedTxn::RollbackInternal() {
|
|||||||
// Commit the batch by writing an empty batch to the queue that will release
|
// Commit the batch by writing an empty batch to the queue that will release
|
||||||
// the commit sequence number to readers.
|
// the commit sequence number to readers.
|
||||||
WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
|
WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
|
||||||
wpt_db_, db_impl_, prepare_seq);
|
wpt_db_, db_impl_, prepare_seq, ONE_BATCH);
|
||||||
WriteBatch empty_batch;
|
WriteBatch empty_batch;
|
||||||
empty_batch.PutLogData(Slice());
|
empty_batch.PutLogData(Slice());
|
||||||
// In the absence of Prepare markers, use Noop as a batch separator
|
// In the absence of Prepare markers, use Noop as a batch separator
|
||||||
WriteBatchInternal::InsertNoop(&empty_batch);
|
WriteBatchInternal::InsertNoop(&empty_batch);
|
||||||
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
|
||||||
no_log_ref, DISABLE_MEMTABLE, &seq_used,
|
no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
|
||||||
&update_commit_map_with_prepare);
|
&update_commit_map_with_prepare);
|
||||||
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
assert(!s.ok() || seq_used != kMaxSequenceNumber);
|
||||||
// Mark the txn as rolled back
|
// Mark the txn as rolled back
|
||||||
@ -334,6 +445,18 @@ Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
|
|||||||
&snap_checker);
|
&snap_checker);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) {
|
||||||
|
auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch);
|
||||||
|
prepare_batch_cnt_ = 1;
|
||||||
|
if (GetWriteBatch()->HasDuplicateKeys()) {
|
||||||
|
SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
|
||||||
|
auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&counter);
|
||||||
|
assert(s.ok());
|
||||||
|
prepare_batch_cnt_ = counter.BatchCount();
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
|
@ -68,7 +68,7 @@ class WritePreparedTxn : public PessimisticTransaction {
|
|||||||
|
|
||||||
Status CommitWithoutPrepareInternal() override;
|
Status CommitWithoutPrepareInternal() override;
|
||||||
|
|
||||||
Status CommitBatchInternal(WriteBatch* batch) override;
|
Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
|
||||||
|
|
||||||
// Since the data is already written to memtables at the Prepare phase, the
|
// Since the data is already written to memtables at the Prepare phase, the
|
||||||
// commit entails writing only a commit marker in the WAL. The sequence number
|
// commit entails writing only a commit marker in the WAL. The sequence number
|
||||||
@ -84,11 +84,15 @@ class WritePreparedTxn : public PessimisticTransaction {
|
|||||||
const Slice& key,
|
const Slice& key,
|
||||||
SequenceNumber* tracked_at_seq) override;
|
SequenceNumber* tracked_at_seq) override;
|
||||||
|
|
||||||
|
virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
WritePreparedTxn(const WritePreparedTxn&);
|
WritePreparedTxn(const WritePreparedTxn&);
|
||||||
void operator=(const WritePreparedTxn&);
|
void operator=(const WritePreparedTxn&);
|
||||||
|
|
||||||
WritePreparedTxnDB* wpt_db_;
|
WritePreparedTxnDB* wpt_db_;
|
||||||
|
// Number of sub-batches in prepare
|
||||||
|
size_t prepare_batch_cnt_ = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -11,7 +11,6 @@
|
|||||||
|
|
||||||
#include "utilities/transactions/write_prepared_txn_db.h"
|
#include "utilities/transactions/write_prepared_txn_db.h"
|
||||||
|
|
||||||
#include <inttypes.h>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
@ -49,6 +48,20 @@ Status WritePreparedTxnDB::Initialize(
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status WritePreparedTxnDB::VerifyCFOptions(
|
||||||
|
const ColumnFamilyOptions& cf_options) {
|
||||||
|
Status s = PessimisticTransactionDB::VerifyCFOptions(cf_options);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) {
|
||||||
|
return Status::InvalidArgument(
|
||||||
|
"memtable_factory->CanHandleDuplicatedKey() cannot be false with "
|
||||||
|
"WritePrpeared transactions");
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
Transaction* WritePreparedTxnDB::BeginTransaction(
|
Transaction* WritePreparedTxnDB::BeginTransaction(
|
||||||
const WriteOptions& write_options, const TransactionOptions& txn_options,
|
const WriteOptions& write_options, const TransactionOptions& txn_options,
|
||||||
Transaction* old_txn) {
|
Transaction* old_txn) {
|
||||||
|
@ -205,6 +205,10 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
|||||||
// Struct to hold ownership of snapshot and read callback for cleanup.
|
// Struct to hold ownership of snapshot and read callback for cleanup.
|
||||||
struct IteratorState;
|
struct IteratorState;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual Status VerifyCFOptions(
|
||||||
|
const ColumnFamilyOptions& cf_options) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
|
friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
|
||||||
friend class WritePreparedTransactionTest_CheckAgainstSnapshotsTest_Test;
|
friend class WritePreparedTransactionTest_CheckAgainstSnapshotsTest_Test;
|
||||||
@ -401,30 +405,48 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
|
|||||||
public:
|
public:
|
||||||
// includes_data indicates that the commit also writes non-empty
|
// includes_data indicates that the commit also writes non-empty
|
||||||
// CommitTimeWriteBatch to memtable, which needs to be committed separately.
|
// CommitTimeWriteBatch to memtable, which needs to be committed separately.
|
||||||
WritePreparedCommitEntryPreReleaseCallback(
|
WritePreparedCommitEntryPreReleaseCallback(WritePreparedTxnDB* db,
|
||||||
WritePreparedTxnDB* db, DBImpl* db_impl,
|
DBImpl* db_impl,
|
||||||
SequenceNumber prep_seq = kMaxSequenceNumber, bool includes_data = false)
|
SequenceNumber prep_seq,
|
||||||
|
size_t prep_batch_cnt,
|
||||||
|
size_t data_batch_cnt = 0)
|
||||||
: db_(db),
|
: db_(db),
|
||||||
db_impl_(db_impl),
|
db_impl_(db_impl),
|
||||||
prep_seq_(prep_seq),
|
prep_seq_(prep_seq),
|
||||||
includes_data_(includes_data) {}
|
prep_batch_cnt_(prep_batch_cnt),
|
||||||
|
data_batch_cnt_(data_batch_cnt),
|
||||||
|
includes_data_(data_batch_cnt_ > 0) {
|
||||||
|
assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber)); // xor
|
||||||
|
assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0);
|
||||||
|
}
|
||||||
|
|
||||||
virtual Status Callback(SequenceNumber commit_seq) {
|
virtual Status Callback(SequenceNumber commit_seq) override {
|
||||||
assert(includes_data_ || prep_seq_ != kMaxSequenceNumber);
|
assert(includes_data_ || prep_seq_ != kMaxSequenceNumber);
|
||||||
|
const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
|
||||||
|
? commit_seq
|
||||||
|
: commit_seq + data_batch_cnt_ - 1;
|
||||||
if (prep_seq_ != kMaxSequenceNumber) {
|
if (prep_seq_ != kMaxSequenceNumber) {
|
||||||
db_->AddCommitted(prep_seq_, commit_seq);
|
for (size_t i = 0; i < prep_batch_cnt_; i++) {
|
||||||
|
db_->AddCommitted(prep_seq_ + i, last_commit_seq);
|
||||||
|
}
|
||||||
} // else there was no prepare phase
|
} // else there was no prepare phase
|
||||||
if (includes_data_) {
|
if (includes_data_) {
|
||||||
|
assert(data_batch_cnt_);
|
||||||
// Commit the data that is accompnaied with the commit request
|
// Commit the data that is accompnaied with the commit request
|
||||||
const bool PREPARE_SKIPPED = true;
|
const bool PREPARE_SKIPPED = true;
|
||||||
db_->AddCommitted(commit_seq, commit_seq, PREPARE_SKIPPED);
|
for (size_t i = 0; i < data_batch_cnt_; i++) {
|
||||||
|
// For commit seq of each batch use the commit seq of the last batch.
|
||||||
|
// This would make debugging easier by having all the batches having
|
||||||
|
// the same sequence number.
|
||||||
|
db_->AddCommitted(commit_seq + i, last_commit_seq, PREPARE_SKIPPED);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (db_impl_->immutable_db_options().two_write_queues) {
|
if (db_impl_->immutable_db_options().two_write_queues) {
|
||||||
// Publish the sequence number. We can do that here assuming the callback
|
// Publish the sequence number. We can do that here assuming the callback
|
||||||
// is invoked only from one write queue, which would guarantee that the
|
// is invoked only from one write queue, which would guarantee that the
|
||||||
// publish sequence numbers will be in order, i.e., once a seq is
|
// publish sequence numbers will be in order, i.e., once a seq is
|
||||||
// published all the seq prior to that are also publishable.
|
// published all the seq prior to that are also publishable.
|
||||||
db_impl_->SetLastPublishedSequence(commit_seq);
|
db_impl_->SetLastPublishedSequence(last_commit_seq);
|
||||||
}
|
}
|
||||||
// else SequenceNumber that is updated as part of the write already does the
|
// else SequenceNumber that is updated as part of the write already does the
|
||||||
// publishing
|
// publishing
|
||||||
@ -436,6 +458,8 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
|
|||||||
DBImpl* db_impl_;
|
DBImpl* db_impl_;
|
||||||
// kMaxSequenceNumber if there was no prepare phase
|
// kMaxSequenceNumber if there was no prepare phase
|
||||||
SequenceNumber prep_seq_;
|
SequenceNumber prep_seq_;
|
||||||
|
size_t prep_batch_cnt_;
|
||||||
|
size_t data_batch_cnt_;
|
||||||
// Either because it is commit without prepare or it has a
|
// Either because it is commit without prepare or it has a
|
||||||
// CommitTimeWriteBatch
|
// CommitTimeWriteBatch
|
||||||
bool includes_data_;
|
bool includes_data_;
|
||||||
|
@ -582,67 +582,8 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
|
|||||||
|
|
||||||
WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
|
WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
|
||||||
|
|
||||||
bool WriteBatchWithIndex::Collapse() {
|
bool WriteBatchWithIndex::HasDuplicateKeys() {
|
||||||
if (rep->obsolete_offsets.size() == 0) {
|
return rep->obsolete_offsets.size() > 0;
|
||||||
return false;
|
|
||||||
}
|
|
||||||
std::sort(rep->obsolete_offsets.begin(), rep->obsolete_offsets.end());
|
|
||||||
WriteBatch& write_batch = rep->write_batch;
|
|
||||||
assert(write_batch.Count() != 0);
|
|
||||||
size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch);
|
|
||||||
Slice input(write_batch.Data());
|
|
||||||
input.remove_prefix(offset);
|
|
||||||
std::string collapsed_buf;
|
|
||||||
collapsed_buf.resize(WriteBatchInternal::kHeader);
|
|
||||||
|
|
||||||
size_t count = 0;
|
|
||||||
Status s;
|
|
||||||
// Loop through all entries in the write batch and add keep them if they are
|
|
||||||
// not obsolete by a newere entry.
|
|
||||||
while (s.ok() && !input.empty()) {
|
|
||||||
Slice key, value, blob, xid;
|
|
||||||
uint32_t column_family_id = 0; // default
|
|
||||||
char tag = 0;
|
|
||||||
// set offset of current entry for call to AddNewEntry()
|
|
||||||
size_t last_entry_offset = input.data() - write_batch.Data().data();
|
|
||||||
s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key,
|
|
||||||
&value, &blob, &xid);
|
|
||||||
if (!rep->obsolete_offsets.empty() &&
|
|
||||||
rep->obsolete_offsets.front() == last_entry_offset) {
|
|
||||||
rep->obsolete_offsets.erase(rep->obsolete_offsets.begin());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
switch (tag) {
|
|
||||||
case kTypeColumnFamilyValue:
|
|
||||||
case kTypeValue:
|
|
||||||
case kTypeColumnFamilyDeletion:
|
|
||||||
case kTypeDeletion:
|
|
||||||
case kTypeColumnFamilySingleDeletion:
|
|
||||||
case kTypeSingleDeletion:
|
|
||||||
case kTypeColumnFamilyMerge:
|
|
||||||
case kTypeMerge:
|
|
||||||
count++;
|
|
||||||
break;
|
|
||||||
case kTypeLogData:
|
|
||||||
case kTypeBeginPrepareXID:
|
|
||||||
case kTypeBeginPersistedPrepareXID:
|
|
||||||
case kTypeEndPrepareXID:
|
|
||||||
case kTypeCommitXID:
|
|
||||||
case kTypeRollbackXID:
|
|
||||||
case kTypeNoop:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
size_t entry_offset = input.data() - write_batch.Data().data();
|
|
||||||
const std::string& wb_data = write_batch.Data();
|
|
||||||
Slice entry_ptr = Slice(wb_data.data() + last_entry_offset,
|
|
||||||
entry_offset - last_entry_offset);
|
|
||||||
collapsed_buf.append(entry_ptr.data(), entry_ptr.size());
|
|
||||||
}
|
|
||||||
write_batch.rep_ = std::move(collapsed_buf);
|
|
||||||
WriteBatchInternal::SetCount(&write_batch, static_cast<int>(count));
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
WBWIIterator* WriteBatchWithIndex::NewIterator() {
|
WBWIIterator* WriteBatchWithIndex::NewIterator() {
|
||||||
@ -758,15 +699,7 @@ Status WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
|
|||||||
rep->SetLastEntryOffset();
|
rep->SetLastEntryOffset();
|
||||||
auto s = rep->write_batch.Merge(column_family, key, value);
|
auto s = rep->write_batch.Merge(column_family, key, value);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
auto size_before = rep->obsolete_offsets.size();
|
|
||||||
rep->AddOrUpdateIndex(column_family, key);
|
rep->AddOrUpdateIndex(column_family, key);
|
||||||
auto size_after = rep->obsolete_offsets.size();
|
|
||||||
bool duplicate_key = size_before != size_after;
|
|
||||||
if (!allow_dup_merge_ && duplicate_key) {
|
|
||||||
assert(0);
|
|
||||||
return Status::NotSupported(
|
|
||||||
"Duplicate key with merge value is not supported yet");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@ -775,15 +708,7 @@ Status WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
|
|||||||
rep->SetLastEntryOffset();
|
rep->SetLastEntryOffset();
|
||||||
auto s = rep->write_batch.Merge(key, value);
|
auto s = rep->write_batch.Merge(key, value);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
auto size_before = rep->obsolete_offsets.size();
|
|
||||||
rep->AddOrUpdateIndex(key);
|
rep->AddOrUpdateIndex(key);
|
||||||
auto size_after = rep->obsolete_offsets.size();
|
|
||||||
bool duplicate_key = size_before != size_after;
|
|
||||||
if (!allow_dup_merge_ && duplicate_key) {
|
|
||||||
assert(0);
|
|
||||||
return Status::NotSupported(
|
|
||||||
"Duplicate key with merge value is not supported yet");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@ -958,8 +883,9 @@ Status WriteBatchWithIndex::RollbackToSavePoint() {
|
|||||||
Status s = rep->write_batch.RollbackToSavePoint();
|
Status s = rep->write_batch.RollbackToSavePoint();
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = rep->ReBuildIndex();
|
// obsolete_offsets will be rebuilt by ReBuildIndex
|
||||||
rep->obsolete_offsets.clear();
|
rep->obsolete_offsets.clear();
|
||||||
|
s = rep->ReBuildIndex();
|
||||||
}
|
}
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
|
Loading…
Reference in New Issue
Block a user