Update WriteBatch::AssignTimestamp() and Add (#9205)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9205

Update WriteBatch::AssignTimestamp() APIs so that they take an
additional argument, i.e. a function object called `checker` indicating the user-specified logic of performing
checks on timestamp sizes.

WriteBatch is a building block used by multiple other RocksDB components, each of which may track
timestamp information in different data structures. For example, transaction can either write to
`WriteBatchWithIndex` which is a `WriteBatch` with index, or write directly to raw `WriteBatch` if
`Transaction::DisableIndexing()` is called.
`WriteBatchWithIndex` keeps mapping from column family id to comparator, and transaction needs
to keep similar information for the `WriteBatch` if user calls `Transaction::DisableIndexing()` (dynamically)
so that we will know the size of each timestamp later. The bookkeeping info maintained by `WriteBatchWithIndex`
and `Transaction` should not overlap.
When we later call `WriteBatch::AssignTimestamp()`, we need to use these data structures to guarantee
that we do not accidentally assign timestamps for keys from column families that disable timestamp.

Reviewed By: ltamasi

Differential Revision: D31735186

fbshipit-source-id: 8b1709ed880ac72f995aa9e012e5873b290840a7
This commit is contained in:
Yanqin Jin 2021-11-30 22:31:41 -08:00 committed by Facebook GitHub Bot
parent 29954b8b57
commit 924616526a
5 changed files with 369 additions and 120 deletions

View File

@ -4,6 +4,7 @@
### Bug Fixes ### Bug Fixes
### Behavior Changes ### Behavior Changes
### Public API change ### Public API change
* Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
## 6.27.0 (2021-11-19) ## 6.27.0 (2021-11-19)
### New Features ### New Features

View File

@ -140,108 +140,6 @@ struct BatchContentClassifier : public WriteBatch::Handler {
} }
}; };
class TimestampAssigner : public WriteBatch::Handler {
public:
explicit TimestampAssigner(const Slice& ts,
WriteBatch::ProtectionInfo* prot_info)
: timestamp_(ts),
timestamps_(kEmptyTimestampList),
prot_info_(prot_info) {}
explicit TimestampAssigner(const std::vector<Slice>& ts_list,
WriteBatch::ProtectionInfo* prot_info)
: timestamps_(ts_list), prot_info_(prot_info) {}
~TimestampAssigner() override {}
Status PutCF(uint32_t, const Slice& key, const Slice&) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status DeleteCF(uint32_t, const Slice& key) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status SingleDeleteCF(uint32_t, const Slice& key) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status DeleteRangeCF(uint32_t, const Slice& begin_key,
const Slice& /* end_key */) override {
AssignTimestamp(begin_key);
++idx_;
return Status::OK();
}
Status MergeCF(uint32_t, const Slice& key, const Slice&) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
// TODO (yanqin): support blob db in the future.
return Status::OK();
}
Status MarkBeginPrepare(bool) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
Status MarkEndPrepare(const Slice&) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
Status MarkCommit(const Slice&) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
Status MarkRollback(const Slice&) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
private:
void AssignTimestamp(const Slice& key) {
assert(timestamps_.empty() || idx_ < timestamps_.size());
const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
size_t ts_sz = ts.size();
if (ts_sz == 0) {
// This key does not have timestamp, so skip.
return;
}
if (prot_info_ != nullptr) {
SliceParts old_key(&key, 1);
Slice key_no_ts(key.data(), key.size() - ts_sz);
std::array<Slice, 2> new_key_cmpts{{key_no_ts, ts}};
SliceParts new_key(new_key_cmpts.data(), 2);
prot_info_->entries_[idx_].UpdateK(old_key, new_key);
}
char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
memcpy(ptr, ts.data(), ts_sz);
}
static const std::vector<Slice> kEmptyTimestampList;
const Slice timestamp_;
const std::vector<Slice>& timestamps_;
WriteBatch::ProtectionInfo* const prot_info_;
size_t idx_ = 0;
// No copy or move.
TimestampAssigner(const TimestampAssigner&) = delete;
TimestampAssigner(TimestampAssigner&&) = delete;
TimestampAssigner& operator=(const TimestampAssigner&) = delete;
TimestampAssigner&& operator=(TimestampAssigner&&) = delete;
};
const std::vector<Slice> TimestampAssigner::kEmptyTimestampList;
} // anon namespace } // anon namespace
struct SavePoints { struct SavePoints {
@ -1292,16 +1190,6 @@ Status WriteBatch::PopSavePoint() {
return Status::OK(); return Status::OK();
} }
Status WriteBatch::AssignTimestamp(const Slice& ts) {
TimestampAssigner ts_assigner(ts, prot_info_.get());
return Iterate(&ts_assigner);
}
Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
TimestampAssigner ts_assigner(ts_list, prot_info_.get());
return Iterate(&ts_assigner);
}
class MemTableInserter : public WriteBatch::Handler { class MemTableInserter : public WriteBatch::Handler {
SequenceNumber sequence_; SequenceNumber sequence_;

View File

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once #pragma once
#include <array>
#include <vector> #include <vector>
#include "db/flush_scheduler.h" #include "db/flush_scheduler.h"
@ -19,6 +20,7 @@
#include "rocksdb/types.h" #include "rocksdb/types.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "util/autovector.h" #include "util/autovector.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -260,4 +262,176 @@ class LocalSavePoint {
#endif #endif
}; };
template <typename Derived, typename Checker>
class TimestampAssignerBase : public WriteBatch::Handler {
public:
explicit TimestampAssignerBase(WriteBatch::ProtectionInfo* prot_info,
Checker&& checker)
: prot_info_(prot_info), checker_(std::move(checker)) {}
~TimestampAssignerBase() override {}
Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
return AssignTimestamp(cf, key);
}
Status DeleteCF(uint32_t cf, const Slice& key) override {
return AssignTimestamp(cf, key);
}
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
return AssignTimestamp(cf, key);
}
Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
const Slice&) override {
return AssignTimestamp(cf, begin_key);
}
Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
return AssignTimestamp(cf, key);
}
Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override {
return AssignTimestamp(cf, key);
}
Status MarkBeginPrepare(bool) override { return Status::OK(); }
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
Status MarkCommit(const Slice&) override { return Status::OK(); }
Status MarkRollback(const Slice&) override { return Status::OK(); }
protected:
Status AssignTimestamp(uint32_t cf, const Slice& key) {
Status s = static_cast_with_check<Derived>(this)->AssignTimestampImpl(
cf, key, idx_);
++idx_;
return s;
}
Status CheckTimestampSize(uint32_t cf, size_t& ts_sz) {
return checker_(cf, ts_sz);
}
Status UpdateTimestampIfNeeded(size_t ts_sz, const Slice& key,
const Slice& ts) {
if (ts_sz > 0) {
assert(ts_sz == ts.size());
UpdateProtectionInformationIfNeeded(key, ts);
UpdateTimestamp(key, ts);
}
return Status::OK();
}
void UpdateProtectionInformationIfNeeded(const Slice& key, const Slice& ts) {
if (prot_info_ != nullptr) {
const size_t ts_sz = ts.size();
SliceParts old_key(&key, 1);
Slice key_no_ts(key.data(), key.size() - ts_sz);
std::array<Slice, 2> new_key_cmpts{{key_no_ts, ts}};
SliceParts new_key(new_key_cmpts.data(), 2);
prot_info_->entries_[idx_].UpdateK(old_key, new_key);
}
}
void UpdateTimestamp(const Slice& key, const Slice& ts) {
const size_t ts_sz = ts.size();
char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
assert(ptr);
memcpy(ptr, ts.data(), ts_sz);
}
// No copy or move.
TimestampAssignerBase(const TimestampAssignerBase&) = delete;
TimestampAssignerBase(TimestampAssignerBase&&) = delete;
TimestampAssignerBase& operator=(const TimestampAssignerBase&) = delete;
TimestampAssignerBase& operator=(TimestampAssignerBase&&) = delete;
WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
const Checker checker_{};
size_t idx_ = 0;
};
template <typename Checker>
class SimpleListTimestampAssigner
: public TimestampAssignerBase<SimpleListTimestampAssigner<Checker>,
Checker> {
public:
explicit SimpleListTimestampAssigner(WriteBatch::ProtectionInfo* prot_info,
Checker checker,
const std::vector<Slice>& timestamps)
: TimestampAssignerBase<SimpleListTimestampAssigner<Checker>, Checker>(
prot_info, std::move(checker)),
timestamps_(timestamps) {}
~SimpleListTimestampAssigner() override {}
private:
friend class TimestampAssignerBase<SimpleListTimestampAssigner<Checker>,
Checker>;
Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t idx) {
if (idx >= timestamps_.size()) {
return Status::InvalidArgument("Need more timestamps for the assignment");
}
const Slice& ts = timestamps_[idx];
size_t ts_sz = ts.size();
const Status s = this->CheckTimestampSize(cf, ts_sz);
if (!s.ok()) {
return s;
}
return this->UpdateTimestampIfNeeded(ts_sz, key, ts);
}
const std::vector<Slice>& timestamps_;
};
template <typename Checker>
class TimestampAssigner
: public TimestampAssignerBase<TimestampAssigner<Checker>, Checker> {
public:
explicit TimestampAssigner(WriteBatch::ProtectionInfo* prot_info,
Checker checker, const Slice& ts)
: TimestampAssignerBase<TimestampAssigner<Checker>, Checker>(
prot_info, std::move(checker)),
timestamp_(ts) {
assert(!timestamp_.empty());
}
~TimestampAssigner() override {}
private:
friend class TimestampAssignerBase<TimestampAssigner<Checker>, Checker>;
Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t /*idx*/) {
if (timestamp_.empty()) {
return Status::InvalidArgument("Timestamp is empty");
}
size_t ts_sz = timestamp_.size();
const Status s = this->CheckTimestampSize(cf, ts_sz);
if (!s.ok()) {
return s;
}
return this->UpdateTimestampIfNeeded(ts_sz, key, timestamp_);
}
const Slice timestamp_;
};
template <typename Checker>
Status WriteBatch::AssignTimestamp(const Slice& ts, Checker checker) {
TimestampAssigner<Checker> ts_assigner(prot_info_.get(), checker, ts);
return Iterate(&ts_assigner);
}
template <typename Checker>
Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list,
Checker checker) {
SimpleListTimestampAssigner<Checker> ts_assigner(prot_info_.get(), checker,
ts_list);
return Iterate(&ts_assigner);
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -13,6 +13,7 @@
#include "db/db_test_util.h" #include "db/db_test_util.h"
#include "db/memtable.h" #include "db/memtable.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
@ -20,6 +21,7 @@
#include "rocksdb/write_buffer_manager.h" #include "rocksdb/write_buffer_manager.h"
#include "table/scoped_arena_iterator.h" #include "table/scoped_arena_iterator.h"
#include "test_util/testharness.h" #include "test_util/testharness.h"
#include "test_util/testutil.h"
#include "util/string_util.h" #include "util/string_util.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -627,13 +629,16 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
public: public:
explicit ColumnFamilyHandleImplDummy(int id) explicit ColumnFamilyHandleImplDummy(int id)
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp)
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
id_(id),
ucmp_(ucmp) {}
uint32_t GetID() const override { return id_; } uint32_t GetID() const override { return id_; }
const Comparator* GetComparator() const override { const Comparator* GetComparator() const override { return ucmp_; }
return BytewiseComparator();
}
private: private:
uint32_t id_; uint32_t id_;
const Comparator* const ucmp_ = BytewiseComparator();
}; };
} // namespace anonymous } // namespace anonymous
@ -899,6 +904,159 @@ TEST_F(WriteBatchTest, MemoryLimitTest) {
ASSERT_TRUE(s.IsMemoryLimit()); ASSERT_TRUE(s.IsMemoryLimit());
} }
namespace {
class TimestampChecker : public WriteBatch::Handler {
public:
explicit TimestampChecker(
std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps, Slice ts)
: cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {}
Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override {
auto cf_iter = cf_to_ucmps_.find(cf);
if (cf_iter == cf_to_ucmps_.end()) {
return Status::Corruption();
}
const Comparator* const ucmp = cf_iter->second;
assert(ucmp);
size_t ts_sz = ucmp->timestamp_size();
if (ts_sz == 0) {
return Status::OK();
}
if (key.size() < ts_sz) {
return Status::Corruption();
}
Slice ts = ExtractTimestampFromUserKey(key, ts_sz);
if (ts.compare(timestamp_) != 0) {
return Status::Corruption();
}
return Status::OK();
}
private:
std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps_;
Slice timestamp_;
};
Status CheckTimestampsInWriteBatch(
WriteBatch& wb, Slice timestamp,
std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps) {
TimestampChecker ts_checker(cf_to_ucmps, timestamp);
return wb.Iterate(&ts_checker);
}
} // namespace
TEST_F(WriteBatchTest, AssignTimestamps) {
// We assume the last eight bytes of each key is reserved for timestamps.
// Therefore, we must make sure each key is longer than eight bytes.
constexpr size_t key_size = 16;
constexpr size_t num_of_keys = 10;
std::vector<std::string> key_strs(num_of_keys, std::string(key_size, '\0'));
ColumnFamilyHandleImplDummy cf0(0);
ColumnFamilyHandleImplDummy cf4(4, test::ComparatorWithU64Ts());
ColumnFamilyHandleImplDummy cf5(5, test::ComparatorWithU64Ts());
const std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps = {
{0, cf0.GetComparator()},
{4, cf4.GetComparator()},
{5, cf5.GetComparator()}};
WriteBatch batch;
// Write to the batch. We will assign timestamps later.
for (const auto& key_str : key_strs) {
ASSERT_OK(batch.Put(&cf0, key_str, "value"));
ASSERT_OK(batch.Put(&cf4, key_str, "value"));
ASSERT_OK(batch.Put(&cf5, key_str, "value"));
}
static constexpr size_t timestamp_size = sizeof(uint64_t);
const auto checker1 = [](uint32_t cf, size_t& ts_sz) {
if (cf == 4 || cf == 5) {
if (ts_sz != timestamp_size) {
return Status::InvalidArgument("Timestamp size mismatch");
}
} else if (cf == 0) {
ts_sz = 0;
return Status::OK();
} else {
return Status::Corruption("Invalid cf");
}
return Status::OK();
};
ASSERT_OK(
batch.AssignTimestamp(std::string(timestamp_size, '\xfe'), checker1));
ASSERT_OK(CheckTimestampsInWriteBatch(
batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps));
// We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to
// simulate the case in which a transaction enables indexing for some writes
// while disables indexing for other writes. A transaction uses a
// WriteBatchWithIndex object to buffer writes (we consider Write-committed
// policy only). If indexing is enabled, then writes go through
// WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a
// mapping from cf to user comparators. If indexing is disabled, a transaction
// writes directly to the underlying raw WriteBatch. We will need to track the
// comparator information for the column families to which un-indexed writes
// are performed. When calling AssignTimestamp(s) API of WriteBatch, we need
// indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform
// checking.
std::unordered_map<uint32_t, const Comparator*> indexed_cf_to_ucmps = {
{0, cf0.GetComparator()}, {4, cf4.GetComparator()}};
std::unordered_set<uint32_t> non_indexed_cfs_with_ts = {cf5.GetID()};
const auto checker2 = [&indexed_cf_to_ucmps, &non_indexed_cfs_with_ts](
uint32_t cf, size_t& ts_sz) {
if (non_indexed_cfs_with_ts.count(cf) > 0) {
if (ts_sz != timestamp_size) {
return Status::InvalidArgument("Timestamp size mismatch");
}
return Status::OK();
}
auto cf_iter = indexed_cf_to_ucmps.find(cf);
if (cf_iter == indexed_cf_to_ucmps.end()) {
return Status::Corruption("Unknown cf");
}
const Comparator* const ucmp = cf_iter->second;
assert(ucmp);
if (ucmp->timestamp_size() == 0) {
ts_sz = 0;
} else if (ts_sz != ucmp->timestamp_size()) {
return Status::InvalidArgument("Timestamp size mismatch");
}
return Status::OK();
};
ASSERT_OK(
batch.AssignTimestamp(std::string(timestamp_size, '\xef'), checker2));
ASSERT_OK(CheckTimestampsInWriteBatch(
batch, std::string(timestamp_size, '\xef'), cf_to_ucmps));
std::vector<std::string> ts_strs;
for (size_t i = 0; i < 3 * key_strs.size(); ++i) {
if (0 == (i % 3)) {
ts_strs.emplace_back();
} else {
ts_strs.emplace_back(std::string(timestamp_size, '\xee'));
}
}
std::vector<Slice> ts_vec(ts_strs.size());
for (size_t i = 0; i < ts_vec.size(); ++i) {
ts_vec[i] = ts_strs[i];
}
const auto checker3 = [&cf_to_ucmps](uint32_t cf, size_t& ts_sz) {
auto cf_iter = cf_to_ucmps.find(cf);
if (cf_iter == cf_to_ucmps.end()) {
return Status::Corruption("Invalid cf");
}
const Comparator* const ucmp = cf_iter->second;
assert(ucmp);
if (ucmp->timestamp_size() != ts_sz) {
return Status::InvalidArgument("Timestamp size mismatch");
}
return Status::OK();
};
ASSERT_OK(batch.AssignTimestamps(ts_vec, checker3));
ASSERT_OK(CheckTimestampsInWriteBatch(
batch, std::string(timestamp_size, '\xee'), cf_to_ucmps));
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -25,10 +25,13 @@
#pragma once #pragma once
#include <stdint.h> #include <stdint.h>
#include <atomic> #include <atomic>
#include <functional>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/write_batch_base.h" #include "rocksdb/write_batch_base.h"
@ -333,18 +336,43 @@ class WriteBatch : public WriteBatchBase {
// Returns true if MarkRollback will be called during Iterate // Returns true if MarkRollback will be called during Iterate
bool HasRollback() const; bool HasRollback() const;
// Assign timestamp to write batch. struct TimestampChecker final {
// This requires that all keys (possibly from multiple column families) in Status operator()(uint32_t /*cf*/, size_t& /*ts_sz*/) const {
// the write batch have timestamps of the same format. return Status::OK();
Status AssignTimestamp(const Slice& ts); }
};
// Experimental.
// Assign timestamp to write batch.
// This requires that all keys, if enable timestamp, (possibly from multiple
// column families) in the write batch have timestamps of the same format.
// checker: callable object to check the timestamp sizes of column families.
// User can call checker(uint32_t cf, size_t& ts_sz) which does the
// following:
// 1. find out the timestamp size of cf.
// 2. if cf's timestamp size is 0, then set ts_sz to 0 and return OK.
// 3. otherwise, compare ts_sz with cf's timestamp size and return
// Status::InvalidArgument() if different.
template <typename Checker = TimestampChecker>
Status AssignTimestamp(const Slice& ts, Checker checker = Checker());
// Experimental.
// Assign timestamps to write batch. // Assign timestamps to write batch.
// This API allows the write batch to include keys from multiple column // This API allows the write batch to include keys from multiple column
// families whose timestamps' formats can differ. For example, some column // families whose timestamps' formats can differ. For example, some column
// families can enable timestamp, while others disable the feature. // families can enable timestamp, while others disable the feature.
// If key does not have timestamp, then put an empty Slice in ts_list as // If key does not have timestamp, then put an empty Slice in ts_list as
// a placeholder. // a placeholder.
Status AssignTimestamps(const std::vector<Slice>& ts_list); // checker: callable object specified by caller to check the timestamp sizes
// of column families.
// User can call checker(uint32_t cf, size_t& ts_sz) which does the
// following:
// 1. find out the timestamp size of cf.
// 2. compare ts_sz with cf's timestamp size and return
// Status::InvalidArgument() if different.
template <typename Checker = TimestampChecker>
Status AssignTimestamps(const std::vector<Slice>& ts_list,
Checker checker = Checker());
using WriteBatchBase::GetWriteBatch; using WriteBatchBase::GetWriteBatch;
WriteBatch* GetWriteBatch() override { return this; } WriteBatch* GetWriteBatch() override { return this; }