Avoid user key copying for Get/Put/Write with user-timestamp (#5502)

Summary:
In previous https://github.com/facebook/rocksdb/issues/5079, we added user-specified timestamp to `DB::Get()` and `DB::Put()`. Limitation is that these two functions may cause extra memory allocation and key copy. The reason is that `WriteBatch` does not allocate extra memory for timestamps because it is not aware of timestamp size, and we did not provide an API to assign/update timestamp of each key within a `WriteBatch`.
We address these issues in this PR by doing the following.
1. Add a `timestamp_size_` to `WriteBatch` so that `WriteBatch` can take timestamps into account when calling `WriteBatch::Put`, `WriteBatch::Delete`, etc.
2. Add APIs `WriteBatch::AssignTimestamp` and `WriteBatch::AssignTimestamps` so that application can assign/update timestamps for each key in a `WriteBatch`.
3. Avoid key copy in `GetImpl` by adding new constructor to `LookupKey`.

Test plan (on devserver):
```
$make clean && COMPILE_WITH_ASAN=1 make -j32 all
$./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/*
$make check
```
If the API extension looks good, I will add more unit tests.

Some simple benchmark using db_bench.
```
$rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000
$rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000 -disable_wal=true
```
Master is at a78503bd6c.
```
|        | readrandom | fillrandom |
| master | 15.53 MB/s | 25.97 MB/s |
| PR5502 | 16.70 MB/s | 25.80 MB/s |
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5502

Differential Revision: D16340894

Pulled By: riversand963

fbshipit-source-id: 51132cf792be07d1efc3ac33f5768c4ee2608bb8
This commit is contained in:
Yanqin Jin 2019-07-25 15:23:46 -07:00 committed by Facebook Github Bot
parent 0d16fad51b
commit ae152ee666
9 changed files with 183 additions and 45 deletions

1
.gitignore vendored
View File

@ -49,6 +49,7 @@ rocksdb_undump
db_test2
trace_analyzer
trace_analyzer_test
block_cache_trace_analyzer
.DS_Store
java/out

View File

@ -1441,16 +1441,7 @@ ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
Status DBImpl::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) {
if (nullptr == read_options.timestamp) {
return GetImpl(read_options, column_family, key, value);
}
Slice akey;
std::string buf;
Status s = AppendTimestamp(key, *(read_options.timestamp), &akey, &buf);
if (s.ok()) {
s = GetImpl(read_options, column_family, akey, value);
}
return s;
}
Status DBImpl::GetImpl(const ReadOptions& read_options,
@ -1528,7 +1519,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
// First look in the memtable, then in the immutable memtable (if any).
// s is both in/out. When in, s could either be OK or MergeInProgress.
// merge_operands will contain the sequence of merges in the latter case.
LookupKey lkey(key, snapshot);
LookupKey lkey(key, snapshot, read_options.timestamp);
PERF_TIMER_STOP(get_snapshot_time);
bool skip_memtable = (read_options.read_tier == kPersistedTier &&

View File

@ -1734,14 +1734,16 @@ Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
}
return Write(opt, &batch);
}
Slice akey;
std::string buf;
Status s = AppendTimestamp(key, *(opt.timestamp), &akey, &buf);
const Slice* ts = opt.timestamp;
assert(nullptr != ts);
size_t ts_sz = ts->size();
WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0,
ts_sz);
Status s = batch.Put(column_family, key, value);
if (!s.ok()) {
return s;
}
WriteBatch batch(akey.size() + value.size() + 24);
s = batch.Put(column_family, akey, value);
s = batch.AssignTimestamp(*ts);
if (!s.ok()) {
return s;
}

View File

@ -159,9 +159,11 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
}
}
LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
const Slice* ts) {
size_t usize = _user_key.size();
size_t needed = usize + 13; // A conservative estimate
size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
size_t needed = usize + ts_sz + 13; // A conservative estimate
char* dst;
if (needed <= sizeof(space_)) {
dst = space_;
@ -170,10 +172,14 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
}
start_ = dst;
// NOTE: We don't support users keys of more than 2GB :)
dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + 8));
dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
kstart_ = dst;
memcpy(dst, _user_key.data(), usize);
dst += usize;
if (nullptr != ts) {
memcpy(dst, ts->data(), ts_sz);
dst += ts_sz;
}
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
dst += 8;
end_ = dst;

View File

@ -669,20 +669,4 @@ struct ParsedInternalKeyComparator {
const InternalKeyComparator* cmp;
};
// TODO (yanqin): this causes extra memory allocation and copy. Should be
// addressed in the future.
inline Status AppendTimestamp(const Slice& key, const Slice& timestamp,
Slice* ret_key, std::string* ret_buf) {
assert(ret_key != nullptr);
assert(ret_buf != nullptr);
if (key.data() + key.size() == timestamp.data()) {
*ret_key = Slice(key.data(), key.size() + timestamp.size());
} else {
ret_buf->assign(key.data(), key.size());
ret_buf->append(timestamp.data(), timestamp.size());
*ret_key = Slice(*ret_buf);
}
return Status::OK();
}
} // namespace rocksdb

View File

@ -21,7 +21,8 @@ class LookupKey {
public:
// Initialize *this for looking up user_key at a snapshot with
// the specified sequence number.
LookupKey(const Slice& _user_key, SequenceNumber sequence);
LookupKey(const Slice& _user_key, SequenceNumber sequence,
const Slice* ts = nullptr);
~LookupKey();

View File

@ -135,6 +135,105 @@ struct BatchContentClassifier : public WriteBatch::Handler {
}
};
class TimestampAssigner : public WriteBatch::Handler {
public:
explicit TimestampAssigner(const Slice& ts)
: timestamp_(ts), timestamps_(kEmptyTimestampList) {}
explicit TimestampAssigner(const std::vector<Slice>& ts_list)
: timestamps_(ts_list) {
SanityCheck();
}
~TimestampAssigner() override {}
Status PutCF(uint32_t, const Slice& key, const Slice&) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status DeleteCF(uint32_t, const Slice& key) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status SingleDeleteCF(uint32_t, const Slice& key) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status DeleteRangeCF(uint32_t, const Slice& begin_key,
const Slice& end_key) override {
AssignTimestamp(begin_key);
AssignTimestamp(end_key);
++idx_;
return Status::OK();
}
Status MergeCF(uint32_t, const Slice& key, const Slice&) override {
AssignTimestamp(key);
++idx_;
return Status::OK();
}
Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
// TODO (yanqin): support blob db in the future.
return Status::OK();
}
Status MarkBeginPrepare(bool) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
Status MarkEndPrepare(const Slice&) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
Status MarkCommit(const Slice&) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
Status MarkRollback(const Slice&) override {
// TODO (yanqin): support in the future.
return Status::OK();
}
private:
void SanityCheck() const {
assert(!timestamps_.empty());
#ifndef NDEBUG
const size_t ts_sz = timestamps_[0].size();
for (size_t i = 1; i != timestamps_.size(); ++i) {
assert(ts_sz == timestamps_[i].size());
}
#endif // !NDEBUG
}
void AssignTimestamp(const Slice& key) {
assert(timestamps_.empty() || idx_ < timestamps_.size());
const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
size_t ts_sz = ts.size();
char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
memcpy(ptr, ts.data(), ts_sz);
}
static const std::vector<Slice> kEmptyTimestampList;
const Slice timestamp_;
const std::vector<Slice>& timestamps_;
size_t idx_ = 0;
// No copy or move.
TimestampAssigner(const TimestampAssigner&) = delete;
TimestampAssigner(TimestampAssigner&&) = delete;
TimestampAssigner& operator=(const TimestampAssigner&) = delete;
TimestampAssigner&& operator=(TimestampAssigner&&) = delete;
};
const std::vector<Slice> TimestampAssigner::kEmptyTimestampList;
} // anon namespace
struct SavePoints {
@ -142,7 +241,15 @@ struct SavePoints {
};
WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
: content_flags_(0), max_bytes_(max_bytes), rep_() {
: content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) {
rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
? reserved_bytes
: WriteBatchInternal::kHeader);
rep_.resize(WriteBatchInternal::kHeader);
}
WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz)
: content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) {
rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ?
reserved_bytes : WriteBatchInternal::kHeader);
rep_.resize(WriteBatchInternal::kHeader);
@ -151,18 +258,21 @@ WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
WriteBatch::WriteBatch(const std::string& rep)
: content_flags_(ContentFlags::DEFERRED),
max_bytes_(0),
rep_(rep) {}
rep_(rep),
timestamp_size_(0) {}
WriteBatch::WriteBatch(std::string&& rep)
: content_flags_(ContentFlags::DEFERRED),
max_bytes_(0),
rep_(std::move(rep)) {}
rep_(std::move(rep)),
timestamp_size_(0) {}
WriteBatch::WriteBatch(const WriteBatch& src)
: wal_term_point_(src.wal_term_point_),
content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
max_bytes_(src.max_bytes_),
rep_(src.rep_) {
rep_(src.rep_),
timestamp_size_(src.timestamp_size_) {
if (src.save_points_ != nullptr) {
save_points_.reset(new SavePoints());
save_points_->stack = src.save_points_->stack;
@ -174,7 +284,8 @@ WriteBatch::WriteBatch(WriteBatch&& src) noexcept
wal_term_point_(std::move(src.wal_term_point_)),
content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
max_bytes_(src.max_bytes_),
rep_(std::move(src.rep_)) {}
rep_(std::move(src.rep_)),
timestamp_size_(src.timestamp_size_) {}
WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
if (&src != this) {
@ -643,7 +754,14 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
PutVarint32(&b->rep_, column_family_id);
}
if (0 == b->timestamp_size_) {
PutLengthPrefixedSlice(&b->rep_, key);
} else {
PutVarint32(&b->rep_,
static_cast<uint32_t>(key.size() + b->timestamp_size_));
b->rep_.append(key.data(), key.size());
b->rep_.append(b->timestamp_size_, '\0');
}
PutLengthPrefixedSlice(&b->rep_, value);
b->content_flags_.store(
b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
@ -692,7 +810,11 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
PutVarint32(&b->rep_, column_family_id);
}
if (0 == b->timestamp_size_) {
PutLengthPrefixedSliceParts(&b->rep_, key);
} else {
PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_);
}
PutLengthPrefixedSliceParts(&b->rep_, value);
b->content_flags_.store(
b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
@ -1038,6 +1160,16 @@ Status WriteBatch::PopSavePoint() {
return Status::OK();
}
Status WriteBatch::AssignTimestamp(const Slice& ts) {
TimestampAssigner ts_assigner(ts);
return Iterate(&ts_assigner);
}
Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
TimestampAssigner ts_assigner(ts_list);
return Iterate(&ts_assigner);
}
class MemTableInserter : public WriteBatch::Handler {
SequenceNumber sequence_;

View File

@ -28,6 +28,7 @@
#include <atomic>
#include <memory>
#include <string>
#include <vector>
#include "rocksdb/status.h"
#include "rocksdb/write_batch_base.h"
@ -60,6 +61,7 @@ struct SavePoint {
class WriteBatch : public WriteBatchBase {
public:
explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0);
explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz);
~WriteBatch() override;
using WriteBatchBase::Put;
@ -311,6 +313,12 @@ class WriteBatch : public WriteBatchBase {
// Returns trie if MarkRollback will be called during Iterate
bool HasRollback() const;
// Assign timestamp to write batch
Status AssignTimestamp(const Slice& ts);
// Assign timestamps to write batch
Status AssignTimestamps(const std::vector<Slice>& ts_list);
using WriteBatchBase::GetWriteBatch;
WriteBatch* GetWriteBatch() override { return this; }
@ -361,6 +369,7 @@ class WriteBatch : public WriteBatchBase {
protected:
std::string rep_; // See comment in write_batch.cc for the format of rep_
const size_t timestamp_size_;
// Intentionally copyable
};

View File

@ -50,6 +50,8 @@ extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1,
extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
extern void PutLengthPrefixedSliceParts(std::string* dst,
const SliceParts& slice_parts);
extern void PutLengthPrefixedSlicePartsWithPadding(
std::string* dst, const SliceParts& slice_parts, size_t pad_sz);
// Standard Get... routines parse a value from the beginning of a Slice
// and advance the slice past the parsed value.
@ -306,9 +308,8 @@ inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
dst->append(value.data(), value.size());
}
inline void PutLengthPrefixedSliceParts(std::string* dst,
inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes,
const SliceParts& slice_parts) {
size_t total_bytes = 0;
for (int i = 0; i < slice_parts.num_parts; ++i) {
total_bytes += slice_parts.parts[i].size();
}
@ -318,6 +319,17 @@ inline void PutLengthPrefixedSliceParts(std::string* dst,
}
}
inline void PutLengthPrefixedSliceParts(std::string* dst,
const SliceParts& slice_parts) {
PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts);
}
inline void PutLengthPrefixedSlicePartsWithPadding(
std::string* dst, const SliceParts& slice_parts, size_t pad_sz) {
PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts);
dst->append(pad_sz, '\0');
}
inline int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {