3122cb4358
Summary: ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`. Namely, `WriteOptions` should not include information about "what-to-write", but should just include information about "how-to-write". According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore, this PR removes `WriteOptions::timestamp` for compliance. After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and `SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity made me reconsider doing it in another PR (maybe). For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take extra `timestamp` information when writing to `WriteBatch`es. These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list. Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to `WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps allocated already and multiple timestamps can be updated. The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp size of the default column family. This will be used to allocate space when calling APIs that do not specify a column family handle. Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing some assertions about timestamp to returning Status code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946 Test Plan: make check ./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8 ./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0 Make sure there is no perf regression by running the following ``` ./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom ``` Before this PR ``` DB path: [/dev/shm/rocksdb] fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s ``` After this PR ``` DB path: [/dev/shm/rocksdb] fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s ``` Reviewed By: ltamasi Differential Revision: D33721359 Pulled By: riversand963 fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
139 lines
6.0 KiB
C++
139 lines
6.0 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
#include <cstddef>
|
|
|
|
#include "rocksdb/rocksdb_namespace.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class Slice;
|
|
class Status;
|
|
class ColumnFamilyHandle;
|
|
class WriteBatch;
|
|
struct SliceParts;
|
|
|
|
// Abstract base class that defines the basic interface for a write batch.
|
|
// See WriteBatch for a basic implementation and WrithBatchWithIndex for an
|
|
// indexed implementation.
|
|
class WriteBatchBase {
|
|
public:
|
|
virtual ~WriteBatchBase() {}
|
|
|
|
// Store the mapping "key->value" in the database.
|
|
virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const Slice& value) = 0;
|
|
virtual Status Put(const Slice& key, const Slice& value) = 0;
|
|
virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const Slice& ts, const Slice& value) = 0;
|
|
|
|
// Variant of Put() that gathers output like writev(2). The key and value
|
|
// that will be written to the database are concatenations of arrays of
|
|
// slices.
|
|
virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
|
const SliceParts& value);
|
|
virtual Status Put(const SliceParts& key, const SliceParts& value);
|
|
|
|
// Merge "value" with the existing value of "key" in the database.
|
|
// "key->merge(existing, value)"
|
|
virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const Slice& value) = 0;
|
|
virtual Status Merge(const Slice& key, const Slice& value) = 0;
|
|
virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const Slice& ts, const Slice& value) = 0;
|
|
|
|
// variant that takes SliceParts
|
|
virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
|
|
const SliceParts& value);
|
|
virtual Status Merge(const SliceParts& key, const SliceParts& value);
|
|
|
|
// If the database contains a mapping for "key", erase it. Else do nothing.
|
|
virtual Status Delete(ColumnFamilyHandle* column_family,
|
|
const Slice& key) = 0;
|
|
virtual Status Delete(const Slice& key) = 0;
|
|
virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
|
|
const Slice& ts) = 0;
|
|
|
|
// variant that takes SliceParts
|
|
virtual Status Delete(ColumnFamilyHandle* column_family,
|
|
const SliceParts& key);
|
|
virtual Status Delete(const SliceParts& key);
|
|
|
|
// If the database contains a mapping for "key", erase it. Expects that the
|
|
// key was not overwritten. Else do nothing.
|
|
virtual Status SingleDelete(ColumnFamilyHandle* column_family,
|
|
const Slice& key) = 0;
|
|
virtual Status SingleDelete(const Slice& key) = 0;
|
|
virtual Status SingleDelete(ColumnFamilyHandle* column_family,
|
|
const Slice& key, const Slice& ts) = 0;
|
|
|
|
// variant that takes SliceParts
|
|
virtual Status SingleDelete(ColumnFamilyHandle* column_family,
|
|
const SliceParts& key);
|
|
virtual Status SingleDelete(const SliceParts& key);
|
|
|
|
// If the database contains mappings in the range ["begin_key", "end_key"),
|
|
// erase them. Else do nothing.
|
|
virtual Status DeleteRange(ColumnFamilyHandle* column_family,
|
|
const Slice& begin_key, const Slice& end_key) = 0;
|
|
virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0;
|
|
virtual Status DeleteRange(ColumnFamilyHandle* column_family,
|
|
const Slice& begin_key, const Slice& end_key,
|
|
const Slice& ts) = 0;
|
|
|
|
// variant that takes SliceParts
|
|
virtual Status DeleteRange(ColumnFamilyHandle* column_family,
|
|
const SliceParts& begin_key,
|
|
const SliceParts& end_key);
|
|
virtual Status DeleteRange(const SliceParts& begin_key,
|
|
const SliceParts& end_key);
|
|
|
|
// Append a blob of arbitrary size to the records in this batch. The blob will
|
|
// be stored in the transaction log but not in any other file. In particular,
|
|
// it will not be persisted to the SST files. When iterating over this
|
|
// WriteBatch, WriteBatch::Handler::LogData will be called with the contents
|
|
// of the blob as it is encountered. Blobs, puts, deletes, and merges will be
|
|
// encountered in the same order in which they were inserted. The blob will
|
|
// NOT consume sequence number(s) and will NOT increase the count of the batch
|
|
//
|
|
// Example application: add timestamps to the transaction log for use in
|
|
// replication.
|
|
virtual Status PutLogData(const Slice& blob) = 0;
|
|
|
|
// Clear all updates buffered in this batch.
|
|
virtual void Clear() = 0;
|
|
|
|
// Covert this batch into a WriteBatch. This is an abstracted way of
|
|
// converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic
|
|
// WriteBatch.
|
|
virtual WriteBatch* GetWriteBatch() = 0;
|
|
|
|
// Records the state of the batch for future calls to RollbackToSavePoint().
|
|
// May be called multiple times to set multiple save points.
|
|
virtual void SetSavePoint() = 0;
|
|
|
|
// Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
|
|
// most recent call to SetSavePoint() and removes the most recent save point.
|
|
// If there is no previous call to SetSavePoint(), behaves the same as
|
|
// Clear().
|
|
virtual Status RollbackToSavePoint() = 0;
|
|
|
|
// Pop the most recent save point.
|
|
// If there is no previous call to SetSavePoint(), Status::NotFound()
|
|
// will be returned.
|
|
// Otherwise returns Status::OK().
|
|
virtual Status PopSavePoint() = 0;
|
|
|
|
// Sets the maximum size of the write batch in bytes. 0 means no limit.
|
|
virtual void SetMaxBytes(size_t max_bytes) = 0;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|