rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
Andres Noetzli 014fd55adc Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).

In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.

Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.

Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
  deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
  this patch supported this so it could be resurrected if needed)

Test Plan: make all check

Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor

Reviewed By: igor

Subscribers: maykov, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00

99 lines
3.8 KiB
C++

// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <limits>
#include <string>
#include <unordered_map>
#include "rocksdb/comparator.h"
#include "rocksdb/iterator.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/write_batch_with_index.h"
namespace rocksdb {
class MergeContext;
struct Options;
// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
struct WriteBatchIndexEntry {
WriteBatchIndexEntry(size_t o, uint32_t c)
: offset(o), column_family(c), search_key(nullptr) {}
WriteBatchIndexEntry(const Slice* sk, uint32_t c)
: offset(0), column_family(c), search_key(sk) {}
// If this flag appears in the offset, it indicates a key that is smaller
// than any other entry for the same column family
static const size_t kFlagMin = std::numeric_limits<size_t>::max();
size_t offset; // offset of an entry in write batch's string buffer.
uint32_t column_family; // column family of the entry
const Slice* search_key; // if not null, instead of reading keys from
// write batch, use it to compare. This is used
// for lookup key.
};
class ReadableWriteBatch : public WriteBatch {
public:
explicit ReadableWriteBatch(size_t reserved_bytes = 0)
: WriteBatch(reserved_bytes) {}
// Retrieve some information from a write entry in the write batch, given
// the start offset of the write entry.
Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
Slice* value, Slice* blob) const;
};
class WriteBatchEntryComparator {
public:
WriteBatchEntryComparator(const Comparator* _default_comparator,
const ReadableWriteBatch* write_batch)
: default_comparator_(_default_comparator), write_batch_(write_batch) {}
// Compare a and b. Return a negative value if a is less than b, 0 if they
// are equal, and a positive value if a is greater than b
int operator()(const WriteBatchIndexEntry* entry1,
const WriteBatchIndexEntry* entry2) const;
int CompareKey(uint32_t column_family, const Slice& key1,
const Slice& key2) const;
void SetComparatorForCF(uint32_t column_family_id,
const Comparator* comparator) {
cf_comparator_map_[column_family_id] = comparator;
}
const Comparator* default_comparator() { return default_comparator_; }
private:
const Comparator* default_comparator_;
std::unordered_map<uint32_t, const Comparator*> cf_comparator_map_;
const ReadableWriteBatch* write_batch_;
};
class WriteBatchWithIndexInternal {
public:
enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };
// If batch contains a value for key, store it in *value and return kFound.
// If batch contains a deletion for key, return Deleted.
// If batch contains Merge operations as the most recent entry for a key,
// and the merge process does not stop (not reaching a value or delete),
// prepend the current merge operands to *operands,
// and return kMergeInProgress
// If batch does not contain this key, return kNotFound
// Else, return kError on error with error Status stored in *s.
static WriteBatchWithIndexInternal::Result GetFromBatch(
const DBOptions& options, WriteBatchWithIndex* batch,
ColumnFamilyHandle* column_family, const Slice& key,
MergeContext* merge_context, WriteBatchEntryComparator* cmp,
std::string* value, Status* s);
};
} // namespace rocksdb
#endif // !ROCKSDB_LITE