138b87eae4
Summary: Fixes the following scenario: 1. Set prefix extractor. Enable bloom filters, with `whole_key_filtering = false`. Use compaction filter that sometimes returns `kRemoveAndSkipUntil`. 2. Do a compaction. 3. Compaction creates an iterator with `total_order_seek = false`, calls `SeekToFirst()` on it, then repeatedly calls `Next()`. 4. At some point compaction filter returns `kRemoveAndSkipUntil`. 5. Compaction calls `Seek(skip_until)` on the iterator. The key that it seeks to happens to have prefix that doesn't match the bloom filter. Since `total_order_seek = false`, iterator becomes invalid, and compaction thinks that it has reached the end. The rest of the compaction input is silently discarded. The fix is to make compaction iterator use `total_order_seek = true`. The implementation for PlainTable is quite awkward. I've made `kRemoveAndSkipUntil` officially incompatible with PlainTable. If you try to use them together, compaction will fail, and DB will enter read-only mode (`bg_error_`). That's not a very graceful way to communicate a misconfiguration, but the alternatives don't seem worth the implementation time and complexity. To be able to check in advance that `kRemoveAndSkipUntil` is not going to be used with PlainTable, we'd need to extend the interface of either `CompactionFilter` or `InternalIterator`. It seems unlikely that anyone will ever want to use `kRemoveAndSkipUntil` with PlainTable: PlainTable probably has very few users, and `kRemoveAndSkipUntil` has only one user so far: us (logdevice). Closes https://github.com/facebook/rocksdb/pull/2349 Differential Revision: D5110388 Pulled By: lightmark fbshipit-source-id: ec29101a99d9dcd97db33923b87f72bce56cc17a
85 lines
2.8 KiB
C++
85 lines
2.8 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
#pragma once
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include <string>
|
|
#include "rocksdb/table.h"
|
|
#include "util/murmurhash.h"
|
|
#include "rocksdb/options.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
|
|
static inline uint64_t CuckooHash(
|
|
const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
|
|
uint64_t table_size_, bool identity_as_first_hash,
|
|
uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
|
|
#if !defined NDEBUG || defined OS_WIN
|
|
// This part is used only in unit tests but we have to keep it for Windows
|
|
// build as we run test in both debug and release modes under Windows.
|
|
if (get_slice_hash != nullptr) {
|
|
return get_slice_hash(user_key, hash_cnt, table_size_);
|
|
}
|
|
#endif
|
|
|
|
uint64_t value = 0;
|
|
if (hash_cnt == 0 && identity_as_first_hash) {
|
|
value = (*reinterpret_cast<const int64_t*>(user_key.data()));
|
|
} else {
|
|
value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()),
|
|
kCuckooMurmurSeedMultiplier * hash_cnt);
|
|
}
|
|
if (use_module_hash) {
|
|
return value % table_size_;
|
|
} else {
|
|
return value & (table_size_ - 1);
|
|
}
|
|
}
|
|
|
|
// Cuckoo Table is designed for applications that require fast point lookups
|
|
// but not fast range scans.
|
|
//
|
|
// Some assumptions:
|
|
// - Key length and Value length are fixed.
|
|
// - Does not support Snapshot.
|
|
// - Does not support Merge operations.
|
|
// - Does not support prefix bloom filters.
|
|
class CuckooTableFactory : public TableFactory {
|
|
public:
|
|
explicit CuckooTableFactory(const CuckooTableOptions& table_options)
|
|
: table_options_(table_options) {}
|
|
~CuckooTableFactory() {}
|
|
|
|
const char* Name() const override { return "CuckooTable"; }
|
|
|
|
Status NewTableReader(
|
|
const TableReaderOptions& table_reader_options,
|
|
unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
unique_ptr<TableReader>* table,
|
|
bool prefetch_index_and_filter_in_cache = true) const override;
|
|
|
|
TableBuilder* NewTableBuilder(
|
|
const TableBuilderOptions& table_builder_options,
|
|
uint32_t column_family_id, WritableFileWriter* file) const override;
|
|
|
|
// Sanitizes the specified DB Options.
|
|
Status SanitizeOptions(const DBOptions& db_opts,
|
|
const ColumnFamilyOptions& cf_opts) const override {
|
|
return Status::OK();
|
|
}
|
|
|
|
std::string GetPrintableTableOptions() const override;
|
|
|
|
void* GetOptions() override { return &table_options_; }
|
|
|
|
private:
|
|
CuckooTableOptions table_options_;
|
|
};
|
|
|
|
} // namespace rocksdb
|
|
#endif // ROCKSDB_LITE
|