2018-07-24 11:42:19 -07:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "rocksdb/slice.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/data_block_hash_index.h"
|
2018-07-24 11:42:19 -07:00
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/hash.h"
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2018-07-24 11:42:19 -07:00
|
|
|
|
|
|
|
void DataBlockHashIndexBuilder::Add(const Slice& key,
|
2018-08-15 14:27:47 -07:00
|
|
|
const size_t restart_index) {
|
|
|
|
assert(Valid());
|
|
|
|
if (restart_index > kMaxRestartSupportedByHashIndex) {
|
|
|
|
valid_ = false;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t hash_value = GetSliceHash(key);
|
|
|
|
hash_and_restart_pairs_.emplace_back(hash_value,
|
|
|
|
static_cast<uint8_t>(restart_index));
|
2018-08-20 23:04:08 -07:00
|
|
|
estimated_num_buckets_ += bucket_per_key_;
|
2018-07-24 11:42:19 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
|
2018-08-15 14:27:47 -07:00
|
|
|
assert(Valid());
|
2018-08-20 23:04:08 -07:00
|
|
|
uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
|
|
|
|
|
2018-08-15 14:27:47 -07:00
|
|
|
if (num_buckets == 0) {
|
|
|
|
num_buckets = 1; // sanity check
|
|
|
|
}
|
2018-07-24 11:42:19 -07:00
|
|
|
|
2018-08-15 14:27:47 -07:00
|
|
|
// The build-in hash cannot well distribute strings when into different
|
|
|
|
// buckets when num_buckets is power of two, resulting in high hash
|
|
|
|
// collision.
|
|
|
|
// We made the num_buckets to be odd to avoid this issue.
|
|
|
|
num_buckets |= 1;
|
|
|
|
|
|
|
|
std::vector<uint8_t> buckets(num_buckets, kNoEntry);
|
|
|
|
// write the restart_index array
|
|
|
|
for (auto& entry : hash_and_restart_pairs_) {
|
|
|
|
uint32_t hash_value = entry.first;
|
|
|
|
uint8_t restart_index = entry.second;
|
|
|
|
uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
|
|
|
|
if (buckets[buck_idx] == kNoEntry) {
|
|
|
|
buckets[buck_idx] = restart_index;
|
|
|
|
} else if (buckets[buck_idx] != restart_index) {
|
|
|
|
// same bucket cannot store two different restart_index, mark collision
|
|
|
|
buckets[buck_idx] = kCollision;
|
2018-07-24 11:42:19 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-15 14:27:47 -07:00
|
|
|
for (uint8_t restart_index : buckets) {
|
2018-08-16 18:29:13 -07:00
|
|
|
buffer.append(
|
|
|
|
const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
|
|
|
|
sizeof(restart_index));
|
2018-07-24 11:42:19 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// write NUM_BUCK
|
2018-08-15 14:27:47 -07:00
|
|
|
PutFixed16(&buffer, num_buckets);
|
2018-07-24 11:42:19 -07:00
|
|
|
|
2018-08-15 14:27:47 -07:00
|
|
|
assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
|
2018-07-24 11:42:19 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void DataBlockHashIndexBuilder::Reset() {
|
2018-08-20 23:04:08 -07:00
|
|
|
estimated_num_buckets_ = 0;
|
2018-08-15 14:27:47 -07:00
|
|
|
valid_ = true;
|
2018-08-20 23:04:08 -07:00
|
|
|
hash_and_restart_pairs_.clear();
|
2018-07-24 11:42:19 -07:00
|
|
|
}
|
|
|
|
|
2018-08-15 14:27:47 -07:00
|
|
|
void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
|
|
|
|
uint16_t* map_offset) {
|
|
|
|
assert(size >= sizeof(uint16_t)); // NUM_BUCKETS
|
|
|
|
num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
|
2018-07-24 11:42:19 -07:00
|
|
|
assert(num_buckets_ > 0);
|
2018-08-15 14:27:47 -07:00
|
|
|
assert(size > num_buckets_ * sizeof(uint8_t));
|
|
|
|
*map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
|
|
|
|
num_buckets_ * sizeof(uint8_t));
|
2018-07-24 11:42:19 -07:00
|
|
|
}
|
|
|
|
|
2018-08-15 14:27:47 -07:00
|
|
|
uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
|
2018-08-16 18:29:13 -07:00
|
|
|
const Slice& key) const {
|
2018-08-15 14:27:47 -07:00
|
|
|
uint32_t hash_value = GetSliceHash(key);
|
|
|
|
uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
|
|
|
|
const char* bucket_table = data + map_offset;
|
|
|
|
return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
|
2018-07-24 11:42:19 -07:00
|
|
|
}
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|