f040e536e4
Summary: PlainTableReader to use a more customized hash table. This patch assumes the SST file is smaller than 2GB: (1) Every bucket uses 32-bit integer (2) no key is stored in bucket (3) use the first bit of the bucket value to distinguish it points to the file offset or a second level index. This index schema fits the use case that most of prefixes have very small number of keys Test Plan: plain_table_db_test Reviewers: haobo, kailiu, dhruba Reviewed By: haobo CC: nkg-, leveldb Differential Revision: https://reviews.facebook.net/D14343
184 lines
5.8 KiB
C++
184 lines
5.8 KiB
C++
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
#include <unordered_map>
|
|
#include <memory>
|
|
#include <stdint.h>
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/iterator.h"
|
|
#include "rocksdb/table.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
class Block;
|
|
class BlockHandle;
|
|
class Footer;
|
|
struct Options;
|
|
class RandomAccessFile;
|
|
struct ReadOptions;
|
|
class TableCache;
|
|
class TableReader;
|
|
|
|
using std::unique_ptr;
|
|
using std::unordered_map;
|
|
|
|
// Based on following output file format:
|
|
// +-------------+
|
|
// | version |
|
|
// +-------------+------------------------------+ <= key1_data_offset
|
|
// | key1 | value_size (4 bytes) | |
|
|
// +----------------------------------------+ |
|
|
// | value1 |
|
|
// | |
|
|
// +----------------------------------------+---+ <= key2_data_offset
|
|
// | key2 | value_size (4 bytes) | |
|
|
// +----------------------------------------+ |
|
|
// | value2 |
|
|
// | |
|
|
// | ...... |
|
|
// +-----------------+--------------------------+ <= index_block_offset
|
|
// | key1 | key1 offset (8 bytes) |
|
|
// +-----------------+--------------------------+ <= key2_index_offset
|
|
// | key2 | key2 offset (8 bytes) |
|
|
// +-----------------+--------------------------+ <= key3_index_offset
|
|
// | key3 | key3 offset (8 bytes) |
|
|
// +-----------------+--------------------------+ <= key4_index_offset
|
|
// | ...... |
|
|
// +-----------------+------------+-------------+
|
|
// When opening the output file, IndexedTableReader creates a hash table
|
|
// from key prefixes to offset of the output file. IndexedTable will decide
|
|
// whether it points to the data offset of the first key with the key prefix
|
|
// or the offset of it. If there are too many keys share this prefix, it will
|
|
// create a binary search-able index from the suffix to offset on disk.
|
|
//
|
|
// The implementation of IndexedTableReader requires output file is mmaped
|
|
class PlainTableReader: public TableReader {
|
|
public:
|
|
static Status Open(const Options& options, const EnvOptions& soptions,
|
|
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
|
|
unique_ptr<TableReader>* table, const int user_key_size,
|
|
const int key_prefix_len, const int bloom_num_bits,
|
|
double hash_table_ratio);
|
|
|
|
bool PrefixMayMatch(const Slice& internal_prefix);
|
|
|
|
Iterator* NewIterator(const ReadOptions&);
|
|
|
|
Status Get(
|
|
const ReadOptions&, const Slice& key, void* arg,
|
|
bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
|
|
void (*mark_key_may_exist)(void*) = nullptr);
|
|
|
|
uint64_t ApproximateOffsetOf(const Slice& key);
|
|
|
|
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
|
|
|
|
void SetupForCompaction();
|
|
|
|
TableProperties& GetTableProperties() {
|
|
return tbl_props;
|
|
}
|
|
|
|
PlainTableReader(const EnvOptions& storage_options, uint64_t file_size,
|
|
int user_key_size, int key_prefix_len, int bloom_num_bits,
|
|
double hash_table_ratio);
|
|
~PlainTableReader();
|
|
|
|
private:
|
|
uint32_t* hash_table_;
|
|
int hash_table_size_;
|
|
std::string sub_index_;
|
|
|
|
Options options_;
|
|
const EnvOptions& soptions_;
|
|
Status status_;
|
|
unique_ptr<RandomAccessFile> file_;
|
|
|
|
Slice file_data_;
|
|
uint32_t version_;
|
|
uint32_t file_size_;
|
|
uint32_t data_start_offset_;
|
|
uint32_t data_end_offset_;
|
|
const size_t user_key_size_;
|
|
const size_t key_prefix_len_;
|
|
const double hash_table_ratio_;
|
|
const FilterPolicy* filter_policy_;
|
|
std::string filter_str_;
|
|
Slice filter_slice_;
|
|
|
|
TableProperties tbl_props;
|
|
|
|
static const size_t kNumInternalBytes = 8;
|
|
static const uint32_t kSubIndexMask = 0x80000000;
|
|
static const size_t kOffsetLen = sizeof(uint32_t);
|
|
|
|
inline size_t GetInternalKeyLength() {
|
|
return user_key_size_ + kNumInternalBytes;
|
|
}
|
|
|
|
friend class TableCache;
|
|
friend class PlainTableIterator;
|
|
|
|
// Populate the internal indexes. It must be called before
|
|
// any query to the table.
|
|
// This query will populate the hash table hash_table_, the second
|
|
// level of indexes sub_index_ and bloom filter filter_slice_ if enabled.
|
|
Status PopulateIndex(uint64_t file_size);
|
|
|
|
// Check bloom filter to see whether it might contain this prefix
|
|
bool MayHavePrefix(const Slice& target_prefix);
|
|
|
|
// Read the key and value at offset to key and value.
|
|
// tmp_slice is a tmp slice.
|
|
// return next_offset as the offset for the next key.
|
|
Status Next(uint32_t offset, Slice* key, Slice* value, uint32_t& next_offset);
|
|
// Get file offset for key target.
|
|
// return value prefix_matched is set to true if the offset is confirmed
|
|
// for a key with the same prefix as target.
|
|
uint32_t GetOffset(const Slice& target, bool& prefix_matched);
|
|
|
|
// No copying allowed
|
|
explicit PlainTableReader(const TableReader&) = delete;
|
|
void operator=(const TableReader&) = delete;
|
|
};
|
|
|
|
// Iterator to iterate IndexedTable
|
|
class PlainTableIterator: public Iterator {
|
|
public:
|
|
explicit PlainTableIterator(PlainTableReader* table);
|
|
~PlainTableIterator();
|
|
|
|
bool Valid() const;
|
|
|
|
void SeekToFirst();
|
|
|
|
void SeekToLast();
|
|
|
|
void Seek(const Slice& target);
|
|
|
|
void Next();
|
|
|
|
void Prev();
|
|
|
|
Slice key() const;
|
|
|
|
Slice value() const;
|
|
|
|
Status status() const;
|
|
|
|
private:
|
|
PlainTableReader* table_;
|
|
uint32_t offset_;
|
|
uint32_t next_offset_;
|
|
Slice key_;
|
|
Slice value_;
|
|
Status status_;
|
|
// No copying allowed
|
|
PlainTableIterator(const PlainTableIterator&) = delete;
|
|
void operator=(const Iterator&) = delete;
|
|
};
|
|
|
|
} // namespace rocksdb
|