2013-10-29 04:34:02 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
2014-04-25 21:21:34 +02:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2013-10-29 04:34:02 +01:00
|
|
|
#include <unordered_map>
|
|
|
|
#include <memory>
|
2014-01-25 06:10:19 +01:00
|
|
|
#include <vector>
|
2014-01-27 22:53:22 +01:00
|
|
|
#include <string>
|
2013-10-29 04:34:02 +01:00
|
|
|
#include <stdint.h>
|
2014-01-28 06:58:46 +01:00
|
|
|
|
2014-01-27 22:53:22 +01:00
|
|
|
#include "db/dbformat.h"
|
2013-10-29 04:34:02 +01:00
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
2014-01-28 19:35:48 +01:00
|
|
|
#include "rocksdb/slice_transform.h"
|
2014-01-28 06:58:46 +01:00
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "rocksdb/table_properties.h"
|
|
|
|
#include "table/table_reader.h"
|
|
|
|
#include "table/plain_table_factory.h"
|
2014-05-04 22:55:53 +02:00
|
|
|
#include "util/arena.h"
|
2014-06-09 21:30:19 +02:00
|
|
|
#include "util/dynamic_bloom.h"
|
2013-10-29 04:34:02 +01:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
class Block;
|
|
|
|
class BlockHandle;
|
|
|
|
class Footer;
|
|
|
|
struct Options;
|
|
|
|
class RandomAccessFile;
|
|
|
|
struct ReadOptions;
|
|
|
|
class TableCache;
|
|
|
|
class TableReader;
|
2014-01-27 22:53:22 +01:00
|
|
|
class InternalKeyComparator;
|
2014-06-19 01:36:48 +02:00
|
|
|
class PlainTableKeyDecoder;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
|
|
|
using std::unique_ptr;
|
|
|
|
using std::unordered_map;
|
2014-01-28 06:58:46 +01:00
|
|
|
extern const uint32_t kPlainTableVariableLength;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2013-12-20 18:35:24 +01:00
|
|
|
// Based on following output file format shown in plain_table_factory.h
|
2013-10-29 04:34:02 +01:00
|
|
|
// When opening the output file, IndexedTableReader creates a hash table
|
|
|
|
// from key prefixes to offset of the output file. IndexedTable will decide
|
|
|
|
// whether it points to the data offset of the first key with the key prefix
|
|
|
|
// or the offset of it. If there are too many keys share this prefix, it will
|
|
|
|
// create a binary search-able index from the suffix to offset on disk.
|
|
|
|
//
|
|
|
|
// The implementation of IndexedTableReader requires output file is mmaped
|
|
|
|
class PlainTableReader: public TableReader {
|
2014-01-25 06:10:19 +01:00
|
|
|
public:
|
2013-10-29 04:34:02 +01:00
|
|
|
static Status Open(const Options& options, const EnvOptions& soptions,
|
2014-01-27 22:53:22 +01:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
2014-01-28 19:35:48 +01:00
|
|
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
2014-01-28 06:58:46 +01:00
|
|
|
unique_ptr<TableReader>* table,
|
2014-02-08 01:25:38 +01:00
|
|
|
const int bloom_bits_per_key, double hash_table_ratio,
|
2014-06-19 01:36:48 +02:00
|
|
|
size_t index_sparseness, size_t huge_page_tlb_size,
|
|
|
|
bool full_scan_mode);
|
2013-10-29 04:34:02 +01:00
|
|
|
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2014-06-12 19:06:18 +02:00
|
|
|
void Prepare(const Slice& target);
|
|
|
|
|
2014-01-27 22:53:22 +01:00
|
|
|
Status Get(const ReadOptions&, const Slice& key, void* arg,
|
|
|
|
bool (*result_handler)(void* arg, const ParsedInternalKey& k,
|
2014-06-20 10:23:02 +02:00
|
|
|
const Slice& v),
|
2014-01-27 22:53:22 +01:00
|
|
|
void (*mark_key_may_exist)(void*) = nullptr);
|
2013-10-29 04:34:02 +01:00
|
|
|
|
|
|
|
uint64_t ApproximateOffsetOf(const Slice& key);
|
|
|
|
|
|
|
|
void SetupForCompaction();
|
|
|
|
|
2014-02-08 04:26:49 +01:00
|
|
|
std::shared_ptr<const TableProperties> GetTableProperties() const {
|
|
|
|
return table_properties_;
|
|
|
|
}
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2014-02-14 00:27:59 +01:00
|
|
|
PlainTableReader(const Options& options, unique_ptr<RandomAccessFile>&& file,
|
|
|
|
const EnvOptions& storage_options,
|
2014-01-27 22:53:22 +01:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
2014-06-19 01:36:48 +02:00
|
|
|
EncodingType encoding_type, uint64_t file_size,
|
|
|
|
const TableProperties* table_properties);
|
2014-02-08 01:25:38 +01:00
|
|
|
virtual ~PlainTableReader();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
// Check bloom filter to see whether it might contain this prefix.
|
|
|
|
// The hash of the prefix is given, since it can be reused for index lookup
|
|
|
|
// too.
|
|
|
|
virtual bool MatchBloom(uint32_t hash) const;
|
|
|
|
|
|
|
|
// PopulateIndex() builds index of keys. It must be called before any query
|
|
|
|
// to the table.
|
|
|
|
//
|
2014-04-23 03:31:55 +02:00
|
|
|
// props: the table properties object that need to be stored. Ownership of
|
|
|
|
// the object will be passed.
|
|
|
|
//
|
2014-02-14 00:27:59 +01:00
|
|
|
// index_ contains buckets size of index_size_, each is a
|
|
|
|
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
|
|
|
|
// and the first bit of the integer indicates type of the offset.
|
2014-02-08 01:25:38 +01:00
|
|
|
//
|
|
|
|
// +--------------+------------------------------------------------------+
|
|
|
|
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
|
|
|
// +--------------+------------------------------------------------------+
|
|
|
|
//
|
|
|
|
// Explanation for the "flag bit":
|
|
|
|
//
|
|
|
|
// 0 indicates that the bucket contains only one prefix (no conflict when
|
|
|
|
// hashing this prefix), whose first row starts from this offset of the
|
|
|
|
// file.
|
|
|
|
// 1 indicates that the bucket contains more than one prefixes, or there
|
|
|
|
// are too many rows for one prefix so we need a binary search for it. In
|
|
|
|
// this case, the offset indicates the offset of sub_index_ holding the
|
|
|
|
// binary search indexes of keys for those rows. Those binary search indexes
|
|
|
|
// are organized in this way:
|
|
|
|
//
|
|
|
|
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
|
|
|
// it, there are N 32-bit integers, each points of an offset of the file,
|
|
|
|
// which
|
|
|
|
// points to starting of a row. Those offsets need to be guaranteed to be in
|
|
|
|
// ascending order so the keys they are pointing to are also in ascending
|
|
|
|
// order
|
|
|
|
// to make sure we can use them to do binary searches. Below is visual
|
|
|
|
// presentation of a bucket.
|
|
|
|
//
|
|
|
|
// <begin>
|
|
|
|
// number_of_records: varint32
|
|
|
|
// record 1 file offset: fixedint32
|
|
|
|
// record 2 file offset: fixedint32
|
|
|
|
// ....
|
|
|
|
// record N file offset: fixedint32
|
|
|
|
// <end>
|
2014-06-09 21:30:19 +02:00
|
|
|
Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
|
|
|
|
double hash_table_ratio, size_t index_sparseness,
|
|
|
|
size_t huge_page_tlb_size);
|
2014-06-19 01:36:48 +02:00
|
|
|
Status MmapDataFile();
|
2014-02-08 01:25:38 +01:00
|
|
|
|
2014-01-25 06:10:19 +01:00
|
|
|
private:
|
2013-12-20 18:35:24 +01:00
|
|
|
struct IndexRecord;
|
|
|
|
class IndexRecordList;
|
|
|
|
|
2014-02-14 00:27:59 +01:00
|
|
|
// Plain table maintains an index and a sub index.
|
|
|
|
// index is implemented by a hash table.
|
|
|
|
// subindex is a big of memory array.
|
|
|
|
// For more details about the in-memory index, please refer to:
|
|
|
|
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
|
|
|
|
// #wiki-in-memory-index-format
|
2014-05-04 22:55:53 +02:00
|
|
|
uint32_t* index_;
|
2014-02-14 00:27:59 +01:00
|
|
|
int index_size_ = 0;
|
2014-05-04 22:55:53 +02:00
|
|
|
char* sub_index_;
|
2014-01-27 22:53:22 +01:00
|
|
|
const InternalKeyComparator internal_comparator_;
|
2014-06-19 01:36:48 +02:00
|
|
|
EncodingType encoding_type_;
|
2014-02-14 00:27:59 +01:00
|
|
|
// represents plain table's current status.
|
2013-10-29 04:34:02 +01:00
|
|
|
Status status_;
|
2013-11-21 20:11:02 +01:00
|
|
|
Slice file_data_;
|
2013-12-20 18:35:24 +01:00
|
|
|
|
2014-02-14 00:27:59 +01:00
|
|
|
// data_start_offset_ and data_end_offset_ defines the range of the
|
|
|
|
// sst file that stores data.
|
2014-01-25 06:10:19 +01:00
|
|
|
const uint32_t data_start_offset_ = 0;
|
2013-12-20 18:35:24 +01:00
|
|
|
const uint32_t data_end_offset_;
|
|
|
|
const size_t user_key_len_;
|
2014-06-09 21:30:19 +02:00
|
|
|
const SliceTransform* prefix_extractor_;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
|
|
|
static const size_t kNumInternalBytes = 8;
|
2013-11-21 20:11:02 +01:00
|
|
|
static const uint32_t kSubIndexMask = 0x80000000;
|
|
|
|
static const size_t kOffsetLen = sizeof(uint32_t);
|
2014-01-25 06:10:19 +01:00
|
|
|
static const uint64_t kMaxFileSize = 1u << 31;
|
|
|
|
static const size_t kRecordsPerGroup = 256;
|
2014-06-19 01:36:48 +02:00
|
|
|
static const int kFullScanModeFlag = -1;
|
2014-01-25 06:10:19 +01:00
|
|
|
|
2014-06-09 21:30:19 +02:00
|
|
|
// Bloom filter is used to rule out non-existent key
|
|
|
|
bool enable_bloom_;
|
|
|
|
DynamicBloom bloom_;
|
|
|
|
Arena arena_;
|
|
|
|
|
|
|
|
const Options& options_;
|
|
|
|
unique_ptr<RandomAccessFile> file_;
|
|
|
|
uint32_t file_size_;
|
|
|
|
std::shared_ptr<const TableProperties> table_properties_;
|
|
|
|
|
2014-01-25 06:10:19 +01:00
|
|
|
bool IsFixedLength() const {
|
2014-01-28 06:58:46 +01:00
|
|
|
return user_key_len_ != kPlainTableVariableLength;
|
2013-12-20 18:35:24 +01:00
|
|
|
}
|
|
|
|
|
2014-01-25 06:10:19 +01:00
|
|
|
size_t GetFixedInternalKeyLength() const {
|
2013-12-20 18:35:24 +01:00
|
|
|
return user_key_len_ + kNumInternalBytes;
|
2013-10-29 04:34:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
friend class TableCache;
|
|
|
|
friend class PlainTableIterator;
|
|
|
|
|
2013-12-20 18:35:24 +01:00
|
|
|
// Internal helper function to generate an IndexRecordList object from all
|
|
|
|
// the rows, which contains index records as a list.
|
2014-02-08 01:25:38 +01:00
|
|
|
// If bloom_ is not null, all the keys' full-key hash will be added to the
|
|
|
|
// bloom filter.
|
|
|
|
Status PopulateIndexRecordList(IndexRecordList* record_list,
|
2014-06-09 21:30:19 +02:00
|
|
|
int* num_prefixes, int bloom_bits_per_key,
|
|
|
|
size_t index_sparseness);
|
2013-12-20 18:35:24 +01:00
|
|
|
|
|
|
|
// Internal helper function to allocate memory for indexes and bloom filters
|
2014-06-09 21:30:19 +02:00
|
|
|
void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key,
|
|
|
|
double hash_table_ratio,
|
|
|
|
size_t huge_page_tlb_size);
|
2013-12-20 18:35:24 +01:00
|
|
|
|
|
|
|
// Internal helper function to bucket index record list to hash buckets.
|
2014-02-14 00:27:59 +01:00
|
|
|
// bucket_header is a vector of size hash_table_size_, with each entry
|
|
|
|
// containing a linklist of IndexRecord hashed to the same bucket, in reverse
|
|
|
|
// order.
|
2013-12-20 18:35:24 +01:00
|
|
|
// of offsets for the hash, in reversed order.
|
2014-03-07 02:30:46 +01:00
|
|
|
// entries_per_bucket is sized of index_size_. The value is how many index
|
2014-02-14 00:27:59 +01:00
|
|
|
// records are there in bucket_headers for the same bucket.
|
2014-03-07 02:30:46 +01:00
|
|
|
size_t BucketizeIndexesAndFillBloom(
|
|
|
|
IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
|
|
|
|
std::vector<uint32_t>* entries_per_bucket);
|
2013-12-20 18:35:24 +01:00
|
|
|
|
|
|
|
// Internal helper class to fill the indexes and bloom filters to internal
|
2014-03-07 02:30:46 +01:00
|
|
|
// data structures. bucket_headers and entries_per_bucket are bucketized
|
|
|
|
// indexes and counts generated by BucketizeIndexesAndFillBloom().
|
|
|
|
void FillIndexes(const size_t kSubIndexSize,
|
2014-02-14 00:27:59 +01:00
|
|
|
const std::vector<IndexRecord*>& bucket_headers,
|
2014-06-09 21:30:19 +02:00
|
|
|
const std::vector<uint32_t>& entries_per_bucket,
|
|
|
|
size_t huge_page_tlb_size);
|
2014-01-25 06:10:19 +01:00
|
|
|
|
2014-06-19 01:36:48 +02:00
|
|
|
// Read the key and value at `offset` to parameters for keys, the and
|
|
|
|
// `seekable`.
|
2014-02-14 00:27:59 +01:00
|
|
|
// On success, `offset` will be updated as the offset for the next key.
|
2014-06-19 01:36:48 +02:00
|
|
|
// `parsed_key` will be key in parsed format.
|
|
|
|
// if `internal_key` is not empty, it will be filled with key with slice
|
|
|
|
// format.
|
|
|
|
// if `seekable` is not null, it will return whether we can directly read
|
|
|
|
// data using this offset.
|
|
|
|
Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
|
|
|
|
ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
|
|
|
|
bool* seekable = nullptr) const;
|
2013-11-21 20:11:02 +01:00
|
|
|
// Get file offset for key target.
|
|
|
|
// return value prefix_matched is set to true if the offset is confirmed
|
|
|
|
// for a key with the same prefix as target.
|
2013-12-20 18:35:24 +01:00
|
|
|
Status GetOffset(const Slice& target, const Slice& prefix,
|
|
|
|
uint32_t prefix_hash, bool& prefix_matched,
|
2014-02-14 00:27:59 +01:00
|
|
|
uint32_t* offset) const;
|
2014-02-08 01:25:38 +01:00
|
|
|
|
|
|
|
Slice GetUserKey(const Slice& key) const {
|
|
|
|
return Slice(key.data(), key.size() - 8);
|
|
|
|
}
|
2013-12-20 18:35:24 +01:00
|
|
|
|
2014-02-08 01:25:38 +01:00
|
|
|
Slice GetPrefix(const Slice& target) const {
|
2014-01-28 19:35:48 +01:00
|
|
|
assert(target.size() >= 8); // target is internal key
|
2014-02-08 01:25:38 +01:00
|
|
|
return GetPrefixFromUserKey(GetUserKey(target));
|
2013-12-20 18:35:24 +01:00
|
|
|
}
|
2013-11-21 20:11:02 +01:00
|
|
|
|
2014-02-08 01:25:38 +01:00
|
|
|
inline Slice GetPrefix(const ParsedInternalKey& target) const;
|
|
|
|
|
|
|
|
Slice GetPrefixFromUserKey(const Slice& user_key) const {
|
|
|
|
if (!IsTotalOrderMode()) {
|
2014-06-09 21:30:19 +02:00
|
|
|
return prefix_extractor_->Transform(user_key);
|
2014-02-08 01:25:38 +01:00
|
|
|
} else {
|
|
|
|
// Use empty slice as prefix if prefix_extractor is not set. In that case,
|
|
|
|
// it falls back to pure binary search and total iterator seek is
|
|
|
|
// supported.
|
|
|
|
return Slice();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-09 21:30:19 +02:00
|
|
|
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
|
2014-01-27 22:53:22 +01:00
|
|
|
|
2013-10-29 04:34:02 +01:00
|
|
|
// No copying allowed
|
|
|
|
explicit PlainTableReader(const TableReader&) = delete;
|
|
|
|
void operator=(const TableReader&) = delete;
|
|
|
|
};
|
|
|
|
} // namespace rocksdb
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif // ROCKSDB_LITE
|