2019-04-18 19:51:19 +02:00
|
|
|
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
2013-10-29 04:34:02 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
2014-04-25 21:21:34 +02:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2013-10-29 04:34:02 +01:00
|
|
|
#include <unordered_map>
|
|
|
|
#include <memory>
|
2014-01-25 06:10:19 +01:00
|
|
|
#include <vector>
|
2014-01-27 22:53:22 +01:00
|
|
|
#include <string>
|
2013-10-29 04:34:02 +01:00
|
|
|
#include <stdint.h>
|
2014-01-28 06:58:46 +01:00
|
|
|
|
2014-01-27 22:53:22 +01:00
|
|
|
#include "db/dbformat.h"
|
2019-09-16 19:31:27 +02:00
|
|
|
#include "file/random_access_file_reader.h"
|
2019-05-31 02:39:43 +02:00
|
|
|
#include "memory/arena.h"
|
2013-10-29 04:34:02 +01:00
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
2014-01-28 19:35:48 +01:00
|
|
|
#include "rocksdb/slice_transform.h"
|
2014-01-28 06:58:46 +01:00
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "rocksdb/table_properties.h"
|
2019-09-05 19:03:42 +02:00
|
|
|
#include "table/plain/plain_table_bloom.h"
|
2019-05-30 23:47:29 +02:00
|
|
|
#include "table/plain/plain_table_factory.h"
|
|
|
|
#include "table/plain/plain_table_index.h"
|
2019-05-31 02:39:43 +02:00
|
|
|
#include "table/table_reader.h"
|
2013-10-29 04:34:02 +01:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
class Block;
|
2014-07-20 16:56:40 +02:00
|
|
|
struct BlockContents;
|
2013-10-29 04:34:02 +01:00
|
|
|
class BlockHandle;
|
|
|
|
class Footer;
|
|
|
|
struct Options;
|
|
|
|
class RandomAccessFile;
|
|
|
|
struct ReadOptions;
|
|
|
|
class TableCache;
|
|
|
|
class TableReader;
|
2014-01-27 22:53:22 +01:00
|
|
|
class InternalKeyComparator;
|
2014-06-19 01:36:48 +02:00
|
|
|
class PlainTableKeyDecoder;
|
2014-09-29 20:09:09 +02:00
|
|
|
class GetContext;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2014-01-28 06:58:46 +01:00
|
|
|
extern const uint32_t kPlainTableVariableLength;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2015-09-17 01:57:43 +02:00
|
|
|
struct PlainTableReaderFileInfo {
|
|
|
|
bool is_mmap_mode;
|
|
|
|
Slice file_data;
|
|
|
|
uint32_t data_end_offset;
|
2018-11-09 20:17:34 +01:00
|
|
|
std::unique_ptr<RandomAccessFileReader> file;
|
2015-09-17 01:57:43 +02:00
|
|
|
|
2019-03-27 18:18:56 +01:00
|
|
|
PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
|
2015-09-17 01:57:43 +02:00
|
|
|
const EnvOptions& storage_options,
|
|
|
|
uint32_t _data_size_offset)
|
|
|
|
: is_mmap_mode(storage_options.use_mmap_reads),
|
|
|
|
data_end_offset(_data_size_offset),
|
|
|
|
file(std::move(_file)) {}
|
|
|
|
};
|
|
|
|
|
2019-05-24 01:22:13 +02:00
|
|
|
// The reader class of PlainTable. For description of PlainTable format
|
|
|
|
// See comments of class PlainTableFactory, where instances of
|
|
|
|
// PlainTableReader are created.
|
|
|
|
class PlainTableReader: public TableReader {
|
|
|
|
public:
|
2013-12-20 18:35:24 +01:00
|
|
|
// Based on following output file format shown in plain_table_factory.h
|
2019-05-24 01:22:13 +02:00
|
|
|
// When opening the output file, PlainTableReader creates a hash table
|
|
|
|
// from key prefixes to offset of the output file. PlainTable will decide
|
2013-10-29 04:34:02 +01:00
|
|
|
// whether it points to the data offset of the first key with the key prefix
|
|
|
|
// or the offset of it. If there are too many keys share this prefix, it will
|
|
|
|
// create a binary search-able index from the suffix to offset on disk.
|
2014-09-05 01:18:36 +02:00
|
|
|
static Status Open(const ImmutableCFOptions& ioptions,
|
|
|
|
const EnvOptions& env_options,
|
2017-07-27 23:17:10 +02:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
2018-11-09 20:17:34 +01:00
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file,
|
|
|
|
uint64_t file_size, std::unique_ptr<TableReader>* table,
|
2014-02-08 01:25:38 +01:00
|
|
|
const int bloom_bits_per_key, double hash_table_ratio,
|
2014-06-19 01:36:48 +02:00
|
|
|
size_t index_sparseness, size_t huge_page_tlb_size,
|
2019-01-26 02:07:00 +01:00
|
|
|
bool full_scan_mode, const bool immortal_table = false,
|
2018-05-21 23:33:55 +02:00
|
|
|
const SliceTransform* prefix_extractor = nullptr);
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2019-06-19 23:07:36 +02:00
|
|
|
// Returns new iterator over table contents
|
|
|
|
// compaction_readahead_size: its value will only be used if for_compaction =
|
|
|
|
// true
|
2017-05-06 00:01:04 +02:00
|
|
|
InternalIterator* NewIterator(const ReadOptions&,
|
2018-05-21 23:33:55 +02:00
|
|
|
const SliceTransform* prefix_extractor,
|
2019-06-20 23:28:22 +02:00
|
|
|
Arena* arena, bool skip_filters,
|
2019-09-20 21:00:55 +02:00
|
|
|
TableReaderCaller caller,
|
|
|
|
size_t compaction_readahead_size = 0) override;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2015-02-26 20:28:41 +01:00
|
|
|
void Prepare(const Slice& target) override;
|
2014-06-12 19:06:18 +02:00
|
|
|
|
2018-05-21 23:33:55 +02:00
|
|
|
Status Get(const ReadOptions& readOptions, const Slice& key,
|
|
|
|
GetContext* get_context, const SliceTransform* prefix_extractor,
|
Skip bottom-level filter block caching when hit-optimized
Summary:
When Get() or NewIterator() trigger file loads, skip caching the filter block if
(1) optimize_filters_for_hits is set and (2) the file is on the bottommost
level. Also skip checking filters under the same conditions, which means that
for a preloaded file or a file that was trivially-moved to the bottom level, its
filter block will eventually expire from the cache.
- added parameters/instance variables in various places in order to propagate the config ("skip_filters") from version_set to block_based_table_reader
- in BlockBasedTable::Rep, this optimization prevents filter from being loaded when the file is opened simply by setting filter_policy = nullptr
- in BlockBasedTable::Get/BlockBasedTable::NewIterator, this optimization prevents filter from being used (even if it was loaded already) by setting filter = nullptr
Test Plan:
updated unit test:
$ ./db_test --gtest_filter=DBTest.OptimizeFiltersForHits
will also run 'make check'
Reviewers: sdong, igor, paultuckfield, anthony, rven, kradhakrishnan, IslamAbdelRahman, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D51633
2015-12-23 19:15:07 +01:00
|
|
|
bool skip_filters = false) override;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2019-06-11 00:30:05 +02:00
|
|
|
uint64_t ApproximateOffsetOf(const Slice& key,
|
2019-06-20 23:28:22 +02:00
|
|
|
TableReaderCaller caller) override;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2019-08-16 23:16:49 +02:00
|
|
|
uint64_t ApproximateSize(const Slice& start, const Slice& end,
|
|
|
|
TableReaderCaller caller) override;
|
|
|
|
|
2014-07-19 01:58:13 +02:00
|
|
|
uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
|
2015-02-26 20:28:41 +01:00
|
|
|
void SetupForCompaction() override;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2015-02-26 20:28:41 +01:00
|
|
|
std::shared_ptr<const TableProperties> GetTableProperties() const override {
|
2014-02-08 04:26:49 +01:00
|
|
|
return table_properties_;
|
|
|
|
}
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2014-08-05 20:27:34 +02:00
|
|
|
virtual size_t ApproximateMemoryUsage() const override {
|
|
|
|
return arena_.MemoryAllocatedBytes();
|
|
|
|
}
|
|
|
|
|
2014-09-05 01:18:36 +02:00
|
|
|
PlainTableReader(const ImmutableCFOptions& ioptions,
|
2018-11-09 20:17:34 +01:00
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file,
|
2014-09-05 01:18:36 +02:00
|
|
|
const EnvOptions& env_options,
|
2014-01-27 22:53:22 +01:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
2014-06-19 01:36:48 +02:00
|
|
|
EncodingType encoding_type, uint64_t file_size,
|
2018-05-21 23:33:55 +02:00
|
|
|
const TableProperties* table_properties,
|
|
|
|
const SliceTransform* prefix_extractor);
|
2014-02-08 01:25:38 +01:00
|
|
|
virtual ~PlainTableReader();
|
|
|
|
|
|
|
|
protected:
|
|
|
|
// Check bloom filter to see whether it might contain this prefix.
|
|
|
|
// The hash of the prefix is given, since it can be reused for index lookup
|
|
|
|
// too.
|
|
|
|
virtual bool MatchBloom(uint32_t hash) const;
|
|
|
|
|
|
|
|
// PopulateIndex() builds index of keys. It must be called before any query
|
|
|
|
// to the table.
|
|
|
|
//
|
2014-04-23 03:31:55 +02:00
|
|
|
// props: the table properties object that need to be stored. Ownership of
|
|
|
|
// the object will be passed.
|
|
|
|
//
|
2014-07-19 01:58:13 +02:00
|
|
|
|
2014-06-09 21:30:19 +02:00
|
|
|
Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
|
|
|
|
double hash_table_ratio, size_t index_sparseness,
|
|
|
|
size_t huge_page_tlb_size);
|
2014-07-19 01:58:13 +02:00
|
|
|
|
2015-09-17 01:57:43 +02:00
|
|
|
Status MmapDataIfNeeded();
|
2014-02-08 01:25:38 +01:00
|
|
|
|
2014-01-25 06:10:19 +01:00
|
|
|
private:
|
2014-01-27 22:53:22 +01:00
|
|
|
const InternalKeyComparator internal_comparator_;
|
2014-06-19 01:36:48 +02:00
|
|
|
EncodingType encoding_type_;
|
2014-02-14 00:27:59 +01:00
|
|
|
// represents plain table's current status.
|
2013-10-29 04:34:02 +01:00
|
|
|
Status status_;
|
2013-12-20 18:35:24 +01:00
|
|
|
|
2014-07-19 01:58:13 +02:00
|
|
|
PlainTableIndex index_;
|
|
|
|
bool full_scan_mode_;
|
|
|
|
|
2014-02-14 00:27:59 +01:00
|
|
|
// data_start_offset_ and data_end_offset_ defines the range of the
|
|
|
|
// sst file that stores data.
|
2014-01-25 06:10:19 +01:00
|
|
|
const uint32_t data_start_offset_ = 0;
|
2014-11-11 22:47:22 +01:00
|
|
|
const uint32_t user_key_len_;
|
2014-06-09 21:30:19 +02:00
|
|
|
const SliceTransform* prefix_extractor_;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
|
|
|
static const size_t kNumInternalBytes = 8;
|
2014-01-25 06:10:19 +01:00
|
|
|
|
2014-06-09 21:30:19 +02:00
|
|
|
// Bloom filter is used to rule out non-existent key
|
|
|
|
bool enable_bloom_;
|
2019-09-05 19:03:42 +02:00
|
|
|
PlainTableBloomV1 bloom_;
|
2015-09-17 01:57:43 +02:00
|
|
|
PlainTableReaderFileInfo file_info_;
|
2014-06-09 21:30:19 +02:00
|
|
|
Arena arena_;
|
2018-10-03 02:21:54 +02:00
|
|
|
CacheAllocationPtr index_block_alloc_;
|
|
|
|
CacheAllocationPtr bloom_block_alloc_;
|
2014-06-09 21:30:19 +02:00
|
|
|
|
2014-09-05 01:18:36 +02:00
|
|
|
const ImmutableCFOptions& ioptions_;
|
2019-01-26 02:07:00 +01:00
|
|
|
std::unique_ptr<Cleanable> dummy_cleanable_;
|
2014-11-11 22:47:22 +01:00
|
|
|
uint64_t file_size_;
|
2014-06-09 21:30:19 +02:00
|
|
|
std::shared_ptr<const TableProperties> table_properties_;
|
|
|
|
|
2014-01-25 06:10:19 +01:00
|
|
|
bool IsFixedLength() const {
|
2014-01-28 06:58:46 +01:00
|
|
|
return user_key_len_ != kPlainTableVariableLength;
|
2013-12-20 18:35:24 +01:00
|
|
|
}
|
|
|
|
|
2014-01-25 06:10:19 +01:00
|
|
|
size_t GetFixedInternalKeyLength() const {
|
2013-12-20 18:35:24 +01:00
|
|
|
return user_key_len_ + kNumInternalBytes;
|
2013-10-29 04:34:02 +01:00
|
|
|
}
|
|
|
|
|
2014-07-19 01:58:13 +02:00
|
|
|
Slice GetPrefix(const Slice& target) const {
|
|
|
|
assert(target.size() >= 8); // target is internal key
|
|
|
|
return GetPrefixFromUserKey(GetUserKey(target));
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice GetPrefix(const ParsedInternalKey& target) const {
|
|
|
|
return GetPrefixFromUserKey(target.user_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice GetUserKey(const Slice& key) const {
|
|
|
|
return Slice(key.data(), key.size() - 8);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice GetPrefixFromUserKey(const Slice& user_key) const {
|
|
|
|
if (!IsTotalOrderMode()) {
|
|
|
|
return prefix_extractor_->Transform(user_key);
|
|
|
|
} else {
|
|
|
|
// Use empty slice as prefix if prefix_extractor is not set.
|
|
|
|
// In that case,
|
|
|
|
// it falls back to pure binary search and
|
|
|
|
// total iterator seek is supported.
|
|
|
|
return Slice();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-29 04:34:02 +01:00
|
|
|
friend class TableCache;
|
|
|
|
friend class PlainTableIterator;
|
|
|
|
|
2013-12-20 18:35:24 +01:00
|
|
|
// Internal helper function to generate an IndexRecordList object from all
|
|
|
|
// the rows, which contains index records as a list.
|
2014-02-08 01:25:38 +01:00
|
|
|
// If bloom_ is not null, all the keys' full-key hash will be added to the
|
|
|
|
// bloom filter.
|
2014-07-19 01:58:13 +02:00
|
|
|
Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
|
2019-03-27 18:18:56 +01:00
|
|
|
std::vector<uint32_t>* prefix_hashes);
|
2014-07-19 01:58:13 +02:00
|
|
|
|
2019-09-13 19:24:38 +02:00
|
|
|
// Internal helper function to allocate memory for bloom filter
|
|
|
|
void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
|
|
|
|
size_t huge_page_tlb_size);
|
2014-07-19 01:58:13 +02:00
|
|
|
|
2019-09-13 19:24:38 +02:00
|
|
|
void FillBloom(const std::vector<uint32_t>& prefix_hashes);
|
2014-01-25 06:10:19 +01:00
|
|
|
|
2014-06-19 01:36:48 +02:00
|
|
|
// Read the key and value at `offset` to parameters for keys, the and
|
|
|
|
// `seekable`.
|
2014-02-14 00:27:59 +01:00
|
|
|
// On success, `offset` will be updated as the offset for the next key.
|
2014-06-19 01:36:48 +02:00
|
|
|
// `parsed_key` will be key in parsed format.
|
|
|
|
// if `internal_key` is not empty, it will be filled with key with slice
|
|
|
|
// format.
|
|
|
|
// if `seekable` is not null, it will return whether we can directly read
|
|
|
|
// data using this offset.
|
|
|
|
Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
|
|
|
|
ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
|
|
|
|
bool* seekable = nullptr) const;
|
2013-11-21 20:11:02 +01:00
|
|
|
// Get file offset for key target.
|
|
|
|
// return value prefix_matched is set to true if the offset is confirmed
|
|
|
|
// for a key with the same prefix as target.
|
2015-11-18 03:29:40 +01:00
|
|
|
Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
|
|
|
|
const Slice& prefix, uint32_t prefix_hash,
|
|
|
|
bool& prefix_matched, uint32_t* offset) const;
|
2014-02-08 01:25:38 +01:00
|
|
|
|
2014-06-09 21:30:19 +02:00
|
|
|
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
|
2014-01-27 22:53:22 +01:00
|
|
|
|
2013-10-29 04:34:02 +01:00
|
|
|
// No copying allowed
|
|
|
|
explicit PlainTableReader(const TableReader&) = delete;
|
|
|
|
void operator=(const TableReader&) = delete;
|
|
|
|
};
|
|
|
|
} // namespace rocksdb
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif // ROCKSDB_LITE
|