2013-10-29 04:34:02 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
2014-04-25 21:23:07 +02:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2013-10-29 04:34:02 +01:00
|
|
|
#include <memory>
|
2014-06-19 01:36:48 +02:00
|
|
|
#include <string>
|
2013-10-29 04:34:02 +01:00
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
struct EnvOptions;
|
|
|
|
|
|
|
|
using std::unique_ptr;
|
|
|
|
class Status;
|
|
|
|
class RandomAccessFile;
|
|
|
|
class WritableFile;
|
|
|
|
class Table;
|
|
|
|
class TableBuilder;
|
|
|
|
|
|
|
|
// IndexedTable requires fixed length key, configured as a constructor
|
|
|
|
// parameter of the factory class. Output file format:
|
2013-12-20 18:35:24 +01:00
|
|
|
// +-------------+-----------------+
|
|
|
|
// | version | user_key_length |
|
2014-06-19 01:36:48 +02:00
|
|
|
// +------------++------------+-----------------+ <= key1 offset
|
|
|
|
// | encoded key1 | value_size | |
|
2013-12-20 18:35:24 +01:00
|
|
|
// +------------+-------------+-------------+ |
|
2013-10-29 04:34:02 +01:00
|
|
|
// | value1 |
|
|
|
|
// | |
|
2014-06-19 01:36:48 +02:00
|
|
|
// +--------------------------+-------------+---+ <= key2 offset
|
|
|
|
// | encoded key2 | value_size | |
|
2013-12-20 18:35:24 +01:00
|
|
|
// +------------+-------------+-------------+ |
|
2013-10-29 04:34:02 +01:00
|
|
|
// | value2 |
|
|
|
|
// | |
|
|
|
|
// | ...... |
|
|
|
|
// +-----------------+--------------------------+
|
2014-06-19 01:36:48 +02:00
|
|
|
//
|
|
|
|
// When the key encoding type is kPlain. Key part is encoded as:
|
|
|
|
// +------------+--------------------+
|
|
|
|
// | [key_size] | internal key |
|
|
|
|
// +------------+--------------------+
|
|
|
|
// for the case of user_key_len = kPlainTableVariableLength case,
|
|
|
|
// and simply:
|
|
|
|
// +----------------------+
|
|
|
|
// | internal key |
|
|
|
|
// +----------------------+
|
|
|
|
// for user_key_len != kPlainTableVariableLength case.
|
|
|
|
//
|
|
|
|
// If key encoding type is kPrefix. Keys are encoding in this format.
|
|
|
|
// There are three ways to encode a key:
|
|
|
|
// (1) Full Key
|
|
|
|
// +---------------+---------------+-------------------+
|
|
|
|
// | Full Key Flag | Full Key Size | Full Internal Key |
|
|
|
|
// +---------------+---------------+-------------------+
|
|
|
|
// which simply encodes a full key
|
|
|
|
//
|
|
|
|
// (2) A key shared the same prefix as the previous key, which is encoded as
|
|
|
|
// format of (1).
|
|
|
|
// +-------------+-------------+-------------+-------------+------------+
|
|
|
|
// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
|
|
|
|
// +-------------+-------------+-------------+-------------+------------+
|
|
|
|
// where key is the suffix part of the key, including the internal bytes.
|
|
|
|
// the actual key will be constructed by concatenating prefix part of the
|
|
|
|
// previous key, with the suffix part of the key here, with sizes given here.
|
|
|
|
//
|
|
|
|
// (3) A key shared the same prefix as the previous key, which is encoded as
|
|
|
|
// the format of (2).
|
|
|
|
// +-----------------+-----------------+------------------------+
|
|
|
|
// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
|
|
|
|
// +-----------------+-----------------+------------------------+
|
|
|
|
// The key will be constructed by concatenating previous key's prefix (which is
|
|
|
|
// also a prefix which the last key encoded in the format of (1)) and the
|
|
|
|
// key given here.
|
|
|
|
//
|
|
|
|
// For example, we for following keys (prefix and suffix are separated by
|
|
|
|
// spaces):
|
|
|
|
// 0000 0001
|
|
|
|
// 0000 00021
|
|
|
|
// 0000 0002
|
|
|
|
// 00011 00
|
|
|
|
// 0002 0001
|
|
|
|
// Will be encoded like this:
|
|
|
|
// FK 8 00000001
|
|
|
|
// PF 4 SF 5 00021
|
|
|
|
// SF 4 0002
|
|
|
|
// FK 7 0001100
|
|
|
|
// FK 8 00020001
|
|
|
|
// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
|
|
|
|
//
|
|
|
|
// All those "key flag + key size" shown above are in this format:
|
|
|
|
// The 8 bits of the first byte:
|
|
|
|
// +----+----+----+----+----+----+----+----+
|
|
|
|
// | Type | Size |
|
|
|
|
// +----+----+----+----+----+----+----+----+
|
|
|
|
// Type indicates: full key, prefix, or suffix.
|
|
|
|
// The last 6 bits are for size. If the size bits are not all 1, it means the
|
|
|
|
// size of the key. Otherwise, varint32 is read after this byte. This varint
|
|
|
|
// value + 0x3F (the value of all 1) will be the key size.
|
|
|
|
//
|
|
|
|
// For example, full key with length 16 will be encoded as (binary):
|
|
|
|
// 00 010000
|
|
|
|
// (00 means full key)
|
|
|
|
// and a prefix with 100 bytes will be encoded as:
|
|
|
|
// 01 111111 00100101
|
|
|
|
// (63) (37)
|
|
|
|
// (01 means key suffix)
|
|
|
|
//
|
|
|
|
// All the internal keys above (including kPlain and kPrefix) are encoded in
|
|
|
|
// this format:
|
|
|
|
// There are two types:
|
|
|
|
// (1) normal internal key format
|
|
|
|
// +----------- ...... -------------+----+---+---+---+---+---+---+---+
|
|
|
|
// | user key |type| sequence ID |
|
|
|
|
// +----------- ..... --------------+----+---+---+---+---+---+---+---+
|
|
|
|
// (2) Special case for keys whose sequence ID is 0 and is value type
|
|
|
|
// +----------- ...... -------------+----+
|
|
|
|
// | user key |0x80|
|
|
|
|
// +----------- ..... --------------+----+
|
|
|
|
// To save 7 bytes for the special case where sequence ID = 0.
|
|
|
|
//
|
|
|
|
//
|
2014-01-28 06:58:46 +01:00
|
|
|
class PlainTableFactory : public TableFactory {
|
|
|
|
public:
|
|
|
|
~PlainTableFactory() {}
|
2014-09-08 10:34:04 +02:00
|
|
|
// user_key_len is the length of the user key. If it is set to be
|
2014-01-28 06:58:46 +01:00
|
|
|
// kPlainTableVariableLength, then it means variable length. Otherwise, all
|
|
|
|
// the keys need to have the fix length of this value. bloom_bits_per_key is
|
2013-11-22 08:33:45 +01:00
|
|
|
// number of bits used for bloom filer per key. hash_table_ratio is
|
2013-12-20 18:35:24 +01:00
|
|
|
// the desired utilization of the hash table used for prefix hashing.
|
2013-11-22 00:13:45 +01:00
|
|
|
// hash_table_ratio = number of prefixes / #buckets in the hash table
|
2014-02-08 01:25:38 +01:00
|
|
|
// hash_table_ratio = 0 means skip hash table but only replying on binary
|
|
|
|
// search.
|
|
|
|
// index_sparseness determines index interval for keys
|
|
|
|
// inside the same prefix. It will be the maximum number of linear search
|
|
|
|
// required after hash and binary search.
|
|
|
|
// index_sparseness = 0 means index for every key.
|
2014-05-04 22:55:53 +02:00
|
|
|
// huge_page_tlb_size determines whether to allocate hash indexes from huge
|
|
|
|
// page TLB and the page size if allocating from there. See comments of
|
|
|
|
// Arena::AllocateAligned() for details.
|
2014-07-18 09:08:38 +02:00
|
|
|
explicit PlainTableFactory(const PlainTableOptions& options =
|
|
|
|
PlainTableOptions())
|
|
|
|
: user_key_len_(options.user_key_len),
|
|
|
|
bloom_bits_per_key_(options.bloom_bits_per_key),
|
|
|
|
hash_table_ratio_(options.hash_table_ratio),
|
|
|
|
index_sparseness_(options.index_sparseness),
|
|
|
|
huge_page_tlb_size_(options.huge_page_tlb_size),
|
|
|
|
encoding_type_(options.encoding_type),
|
2014-07-19 01:58:13 +02:00
|
|
|
full_scan_mode_(options.full_scan_mode),
|
|
|
|
store_index_in_file_(options.store_index_in_file) {}
|
2014-01-28 06:58:46 +01:00
|
|
|
const char* Name() const override { return "PlainTable"; }
|
2015-09-11 20:36:33 +02:00
|
|
|
Status NewTableReader(const TableReaderOptions& table_reader_options,
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
unique_ptr<RandomAccessFileReader>&& file,
|
|
|
|
uint64_t file_size,
|
|
|
|
unique_ptr<TableReader>* table) const override;
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 19:04:30 +02:00
|
|
|
TableBuilder* NewTableBuilder(
|
|
|
|
const TableBuilderOptions& table_builder_options,
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
WritableFileWriter* file) const override;
|
2013-10-29 04:34:02 +01:00
|
|
|
|
2014-08-25 23:24:09 +02:00
|
|
|
std::string GetPrintableTableOptions() const override;
|
|
|
|
|
2014-01-27 22:53:22 +01:00
|
|
|
static const char kValueTypeSeqId0 = 0xFF;
|
2013-12-20 18:35:24 +01:00
|
|
|
|
2014-08-21 00:53:39 +02:00
|
|
|
// Sanitizes the specified DB Options.
|
2014-10-18 06:18:36 +02:00
|
|
|
Status SanitizeOptions(const DBOptions& db_opts,
|
|
|
|
const ColumnFamilyOptions& cf_opts) const override {
|
|
|
|
if (db_opts.allow_mmap_reads == false) {
|
2014-08-21 00:53:39 +02:00
|
|
|
return Status::NotSupported(
|
|
|
|
"PlainTable with allow_mmap_reads == false is not supported.");
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2014-01-28 06:58:46 +01:00
|
|
|
private:
|
2013-12-20 18:35:24 +01:00
|
|
|
uint32_t user_key_len_;
|
2014-01-28 06:58:46 +01:00
|
|
|
int bloom_bits_per_key_;
|
2013-11-22 00:13:45 +01:00
|
|
|
double hash_table_ratio_;
|
2014-02-08 01:25:38 +01:00
|
|
|
size_t index_sparseness_;
|
2014-05-04 22:55:53 +02:00
|
|
|
size_t huge_page_tlb_size_;
|
2014-06-19 01:36:48 +02:00
|
|
|
EncodingType encoding_type_;
|
|
|
|
bool full_scan_mode_;
|
2014-07-19 01:58:13 +02:00
|
|
|
bool store_index_in_file_;
|
2013-10-29 04:34:02 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace rocksdb
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif // ROCKSDB_LITE
|