2013-11-18 19:12:08 +01:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2013-08-23 17:38:13 +02:00
|
|
|
#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|
|
|
|
#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
#include <stddef.h>
|
2012-09-06 02:44:13 +02:00
|
|
|
#include <string>
|
2013-01-20 11:07:13 +01:00
|
|
|
#include <memory>
|
2013-01-24 19:54:26 +01:00
|
|
|
#include <vector>
|
2012-10-23 22:30:14 +02:00
|
|
|
#include <stdint.h>
|
2013-10-16 20:50:50 +02:00
|
|
|
|
|
|
|
#include "rocksdb/memtablerep.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/slice.h"
|
2013-10-16 20:50:50 +02:00
|
|
|
#include "rocksdb/slice_transform.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/statistics.h"
|
2013-11-20 01:29:42 +01:00
|
|
|
#include "rocksdb/table_properties.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/universal_compaction.h"
|
2013-08-13 23:04:56 +02:00
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
namespace rocksdb {
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
class Cache;
|
2013-11-08 06:27:21 +01:00
|
|
|
class CompactionFilter;
|
|
|
|
class CompactionFilterFactory;
|
2011-03-18 23:37:00 +01:00
|
|
|
class Comparator;
|
|
|
|
class Env;
|
2012-04-17 17:36:46 +02:00
|
|
|
class FilterPolicy;
|
2011-07-21 04:40:18 +02:00
|
|
|
class Logger;
|
2013-03-21 23:59:47 +01:00
|
|
|
class MergeOperator;
|
2011-03-18 23:37:00 +01:00
|
|
|
class Snapshot;
|
2013-10-29 01:54:09 +01:00
|
|
|
class TableFactory;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-01-20 11:07:13 +01:00
|
|
|
using std::shared_ptr;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// DB contents are stored in a set of blocks, each of which holds a
|
|
|
|
// sequence of key,value pairs. Each block may be compressed before
|
|
|
|
// being stored in a file. The following enum describes which
|
|
|
|
// compression method (if any) is used to compress a block.
|
2013-09-02 08:23:40 +02:00
|
|
|
enum CompressionType : char {
|
2011-03-18 23:37:00 +01:00
|
|
|
// NOTE: do not change the values of existing entries, as these are
|
|
|
|
// part of the persistent format on disk.
|
2011-03-23 00:24:02 +01:00
|
|
|
kNoCompression = 0x0,
|
2012-06-28 08:41:33 +02:00
|
|
|
kSnappyCompression = 0x1,
|
2012-06-29 04:26:43 +02:00
|
|
|
kZlibCompression = 0x2,
|
|
|
|
kBZip2Compression = 0x3
|
2011-03-18 23:37:00 +01:00
|
|
|
};
|
|
|
|
|
2013-09-02 08:23:40 +02:00
|
|
|
enum CompactionStyle : char {
|
2013-07-04 00:32:49 +02:00
|
|
|
kCompactionStyleLevel = 0x0, // level based compaction style
|
|
|
|
kCompactionStyleUniversal = 0x1 // Universal compaction style
|
|
|
|
};
|
|
|
|
|
2012-11-01 18:50:08 +01:00
|
|
|
// Compression options for different compression algorithms like Zlib
|
|
|
|
struct CompressionOptions {
|
|
|
|
int window_bits;
|
|
|
|
int level;
|
|
|
|
int strategy;
|
|
|
|
CompressionOptions():window_bits(-14),
|
|
|
|
level(-1),
|
|
|
|
strategy(0){}
|
|
|
|
CompressionOptions(int wbits, int lev, int strategy):window_bits(wbits),
|
|
|
|
level(lev),
|
|
|
|
strategy(strategy){}
|
|
|
|
};
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
struct ColumnFamilyOptions {
|
2011-03-18 23:37:00 +01:00
|
|
|
// -------------------
|
|
|
|
// Parameters that affect behavior
|
|
|
|
|
|
|
|
// Comparator used to define the order of keys in the table.
|
|
|
|
// Default: a comparator that uses lexicographic byte-wise ordering
|
|
|
|
//
|
|
|
|
// REQUIRES: The client must ensure that the comparator supplied
|
|
|
|
// here has the same name and orders keys *exactly* the same as the
|
|
|
|
// comparator provided to previous open calls on the same DB.
|
|
|
|
const Comparator* comparator;
|
|
|
|
|
2013-03-21 23:59:47 +01:00
|
|
|
// REQUIRES: The client must provide a merge operator if Merge operation
|
|
|
|
// needs to be accessed. Calling Merge on a DB without a merge operator
|
|
|
|
// would result in Status::NotSupported. The client must ensure that the
|
|
|
|
// merge operator supplied here has the same name and *exactly* the same
|
|
|
|
// semantics as the merge operator provided to previous open calls on
|
|
|
|
// the same DB. The only exception is reserved for upgrade, where a DB
|
|
|
|
// previously without a merge operator is introduced to Merge operation
|
|
|
|
// for the first time. It's necessary to specify a merge operator when
|
|
|
|
// openning the DB in this case.
|
|
|
|
// Default: nullptr
|
2013-08-20 22:35:28 +02:00
|
|
|
shared_ptr<MergeOperator> merge_operator;
|
2013-03-21 23:59:47 +01:00
|
|
|
|
2013-12-02 23:59:23 +01:00
|
|
|
// A single CompactionFilter instance to call into during compaction.
|
2013-05-12 11:36:59 +02:00
|
|
|
// Allows an application to modify/delete a key-value during background
|
|
|
|
// compaction.
|
2013-12-02 23:59:23 +01:00
|
|
|
//
|
|
|
|
// If the client requires a new compaction filter to be used for different
|
|
|
|
// compaction runs, it can specify compaction_filter_factory instead of this
|
|
|
|
// option. The client should specify only one of the two.
|
2013-08-13 19:56:20 +02:00
|
|
|
// compaction_filter takes precedence over compaction_filter_factory if
|
|
|
|
// client specifies both.
|
2013-12-02 23:59:23 +01:00
|
|
|
//
|
|
|
|
// If multithreaded compaction is being used, the supplied CompactionFilter
|
|
|
|
// instance may be used from different threads concurrently and so should be
|
|
|
|
// thread-safe.
|
|
|
|
//
|
2013-05-12 11:36:59 +02:00
|
|
|
// Default: nullptr
|
|
|
|
const CompactionFilter* compaction_filter;
|
|
|
|
|
2013-12-02 23:59:23 +01:00
|
|
|
// This is a factory that provides compaction filter objects which allow
|
|
|
|
// an application to modify/delete a key-value during background compaction.
|
|
|
|
//
|
|
|
|
// A new filter will be created on each compaction run. If multithreaded
|
|
|
|
// compaction is being used, each created CompactionFilter will only be used
|
|
|
|
// from a single thread and so does not need to be thread-safe.
|
|
|
|
//
|
|
|
|
// Default: a factory that doesn't provide any object
|
|
|
|
std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// -------------------
|
|
|
|
// Parameters that affect performance
|
|
|
|
|
2011-04-12 21:38:58 +02:00
|
|
|
// Amount of data to build up in memory (backed by an unsorted log
|
|
|
|
// on disk) before converting to a sorted on-disk file.
|
2011-03-18 23:37:00 +01:00
|
|
|
//
|
2011-04-12 21:38:58 +02:00
|
|
|
// Larger values increase performance, especially during bulk loads.
|
2012-11-29 01:42:36 +01:00
|
|
|
// Up to max_write_buffer_number write buffers may be held in memory
|
2012-10-19 23:00:53 +02:00
|
|
|
// at the same time,
|
2011-04-12 21:38:58 +02:00
|
|
|
// so you may wish to adjust this parameter to control memory usage.
|
2011-07-15 02:20:57 +02:00
|
|
|
// Also, a larger write buffer will result in a longer recovery time
|
|
|
|
// the next time the database is opened.
|
2011-03-18 23:37:00 +01:00
|
|
|
//
|
2011-04-12 21:38:58 +02:00
|
|
|
// Default: 4MB
|
2011-03-18 23:37:00 +01:00
|
|
|
size_t write_buffer_size;
|
|
|
|
|
2012-10-19 23:00:53 +02:00
|
|
|
// The maximum number of write buffers that are built up in memory.
|
2012-11-29 01:42:36 +01:00
|
|
|
// The default is 2, so that when 1 write buffer is being flushed to
|
2012-10-19 23:00:53 +02:00
|
|
|
// storage, new writes can continue to the other write buffer.
|
|
|
|
// Default: 2
|
|
|
|
int max_write_buffer_number;
|
|
|
|
|
2013-06-11 23:23:58 +02:00
|
|
|
// The minimum number of write buffers that will be merged together
|
2013-07-09 20:16:39 +02:00
|
|
|
// before writing to storage. If set to 1, then
|
2013-06-11 23:23:58 +02:00
|
|
|
// all write buffers are fushed to L0 as individual files and this increases
|
|
|
|
// read amplification because a get request has to check in all of these
|
|
|
|
// files. Also, an in-memory merge may result in writing lesser
|
|
|
|
// data to storage if there are duplicate records in each of these
|
|
|
|
// individual write buffers. Default: 1
|
|
|
|
int min_write_buffer_number_to_merge;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Control over blocks (user data is stored in a set of blocks, and
|
|
|
|
// a block is the unit of reading from disk).
|
|
|
|
|
2013-03-01 03:04:58 +01:00
|
|
|
// If non-NULL use the specified cache for blocks.
|
2013-10-05 07:32:05 +02:00
|
|
|
// If NULL, rocksdb will automatically create and use an 8MB internal cache.
|
2013-03-01 03:04:58 +01:00
|
|
|
// Default: nullptr
|
2013-01-20 11:07:13 +01:00
|
|
|
shared_ptr<Cache> block_cache;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-09-02 08:23:40 +02:00
|
|
|
// If non-NULL use the specified cache for compressed blocks.
|
|
|
|
// If NULL, rocksdb will not use a compressed block cache.
|
|
|
|
// Default: nullptr
|
|
|
|
shared_ptr<Cache> block_cache_compressed;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Approximate size of user data packed per block. Note that the
|
|
|
|
// block size specified here corresponds to uncompressed data. The
|
|
|
|
// actual size of the unit read from disk may be smaller if
|
|
|
|
// compression is enabled. This parameter can be changed dynamically.
|
|
|
|
//
|
2011-04-12 21:38:58 +02:00
|
|
|
// Default: 4K
|
2011-04-21 00:48:11 +02:00
|
|
|
size_t block_size;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
// Number of keys between restart points for delta encoding of keys.
|
|
|
|
// This parameter can be changed dynamically. Most clients should
|
|
|
|
// leave this parameter alone.
|
|
|
|
//
|
|
|
|
// Default: 16
|
|
|
|
int block_restart_interval;
|
|
|
|
|
2012-08-29 21:29:43 +02:00
|
|
|
|
|
|
|
// Compress blocks using the specified compression algorithm. This
|
|
|
|
// parameter can be changed dynamically.
|
|
|
|
//
|
|
|
|
// Default: kSnappyCompression, which gives lightweight but fast
|
|
|
|
// compression.
|
|
|
|
//
|
|
|
|
// Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
|
|
|
|
// ~200-500MB/s compression
|
|
|
|
// ~400-800MB/s decompression
|
|
|
|
// Note that these speeds are significantly faster than most
|
|
|
|
// persistent storage speeds, and therefore it is typically never
|
|
|
|
// worth switching to kNoCompression. Even if the input data is
|
|
|
|
// incompressible, the kSnappyCompression implementation will
|
|
|
|
// efficiently detect that and will switch to uncompressed mode.
|
|
|
|
CompressionType compression;
|
|
|
|
|
2012-10-28 07:13:17 +01:00
|
|
|
// Different levels can have different compression policies. There
|
|
|
|
// are cases where most lower levels would like to quick compression
|
|
|
|
// algorithm while the higher levels (which have more data) use
|
|
|
|
// compression algorithms that have better compression but could
|
2013-03-01 03:04:58 +01:00
|
|
|
// be slower. This array, if non nullptr, should have an entry for
|
|
|
|
// each level of the database. This array, if non nullptr, overides the
|
2012-10-28 07:13:17 +01:00
|
|
|
// value specified in the previous field 'compression'. The caller is
|
|
|
|
// reponsible for allocating memory and initializing the values in it
|
|
|
|
// before invoking Open(). The caller is responsible for freeing this
|
|
|
|
// array and it could be freed anytime after the return from Open().
|
2012-11-01 18:50:08 +01:00
|
|
|
// This could have been a std::vector but that makes the equivalent
|
2012-10-28 07:13:17 +01:00
|
|
|
// java/C api hard to construct.
|
2013-01-24 19:54:26 +01:00
|
|
|
std::vector<CompressionType> compression_per_level;
|
2012-10-28 07:13:17 +01:00
|
|
|
|
2012-11-01 18:50:08 +01:00
|
|
|
//different options for compression algorithms
|
|
|
|
CompressionOptions compression_opts;
|
|
|
|
|
2013-03-01 03:04:58 +01:00
|
|
|
// If non-nullptr, use the specified filter policy to reduce disk reads.
|
2012-08-29 21:29:43 +02:00
|
|
|
// Many applications will benefit from passing the result of
|
|
|
|
// NewBloomFilterPolicy() here.
|
|
|
|
//
|
2013-03-01 03:04:58 +01:00
|
|
|
// Default: nullptr
|
2012-08-29 21:29:43 +02:00
|
|
|
const FilterPolicy* filter_policy;
|
|
|
|
|
2013-08-13 23:04:56 +02:00
|
|
|
// If non-nullptr, use the specified function to determine the
|
|
|
|
// prefixes for keys. These prefixes will be placed in the filter.
|
|
|
|
// Depending on the workload, this can reduce the number of read-IOP
|
|
|
|
// cost for scans when a prefix is passed via ReadOptions to
|
|
|
|
// db.NewIterator(). For prefix filtering to work properly,
|
|
|
|
// "prefix_extractor" and "comparator" must be such that the following
|
|
|
|
// properties hold:
|
|
|
|
//
|
|
|
|
// 1) key.starts_with(prefix(key))
|
|
|
|
// 2) Compare(prefix(key), key) <= 0.
|
|
|
|
// 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
|
|
|
|
// 4) prefix(prefix(key)) == prefix(key)
|
|
|
|
//
|
|
|
|
// Default: nullptr
|
|
|
|
const SliceTransform* prefix_extractor;
|
|
|
|
|
|
|
|
// If true, place whole keys in the filter (not just prefixes).
|
|
|
|
// This must generally be true for gets to be efficient.
|
|
|
|
//
|
|
|
|
// Default: true
|
|
|
|
bool whole_key_filtering;
|
|
|
|
|
2012-06-23 04:30:03 +02:00
|
|
|
// Number of levels for this database
|
|
|
|
int num_levels;
|
|
|
|
|
|
|
|
// Number of files to trigger level-0 compaction. A value <0 means that
|
|
|
|
// level-0 compaction will not be triggered by number of files at all.
|
|
|
|
int level0_file_num_compaction_trigger;
|
|
|
|
|
2013-08-01 01:20:48 +02:00
|
|
|
// Soft limit on number of level-0 files. We start slowing down writes at this
|
|
|
|
// point. A value <0 means that no writing slow down will be triggered by
|
|
|
|
// number of files in level-0.
|
2012-06-23 04:30:03 +02:00
|
|
|
int level0_slowdown_writes_trigger;
|
|
|
|
|
|
|
|
// Maximum number of level-0 files. We stop writes at this point.
|
|
|
|
int level0_stop_writes_trigger;
|
|
|
|
|
|
|
|
// Maximum level to which a new compacted memtable is pushed if it
|
|
|
|
// does not create overlap. We try to push to level 2 to avoid the
|
|
|
|
// relatively expensive level 0=>1 compactions and to avoid some
|
|
|
|
// expensive manifest file operations. We do not push all the way to
|
|
|
|
// the largest level since that can generate a lot of wasted disk
|
|
|
|
// space if the same key space is being repeatedly overwritten.
|
|
|
|
int max_mem_compaction_level;
|
|
|
|
|
2012-09-18 00:16:49 +02:00
|
|
|
// Target file size for compaction.
|
|
|
|
// target_file_size_base is per-file size for level-1.
|
|
|
|
// Target file size for level L can be calculated by
|
|
|
|
// target_file_size_base * (target_file_size_multiplier ^ (L-1))
|
|
|
|
// For example, if target_file_size_base is 2MB and
|
|
|
|
// target_file_size_multiplier is 10, then each file on level-1 will
|
|
|
|
// be 2MB, and each file on level 2 will be 20MB,
|
|
|
|
// and each file on level-3 will be 200MB.
|
|
|
|
|
|
|
|
// by default target_file_size_base is 2MB.
|
2012-06-23 04:30:03 +02:00
|
|
|
int target_file_size_base;
|
2012-09-18 00:16:49 +02:00
|
|
|
// by default target_file_size_multiplier is 1, which means
|
|
|
|
// by default files in different levels will have similar size.
|
2012-06-23 04:30:03 +02:00
|
|
|
int target_file_size_multiplier;
|
|
|
|
|
2012-09-18 00:16:49 +02:00
|
|
|
// Control maximum total data size for a level.
|
|
|
|
// max_bytes_for_level_base is the max total for level-1.
|
|
|
|
// Maximum number of bytes for level L can be calculated as
|
|
|
|
// (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
|
|
|
|
// For example, if max_bytes_for_level_base is 20MB, and if
|
|
|
|
// max_bytes_for_level_multiplier is 10, total data size for level-1
|
|
|
|
// will be 20MB, total file size for level-2 will be 200MB,
|
|
|
|
// and total file size for level-3 will be 2GB.
|
|
|
|
|
2012-06-23 04:30:03 +02:00
|
|
|
|
2012-09-18 00:16:49 +02:00
|
|
|
// by default 'max_bytes_for_level_base' is 10MB.
|
2012-11-09 03:45:19 +01:00
|
|
|
uint64_t max_bytes_for_level_base;
|
2012-09-18 00:16:49 +02:00
|
|
|
// by default 'max_bytes_for_level_base' is 10.
|
2012-08-16 00:44:23 +02:00
|
|
|
int max_bytes_for_level_multiplier;
|
2012-06-23 04:30:03 +02:00
|
|
|
|
2013-05-21 20:37:06 +02:00
|
|
|
// Different max-size multipliers for different levels.
|
|
|
|
// These are multiplied by max_bytes_for_level_multiplier to arrive
|
|
|
|
// at the max-size of each level.
|
|
|
|
// Default: 1
|
|
|
|
std::vector<int> max_bytes_for_level_multiplier_additional;
|
|
|
|
|
2012-08-16 00:44:23 +02:00
|
|
|
// Maximum number of bytes in all compacted files. We avoid expanding
|
|
|
|
// the lower level file set of a compaction if it would make the
|
|
|
|
// total compaction cover more than
|
|
|
|
// (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
|
|
|
|
int expanded_compaction_factor;
|
2012-06-23 04:30:03 +02:00
|
|
|
|
2012-11-21 08:07:41 +01:00
|
|
|
// Maximum number of bytes in all source files to be compacted in a
|
2012-11-29 01:42:36 +01:00
|
|
|
// single compaction run. We avoid picking too many files in the
|
2012-11-21 08:07:41 +01:00
|
|
|
// source level so that we do not exceed the total source bytes
|
|
|
|
// for compaction to exceed
|
|
|
|
// (source_compaction_factor * targetFileSizeLevel()) many bytes.
|
|
|
|
// Default:1, i.e. pick maxfilesize amount of data as the source of
|
|
|
|
// a compaction.
|
|
|
|
int source_compaction_factor;
|
|
|
|
|
2012-08-16 00:44:23 +02:00
|
|
|
// Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
|
|
|
|
// stop building a single file in a level->level+1 compaction.
|
|
|
|
int max_grandparent_overlap_factor;
|
2012-06-23 04:30:03 +02:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
// Disable compaction triggered by seek.
|
|
|
|
// With bloomfilter and fast storage, a miss on one level
|
|
|
|
// is very cheap if the file handle is cached in table cache
|
|
|
|
// (which is true if max_open_files is large).
|
|
|
|
bool disable_seek_compaction;
|
|
|
|
|
|
|
|
// Puts are delayed 0-1 ms when any level has a compaction score that exceeds
|
|
|
|
// soft_rate_limit. This is ignored when == 0.0.
|
|
|
|
// CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
|
|
|
|
// hold, RocksDB will set soft_rate_limit = hard_rate_limit
|
|
|
|
// Default: 0 (disabled)
|
|
|
|
double soft_rate_limit;
|
|
|
|
|
|
|
|
// Puts are delayed 1ms at a time when any level has a compaction score that
|
|
|
|
// exceeds hard_rate_limit. This is ignored when <= 1.0.
|
|
|
|
// Default: 0 (disabled)
|
|
|
|
double hard_rate_limit;
|
|
|
|
|
|
|
|
// Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
|
|
|
|
// there is no limit.
|
|
|
|
// Default: 1000
|
|
|
|
unsigned int rate_limit_delay_max_milliseconds;
|
|
|
|
|
|
|
|
// Disable block cache. If this is set to true,
|
|
|
|
// then no block cache should be used, and the block_cache should
|
|
|
|
// point to a nullptr object.
|
|
|
|
// Default: false
|
|
|
|
bool no_block_cache;
|
|
|
|
|
|
|
|
// Number of shards used for table cache.
|
|
|
|
int table_cache_numshardbits;
|
|
|
|
|
|
|
|
// During data eviction of table's LRU cache, it would be inefficient
|
|
|
|
// to strictly follow LRU because this piece of memory will not really
|
|
|
|
// be released unless its refcount falls to zero. Instead, make two
|
|
|
|
// passes: the first pass will release items with refcount = 1,
|
|
|
|
// and if not enough space releases after scanning the number of
|
|
|
|
// elements specified by this parameter, we will remove items in LRU
|
|
|
|
// order.
|
|
|
|
int table_cache_remove_scan_count_limit;
|
|
|
|
|
|
|
|
// Disable automatic compactions. Manual compactions can still
|
|
|
|
// be issued on this column family
|
|
|
|
bool disable_auto_compactions;
|
|
|
|
|
|
|
|
// Purge duplicate/deleted keys when a memtable is flushed to storage.
|
|
|
|
// Default: true
|
|
|
|
bool purge_redundant_kvs_while_flush;
|
|
|
|
|
|
|
|
// This is used to close a block before it reaches the configured
|
|
|
|
// 'block_size'. If the percentage of free space in the current block is less
|
|
|
|
// than this specified number and adding a new record to the block will
|
|
|
|
// exceed the configured block size, then this block will be closed and the
|
|
|
|
// new record will be written to the next block.
|
|
|
|
// Default is 10.
|
|
|
|
int block_size_deviation;
|
|
|
|
|
|
|
|
// The compaction style. Default: kCompactionStyleLevel
|
|
|
|
CompactionStyle compaction_style;
|
|
|
|
|
|
|
|
// The options needed to support Universal Style compactions
|
|
|
|
CompactionOptionsUniversal compaction_options_universal;
|
|
|
|
|
|
|
|
// Use KeyMayExist API to filter deletes when this is true.
|
|
|
|
// If KeyMayExist returns false, i.e. the key definitely does not exist, then
|
|
|
|
// the delete is a noop. KeyMayExist only incurs in-memory look up.
|
|
|
|
// This optimization avoids writing the delete to storage when appropriate.
|
|
|
|
// Default: false
|
|
|
|
bool filter_deletes;
|
|
|
|
|
|
|
|
// An iteration->Next() sequentially skips over keys with the same
|
|
|
|
// user-key unless this option is set. This number specifies the number
|
|
|
|
// of keys (with the same userkey) that will be sequentially
|
|
|
|
// skipped before a reseek is issued.
|
|
|
|
// Default: 8
|
|
|
|
uint64_t max_sequential_skip_in_iterations;
|
|
|
|
|
|
|
|
// This is a factory that provides MemTableRep objects.
|
|
|
|
// Default: a factory that provides a skip-list-based implementation of
|
|
|
|
// MemTableRep.
|
|
|
|
std::shared_ptr<MemTableRepFactory> memtable_factory;
|
|
|
|
|
|
|
|
// This is a factory that provides TableFactory objects.
|
|
|
|
// Default: a factory that provides a default implementation of
|
|
|
|
// Table and TableBuilder.
|
|
|
|
std::shared_ptr<TableFactory> table_factory;
|
|
|
|
|
|
|
|
// This option allows user to to collect their own interested statistics of
|
|
|
|
// the tables.
|
|
|
|
// Default: emtpy vector -- no user-defined statistics collection will be
|
|
|
|
// performed.
|
|
|
|
std::vector<std::shared_ptr<TablePropertiesCollector>>
|
|
|
|
table_properties_collectors;
|
|
|
|
|
|
|
|
// Allows thread-safe inplace updates. Requires Updates iff
|
|
|
|
// * key exists in current memtable
|
|
|
|
// * new sizeof(new_value) <= sizeof(old_value)
|
|
|
|
// * old_value for that key is a put i.e. kTypeValue
|
|
|
|
// Default: false.
|
|
|
|
bool inplace_update_support;
|
|
|
|
|
|
|
|
// Number of locks used for inplace update
|
|
|
|
// Default: 10000, if inplace_update_support = true, else 0.
|
|
|
|
size_t inplace_update_num_locks;
|
|
|
|
|
|
|
|
// Create ColumnFamilyOptions with default values for all fields
|
|
|
|
ColumnFamilyOptions();
|
|
|
|
};
|
|
|
|
|
|
|
|
struct DBOptions {
|
|
|
|
// If true, the database will be created if it is missing.
|
|
|
|
// Default: false
|
|
|
|
bool create_if_missing;
|
|
|
|
|
|
|
|
// If true, an error is raised if the database already exists.
|
|
|
|
// Default: false
|
|
|
|
bool error_if_exists;
|
|
|
|
|
|
|
|
// If true, the implementation will do aggressive checking of the
|
|
|
|
// data it is processing and will stop early if it detects any
|
|
|
|
// errors. This may have unforeseen ramifications: for example, a
|
|
|
|
// corruption of one DB entry may cause a large number of entries to
|
|
|
|
// become unreadable or for the entire DB to become unopenable.
|
|
|
|
// If any of the writes to the database fails (Put, Delete, Merge, Write),
|
|
|
|
// the database will switch to read-only mode and fail all other
|
|
|
|
// Write operations.
|
|
|
|
// Default: false
|
|
|
|
bool paranoid_checks;
|
|
|
|
|
|
|
|
// Use the specified object to interact with the environment,
|
|
|
|
// e.g. to read/write files, schedule background work, etc.
|
|
|
|
// Default: Env::Default()
|
|
|
|
Env* env;
|
|
|
|
|
|
|
|
// Any internal progress/error information generated by the db will
|
|
|
|
// be written to info_log if it is non-nullptr, or to a file stored
|
|
|
|
// in the same directory as the DB contents if info_log is nullptr.
|
|
|
|
// Default: nullptr
|
|
|
|
shared_ptr<Logger> info_log;
|
|
|
|
|
|
|
|
// Number of open files that can be used by the DB. You may need to
|
|
|
|
// increase this if your database has a large working set (budget
|
|
|
|
// one open file per 2MB of working set).
|
|
|
|
//
|
|
|
|
// Default: 1000
|
|
|
|
int max_open_files;
|
|
|
|
|
2012-05-30 08:18:16 +02:00
|
|
|
// If non-null, then we should collect metrics about database operations
|
2013-01-16 01:48:22 +01:00
|
|
|
// Statistics objects should not be shared between DB instances as
|
|
|
|
// it does not use any locks to prevent concurrent updates.
|
2013-03-27 19:27:39 +01:00
|
|
|
shared_ptr<Statistics> statistics;
|
2012-05-30 08:18:16 +02:00
|
|
|
|
2012-08-04 00:20:58 +02:00
|
|
|
// If true, then the contents of data files are not synced
|
|
|
|
// to stable storage. Their contents remain in the OS buffers till the
|
|
|
|
// OS decides to flush them. This option is good for bulk-loading
|
|
|
|
// of data. Once the bulk-loading is complete, please issue a
|
|
|
|
// sync to the OS to flush all dirty buffesrs to stable storage.
|
|
|
|
// Default: false
|
|
|
|
bool disableDataSync;
|
|
|
|
|
2012-08-27 21:10:26 +02:00
|
|
|
// If true, then every store to stable storage will issue a fsync.
|
|
|
|
// If false, then every store to stable storage will issue a fdatasync.
|
|
|
|
// This parameter should be set to true while storing data to
|
2013-11-12 06:02:38 +01:00
|
|
|
// filesystem like ext3 that can lose files after a reboot.
|
2012-08-27 21:10:26 +02:00
|
|
|
// Default: false
|
|
|
|
bool use_fsync;
|
|
|
|
|
2012-08-15 00:20:36 +02:00
|
|
|
// This number controls how often a new scribe log about
|
|
|
|
// db deploy stats is written out.
|
|
|
|
// -1 indicates no logging at all.
|
|
|
|
// Default value is 1800 (half an hour).
|
|
|
|
int db_stats_log_interval;
|
|
|
|
|
2013-10-01 23:46:52 +02:00
|
|
|
// This specifies the info LOG dir.
|
2012-09-06 02:44:13 +02:00
|
|
|
// If it is empty, the log files will be in the same dir as data.
|
|
|
|
// If it is non empty, the log files will be in the specified dir,
|
|
|
|
// and the db data dir's absolute path will be used as the log file
|
|
|
|
// name's prefix.
|
|
|
|
std::string db_log_dir;
|
|
|
|
|
2013-10-01 23:46:52 +02:00
|
|
|
// This specifies the absolute dir path for write-ahead logs (WAL).
|
|
|
|
// If it is empty, the log files will be in the same dir as data,
|
|
|
|
// dbname is used as the data dir by default
|
|
|
|
// If it is non empty, the log files will be in kept the specified dir.
|
|
|
|
// When destroying the db,
|
|
|
|
// all log files in wal_dir and the dir itself is deleted
|
|
|
|
std::string wal_dir;
|
|
|
|
|
2012-10-16 17:53:46 +02:00
|
|
|
// The periodicity when obsolete files get deleted. The default
|
2013-11-15 03:03:57 +01:00
|
|
|
// value is 6 hours. The files that get out of scope by compaction
|
|
|
|
// process will still get automatically delete on every compaction,
|
|
|
|
// regardless of this setting
|
2012-10-16 17:53:46 +02:00
|
|
|
uint64_t delete_obsolete_files_period_micros;
|
2012-11-29 01:42:36 +01:00
|
|
|
|
2013-09-13 23:38:37 +02:00
|
|
|
// Maximum number of concurrent background jobs, submitted to
|
|
|
|
// the default LOW priority thread pool
|
2012-10-19 23:00:53 +02:00
|
|
|
// Default: 1
|
|
|
|
int max_background_compactions;
|
2012-10-16 17:53:46 +02:00
|
|
|
|
2013-09-13 23:38:37 +02:00
|
|
|
// Maximum number of concurrent background memtable flush jobs, submitted to
|
|
|
|
// the HIGH priority thread pool.
|
|
|
|
// By default, all background jobs (major compaction and memtable flush) go
|
|
|
|
// to the LOW priority pool. If this option is set to a positive number,
|
|
|
|
// memtable flush jobs will be submitted to the HIGH priority pool.
|
|
|
|
// It is important when the same Env is shared by multiple db instances.
|
|
|
|
// Without a separate pool, long running major compaction jobs could
|
|
|
|
// potentially block memtable flush jobs of other db instances, leading to
|
|
|
|
// unnecessary Put stalls.
|
|
|
|
// Default: 0
|
|
|
|
int max_background_flushes;
|
|
|
|
|
2012-10-26 23:55:02 +02:00
|
|
|
// Specify the maximal size of the info log file. If the log file
|
|
|
|
// is larger than `max_log_file_size`, a new info log file will
|
|
|
|
// be created.
|
|
|
|
// If max_log_file_size == 0, all logs will be written to one
|
|
|
|
// log file.
|
|
|
|
size_t max_log_file_size;
|
|
|
|
|
2013-02-05 04:42:40 +01:00
|
|
|
// Time for the info log file to roll (in seconds).
|
|
|
|
// If specified with non-zero value, log file will be rolled
|
|
|
|
// if it has been active longer than `log_file_time_to_roll`.
|
|
|
|
// Default: 0 (disabled)
|
|
|
|
size_t log_file_time_to_roll;
|
|
|
|
|
|
|
|
// Maximal info log files to be kept.
|
|
|
|
// Default: 1000
|
|
|
|
size_t keep_log_file_num;
|
|
|
|
|
2013-01-11 02:18:50 +01:00
|
|
|
// manifest file is rolled over on reaching this limit.
|
|
|
|
// The older manifest file be deleted.
|
|
|
|
// The default value is MAX_INT so that roll-over does not take place.
|
|
|
|
uint64_t max_manifest_file_size;
|
|
|
|
|
Make arena block size configurable
Summary:
Add an option for arena block size, default value 4096 bytes. Arena will allocate blocks with such size.
I am not sure about passing parameter to skiplist in the new virtualized framework, though I talked to Jim a bit. So add Jim as reviewer.
Test Plan:
new unit test, I am running db_test.
For passing paramter from configured option to Arena, I tried tests like:
TEST(DBTest, Arena_Option) {
std::string dbname = test::TmpDir() + "/db_arena_option_test";
DestroyDB(dbname, Options());
DB* db = nullptr;
Options opts;
opts.create_if_missing = true;
opts.arena_block_size = 1000000; // tested 99, 999999
Status s = DB::Open(opts, dbname, &db);
db->Put(WriteOptions(), "a", "123");
}
and printed some debug info. The results look good. Any suggestion for such a unit-test?
Reviewers: haobo, dhruba, emayanke, jpaton
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D11799
2013-07-31 21:42:23 +02:00
|
|
|
// size of one block in arena memory allocation.
|
|
|
|
// If <= 0, a proper value is automatically calculated (usually 1/10 of
|
|
|
|
// writer_buffer_size).
|
|
|
|
//
|
|
|
|
// Default: 0
|
|
|
|
size_t arena_block_size;
|
|
|
|
|
2013-11-07 03:46:28 +01:00
|
|
|
// The following two fields affect how archived logs will be deleted.
|
|
|
|
// 1. If both set to 0, logs will be deleted asap and will not get into
|
|
|
|
// the archive.
|
|
|
|
// 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
|
|
|
|
// WAL files will be checked every 10 min and if total size is greater
|
|
|
|
// then WAL_size_limit_MB, they will be deleted starting with the
|
|
|
|
// earliest until size_limit is met. All empty files will be deleted.
|
|
|
|
// 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
|
2013-11-12 06:02:38 +01:00
|
|
|
// WAL files will be checked every WAL_ttl_secondsi / 2 and those that
|
2013-11-07 03:46:28 +01:00
|
|
|
// are older than WAL_ttl_seconds will be deleted.
|
|
|
|
// 4. If both are not 0, WAL files will be checked every 10 min and both
|
|
|
|
// checks will be performed with ttl being first.
|
2012-11-26 22:56:45 +01:00
|
|
|
uint64_t WAL_ttl_seconds;
|
2013-11-07 03:46:28 +01:00
|
|
|
uint64_t WAL_size_limit_MB;
|
2013-01-15 23:05:42 +01:00
|
|
|
|
|
|
|
// Number of bytes to preallocate (via fallocate) the manifest
|
|
|
|
// files. Default is 4mb, which is reasonable to reduce random IO
|
|
|
|
// as well as prevent overallocation for mounts that preallocate
|
|
|
|
// large amounts of data (such as xfs's allocsize option).
|
|
|
|
size_t manifest_preallocation_size;
|
2013-02-28 23:09:30 +01:00
|
|
|
|
2013-03-15 01:00:04 +01:00
|
|
|
// Data being read from file storage may be buffered in the OS
|
|
|
|
// Default: true
|
|
|
|
bool allow_os_buffer;
|
|
|
|
|
2013-06-18 01:13:32 +02:00
|
|
|
// Allow the OS to mmap file for reading sst tables. Default: false
|
2013-03-15 01:00:04 +01:00
|
|
|
bool allow_mmap_reads;
|
|
|
|
|
|
|
|
// Allow the OS to mmap file for writing. Default: true
|
|
|
|
bool allow_mmap_writes;
|
2013-04-10 04:42:07 +02:00
|
|
|
|
|
|
|
// Disable child process inherit open files. Default: true
|
|
|
|
bool is_fd_close_on_exec;
|
2013-05-21 20:53:33 +02:00
|
|
|
|
|
|
|
// Skip log corruption error on recovery (If client is ok with
|
|
|
|
// losing most recent changes)
|
|
|
|
// Default: false
|
|
|
|
bool skip_log_error_on_recovery;
|
|
|
|
|
2013-10-05 07:32:05 +02:00
|
|
|
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
|
2013-05-11 00:21:04 +02:00
|
|
|
// Default: 3600 (1 hour)
|
|
|
|
unsigned int stats_dump_period_sec;
|
2013-05-15 19:34:02 +02:00
|
|
|
|
2013-05-18 00:53:01 +02:00
|
|
|
// If set true, will hint the underlying file system that the file
|
|
|
|
// access pattern is random, when a sst file is opened.
|
|
|
|
// Default: true
|
|
|
|
bool advise_random_on_open;
|
|
|
|
|
|
|
|
// Specify the file access pattern once a compaction is started.
|
|
|
|
// It will be applied to all input files of a compaction.
|
|
|
|
// Default: NORMAL
|
|
|
|
enum { NONE, NORMAL, SEQUENTIAL, WILLNEED } access_hint_on_compaction_start;
|
2013-06-01 01:30:17 +02:00
|
|
|
|
|
|
|
// Use adaptive mutex, which spins in the user space before resorting
|
|
|
|
// to kernel. This could reduce context switch when the mutex is not
|
|
|
|
// heavily contended. However, if the mutex is hot, we could end up
|
|
|
|
// wasting spin time.
|
|
|
|
// Default: false
|
|
|
|
bool use_adaptive_mutex;
|
|
|
|
|
2013-06-14 07:49:46 +02:00
|
|
|
// Allows OS to incrementally sync files to disk while they are being
|
|
|
|
// written, asynchronously, in the background.
|
|
|
|
// Issue one request for every bytes_per_sync written. 0 turns it off.
|
|
|
|
// Default: 0
|
|
|
|
uint64_t bytes_per_sync;
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
// Create DBOptions with default values for all fields
|
|
|
|
DBOptions();
|
|
|
|
};
|
2013-08-13 19:56:20 +02:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
// Options to control the behavior of a database (passed to DB::Open)
|
|
|
|
struct Options : public DBOptions, public ColumnFamilyOptions {
|
|
|
|
// Create an Options object with default values for all fields.
|
|
|
|
Options() :
|
|
|
|
DBOptions(),
|
|
|
|
ColumnFamilyOptions() {}
|
2013-10-29 01:54:09 +01:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
Options(const DBOptions& db_options,
|
|
|
|
const ColumnFamilyOptions& column_family_options)
|
|
|
|
: DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 23:12:47 +02:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
void Dump(Logger* log) const;
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 23:12:47 +02:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
// Set appropriate parameters for bulk loading.
|
|
|
|
// The reason that this is a function that returns "this" instead of a
|
|
|
|
// constructor is to enable chaining of multiple similar calls in the future.
|
|
|
|
//
|
|
|
|
// All data will be in level 0 without any automatic compaction.
|
|
|
|
// It's recommended to manually call CompactRange(NULL, NULL) before reading
|
|
|
|
// from the database, because otherwise the read can be very slow.
|
|
|
|
Options* PrepareForBulkLoad();
|
2011-03-18 23:37:00 +01:00
|
|
|
};
|
|
|
|
|
2013-08-25 07:48:51 +02:00
|
|
|
//
|
|
|
|
// An application can issue a read request (via Get/Iterators) and specify
|
|
|
|
// if that read should process data that ALREADY resides on a specified cache
|
|
|
|
// level. For example, if an application specifies kBlockCacheTier then the
|
|
|
|
// Get call will process data that is already processed in the memtable or
|
|
|
|
// the block cache. It will not page in data from the OS cache or data that
|
|
|
|
// resides in storage.
|
|
|
|
enum ReadTier {
|
|
|
|
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
|
|
|
|
kBlockCacheTier = 0x1 // data in memtable or block cache
|
|
|
|
};
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Options that control read operations
|
|
|
|
struct ReadOptions {
|
|
|
|
// If true, all data read from underlying storage will be
|
|
|
|
// verified against corresponding checksums.
|
|
|
|
// Default: false
|
|
|
|
bool verify_checksums;
|
|
|
|
|
2013-11-13 07:46:51 +01:00
|
|
|
// Should the "data block"/"index block"/"filter block" read for this
|
|
|
|
// iteration be cached in memory?
|
2011-03-18 23:37:00 +01:00
|
|
|
// Callers may wish to set this field to false for bulk scans.
|
|
|
|
// Default: true
|
|
|
|
bool fill_cache;
|
|
|
|
|
2013-11-04 01:32:46 +01:00
|
|
|
// If this option is set and memtable implementation allows, Seek
|
|
|
|
// might only return keys with the same prefix as the seek-key
|
|
|
|
bool prefix_seek;
|
|
|
|
|
2013-03-01 03:04:58 +01:00
|
|
|
// If "snapshot" is non-nullptr, read as of the supplied snapshot
|
2011-03-18 23:37:00 +01:00
|
|
|
// (which must belong to the DB that is being read and which must
|
2013-03-01 03:04:58 +01:00
|
|
|
// not have been released). If "snapshot" is nullptr, use an impliicit
|
2011-03-18 23:37:00 +01:00
|
|
|
// snapshot of the state at the beginning of this read operation.
|
2013-03-01 03:04:58 +01:00
|
|
|
// Default: nullptr
|
2011-03-18 23:37:00 +01:00
|
|
|
const Snapshot* snapshot;
|
|
|
|
|
2013-08-13 23:04:56 +02:00
|
|
|
// If "prefix" is non-nullptr, and ReadOptions is being passed to
|
|
|
|
// db.NewIterator, only return results when the key begins with this
|
|
|
|
// prefix. This field is ignored by other calls (e.g., Get).
|
|
|
|
// Options.prefix_extractor must also be set, and
|
|
|
|
// prefix_extractor.InRange(prefix) must be true. The iterator
|
|
|
|
// returned by NewIterator when this option is set will behave just
|
|
|
|
// as if the underlying store did not contain any non-matching keys,
|
|
|
|
// with two exceptions. Seek() only accepts keys starting with the
|
|
|
|
// prefix, and SeekToLast() is not supported. prefix filter with this
|
|
|
|
// option will sometimes reduce the number of read IOPs.
|
|
|
|
// Default: nullptr
|
|
|
|
const Slice* prefix;
|
|
|
|
|
2013-08-25 07:48:51 +02:00
|
|
|
// Specify if this read request should process data that ALREADY
|
|
|
|
// resides on a particular cache. If the required data is not
|
2013-08-31 17:40:45 +02:00
|
|
|
// found at the specified cache, then Status::Incomplete is returned.
|
2013-08-25 07:48:51 +02:00
|
|
|
// Default: kReadAllTier
|
|
|
|
ReadTier read_tier;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
ReadOptions()
|
|
|
|
: verify_checksums(false),
|
|
|
|
fill_cache(true),
|
2013-11-04 01:32:46 +01:00
|
|
|
prefix_seek(false),
|
2013-08-13 23:04:56 +02:00
|
|
|
snapshot(nullptr),
|
2013-08-25 07:48:51 +02:00
|
|
|
prefix(nullptr),
|
|
|
|
read_tier(kReadAllTier) {
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2012-05-19 09:05:48 +02:00
|
|
|
ReadOptions(bool cksum, bool cache) :
|
|
|
|
verify_checksums(cksum), fill_cache(cache),
|
2013-11-04 01:32:46 +01:00
|
|
|
prefix_seek(false), snapshot(nullptr), prefix(nullptr),
|
2013-08-25 07:48:51 +02:00
|
|
|
read_tier(kReadAllTier) {
|
2012-05-19 09:05:48 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
// Options that control write operations
|
|
|
|
struct WriteOptions {
|
|
|
|
// If true, the write will be flushed from the operating system
|
|
|
|
// buffer cache (by calling WritableFile::Sync()) before the write
|
|
|
|
// is considered complete. If this flag is true, writes will be
|
|
|
|
// slower.
|
|
|
|
//
|
|
|
|
// If this flag is false, and the machine crashes, some recent
|
|
|
|
// writes may be lost. Note that if it is just the process that
|
|
|
|
// crashes (i.e., the machine does not reboot), no writes will be
|
|
|
|
// lost even if sync==false.
|
|
|
|
//
|
2011-04-12 21:38:58 +02:00
|
|
|
// In other words, a DB write with sync==false has similar
|
|
|
|
// crash semantics as the "write()" system call. A DB write
|
|
|
|
// with sync==true has similar crash semantics to a "write()"
|
2013-11-12 06:02:38 +01:00
|
|
|
// system call followed by "fdatasync()".
|
2011-04-12 21:38:58 +02:00
|
|
|
//
|
|
|
|
// Default: false
|
2011-03-18 23:37:00 +01:00
|
|
|
bool sync;
|
|
|
|
|
2012-07-05 22:39:28 +02:00
|
|
|
// If true, writes will not first go to the write ahead log,
|
|
|
|
// and the write may got lost after a crash.
|
|
|
|
bool disableWAL;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
WriteOptions()
|
2012-07-05 22:39:28 +02:00
|
|
|
: sync(false),
|
|
|
|
disableWAL(false) {
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2012-07-06 20:42:09 +02:00
|
|
|
// Options that control flush operations
|
|
|
|
struct FlushOptions {
|
|
|
|
// If true, the flush will wait until the flush is done.
|
|
|
|
// Default: true
|
|
|
|
bool wait;
|
|
|
|
|
|
|
|
FlushOptions()
|
|
|
|
: wait(true) {
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
} // namespace rocksdb
|
|
|
|
|
2013-08-23 17:38:13 +02:00
|
|
|
#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|