2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 23:59:46 +02:00
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2019-05-30 20:21:38 +02:00
|
|
|
#include "test_util/testutil.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2020-04-25 08:58:13 +02:00
|
|
|
#include <fcntl.h>
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 05:50:35 +02:00
|
|
|
#include <array>
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
#include <cctype>
|
2019-07-09 23:48:07 +02:00
|
|
|
#include <fstream>
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
#include <sstream>
|
|
|
|
|
2017-03-13 19:44:50 +01:00
|
|
|
#include "db/memtable_list.h"
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
#include "env/composite_env_wrapper.h"
|
2019-09-16 19:31:27 +02:00
|
|
|
#include "file/random_access_file_reader.h"
|
|
|
|
#include "file/sequence_file_reader.h"
|
|
|
|
#include "file/writable_file_writer.h"
|
2014-08-27 19:39:31 +02:00
|
|
|
#include "port/port.h"
|
2020-04-25 08:58:13 +02:00
|
|
|
#include "test_util/sync_point.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 23:37:00 +01:00
|
|
|
namespace test {
|
|
|
|
|
2018-06-05 04:59:44 +02:00
|
|
|
const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
|
New Bloom filter implementation for full and partitioned filters (#6007)
Summary:
Adds an improved, replacement Bloom filter implementation (FastLocalBloom) for full and partitioned filters in the block-based table. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single filter.
Speed
The improved speed, at least on recent x86_64, comes from
* Using fastrange instead of modulo (%)
* Using our new hash function (XXH3 preview, added in a previous commit), which is much faster for large keys and only *slightly* slower on keys around 12 bytes if hashing the same size many thousands of times in a row.
* Optimizing the Bloom filter queries with AVX2 SIMD operations. (Added AVX2 to the USE_SSE=1 build.) Careful design was required to support (a) SIMD-optimized queries, (b) compatible non-SIMD code that's simple and efficient, (c) flexible choice of number of probes, and (d) essentially maximized accuracy for a cache-local Bloom filter. Probes are made eight at a time, so any number of probes up to 8 is the same speed, then up to 16, etc.
* Prefetching cache lines when building the filter. Although this optimization could be applied to the old structure as well, it seems to balance out the small added cost of accumulating 64 bit hashes for adding to the filter rather than 32 bit hashes.
Here's nominal speed data from filter_bench (200MB in filters, about 10k keys each, 10 bits filter data / key, 6 probes, avg key size 24 bytes, includes hashing time) on Skylake DE (relatively low clock speed):
$ ./filter_bench -quick -impl=2 -net_includes_hashing # New Bloom filter
Build avg ns/key: 47.7135
Mixed inside/outside queries...
Single filter net ns/op: 26.2825
Random filter net ns/op: 150.459
Average FP rate %: 0.954651
$ ./filter_bench -quick -impl=0 -net_includes_hashing # Old Bloom filter
Build avg ns/key: 47.2245
Mixed inside/outside queries...
Single filter net ns/op: 63.2978
Random filter net ns/op: 188.038
Average FP rate %: 1.13823
Similar build time but dramatically faster query times on hot data (63 ns to 26 ns), and somewhat faster on stale data (188 ns to 150 ns). Performance differences on batched and skewed query loads are between these extremes as expected.
The only other interesting thing about speed is "inside" (query key was added to filter) vs. "outside" (query key was not added to filter) query times. The non-SIMD implementations are substantially slower when most queries are "outside" vs. "inside". This goes against what one might expect or would have observed years ago, as "outside" queries only need about two probes on average, due to short-circuiting, while "inside" always have num_probes (say 6). The problem is probably the nastily unpredictable branch. The SIMD implementation has few branches (very predictable) and has pretty consistent running time regardless of query outcome.
Accuracy
The generally improved accuracy (re: Issue https://github.com/facebook/rocksdb/issues/5857) comes from a better design for probing indices
within a cache line (re: Issue https://github.com/facebook/rocksdb/issues/4120) and improved accuracy for millions of keys in a single filter from using a 64-bit hash function (XXH3p). Design details in code comments.
Accuracy data (generalizes, except old impl gets worse with millions of keys):
Memory bits per key: FP rate percent old impl -> FP rate percent new impl
6: 5.70953 -> 5.69888
8: 2.45766 -> 2.29709
10: 1.13977 -> 0.959254
12: 0.662498 -> 0.411593
16: 0.353023 -> 0.0873754
24: 0.261552 -> 0.0060971
50: 0.225453 -> ~0.00003 (less than 1 in a million queries are FP)
Fixes https://github.com/facebook/rocksdb/issues/5857
Fixes https://github.com/facebook/rocksdb/issues/4120
Unlike the old implementation, this implementation has a fixed cache line size (64 bytes). At 10 bits per key, the accuracy of this new implementation is very close to the old implementation with 128-byte cache line size. If there's sufficient demand, this implementation could be generalized.
Compatibility
Although old releases would see the new structure as corrupt filter data and read the table as if there's no filter, we've decided only to enable the new Bloom filter with new format_version=5. This provides a smooth path for automatic adoption over time, with an option for early opt-in.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6007
Test Plan: filter_bench has been used thoroughly to validate speed, accuracy, and correctness. Unit tests have been carefully updated to exercise new and old implementations, as well as the logic to select an implementation based on context (format_version).
Differential Revision: D18294749
Pulled By: pdillinger
fbshipit-source-id: d44c9db3696e4d0a17caaec47075b7755c262c5f
2019-11-14 01:31:26 +01:00
|
|
|
const uint32_t kLatestFormatVersion = 5u;
|
2018-06-05 04:59:44 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Slice RandomString(Random* rnd, int len, std::string* dst) {
|
|
|
|
dst->resize(len);
|
|
|
|
for (int i = 0; i < len; i++) {
|
2018-04-13 02:55:14 +02:00
|
|
|
(*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95)); // ' ' .. '~'
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
return Slice(*dst);
|
|
|
|
}
|
|
|
|
|
2014-09-09 07:24:40 +02:00
|
|
|
extern std::string RandomHumanReadableString(Random* rnd, int len) {
|
|
|
|
std::string ret;
|
|
|
|
ret.resize(len);
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
|
|
ret[i] = static_cast<char>('a' + rnd->Uniform(26));
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:24:45 +02:00
|
|
|
std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
|
2011-03-18 23:37:00 +01:00
|
|
|
// Make sure to generate a wide variety of characters so we
|
|
|
|
// test the boundary conditions for short-key optimizations.
|
2018-04-13 02:55:14 +02:00
|
|
|
static const char kTestChars[] = {'\0', '\1', 'a', 'b', 'c',
|
|
|
|
'd', 'e', '\xfd', '\xfe', '\xff'};
|
2011-03-18 23:37:00 +01:00
|
|
|
std::string result;
|
|
|
|
for (int i = 0; i < len; i++) {
|
2015-10-13 23:24:45 +02:00
|
|
|
std::size_t indx = 0;
|
|
|
|
switch (type) {
|
|
|
|
case RandomKeyType::RANDOM:
|
|
|
|
indx = rnd->Uniform(sizeof(kTestChars));
|
|
|
|
break;
|
|
|
|
case RandomKeyType::LARGEST:
|
|
|
|
indx = sizeof(kTestChars) - 1;
|
|
|
|
break;
|
|
|
|
case RandomKeyType::MIDDLE:
|
|
|
|
indx = sizeof(kTestChars) / 2;
|
|
|
|
break;
|
|
|
|
case RandomKeyType::SMALLEST:
|
|
|
|
indx = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
result += kTestChars[indx];
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern Slice CompressibleString(Random* rnd, double compressed_fraction,
|
|
|
|
int len, std::string* dst) {
|
|
|
|
int raw = static_cast<int>(len * compressed_fraction);
|
|
|
|
if (raw < 1) raw = 1;
|
|
|
|
std::string raw_data;
|
|
|
|
RandomString(rnd, raw, &raw_data);
|
|
|
|
|
|
|
|
// Duplicate the random data until we have filled "len" bytes
|
|
|
|
dst->clear();
|
2012-11-06 21:02:18 +01:00
|
|
|
while (dst->size() < (unsigned int)len) {
|
2011-03-18 23:37:00 +01:00
|
|
|
dst->append(raw_data);
|
|
|
|
}
|
|
|
|
dst->resize(len);
|
|
|
|
return Slice(*dst);
|
|
|
|
}
|
|
|
|
|
2014-08-27 19:39:31 +02:00
|
|
|
namespace {
|
|
|
|
class Uint64ComparatorImpl : public Comparator {
|
|
|
|
public:
|
2018-04-13 02:55:14 +02:00
|
|
|
Uint64ComparatorImpl() {}
|
2014-08-27 19:39:31 +02:00
|
|
|
|
2019-02-14 22:52:47 +01:00
|
|
|
const char* Name() const override { return "rocksdb.Uint64Comparator"; }
|
2014-08-27 19:39:31 +02:00
|
|
|
|
2019-02-14 22:52:47 +01:00
|
|
|
int Compare(const Slice& a, const Slice& b) const override {
|
2014-08-27 19:39:31 +02:00
|
|
|
assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t));
|
|
|
|
const uint64_t* left = reinterpret_cast<const uint64_t*>(a.data());
|
|
|
|
const uint64_t* right = reinterpret_cast<const uint64_t*>(b.data());
|
2017-04-22 05:41:37 +02:00
|
|
|
uint64_t leftValue;
|
|
|
|
uint64_t rightValue;
|
|
|
|
GetUnaligned(left, &leftValue);
|
|
|
|
GetUnaligned(right, &rightValue);
|
|
|
|
if (leftValue == rightValue) {
|
2014-08-27 19:39:31 +02:00
|
|
|
return 0;
|
2017-04-22 05:41:37 +02:00
|
|
|
} else if (leftValue < rightValue) {
|
2014-08-27 19:39:31 +02:00
|
|
|
return -1;
|
|
|
|
} else {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-14 22:52:47 +01:00
|
|
|
void FindShortestSeparator(std::string* /*start*/,
|
|
|
|
const Slice& /*limit*/) const override {
|
2014-08-27 19:39:31 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-02-14 22:52:47 +01:00
|
|
|
void FindShortSuccessor(std::string* /*key*/) const override { return; }
|
2014-08-27 19:39:31 +02:00
|
|
|
};
|
2020-04-11 01:03:33 +02:00
|
|
|
|
|
|
|
// A test implementation of comparator with 64-bit integer timestamp.
|
|
|
|
class ComparatorWithU64TsImpl : public Comparator {
|
|
|
|
public:
|
|
|
|
ComparatorWithU64TsImpl()
|
|
|
|
: Comparator(/*ts_sz=*/sizeof(uint64_t)),
|
|
|
|
cmp_without_ts_(BytewiseComparator()) {
|
|
|
|
assert(cmp_without_ts_);
|
|
|
|
assert(cmp_without_ts_->timestamp_size() == 0);
|
|
|
|
}
|
|
|
|
const char* Name() const override { return "ComparatorWithU64Ts"; }
|
|
|
|
void FindShortSuccessor(std::string*) const override {}
|
|
|
|
void FindShortestSeparator(std::string*, const Slice&) const override {}
|
|
|
|
int Compare(const Slice& a, const Slice& b) const override {
|
|
|
|
int ret = CompareWithoutTimestamp(a, b);
|
|
|
|
size_t ts_sz = timestamp_size();
|
|
|
|
if (ret != 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
// Compare timestamp.
|
|
|
|
// For the same user key with different timestamps, larger (newer) timestamp
|
|
|
|
// comes first.
|
|
|
|
return -CompareTimestamp(ExtractTimestampFromUserKey(a, ts_sz),
|
|
|
|
ExtractTimestampFromUserKey(b, ts_sz));
|
|
|
|
}
|
|
|
|
using Comparator::CompareWithoutTimestamp;
|
|
|
|
int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
|
|
|
|
bool b_has_ts) const override {
|
|
|
|
const size_t ts_sz = timestamp_size();
|
|
|
|
assert(!a_has_ts || a.size() >= ts_sz);
|
|
|
|
assert(!b_has_ts || b.size() >= ts_sz);
|
|
|
|
Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, ts_sz) : a;
|
|
|
|
Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, ts_sz) : b;
|
|
|
|
return cmp_without_ts_->Compare(lhs, rhs);
|
|
|
|
}
|
|
|
|
int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
|
2020-04-12 07:02:17 +02:00
|
|
|
assert(ts1.size() == sizeof(uint64_t));
|
|
|
|
assert(ts2.size() == sizeof(uint64_t));
|
2020-04-11 01:03:33 +02:00
|
|
|
uint64_t lhs = DecodeFixed64(ts1.data());
|
|
|
|
uint64_t rhs = DecodeFixed64(ts2.data());
|
|
|
|
if (lhs < rhs) {
|
|
|
|
return -1;
|
|
|
|
} else if (lhs > rhs) {
|
|
|
|
return 1;
|
|
|
|
} else {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const Comparator* cmp_without_ts_{nullptr};
|
|
|
|
};
|
|
|
|
|
2014-08-27 19:39:31 +02:00
|
|
|
} // namespace
|
|
|
|
|
|
|
|
const Comparator* Uint64Comparator() {
|
2018-06-05 04:59:44 +02:00
|
|
|
static Uint64ComparatorImpl uint64comp;
|
|
|
|
return &uint64comp;
|
2014-08-27 19:39:31 +02:00
|
|
|
}
|
|
|
|
|
2020-04-11 01:03:33 +02:00
|
|
|
const Comparator* ComparatorWithU64Ts() {
|
|
|
|
static ComparatorWithU64TsImpl comp_with_u64_ts;
|
|
|
|
return &comp_with_u64_ts;
|
|
|
|
}
|
|
|
|
|
2018-08-23 19:04:10 +02:00
|
|
|
WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
|
|
|
|
const std::string& fname) {
|
2018-11-09 20:17:34 +01:00
|
|
|
std::unique_ptr<WritableFile> file(wf);
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
return new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)),
|
|
|
|
fname, EnvOptions());
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
|
2018-11-09 20:17:34 +01:00
|
|
|
std::unique_ptr<RandomAccessFile> file(raf);
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
return new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
|
2017-06-29 06:26:03 +02:00
|
|
|
"[test RandomAccessFileReader]");
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2018-06-21 17:34:24 +02:00
|
|
|
SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
|
|
|
|
const std::string& fname) {
|
2018-11-09 20:17:34 +01:00
|
|
|
std::unique_ptr<SequentialFile> file(se);
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
return new SequentialFileReader(NewLegacySequentialFileWrapper(file), fname);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
Simplify querying of merge results
Summary:
While working on supporting mixing merge operators with
single deletes ( https://reviews.facebook.net/D43179 ),
I realized that returning and dealing with merge results
can be made simpler. Submitting this as a separate diff
because it is not directly related to single deletes.
Before, callers of merge helper had to retrieve the merge
result in one of two ways depending on whether the merge
was successful or not (success = result of merge was single
kTypeValue). For successful merges, the caller could query
the resulting key/value pair and for unsuccessful merges,
the result could be retrieved in the form of two deques of
keys and values. However, with single deletes, a successful merge
does not return a single key/value pair (if merge
operands are merged with a single delete, we have to generate
a value and keep the original single delete around to make
sure that we are not accidentially producing a key overwrite).
In addition, the two existing call sites of the merge
helper were taking the same actions independently from whether
the merge was successful or not, so this patch simplifies that.
Test Plan: make clean all check
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43353
2015-08-18 02:34:38 +02:00
|
|
|
void CorruptKeyType(InternalKey* ikey) {
|
|
|
|
std::string keystr = ikey->Encode().ToString();
|
|
|
|
keystr[keystr.size() - 8] = kTypeLogData;
|
|
|
|
ikey->DecodeFrom(Slice(keystr.data(), keystr.size()));
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 20:42:56 +02:00
|
|
|
std::string KeyStr(const std::string& user_key, const SequenceNumber& seq,
|
|
|
|
const ValueType& t, bool corrupt) {
|
|
|
|
InternalKey k(user_key, seq, t);
|
|
|
|
if (corrupt) {
|
|
|
|
CorruptKeyType(&k);
|
|
|
|
}
|
|
|
|
return k.Encode().ToString();
|
|
|
|
}
|
|
|
|
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
std::string RandomName(Random* rnd, const size_t len) {
|
|
|
|
std::stringstream ss;
|
|
|
|
for (size_t i = 0; i < len; ++i) {
|
|
|
|
ss << static_cast<char>(rnd->Uniform(26) + 'a');
|
|
|
|
}
|
|
|
|
return ss.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
CompressionType RandomCompressionType(Random* rnd) {
|
2019-06-04 04:47:02 +02:00
|
|
|
auto ret = static_cast<CompressionType>(rnd->Uniform(6));
|
|
|
|
while (!CompressionTypeSupported(ret)) {
|
|
|
|
ret = static_cast<CompressionType>((static_cast<int>(ret) + 1) % 6);
|
|
|
|
}
|
|
|
|
return ret;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void RandomCompressionTypeVector(const size_t count,
|
|
|
|
std::vector<CompressionType>* types,
|
|
|
|
Random* rnd) {
|
|
|
|
types->clear();
|
|
|
|
for (size_t i = 0; i < count; ++i) {
|
|
|
|
types->emplace_back(RandomCompressionType(rnd));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined) {
|
|
|
|
int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
|
|
|
|
switch (random_num) {
|
|
|
|
case 0:
|
|
|
|
return NewFixedPrefixTransform(rnd->Uniform(20) + 1);
|
|
|
|
case 1:
|
|
|
|
return NewCappedPrefixTransform(rnd->Uniform(20) + 1);
|
|
|
|
case 2:
|
|
|
|
return NewNoopTransform();
|
|
|
|
default:
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
|
|
|
|
BlockBasedTableOptions opt;
|
|
|
|
opt.cache_index_and_filter_blocks = rnd->Uniform(2);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 19:42:39 +02:00
|
|
|
opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
|
2018-06-23 00:14:05 +02:00
|
|
|
opt.pin_top_level_index_and_filter = rnd->Uniform(2);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 05:50:35 +02:00
|
|
|
using IndexType = BlockBasedTableOptions::IndexType;
|
|
|
|
const std::array<IndexType, 4> index_types = {
|
|
|
|
{IndexType::kBinarySearch, IndexType::kHashSearch,
|
|
|
|
IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}};
|
|
|
|
opt.index_type =
|
|
|
|
index_types[rnd->Uniform(static_cast<int>(index_types.size()))];
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
opt.hash_index_allow_collision = rnd->Uniform(2);
|
|
|
|
opt.checksum = static_cast<ChecksumType>(rnd->Uniform(3));
|
|
|
|
opt.block_size = rnd->Uniform(10000000);
|
|
|
|
opt.block_size_deviation = rnd->Uniform(100);
|
|
|
|
opt.block_restart_interval = rnd->Uniform(100);
|
2016-02-05 19:22:37 +01:00
|
|
|
opt.index_block_restart_interval = rnd->Uniform(100);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
opt.whole_key_filtering = rnd->Uniform(2);
|
|
|
|
|
|
|
|
return opt;
|
|
|
|
}
|
|
|
|
|
|
|
|
TableFactory* RandomTableFactory(Random* rnd, int pre_defined) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
|
|
|
|
switch (random_num) {
|
|
|
|
case 0:
|
|
|
|
return NewPlainTableFactory();
|
|
|
|
case 1:
|
|
|
|
return NewCuckooTableFactory();
|
|
|
|
default:
|
|
|
|
return NewBlockBasedTableFactory();
|
|
|
|
}
|
|
|
|
#else
|
2018-04-13 02:55:14 +02:00
|
|
|
(void)rnd;
|
|
|
|
(void)pre_defined;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
return NewBlockBasedTableFactory();
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
MergeOperator* RandomMergeOperator(Random* rnd) {
|
|
|
|
return new ChanglingMergeOperator(RandomName(rnd, 10));
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactionFilter* RandomCompactionFilter(Random* rnd) {
|
|
|
|
return new ChanglingCompactionFilter(RandomName(rnd, 10));
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd) {
|
|
|
|
return new ChanglingCompactionFilterFactory(RandomName(rnd, 10));
|
|
|
|
}
|
|
|
|
|
|
|
|
void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
|
|
|
|
// boolean options
|
|
|
|
db_opt->advise_random_on_open = rnd->Uniform(2);
|
|
|
|
db_opt->allow_mmap_reads = rnd->Uniform(2);
|
|
|
|
db_opt->allow_mmap_writes = rnd->Uniform(2);
|
2016-12-22 21:51:29 +01:00
|
|
|
db_opt->use_direct_reads = rnd->Uniform(2);
|
2017-04-13 22:07:33 +02:00
|
|
|
db_opt->use_direct_io_for_flush_and_compaction = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
db_opt->create_if_missing = rnd->Uniform(2);
|
|
|
|
db_opt->create_missing_column_families = rnd->Uniform(2);
|
|
|
|
db_opt->enable_thread_tracking = rnd->Uniform(2);
|
|
|
|
db_opt->error_if_exists = rnd->Uniform(2);
|
|
|
|
db_opt->is_fd_close_on_exec = rnd->Uniform(2);
|
|
|
|
db_opt->paranoid_checks = rnd->Uniform(2);
|
|
|
|
db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
|
|
|
|
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
|
Add an option to prevent DB::Open() from querying sizes of all sst files (#6353)
Summary:
When paranoid_checks is on, DBImpl::CheckConsistency() iterates over all sst files and calls Env::GetFileSize() for each of them. As far as I could understand, this is pretty arbitrary and doesn't affect correctness - if filesystem doesn't corrupt fsynced files, the file sizes will always match; if it does, it may as well corrupt contents as well as sizes, and rocksdb doesn't check contents on open.
If there are thousands of sst files, getting all their sizes takes a while. If, on top of that, Env is overridden to use some remote storage instead of local filesystem, it can be *really* slow and overload the remote storage service. This PR adds an option to not do GetFileSize(); instead it does GetChildren() for parent directory to check that all the expected sst files are at least present, but doesn't check their sizes.
We can't just disable paranoid_checks instead because paranoid_checks do a few other important things: make the DB read-only on write errors, print error messages on read errors, etc.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6353
Test Plan: ran the added sanity check unit test. Will try it out in a LogDevice test cluster where the GetFileSize() calls are causing a lot of trouble.
Differential Revision: D19656425
Pulled By: al13n321
fbshipit-source-id: c2c421b367633033760d1f56747bad206d1fbf82
2020-02-04 10:24:29 +01:00
|
|
|
db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
db_opt->use_adaptive_mutex = rnd->Uniform(2);
|
|
|
|
db_opt->use_fsync = rnd->Uniform(2);
|
|
|
|
db_opt->recycle_log_file_num = rnd->Uniform(2);
|
2016-11-02 23:22:13 +01:00
|
|
|
db_opt->avoid_flush_during_recovery = rnd->Uniform(2);
|
|
|
|
db_opt->avoid_flush_during_shutdown = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
|
|
|
|
// int options
|
|
|
|
db_opt->max_background_compactions = rnd->Uniform(100);
|
|
|
|
db_opt->max_background_flushes = rnd->Uniform(100);
|
|
|
|
db_opt->max_file_opening_threads = rnd->Uniform(100);
|
|
|
|
db_opt->max_open_files = rnd->Uniform(100);
|
|
|
|
db_opt->table_cache_numshardbits = rnd->Uniform(100);
|
|
|
|
|
|
|
|
// size_t options
|
|
|
|
db_opt->db_write_buffer_size = rnd->Uniform(10000);
|
|
|
|
db_opt->keep_log_file_num = rnd->Uniform(10000);
|
|
|
|
db_opt->log_file_time_to_roll = rnd->Uniform(10000);
|
|
|
|
db_opt->manifest_preallocation_size = rnd->Uniform(10000);
|
|
|
|
db_opt->max_log_file_size = rnd->Uniform(10000);
|
|
|
|
|
|
|
|
// std::string options
|
|
|
|
db_opt->db_log_dir = "path/to/db_log_dir";
|
|
|
|
db_opt->wal_dir = "path/to/wal_dir";
|
|
|
|
|
|
|
|
// uint32_t options
|
|
|
|
db_opt->max_subcompactions = rnd->Uniform(100000);
|
|
|
|
|
|
|
|
// uint64_t options
|
|
|
|
static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
|
|
|
|
db_opt->WAL_size_limit_MB = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->WAL_ttl_seconds = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->bytes_per_sync = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->delayed_write_rate = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->delete_obsolete_files_period_micros = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->max_manifest_file_size = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->max_total_wal_size = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->wal_bytes_per_sync = uint_max + rnd->Uniform(100000);
|
|
|
|
|
|
|
|
// unsigned int options
|
|
|
|
db_opt->stats_dump_period_sec = rnd->Uniform(100000);
|
|
|
|
}
|
|
|
|
|
2019-06-04 04:47:02 +02:00
|
|
|
void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
|
|
|
|
Random* rnd) {
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4));
|
|
|
|
|
|
|
|
// boolean options
|
2016-04-14 22:56:29 +02:00
|
|
|
cf_opt->report_bg_io_stats = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
cf_opt->disable_auto_compactions = rnd->Uniform(2);
|
|
|
|
cf_opt->inplace_update_support = rnd->Uniform(2);
|
|
|
|
cf_opt->level_compaction_dynamic_level_bytes = rnd->Uniform(2);
|
|
|
|
cf_opt->optimize_filters_for_hits = rnd->Uniform(2);
|
|
|
|
cf_opt->paranoid_file_checks = rnd->Uniform(2);
|
|
|
|
cf_opt->purge_redundant_kvs_while_flush = rnd->Uniform(2);
|
2016-10-08 02:21:45 +02:00
|
|
|
cf_opt->force_consistency_checks = rnd->Uniform(2);
|
2017-10-20 00:19:20 +02:00
|
|
|
cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
|
2019-02-19 21:12:25 +01:00
|
|
|
cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
|
|
|
|
// double options
|
|
|
|
cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
|
|
|
|
cf_opt->soft_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
|
2016-06-04 02:02:10 +02:00
|
|
|
cf_opt->memtable_prefix_bloom_size_ratio =
|
|
|
|
static_cast<double>(rnd->Uniform(10000)) / 20000.0;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
|
|
|
|
// int options
|
|
|
|
cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);
|
|
|
|
cf_opt->level0_slowdown_writes_trigger = rnd->Uniform(100);
|
|
|
|
cf_opt->level0_stop_writes_trigger = rnd->Uniform(100);
|
|
|
|
cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100);
|
|
|
|
cf_opt->max_mem_compaction_level = rnd->Uniform(100);
|
|
|
|
cf_opt->max_write_buffer_number = rnd->Uniform(100);
|
|
|
|
cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
|
|
|
cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
|
|
|
|
cf_opt->num_levels = rnd->Uniform(100);
|
|
|
|
cf_opt->target_file_size_multiplier = rnd->Uniform(100);
|
|
|
|
|
2016-09-14 06:12:43 +02:00
|
|
|
// vector int options
|
|
|
|
cf_opt->max_bytes_for_level_multiplier_additional.resize(cf_opt->num_levels);
|
|
|
|
for (int i = 0; i < cf_opt->num_levels; i++) {
|
|
|
|
cf_opt->max_bytes_for_level_multiplier_additional[i] = rnd->Uniform(100);
|
|
|
|
}
|
|
|
|
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
// size_t options
|
|
|
|
cf_opt->arena_block_size = rnd->Uniform(10000);
|
|
|
|
cf_opt->inplace_update_num_locks = rnd->Uniform(10000);
|
2020-01-11 01:51:34 +01:00
|
|
|
cf_opt->max_successive_merges = rnd->Uniform(10000);
|
2016-07-27 03:05:30 +02:00
|
|
|
cf_opt->memtable_huge_page_size = rnd->Uniform(10000);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
cf_opt->write_buffer_size = rnd->Uniform(10000);
|
|
|
|
|
|
|
|
// uint32_t options
|
|
|
|
cf_opt->bloom_locality = rnd->Uniform(10000);
|
|
|
|
cf_opt->max_bytes_for_level_base = rnd->Uniform(10000);
|
|
|
|
|
|
|
|
// uint64_t options
|
|
|
|
static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
|
2019-06-04 04:47:02 +02:00
|
|
|
cf_opt->ttl =
|
|
|
|
db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
|
|
|
|
cf_opt->periodic_compaction_seconds =
|
|
|
|
db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
|
|
|
|
cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
|
2016-06-17 01:02:52 +02:00
|
|
|
cf_opt->max_compaction_bytes =
|
|
|
|
cf_opt->target_file_size_base * rnd->Uniform(100);
|
2017-10-20 00:19:20 +02:00
|
|
|
cf_opt->compaction_options_fifo.max_table_files_size =
|
|
|
|
uint_max + rnd->Uniform(10000);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 15:52:43 +01:00
|
|
|
|
|
|
|
// unsigned int options
|
|
|
|
cf_opt->rate_limit_delay_max_milliseconds = rnd->Uniform(10000);
|
|
|
|
|
|
|
|
// pointer typed options
|
|
|
|
cf_opt->prefix_extractor.reset(RandomSliceTransform(rnd));
|
|
|
|
cf_opt->table_factory.reset(RandomTableFactory(rnd));
|
|
|
|
cf_opt->merge_operator.reset(RandomMergeOperator(rnd));
|
|
|
|
if (cf_opt->compaction_filter) {
|
|
|
|
delete cf_opt->compaction_filter;
|
|
|
|
}
|
|
|
|
cf_opt->compaction_filter = RandomCompactionFilter(rnd);
|
|
|
|
cf_opt->compaction_filter_factory.reset(RandomCompactionFilterFactory(rnd));
|
|
|
|
|
|
|
|
// custom typed options
|
|
|
|
cf_opt->compression = RandomCompressionType(rnd);
|
|
|
|
RandomCompressionTypeVector(cf_opt->num_levels,
|
|
|
|
&cf_opt->compression_per_level, rnd);
|
|
|
|
}
|
|
|
|
|
2016-09-08 00:41:54 +02:00
|
|
|
Status DestroyDir(Env* env, const std::string& dir) {
|
|
|
|
Status s;
|
|
|
|
if (env->FileExists(dir).IsNotFound()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
std::vector<std::string> files_in_dir;
|
|
|
|
s = env->GetChildren(dir, &files_in_dir);
|
|
|
|
if (s.ok()) {
|
|
|
|
for (auto& file_in_dir : files_in_dir) {
|
|
|
|
if (file_in_dir == "." || file_in_dir == "..") {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
s = env->DeleteFile(dir + "/" + file_in_dir);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
s = env->DeleteDir(dir);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2018-11-13 20:14:41 +01:00
|
|
|
bool IsDirectIOSupported(Env* env, const std::string& dir) {
|
|
|
|
EnvOptions env_options;
|
|
|
|
env_options.use_mmap_writes = false;
|
|
|
|
env_options.use_direct_writes = true;
|
|
|
|
std::string tmp = TempFileName(dir, 999);
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
std::unique_ptr<WritableFile> file;
|
|
|
|
s = env->NewWritableFile(tmp, &file, env_options);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = env->DeleteFile(tmp);
|
|
|
|
}
|
|
|
|
return s.ok();
|
|
|
|
}
|
|
|
|
|
2019-07-09 23:48:07 +02:00
|
|
|
size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
|
|
|
|
std::stringstream ssbuf;
|
|
|
|
std::string line;
|
|
|
|
size_t count = 0;
|
|
|
|
|
|
|
|
std::ifstream inFile(fname.c_str());
|
|
|
|
ssbuf << inFile.rdbuf();
|
|
|
|
|
|
|
|
while (getline(ssbuf, line)) {
|
|
|
|
if (line.find(pattern) != std::string::npos) {
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2020-04-25 08:58:13 +02:00
|
|
|
void SetupSyncPointsToMockDirectIO() {
|
|
|
|
#if !defined(NDEBUG) && !defined(OS_MACOSX) && !defined(OS_WIN) && \
|
|
|
|
!defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD)
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"NewWritableFile:O_DIRECT", [&](void* arg) {
|
|
|
|
int* val = static_cast<int*>(arg);
|
|
|
|
*val &= ~O_DIRECT;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"NewRandomAccessFile:O_DIRECT", [&](void* arg) {
|
|
|
|
int* val = static_cast<int*>(arg);
|
|
|
|
*val &= ~O_DIRECT;
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2011-10-31 18:22:06 +01:00
|
|
|
} // namespace test
|
2020-02-20 21:07:53 +01:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|