2013-10-16 14:59:46 -07:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2012-10-03 09:58:45 -07:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2013-02-23 11:11:16 -08:00
|
|
|
//
|
2013-08-13 13:58:02 -07:00
|
|
|
// The test uses an array to compare against values written to the database.
|
|
|
|
// Keys written to the array are in 1:1 correspondence to the actual values in
|
|
|
|
// the database according to the formula in the function GenerateValue.
|
|
|
|
|
|
|
|
// Space is reserved in the array from 0 to FLAGS_max_key and values are
|
|
|
|
// randomly written/deleted/read from those positions. During verification we
|
|
|
|
// compare all the positions in the array. To shorten/elongate the running
|
|
|
|
// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
|
|
|
|
// (sometimes also FLAGS_threads).
|
|
|
|
//
|
|
|
|
// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
|
|
|
|
// different behavior. See comment of the flag for details.
|
2013-02-20 17:37:13 -08:00
|
|
|
|
2014-05-08 17:25:13 -07:00
|
|
|
#ifndef GFLAGS
|
|
|
|
#include <cstdio>
|
|
|
|
int main() {
|
|
|
|
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
|
2014-12-04 11:59:29 -08:00
|
|
|
#define __STDC_FORMAT_MACROS
|
|
|
|
#include <inttypes.h>
|
2012-10-03 09:58:45 -07:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
2015-05-29 13:17:49 -07:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <chrono>
|
2014-09-02 13:21:59 -07:00
|
|
|
#include <exception>
|
2015-05-29 13:17:49 -07:00
|
|
|
#include <thread>
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
#include <gflags/gflags.h>
|
2012-10-03 09:58:45 -07:00
|
|
|
#include "db/db_impl.h"
|
|
|
|
#include "db/version_set.h"
|
2015-08-11 11:46:15 -07:00
|
|
|
#include "hdfs/env_hdfs.h"
|
|
|
|
#include "port/port.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/cache.h"
|
|
|
|
#include "rocksdb/env.h"
|
2014-01-24 16:15:05 -08:00
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "rocksdb/slice_transform.h"
|
2015-08-11 11:46:15 -07:00
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/utilities/db_ttl.h"
|
|
|
|
#include "rocksdb/write_batch.h"
|
2013-08-14 16:58:36 -07:00
|
|
|
#include "util/coding.h"
|
2015-08-11 11:46:15 -07:00
|
|
|
#include "util/compression.h"
|
2012-10-03 09:58:45 -07:00
|
|
|
#include "util/crc32c.h"
|
|
|
|
#include "util/histogram.h"
|
2015-08-11 11:46:15 -07:00
|
|
|
#include "util/logging.h"
|
2012-10-03 09:58:45 -07:00
|
|
|
#include "util/mutexlock.h"
|
|
|
|
#include "util/random.h"
|
2015-08-11 11:46:15 -07:00
|
|
|
#include "util/string_util.h"
|
2012-10-03 09:58:45 -07:00
|
|
|
#include "util/testutil.h"
|
Benchmarking for Merge Operator
Summary:
Updated db_bench and utilities/merge_operators.h to allow for dynamic benchmarking
of merge operators in db_bench. Added a new test (--benchmarks=mergerandom), which performs
a bunch of random Merge() operations over random keys. Also added a "--merge_operator=" flag
so that the tester can easily benchmark different merge operators. Currently supports
the PutOperator and UInt64Add operator. Support for stringappend or list append may come later.
Test Plan:
1. make db_bench
2. Test the PutOperator (simulating Put) as follows:
./db_bench --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom --merge_operator=put
--threads=2
3. Test the UInt64AddOperator (simulating numeric addition) similarly:
./db_bench --value_size=8 --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom
--merge_operator=uint64add --threads=2
Reviewers: haobo, dhruba, zshao, MarkCallaghan
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11535
2013-08-15 17:13:07 -07:00
|
|
|
#include "utilities/merge_operators.h"
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-05-08 17:25:13 -07:00
|
|
|
using GFLAGS::ParseCommandLineFlags;
|
|
|
|
using GFLAGS::RegisterFlagValidator;
|
|
|
|
using GFLAGS::SetUsageMessage;
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-05-08 17:25:13 -07:00
|
|
|
static const long KB = 1024;
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
static bool ValidateUint32Range(const char* flagname, uint64_t value) {
|
|
|
|
if (value > std::numeric_limits<uint32_t>::max()) {
|
2013-11-16 23:44:39 -08:00
|
|
|
fprintf(stderr,
|
|
|
|
"Invalid value for --%s: %lu, overflow\n",
|
|
|
|
flagname,
|
|
|
|
(unsigned long)value);
|
2013-10-24 07:43:14 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
hints for narrowing down FindFile range and avoiding checking unrelevant L0 files
Summary:
The file tree structure in Version is prebuilt and the range of each file is known.
On the Get() code path, we do binary search in FindFile() by comparing
target key with each file's largest key and also check the range for each L0 file.
With some pre-calculated knowledge, each key comparision that has been done can serve
as a hint to narrow down further searches:
(1) If a key falls within a L0 file's range, we can safely skip the next
file if its range does not overlap with the current one.
(2) If a key falls within a file's range in level L0 - Ln-1, we should only
need to binary search in the next level for files that overlap with the current one.
(1) will be able to skip some files depending one the key distribution.
(2) can greatly reduce the range of binary search, especially for bottom
levels, given that one file most likely only overlaps with N files from
the level below (where N is max_bytes_for_level_multiplier). So on level
L, we will only look at ~N files instead of N^L files.
Some inital results: measured with 500M key DB, when write is light (10k/s = 1.2M/s), this
improves QPS ~7% on top of blocked bloom. When write is heavier (80k/s =
9.6M/s), it gives us ~13% improvement.
Test Plan: make all check
Reviewers: haobo, igor, dhruba, sdong, yhchiang
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D17205
2014-04-21 09:10:12 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_uint64(seed, 2341234, "Seed for PRNG");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_seed_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
DEFINE_int64(max_key, 1 * KB* KB,
|
2013-10-24 07:43:14 -07:00
|
|
|
"Max number of key/values to place in database");
|
2013-04-08 12:35:40 -07:00
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
DEFINE_int32(column_families, 10, "Number of column families");
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
// TODO(noetzli) Add support for single deletes
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(test_batches_snapshots, false,
|
2015-07-31 14:11:43 -07:00
|
|
|
"If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
|
2013-10-24 07:43:14 -07:00
|
|
|
" which read/write/delete multiple keys in a batch. In this mode,"
|
|
|
|
" we do not verify db content by comparing the content with the "
|
|
|
|
"pre-allocated array. Instead, we do partial verification inside"
|
|
|
|
" MultiGet() by checking various values in a batch. Benefit of"
|
|
|
|
" this mode:\n"
|
|
|
|
"\t(a) No need to acquire mutexes during writes (less cache "
|
|
|
|
"flushes in multi-core leading to speed up)\n"
|
|
|
|
"\t(b) No long validation at the end (more speed up)\n"
|
|
|
|
"\t(c) Test snapshot and atomicity of batch writes");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(ttl, -1,
|
|
|
|
"Opens the db with this ttl value if this is not -1. "
|
|
|
|
"Carefully specify a large value such that verifications on "
|
|
|
|
"deleted values don't fail");
|
2012-10-19 14:00:53 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(value_size_mult, 8,
|
|
|
|
"Size of value will be this number times rand_int(1,3) bytes");
|
2013-09-09 16:06:10 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(verify_before_write, false, "Verify before write");
|
2012-10-19 14:00:53 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(histogram, false, "Print histogram of operation timings");
|
2013-06-13 22:09:08 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(destroy_db_initially, true,
|
|
|
|
"Destroys the database dir before start if this is true");
|
2013-09-09 16:06:10 -07:00
|
|
|
|
2014-03-19 09:58:41 -07:00
|
|
|
DEFINE_bool(verbose, false, "Verbose");
|
|
|
|
|
|
|
|
DEFINE_bool(progress_reports, true,
|
|
|
|
"If true, db_stress will report number of finished operations");
|
2013-09-09 16:06:10 -07:00
|
|
|
|
2014-12-02 12:09:20 -08:00
|
|
|
DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
|
|
|
|
"Number of bytes to buffer in all memtables before compacting");
|
|
|
|
|
2014-11-11 16:47:22 -05:00
|
|
|
DEFINE_int32(write_buffer_size,
|
|
|
|
static_cast<int32_t>(rocksdb::Options().write_buffer_size),
|
2013-10-24 07:43:14 -07:00
|
|
|
"Number of bytes to buffer in memtable before compacting");
|
2013-09-09 16:06:10 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(max_write_buffer_number,
|
|
|
|
rocksdb::Options().max_write_buffer_number,
|
|
|
|
"The number of in-memory memtables. "
|
|
|
|
"Each memtable is of size FLAGS_write_buffer_size.");
|
2013-09-09 16:06:10 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(min_write_buffer_number_to_merge,
|
|
|
|
rocksdb::Options().min_write_buffer_number_to_merge,
|
|
|
|
"The minimum number of write buffers that will be merged together "
|
|
|
|
"before writing to storage. This is cheap because it is an "
|
|
|
|
"in-memory merge. If this feature is not enabled, then all these "
|
|
|
|
"write buffers are flushed to L0 as separate files and this "
|
|
|
|
"increases read amplification because a get request has to check "
|
|
|
|
"in all of these files. Also, an in-memory merge may result in "
|
|
|
|
"writing less data to storage if there are duplicate records in"
|
|
|
|
" each of these individual write buffers.");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 16:34:24 -07:00
|
|
|
DEFINE_int32(max_write_buffer_number_to_maintain,
|
|
|
|
rocksdb::Options().max_write_buffer_number_to_maintain,
|
|
|
|
"The total maximum number of write buffers to maintain in memory "
|
|
|
|
"including copies of buffers that have already been flushed. "
|
|
|
|
"Unlike max_write_buffer_number, this parameter does not affect "
|
|
|
|
"flushing. This controls the minimum amount of write history "
|
|
|
|
"that will be available in memory for conflict checking when "
|
|
|
|
"Transactions are used. If this value is too low, some "
|
|
|
|
"transactions may fail at commit time due to not being able to "
|
|
|
|
"determine whether there were any write conflicts. Setting this "
|
|
|
|
"value to 0 will cause write buffers to be freed immediately "
|
|
|
|
"after they are flushed. If this value is set to -1, "
|
|
|
|
"'max_write_buffer_number' will be used.");
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(open_files, rocksdb::Options().max_open_files,
|
|
|
|
"Maximum number of files to keep open at the same time "
|
|
|
|
"(use default if == 0)");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-09-01 23:23:40 -07:00
|
|
|
DEFINE_int64(compressed_cache_size, -1,
|
|
|
|
"Number of bytes to use as a cache of compressed data."
|
|
|
|
" Negative means use default settings.");
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
|
2012-11-09 13:04:12 -08:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(level0_file_num_compaction_trigger,
|
|
|
|
rocksdb::Options().level0_file_num_compaction_trigger,
|
|
|
|
"Level0 compaction start trigger");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(level0_slowdown_writes_trigger,
|
|
|
|
rocksdb::Options().level0_slowdown_writes_trigger,
|
|
|
|
"Number of files in level-0 that will slow down writes");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(level0_stop_writes_trigger,
|
|
|
|
rocksdb::Options().level0_stop_writes_trigger,
|
|
|
|
"Number of files in level-0 that will trigger put stop.");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-11-11 16:47:22 -05:00
|
|
|
DEFINE_int32(block_size,
|
|
|
|
static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
|
2013-10-24 07:43:14 -07:00
|
|
|
"Number of bytes in a block.");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(max_background_compactions,
|
|
|
|
rocksdb::Options().max_background_compactions,
|
|
|
|
"The maximum number of concurrent background compactions "
|
|
|
|
"that can occur in parallel.");
|
2013-06-17 16:13:32 -07:00
|
|
|
|
2014-06-02 10:12:41 -07:00
|
|
|
DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
|
|
|
|
"The interval (in milliseconds) to adjust compaction thread pool "
|
|
|
|
"size. Don't change it periodically if the value is 0.");
|
|
|
|
|
2015-04-25 18:14:27 +09:00
|
|
|
DEFINE_int32(compaction_thread_pool_variations, 2,
|
2015-07-31 14:11:43 -07:00
|
|
|
"Range of background thread pool size variations when adjusted "
|
2014-06-02 10:12:41 -07:00
|
|
|
"periodically.");
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
|
|
|
|
"The maximum number of concurrent background flushes "
|
|
|
|
"that can occur in parallel.");
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
|
|
|
|
" compaction in universal style");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to "
|
|
|
|
"compact in universal style compaction");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
|
|
|
|
" in universal style compaction");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(universal_max_size_amplification_percent, 0,
|
|
|
|
"The max size amplification for universal style compaction");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
DEFINE_int32(clear_column_family_one_in, 1000000,
|
|
|
|
"With a chance of 1/N, delete a column family and then recreate "
|
|
|
|
"it again. If N == 0, never drop/create column families. "
|
|
|
|
"When test_batches_snapshots is true, this flag has no effect");
|
|
|
|
|
2014-10-27 12:11:16 -07:00
|
|
|
DEFINE_int32(set_options_one_in, 0,
|
|
|
|
"With a chance of 1/N, change some random options");
|
|
|
|
|
2014-10-31 12:02:14 -07:00
|
|
|
DEFINE_int32(set_in_place_one_in, 0,
|
|
|
|
"With a chance of 1/N, toggle in place support option");
|
|
|
|
|
2015-09-15 14:41:00 +03:00
|
|
|
DEFINE_int64(cache_size, 2LL * KB * KB * KB,
|
2013-10-24 07:43:14 -07:00
|
|
|
"Number of bytes to use as a cache of uncompressed data.");
|
2013-04-04 23:49:43 -07:00
|
|
|
|
2015-08-18 11:06:23 -07:00
|
|
|
DEFINE_uint64(subcompactions, 1,
|
|
|
|
"Maximum number of subcompactions to divide L0-L1 compactions "
|
|
|
|
"into.");
|
|
|
|
static const bool FLAGS_subcompactions_dummy __attribute__((unused)) =
|
|
|
|
RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
static bool ValidateInt32Positive(const char* flagname, int32_t value) {
|
|
|
|
if (value < 0) {
|
|
|
|
fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n",
|
|
|
|
flagname, value);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
DEFINE_int32(reopen, 10, "Number of times database reopens");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_reopen_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
|
|
|
|
"Negative means use default settings.");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-09-08 10:37:05 -07:00
|
|
|
DEFINE_bool(use_block_based_filter, false, "use block based filter"
|
|
|
|
"instead of full filter for block based table");
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_string(db, "", "Use the db with the following name.");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(verify_checksum, false,
|
|
|
|
"Verify checksum for every block read from storage");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
|
|
|
|
"Allow reads to occur via mmap-ing files");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
// Database statistics
|
|
|
|
static std::shared_ptr<rocksdb::Statistics> dbstats;
|
|
|
|
DEFINE_bool(statistics, false, "Create database statistics");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(sync, false, "Sync all writes to disk");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(disable_data_sync, false,
|
|
|
|
"If true, do not wait until data is synced to disk.");
|
2012-11-07 15:35:08 -08:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
|
2013-08-14 16:58:36 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(kill_random_test, 0,
|
|
|
|
"If non-zero, kill at various points in source code with "
|
|
|
|
"probability 1/this");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
|
2013-10-24 07:43:14 -07:00
|
|
|
extern int rocksdb_kill_odds;
|
2013-08-14 16:58:36 -07:00
|
|
|
|
2015-10-14 14:08:50 -07:00
|
|
|
DEFINE_string(kill_prefix_blacklist, "",
|
|
|
|
"If non-empty, kill points with prefix in the list given will be"
|
|
|
|
" skipped. Items are comma-separated.");
|
|
|
|
extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(target_file_size_base, 64 * KB,
|
|
|
|
"Target level-1 file size for compaction");
|
2013-09-19 16:47:24 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(target_file_size_multiplier, 1,
|
2015-07-31 14:11:43 -07:00
|
|
|
"A multiplier to compute target level-N file size (N >= 2)");
|
2013-09-19 16:47:24 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1");
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(max_bytes_for_level_multiplier, 2,
|
|
|
|
"A multiplier to compute max bytes for level-N (N >= 2)");
|
2012-10-19 14:00:53 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
static bool ValidateInt32Percent(const char* flagname, int32_t value) {
|
|
|
|
if (value < 0 || value>100) {
|
|
|
|
fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
|
|
|
|
flagname, value);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
DEFINE_int32(readpercent, 10,
|
|
|
|
"Ratio of reads to total workload (expressed as a percentage)");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
|
|
|
DEFINE_int32(prefixpercent, 20,
|
|
|
|
"Ratio of prefix iterators to total workload (expressed as a"
|
|
|
|
" percentage)");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
|
|
|
DEFINE_int32(writepercent, 45,
|
2015-07-31 14:11:43 -07:00
|
|
|
"Ratio of writes to total workload (expressed as a percentage)");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
|
|
|
DEFINE_int32(delpercent, 15,
|
|
|
|
"Ratio of deletes to total workload (expressed as a percentage)");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
DEFINE_int32(nooverwritepercent, 60,
|
|
|
|
"Ratio of keys without overwrite to total workload (expressed as "
|
|
|
|
" a percentage)");
|
|
|
|
static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
|
|
|
|
RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
|
|
|
|
" (expressed as a percentage)");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
|
|
|
DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
2014-04-09 21:17:14 -07:00
|
|
|
namespace {
|
2013-10-24 07:43:14 -07:00
|
|
|
enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
|
|
|
|
assert(ctype);
|
|
|
|
|
|
|
|
if (!strcasecmp(ctype, "none"))
|
|
|
|
return rocksdb::kNoCompression;
|
|
|
|
else if (!strcasecmp(ctype, "snappy"))
|
|
|
|
return rocksdb::kSnappyCompression;
|
|
|
|
else if (!strcasecmp(ctype, "zlib"))
|
|
|
|
return rocksdb::kZlibCompression;
|
|
|
|
else if (!strcasecmp(ctype, "bzip2"))
|
|
|
|
return rocksdb::kBZip2Compression;
|
2014-02-07 18:12:30 -08:00
|
|
|
else if (!strcasecmp(ctype, "lz4"))
|
|
|
|
return rocksdb::kLZ4Compression;
|
|
|
|
else if (!strcasecmp(ctype, "lz4hc"))
|
|
|
|
return rocksdb::kLZ4HCCompression;
|
2015-08-27 15:40:42 -07:00
|
|
|
else if (!strcasecmp(ctype, "zstd"))
|
|
|
|
return rocksdb::kZSTDNotFinalCompression;
|
2013-10-24 07:43:14 -07:00
|
|
|
|
|
|
|
fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
|
|
|
|
return rocksdb::kSnappyCompression; //default value
|
|
|
|
}
|
2015-10-14 14:08:50 -07:00
|
|
|
|
|
|
|
std::vector<std::string> SplitString(std::string src) {
|
|
|
|
std::vector<std::string> ret;
|
|
|
|
if (src.empty()) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
size_t pos = 0;
|
|
|
|
size_t pos_comma;
|
|
|
|
while ((pos_comma = src.find(',', pos)) != std::string::npos) {
|
|
|
|
ret.push_back(src.substr(pos, pos_comma - pos));
|
|
|
|
pos = pos_comma + 1;
|
|
|
|
}
|
|
|
|
ret.push_back(src.substr(pos, src.length()));
|
|
|
|
return ret;
|
|
|
|
}
|
2014-04-09 21:17:14 -07:00
|
|
|
} // namespace
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_string(compression_type, "snappy",
|
|
|
|
"Algorithm to use to compress the database");
|
|
|
|
static enum rocksdb::CompressionType FLAGS_compression_type_e =
|
2013-10-03 21:49:15 -07:00
|
|
|
rocksdb::kSnappyCompression;
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_string(hdfs, "", "Name of hdfs environment");
|
2012-10-03 09:58:45 -07:00
|
|
|
// posix or hdfs environment
|
2013-10-03 21:49:15 -07:00
|
|
|
static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
|
|
|
|
static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
|
2014-02-27 12:13:48 -08:00
|
|
|
static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
|
2013-03-06 12:54:55 -08:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
|
|
|
|
" the delete if key not present");
|
2013-06-17 13:51:12 -07:00
|
|
|
|
2014-10-31 12:02:14 -07:00
|
|
|
DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
|
|
|
|
|
2013-08-22 23:10:02 -07:00
|
|
|
enum RepFactory {
|
|
|
|
kSkipList,
|
2013-12-03 12:42:15 -08:00
|
|
|
kHashSkipList,
|
2013-08-22 23:10:02 -07:00
|
|
|
kVectorRep
|
|
|
|
};
|
2014-04-09 21:17:14 -07:00
|
|
|
|
|
|
|
namespace {
|
2013-10-24 07:43:14 -07:00
|
|
|
enum RepFactory StringToRepFactory(const char* ctype) {
|
|
|
|
assert(ctype);
|
|
|
|
|
|
|
|
if (!strcasecmp(ctype, "skip_list"))
|
|
|
|
return kSkipList;
|
|
|
|
else if (!strcasecmp(ctype, "prefix_hash"))
|
2013-12-03 12:42:15 -08:00
|
|
|
return kHashSkipList;
|
2013-10-24 07:43:14 -07:00
|
|
|
else if (!strcasecmp(ctype, "vector"))
|
|
|
|
return kVectorRep;
|
|
|
|
|
|
|
|
fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
|
|
|
|
return kSkipList;
|
|
|
|
}
|
2014-04-09 21:17:14 -07:00
|
|
|
} // namespace
|
|
|
|
|
2013-08-22 23:10:02 -07:00
|
|
|
static enum RepFactory FLAGS_rep_factory;
|
2014-03-11 13:44:33 -07:00
|
|
|
DEFINE_string(memtablerep, "prefix_hash", "");
|
2013-08-22 23:10:02 -07:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
static bool ValidatePrefixSize(const char* flagname, int32_t value) {
|
2014-03-11 13:44:33 -07:00
|
|
|
if (value < 0 || value > 8) {
|
|
|
|
fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n",
|
2013-10-24 07:43:14 -07:00
|
|
|
flagname, value);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2014-03-12 09:31:06 -07:00
|
|
|
DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
|
2014-10-27 12:11:16 -07:00
|
|
|
static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
|
2014-05-08 17:25:13 -07:00
|
|
|
RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
|
|
|
DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
|
|
|
|
"that behaves like a Put");
|
2013-08-22 23:10:02 -07:00
|
|
|
|
Benchmarking for Merge Operator
Summary:
Updated db_bench and utilities/merge_operators.h to allow for dynamic benchmarking
of merge operators in db_bench. Added a new test (--benchmarks=mergerandom), which performs
a bunch of random Merge() operations over random keys. Also added a "--merge_operator=" flag
so that the tester can easily benchmark different merge operators. Currently supports
the PutOperator and UInt64Add operator. Support for stringappend or list append may come later.
Test Plan:
1. make db_bench
2. Test the PutOperator (simulating Put) as follows:
./db_bench --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom --merge_operator=put
--threads=2
3. Test the UInt64AddOperator (simulating numeric addition) similarly:
./db_bench --value_size=8 --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom
--merge_operator=uint64add --threads=2
Reviewers: haobo, dhruba, zshao, MarkCallaghan
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11535
2013-08-15 17:13:07 -07:00
|
|
|
|
2013-10-03 21:49:15 -07:00
|
|
|
namespace rocksdb {
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2013-08-14 16:58:36 -07:00
|
|
|
// convert long to a big-endian slice key
|
|
|
|
static std::string Key(long val) {
|
|
|
|
std::string little_endian_key;
|
|
|
|
std::string big_endian_key;
|
|
|
|
PutFixed64(&little_endian_key, val);
|
|
|
|
assert(little_endian_key.size() == sizeof(val));
|
|
|
|
big_endian_key.resize(sizeof(val));
|
|
|
|
for (int i=0; i<(int)sizeof(val); i++) {
|
|
|
|
big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
|
|
|
|
}
|
|
|
|
return big_endian_key;
|
|
|
|
}
|
|
|
|
|
hints for narrowing down FindFile range and avoiding checking unrelevant L0 files
Summary:
The file tree structure in Version is prebuilt and the range of each file is known.
On the Get() code path, we do binary search in FindFile() by comparing
target key with each file's largest key and also check the range for each L0 file.
With some pre-calculated knowledge, each key comparision that has been done can serve
as a hint to narrow down further searches:
(1) If a key falls within a L0 file's range, we can safely skip the next
file if its range does not overlap with the current one.
(2) If a key falls within a file's range in level L0 - Ln-1, we should only
need to binary search in the next level for files that overlap with the current one.
(1) will be able to skip some files depending one the key distribution.
(2) can greatly reduce the range of binary search, especially for bottom
levels, given that one file most likely only overlaps with N files from
the level below (where N is max_bytes_for_level_multiplier). So on level
L, we will only look at ~N files instead of N^L files.
Some inital results: measured with 500M key DB, when write is light (10k/s = 1.2M/s), this
improves QPS ~7% on top of blocked bloom. When write is heavier (80k/s =
9.6M/s), it gives us ~13% improvement.
Test Plan: make all check
Reviewers: haobo, igor, dhruba, sdong, yhchiang
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D17205
2014-04-21 09:10:12 -07:00
|
|
|
static std::string StringToHex(const std::string& str) {
|
|
|
|
std::string result = "0x";
|
|
|
|
char buf[10];
|
|
|
|
for (size_t i = 0; i < str.length(); i++) {
|
|
|
|
snprintf(buf, 10, "%02X", (unsigned char)str[i]);
|
|
|
|
result += buf;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
class StressTest;
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
class Stats {
|
|
|
|
private:
|
|
|
|
double start_;
|
|
|
|
double finish_;
|
|
|
|
double seconds_;
|
|
|
|
long done_;
|
2013-08-14 16:58:36 -07:00
|
|
|
long gets_;
|
|
|
|
long prefixes_;
|
2012-10-03 09:58:45 -07:00
|
|
|
long writes_;
|
2012-11-07 15:35:08 -08:00
|
|
|
long deletes_;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
size_t single_deletes_;
|
2013-08-14 16:58:36 -07:00
|
|
|
long iterator_size_sums_;
|
2013-02-20 15:57:27 -08:00
|
|
|
long founds_;
|
2013-09-19 16:47:24 -07:00
|
|
|
long iterations_;
|
2013-02-20 15:57:27 -08:00
|
|
|
long errors_;
|
2012-10-03 09:58:45 -07:00
|
|
|
int next_report_;
|
|
|
|
size_t bytes_;
|
|
|
|
double last_op_finish_;
|
2013-02-15 11:53:17 -08:00
|
|
|
HistogramImpl hist_;
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
public:
|
|
|
|
Stats() { }
|
|
|
|
|
|
|
|
void Start() {
|
|
|
|
next_report_ = 100;
|
|
|
|
hist_.Clear();
|
|
|
|
done_ = 0;
|
2013-08-14 16:58:36 -07:00
|
|
|
gets_ = 0;
|
|
|
|
prefixes_ = 0;
|
2012-10-03 09:58:45 -07:00
|
|
|
writes_ = 0;
|
2012-11-07 15:35:08 -08:00
|
|
|
deletes_ = 0;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
single_deletes_ = 0;
|
2013-08-14 16:58:36 -07:00
|
|
|
iterator_size_sums_ = 0;
|
2013-02-20 15:57:27 -08:00
|
|
|
founds_ = 0;
|
2013-09-19 16:47:24 -07:00
|
|
|
iterations_ = 0;
|
2013-02-20 15:57:27 -08:00
|
|
|
errors_ = 0;
|
2012-10-03 09:58:45 -07:00
|
|
|
bytes_ = 0;
|
|
|
|
seconds_ = 0;
|
|
|
|
start_ = FLAGS_env->NowMicros();
|
|
|
|
last_op_finish_ = start_;
|
|
|
|
finish_ = start_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Merge(const Stats& other) {
|
|
|
|
hist_.Merge(other.hist_);
|
|
|
|
done_ += other.done_;
|
2013-08-14 16:58:36 -07:00
|
|
|
gets_ += other.gets_;
|
|
|
|
prefixes_ += other.prefixes_;
|
2012-10-03 09:58:45 -07:00
|
|
|
writes_ += other.writes_;
|
2012-11-07 15:35:08 -08:00
|
|
|
deletes_ += other.deletes_;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
single_deletes_ += other.single_deletes_;
|
2013-08-14 16:58:36 -07:00
|
|
|
iterator_size_sums_ += other.iterator_size_sums_;
|
2013-02-23 11:11:16 -08:00
|
|
|
founds_ += other.founds_;
|
2013-09-19 16:47:24 -07:00
|
|
|
iterations_ += other.iterations_;
|
2013-02-23 11:11:16 -08:00
|
|
|
errors_ += other.errors_;
|
2012-10-03 09:58:45 -07:00
|
|
|
bytes_ += other.bytes_;
|
|
|
|
seconds_ += other.seconds_;
|
|
|
|
if (other.start_ < start_) start_ = other.start_;
|
|
|
|
if (other.finish_ > finish_) finish_ = other.finish_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Stop() {
|
|
|
|
finish_ = FLAGS_env->NowMicros();
|
|
|
|
seconds_ = (finish_ - start_) * 1e-6;
|
|
|
|
}
|
|
|
|
|
|
|
|
void FinishedSingleOp() {
|
|
|
|
if (FLAGS_histogram) {
|
|
|
|
double now = FLAGS_env->NowMicros();
|
|
|
|
double micros = now - last_op_finish_;
|
|
|
|
hist_.Add(micros);
|
|
|
|
if (micros > 20000) {
|
2013-03-12 23:20:14 -07:00
|
|
|
fprintf(stdout, "long op: %.1f micros%30s\r", micros, "");
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
last_op_finish_ = now;
|
|
|
|
}
|
|
|
|
|
2014-03-19 09:58:41 -07:00
|
|
|
done_++;
|
2014-03-20 16:52:59 -07:00
|
|
|
if (FLAGS_progress_reports) {
|
2014-03-19 09:58:41 -07:00
|
|
|
if (done_ >= next_report_) {
|
|
|
|
if (next_report_ < 1000) next_report_ += 100;
|
|
|
|
else if (next_report_ < 5000) next_report_ += 500;
|
|
|
|
else if (next_report_ < 10000) next_report_ += 1000;
|
|
|
|
else if (next_report_ < 50000) next_report_ += 5000;
|
|
|
|
else if (next_report_ < 100000) next_report_ += 10000;
|
|
|
|
else if (next_report_ < 500000) next_report_ += 50000;
|
|
|
|
else next_report_ += 100000;
|
|
|
|
fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
|
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-02-20 15:57:27 -08:00
|
|
|
void AddBytesForWrites(int nwrites, size_t nbytes) {
|
|
|
|
writes_ += nwrites;
|
|
|
|
bytes_ += nbytes;
|
|
|
|
}
|
|
|
|
|
2013-02-23 11:11:16 -08:00
|
|
|
void AddGets(int ngets, int nfounds) {
|
|
|
|
founds_ += nfounds;
|
|
|
|
gets_ += ngets;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
2013-08-14 16:58:36 -07:00
|
|
|
void AddPrefixes(int nprefixes, int count) {
|
|
|
|
prefixes_ += nprefixes;
|
|
|
|
iterator_size_sums_ += count;
|
|
|
|
}
|
|
|
|
|
2013-09-19 16:47:24 -07:00
|
|
|
void AddIterations(int n) {
|
|
|
|
iterations_ += n;
|
|
|
|
}
|
|
|
|
|
2013-08-14 16:58:36 -07:00
|
|
|
void AddDeletes(int n) {
|
|
|
|
deletes_ += n;
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
void AddSingleDeletes(size_t n) { single_deletes_ += n; }
|
|
|
|
|
2013-02-20 15:57:27 -08:00
|
|
|
void AddErrors(int n) {
|
|
|
|
errors_ += n;
|
2012-11-07 15:35:08 -08:00
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
void Report(const char* name) {
|
|
|
|
std::string extra;
|
|
|
|
if (bytes_ < 1 || done_ < 1) {
|
|
|
|
fprintf(stderr, "No writes or ops?\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
double elapsed = (finish_ - start_) * 1e-6;
|
|
|
|
double bytes_mb = bytes_ / 1048576.0;
|
|
|
|
double rate = bytes_mb / elapsed;
|
|
|
|
double throughput = (double)done_/elapsed;
|
|
|
|
|
|
|
|
fprintf(stdout, "%-12s: ", name);
|
|
|
|
fprintf(stdout, "%.3f micros/op %ld ops/sec\n",
|
|
|
|
seconds_ * 1e6 / done_, (long)throughput);
|
|
|
|
fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
|
|
|
|
"", bytes_mb, rate, (100*writes_)/done_, done_);
|
2013-02-23 11:11:16 -08:00
|
|
|
fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
|
2012-11-07 15:35:08 -08:00
|
|
|
fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
|
2015-12-08 08:38:21 -08:00
|
|
|
fprintf(stdout, "%-12s: Single deleted %" ROCKSDB_PRIszt " times\n", "",
|
|
|
|
single_deletes_);
|
2013-08-13 13:58:02 -07:00
|
|
|
fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "",
|
|
|
|
gets_, founds_);
|
2013-08-14 16:58:36 -07:00
|
|
|
fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
|
|
|
|
fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
|
|
|
|
iterator_size_sums_);
|
2013-09-19 16:47:24 -07:00
|
|
|
fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
|
2013-02-20 15:57:27 -08:00
|
|
|
fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
if (FLAGS_histogram) {
|
|
|
|
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
|
|
|
|
}
|
|
|
|
fflush(stdout);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// State shared by all concurrent executions of the same benchmark.
|
|
|
|
class SharedState {
|
|
|
|
public:
|
2014-03-11 13:08:48 -07:00
|
|
|
static const uint32_t SENTINEL;
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2014-04-24 06:46:50 -07:00
|
|
|
explicit SharedState(StressTest* stress_test)
|
|
|
|
: cv_(&mu_),
|
2014-11-11 16:47:22 -05:00
|
|
|
seed_(static_cast<uint32_t>(FLAGS_seed)),
|
2014-04-24 06:46:50 -07:00
|
|
|
max_key_(FLAGS_max_key),
|
2014-11-11 16:47:22 -05:00
|
|
|
log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
|
2014-04-24 06:46:50 -07:00
|
|
|
num_threads_(FLAGS_threads),
|
|
|
|
num_initialized_(0),
|
|
|
|
num_populated_(0),
|
|
|
|
vote_reopen_(0),
|
|
|
|
num_done_(0),
|
|
|
|
start_(false),
|
|
|
|
start_verify_(false),
|
2014-06-02 10:12:41 -07:00
|
|
|
should_stop_bg_thread_(false),
|
|
|
|
bg_thread_finished_(false),
|
2014-04-24 06:46:50 -07:00
|
|
|
stress_test_(stress_test),
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
verification_failure_(false),
|
|
|
|
no_overwrite_ids_(FLAGS_column_families) {
|
|
|
|
// Pick random keys in each column family that will not experience
|
|
|
|
// overwrite
|
|
|
|
|
|
|
|
printf("Choosing random keys with no overwrite\n");
|
|
|
|
Random rnd(seed_);
|
|
|
|
size_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
|
|
|
|
for (auto& cf_ids : no_overwrite_ids_) {
|
|
|
|
for (size_t i = 0; i < num_no_overwrite_keys; i++) {
|
|
|
|
size_t rand_key;
|
|
|
|
do {
|
|
|
|
rand_key = rnd.Next() % max_key_;
|
|
|
|
} while (cf_ids.find(rand_key) != cf_ids.end());
|
|
|
|
cf_ids.insert(rand_key);
|
|
|
|
}
|
|
|
|
assert(cf_ids.size() == num_no_overwrite_keys);
|
|
|
|
}
|
|
|
|
|
2013-02-20 15:57:27 -08:00
|
|
|
if (FLAGS_test_batches_snapshots) {
|
|
|
|
fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
|
|
|
|
return;
|
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
values_.resize(FLAGS_column_families);
|
|
|
|
|
|
|
|
for (int i = 0; i < FLAGS_column_families; ++i) {
|
|
|
|
values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
2013-02-20 15:57:27 -08:00
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
long num_locks = (max_key_ >> log2_keys_per_lock_);
|
|
|
|
if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
|
2014-02-27 12:13:48 -08:00
|
|
|
num_locks++;
|
|
|
|
}
|
|
|
|
fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
|
|
|
|
key_locks_.resize(FLAGS_column_families);
|
2015-07-01 16:13:49 -07:00
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
for (int i = 0; i < FLAGS_column_families; ++i) {
|
2015-07-01 16:13:49 -07:00
|
|
|
key_locks_[i].resize(num_locks);
|
|
|
|
for (auto& ptr : key_locks_[i]) {
|
|
|
|
ptr.reset(new port::Mutex);
|
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
~SharedState() {}
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
port::Mutex* GetMutex() {
|
|
|
|
return &mu_;
|
|
|
|
}
|
|
|
|
|
|
|
|
port::CondVar* GetCondVar() {
|
|
|
|
return &cv_;
|
|
|
|
}
|
|
|
|
|
|
|
|
StressTest* GetStressTest() const {
|
|
|
|
return stress_test_;
|
|
|
|
}
|
|
|
|
|
|
|
|
long GetMaxKey() const {
|
|
|
|
return max_key_;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t GetNumThreads() const {
|
|
|
|
return num_threads_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void IncInitialized() {
|
|
|
|
num_initialized_++;
|
|
|
|
}
|
|
|
|
|
2012-11-07 15:35:08 -08:00
|
|
|
void IncOperated() {
|
2012-10-03 09:58:45 -07:00
|
|
|
num_populated_++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void IncDone() {
|
|
|
|
num_done_++;
|
|
|
|
}
|
|
|
|
|
2012-11-09 13:04:12 -08:00
|
|
|
void IncVotedReopen() {
|
|
|
|
vote_reopen_ = (vote_reopen_ + 1) % num_threads_;
|
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
bool AllInitialized() const {
|
|
|
|
return num_initialized_ >= num_threads_;
|
|
|
|
}
|
|
|
|
|
2012-11-07 15:35:08 -08:00
|
|
|
bool AllOperated() const {
|
2012-10-03 09:58:45 -07:00
|
|
|
return num_populated_ >= num_threads_;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AllDone() const {
|
|
|
|
return num_done_ >= num_threads_;
|
|
|
|
}
|
|
|
|
|
2012-11-09 13:04:12 -08:00
|
|
|
bool AllVotedReopen() {
|
|
|
|
return (vote_reopen_ == 0);
|
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
void SetStart() {
|
|
|
|
start_ = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetStartVerify() {
|
|
|
|
start_verify_ = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Started() const {
|
|
|
|
return start_;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool VerifyStarted() const {
|
|
|
|
return start_verify_;
|
|
|
|
}
|
|
|
|
|
2014-04-24 09:22:58 -04:00
|
|
|
void SetVerificationFailure() { verification_failure_.store(true); }
|
|
|
|
|
|
|
|
bool HasVerificationFailedYet() { return verification_failure_.load(); }
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
port::Mutex* GetMutexForKey(int cf, long key) {
|
2015-07-01 16:13:49 -07:00
|
|
|
return key_locks_[cf][key >> log2_keys_per_lock_].get();
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
void LockColumnFamily(int cf) {
|
|
|
|
for (auto& mutex : key_locks_[cf]) {
|
2015-07-01 16:13:49 -07:00
|
|
|
mutex->Lock();
|
2014-02-27 12:13:48 -08:00
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
void UnlockColumnFamily(int cf) {
|
|
|
|
for (auto& mutex : key_locks_[cf]) {
|
2015-07-01 16:13:49 -07:00
|
|
|
mutex->Unlock();
|
2014-02-27 12:13:48 -08:00
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
void ClearColumnFamily(int cf) {
|
|
|
|
std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
|
2012-11-07 15:35:08 -08:00
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
void Put(int cf, long key, uint32_t value_base) {
|
|
|
|
values_[cf][key] = value_base;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
uint32_t Get(int cf, long key) const { return values_[cf][key]; }
|
|
|
|
|
|
|
|
void Delete(int cf, long key) { values_[cf][key] = SENTINEL; }
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
void SingleDelete(int cf, size_t key) { values_[cf][key] = SENTINEL; }
|
|
|
|
|
|
|
|
bool AllowsOverwrite(int cf, size_t key) {
|
|
|
|
return no_overwrite_ids_[cf].find(key) == no_overwrite_ids_[cf].end();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Exists(int cf, size_t key) { return values_[cf][key] != SENTINEL; }
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
uint32_t GetSeed() const { return seed_; }
|
|
|
|
|
2014-06-02 10:12:41 -07:00
|
|
|
void SetShouldStopBgThread() { should_stop_bg_thread_ = true; }
|
|
|
|
|
|
|
|
bool ShoudStopBgThread() { return should_stop_bg_thread_; }
|
|
|
|
|
|
|
|
void SetBgThreadFinish() { bg_thread_finished_ = true; }
|
|
|
|
|
|
|
|
bool BgThreadFinished() const { return bg_thread_finished_; }
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
private:
|
|
|
|
port::Mutex mu_;
|
|
|
|
port::CondVar cv_;
|
|
|
|
const uint32_t seed_;
|
|
|
|
const long max_key_;
|
|
|
|
const uint32_t log2_keys_per_lock_;
|
|
|
|
const int num_threads_;
|
|
|
|
long num_initialized_;
|
|
|
|
long num_populated_;
|
2012-11-09 13:04:12 -08:00
|
|
|
long vote_reopen_;
|
2012-10-03 09:58:45 -07:00
|
|
|
long num_done_;
|
|
|
|
bool start_;
|
|
|
|
bool start_verify_;
|
2014-06-02 10:12:41 -07:00
|
|
|
bool should_stop_bg_thread_;
|
|
|
|
bool bg_thread_finished_;
|
2012-10-03 09:58:45 -07:00
|
|
|
StressTest* stress_test_;
|
2014-04-24 09:22:58 -04:00
|
|
|
std::atomic<bool> verification_failure_;
|
2012-10-03 09:58:45 -07:00
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
// Keys that should not be overwritten
|
|
|
|
std::vector<std::set<size_t> > no_overwrite_ids_;
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
std::vector<std::vector<uint32_t>> values_;
|
2015-07-01 16:13:49 -07:00
|
|
|
// Has to make it owned by a smart ptr as port::Mutex is not copyable
|
|
|
|
// and storing it in the container may require copying depending on the impl.
|
2015-07-13 12:11:05 -07:00
|
|
|
std::vector<std::vector<std::unique_ptr<port::Mutex> > > key_locks_;
|
2012-10-03 09:58:45 -07:00
|
|
|
};
|
|
|
|
|
2014-03-11 13:08:48 -07:00
|
|
|
const uint32_t SharedState::SENTINEL = 0xffffffff;
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
// Per-thread state for concurrent executions of the same benchmark.
|
|
|
|
struct ThreadState {
|
|
|
|
uint32_t tid; // 0..n-1
|
|
|
|
Random rand; // Has different seeds for different threads
|
|
|
|
SharedState* shared;
|
|
|
|
Stats stats;
|
|
|
|
|
2014-11-07 15:04:30 -08:00
|
|
|
ThreadState(uint32_t index, SharedState* _shared)
|
2014-11-08 13:01:31 -08:00
|
|
|
: tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
|
2012-10-03 09:58:45 -07:00
|
|
|
};
|
|
|
|
|
2015-05-29 13:17:49 -07:00
|
|
|
class DbStressListener : public EventListener {
|
|
|
|
public:
|
|
|
|
DbStressListener(
|
|
|
|
const std::string& db_name,
|
2015-05-30 13:00:23 -07:00
|
|
|
const std::vector<DbPath>& db_paths) :
|
2015-05-29 13:17:49 -07:00
|
|
|
db_name_(db_name),
|
|
|
|
db_paths_(db_paths),
|
|
|
|
rand_(301) {}
|
|
|
|
virtual ~DbStressListener() {}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
virtual void OnFlushCompleted(
|
2015-06-05 12:28:51 -07:00
|
|
|
DB* db, const FlushJobInfo& info) override {
|
2015-05-29 13:17:49 -07:00
|
|
|
assert(db);
|
|
|
|
assert(db->GetName() == db_name_);
|
2015-06-05 12:28:51 -07:00
|
|
|
assert(IsValidColumnFamilyName(info.cf_name));
|
|
|
|
VerifyFilePath(info.file_path);
|
2015-05-29 13:17:49 -07:00
|
|
|
// pretending doing some work here
|
|
|
|
std::this_thread::sleep_for(
|
|
|
|
std::chrono::microseconds(rand_.Uniform(5000)));
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void OnCompactionCompleted(
|
2015-05-29 13:36:13 -07:00
|
|
|
DB *db, const CompactionJobInfo& ci) override {
|
2015-05-29 13:17:49 -07:00
|
|
|
assert(db);
|
|
|
|
assert(db->GetName() == db_name_);
|
|
|
|
assert(IsValidColumnFamilyName(ci.cf_name));
|
|
|
|
assert(ci.input_files.size() + ci.output_files.size() > 0U);
|
|
|
|
for (const auto& file_path : ci.input_files) {
|
|
|
|
VerifyFilePath(file_path);
|
|
|
|
}
|
|
|
|
for (const auto& file_path : ci.output_files) {
|
|
|
|
VerifyFilePath(file_path);
|
|
|
|
}
|
|
|
|
// pretending doing some work here
|
|
|
|
std::this_thread::sleep_for(
|
|
|
|
std::chrono::microseconds(rand_.Uniform(5000)));
|
|
|
|
}
|
|
|
|
|
2015-06-02 14:12:23 -07:00
|
|
|
virtual void OnTableFileCreated(
|
|
|
|
const TableFileCreationInfo& info) override {
|
|
|
|
assert(info.db_name == db_name_);
|
|
|
|
assert(IsValidColumnFamilyName(info.cf_name));
|
|
|
|
VerifyFilePath(info.file_path);
|
|
|
|
assert(info.file_size > 0);
|
|
|
|
assert(info.job_id > 0);
|
|
|
|
assert(info.table_properties.data_size > 0);
|
|
|
|
assert(info.table_properties.raw_key_size > 0);
|
|
|
|
assert(info.table_properties.num_entries > 0);
|
|
|
|
}
|
|
|
|
|
2015-05-29 13:17:49 -07:00
|
|
|
protected:
|
|
|
|
bool IsValidColumnFamilyName(const std::string& cf_name) const {
|
|
|
|
if (cf_name == kDefaultColumnFamilyName) {
|
|
|
|
return true;
|
|
|
|
}
|
2015-05-30 13:00:23 -07:00
|
|
|
// The column family names in the stress tests are numbers.
|
|
|
|
for (size_t i = 0; i < cf_name.size(); ++i) {
|
|
|
|
if (cf_name[i] < '0' || cf_name[i] > '9') {
|
|
|
|
return false;
|
2015-05-29 13:17:49 -07:00
|
|
|
}
|
|
|
|
}
|
2015-05-30 13:00:23 -07:00
|
|
|
return true;
|
2015-05-29 13:17:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyFileDir(const std::string& file_dir) {
|
2015-05-29 14:51:21 -07:00
|
|
|
#ifndef NDEBUG
|
2015-05-29 13:17:49 -07:00
|
|
|
if (db_name_ == file_dir) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
for (const auto& db_path : db_paths_) {
|
|
|
|
if (db_path.path == file_dir) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(false);
|
2015-05-29 14:51:21 -07:00
|
|
|
#endif // !NDEBUG
|
2015-05-29 13:17:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyFileName(const std::string& file_name) {
|
2015-05-29 14:51:21 -07:00
|
|
|
#ifndef NDEBUG
|
2015-05-29 13:17:49 -07:00
|
|
|
uint64_t file_number;
|
|
|
|
FileType file_type;
|
|
|
|
bool result = ParseFileName(file_name, &file_number, &file_type);
|
|
|
|
assert(result);
|
|
|
|
assert(file_type == kTableFile);
|
2015-05-29 14:51:21 -07:00
|
|
|
#endif // !NDEBUG
|
2015-05-29 13:17:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyFilePath(const std::string& file_path) {
|
2015-05-29 14:51:21 -07:00
|
|
|
#ifndef NDEBUG
|
2015-05-29 13:17:49 -07:00
|
|
|
size_t pos = file_path.find_last_of("/");
|
|
|
|
if (pos == std::string::npos) {
|
|
|
|
VerifyFileName(file_path);
|
|
|
|
} else {
|
|
|
|
if (pos > 0) {
|
|
|
|
VerifyFileDir(file_path.substr(0, pos));
|
|
|
|
}
|
|
|
|
VerifyFileName(file_path.substr(pos));
|
|
|
|
}
|
2015-05-29 14:51:21 -07:00
|
|
|
#endif // !NDEBUG
|
2015-05-29 13:17:49 -07:00
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::string db_name_;
|
|
|
|
std::vector<DbPath> db_paths_;
|
|
|
|
Random rand_;
|
|
|
|
};
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
} // namespace
|
|
|
|
|
|
|
|
class StressTest {
|
|
|
|
public:
|
|
|
|
StressTest()
|
|
|
|
: cache_(NewLRUCache(FLAGS_cache_size)),
|
2014-02-27 12:13:48 -08:00
|
|
|
compressed_cache_(FLAGS_compressed_cache_size >= 0
|
|
|
|
? NewLRUCache(FLAGS_compressed_cache_size)
|
|
|
|
: nullptr),
|
2012-10-03 09:58:45 -07:00
|
|
|
filter_policy_(FLAGS_bloom_bits >= 0
|
2014-09-08 10:37:05 -07:00
|
|
|
? FLAGS_use_block_based_filter
|
|
|
|
? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
|
|
|
|
: NewBloomFilterPolicy(FLAGS_bloom_bits, false)
|
|
|
|
: nullptr),
|
2013-02-15 11:53:17 -08:00
|
|
|
db_(nullptr),
|
2014-09-02 13:21:59 -07:00
|
|
|
new_column_family_name_(1),
|
2012-11-16 15:28:14 -08:00
|
|
|
num_times_reopened_(0) {
|
2013-04-08 12:35:40 -07:00
|
|
|
if (FLAGS_destroy_db_initially) {
|
|
|
|
std::vector<std::string> files;
|
|
|
|
FLAGS_env->GetChildren(FLAGS_db, &files);
|
|
|
|
for (unsigned int i = 0; i < files.size(); i++) {
|
|
|
|
if (Slice(files[i]).starts_with("heap-")) {
|
2013-10-24 07:43:14 -07:00
|
|
|
FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
|
2013-04-08 12:35:40 -07:00
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
2013-04-08 12:35:40 -07:00
|
|
|
DestroyDB(FLAGS_db, Options());
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
~StressTest() {
|
2014-02-27 12:13:48 -08:00
|
|
|
for (auto cf : column_families_) {
|
|
|
|
delete cf;
|
|
|
|
}
|
|
|
|
column_families_.clear();
|
2012-10-03 09:58:45 -07:00
|
|
|
delete db_;
|
|
|
|
}
|
|
|
|
|
2014-10-27 12:11:16 -07:00
|
|
|
bool BuildOptionsTable() {
|
|
|
|
if (FLAGS_set_options_one_in <= 0) {
|
|
|
|
return true;
|
|
|
|
}
|
2015-07-01 16:13:49 -07:00
|
|
|
|
2015-07-13 12:11:05 -07:00
|
|
|
std::unordered_map<std::string, std::vector<std::string> > options_tbl = {
|
|
|
|
{"write_buffer_size",
|
|
|
|
{ToString(FLAGS_write_buffer_size),
|
2014-11-24 20:44:49 -08:00
|
|
|
ToString(FLAGS_write_buffer_size * 2),
|
2015-07-13 12:11:05 -07:00
|
|
|
ToString(FLAGS_write_buffer_size * 4)}},
|
|
|
|
{"max_write_buffer_number",
|
|
|
|
{ToString(FLAGS_max_write_buffer_number),
|
2014-11-24 20:44:49 -08:00
|
|
|
ToString(FLAGS_max_write_buffer_number * 2),
|
2015-07-13 12:11:05 -07:00
|
|
|
ToString(FLAGS_max_write_buffer_number * 4)}},
|
|
|
|
{"arena_block_size",
|
|
|
|
{
|
|
|
|
ToString(Options().arena_block_size),
|
|
|
|
ToString(FLAGS_write_buffer_size / 4),
|
|
|
|
ToString(FLAGS_write_buffer_size / 8),
|
|
|
|
}},
|
|
|
|
{"memtable_prefix_bloom_bits", {"0", "8", "10"}},
|
|
|
|
{"memtable_prefix_bloom_probes", {"4", "5", "6"}},
|
|
|
|
{"memtable_prefix_bloom_huge_page_tlb_size",
|
|
|
|
{"0", ToString(2 * 1024 * 1024)}},
|
|
|
|
{"max_successive_merges", {"0", "2", "4"}},
|
|
|
|
{"filter_deletes", {"0", "1"}},
|
|
|
|
{"inplace_update_num_locks", {"100", "200", "300"}},
|
|
|
|
// TODO(ljin): enable test for this option
|
|
|
|
// {"disable_auto_compactions", {"100", "200", "300"}},
|
|
|
|
{"soft_rate_limit", {"0", "0.5", "0.9"}},
|
|
|
|
{"hard_rate_limit", {"0", "1.1", "2.0"}},
|
|
|
|
{"level0_file_num_compaction_trigger",
|
|
|
|
{
|
|
|
|
ToString(FLAGS_level0_file_num_compaction_trigger),
|
|
|
|
ToString(FLAGS_level0_file_num_compaction_trigger + 2),
|
|
|
|
ToString(FLAGS_level0_file_num_compaction_trigger + 4),
|
|
|
|
}},
|
|
|
|
{"level0_slowdown_writes_trigger",
|
|
|
|
{
|
|
|
|
ToString(FLAGS_level0_slowdown_writes_trigger),
|
|
|
|
ToString(FLAGS_level0_slowdown_writes_trigger + 2),
|
|
|
|
ToString(FLAGS_level0_slowdown_writes_trigger + 4),
|
|
|
|
}},
|
|
|
|
{"level0_stop_writes_trigger",
|
|
|
|
{
|
|
|
|
ToString(FLAGS_level0_stop_writes_trigger),
|
|
|
|
ToString(FLAGS_level0_stop_writes_trigger + 2),
|
|
|
|
ToString(FLAGS_level0_stop_writes_trigger + 4),
|
|
|
|
}},
|
|
|
|
{"max_grandparent_overlap_factor",
|
|
|
|
{
|
|
|
|
ToString(Options().max_grandparent_overlap_factor - 5),
|
|
|
|
ToString(Options().max_grandparent_overlap_factor),
|
|
|
|
ToString(Options().max_grandparent_overlap_factor + 5),
|
|
|
|
}},
|
|
|
|
{"expanded_compaction_factor",
|
|
|
|
{
|
|
|
|
ToString(Options().expanded_compaction_factor - 5),
|
|
|
|
ToString(Options().expanded_compaction_factor),
|
|
|
|
ToString(Options().expanded_compaction_factor + 5),
|
|
|
|
}},
|
|
|
|
{"source_compaction_factor",
|
|
|
|
{
|
|
|
|
ToString(Options().source_compaction_factor),
|
|
|
|
ToString(Options().source_compaction_factor * 2),
|
|
|
|
ToString(Options().source_compaction_factor * 4),
|
|
|
|
}},
|
|
|
|
{"target_file_size_base",
|
|
|
|
{
|
|
|
|
ToString(FLAGS_target_file_size_base),
|
|
|
|
ToString(FLAGS_target_file_size_base * 2),
|
|
|
|
ToString(FLAGS_target_file_size_base * 4),
|
|
|
|
}},
|
|
|
|
{"target_file_size_multiplier",
|
|
|
|
{
|
|
|
|
ToString(FLAGS_target_file_size_multiplier), "1", "2",
|
|
|
|
}},
|
|
|
|
{"max_bytes_for_level_base",
|
|
|
|
{
|
|
|
|
ToString(FLAGS_max_bytes_for_level_base / 2),
|
|
|
|
ToString(FLAGS_max_bytes_for_level_base),
|
|
|
|
ToString(FLAGS_max_bytes_for_level_base * 2),
|
|
|
|
}},
|
|
|
|
{"max_bytes_for_level_multiplier",
|
|
|
|
{
|
|
|
|
ToString(FLAGS_max_bytes_for_level_multiplier), "1", "2",
|
|
|
|
}},
|
|
|
|
{"max_sequential_skip_in_iterations", {"4", "8", "12"}},
|
2014-10-27 12:11:16 -07:00
|
|
|
};
|
2015-07-01 16:13:49 -07:00
|
|
|
|
|
|
|
options_table_ = std::move(options_tbl);
|
|
|
|
|
2014-10-27 12:11:16 -07:00
|
|
|
for (const auto& iter : options_table_) {
|
|
|
|
options_index_.push_back(iter.first);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-04-24 09:22:58 -04:00
|
|
|
bool Run() {
|
2012-10-03 09:58:45 -07:00
|
|
|
PrintEnv();
|
2014-10-27 12:11:16 -07:00
|
|
|
BuildOptionsTable();
|
2012-10-03 09:58:45 -07:00
|
|
|
Open();
|
|
|
|
SharedState shared(this);
|
|
|
|
uint32_t n = shared.GetNumThreads();
|
|
|
|
|
|
|
|
std::vector<ThreadState*> threads(n);
|
|
|
|
for (uint32_t i = 0; i < n; i++) {
|
|
|
|
threads[i] = new ThreadState(i, &shared);
|
|
|
|
FLAGS_env->StartThread(ThreadBody, threads[i]);
|
|
|
|
}
|
2014-06-02 10:12:41 -07:00
|
|
|
ThreadState bg_thread(0, &shared);
|
|
|
|
if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
|
|
|
|
FLAGS_env->StartThread(PoolSizeChangeThread, &bg_thread);
|
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
// Each thread goes through the following states:
|
2012-11-07 15:35:08 -08:00
|
|
|
// initializing -> wait for others to init -> read/populate/depopulate
|
|
|
|
// wait for others to operate -> verify -> done
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
{
|
|
|
|
MutexLock l(shared.GetMutex());
|
|
|
|
while (!shared.AllInitialized()) {
|
|
|
|
shared.GetCondVar()->Wait();
|
|
|
|
}
|
|
|
|
|
2012-11-16 15:28:14 -08:00
|
|
|
double now = FLAGS_env->NowMicros();
|
|
|
|
fprintf(stdout, "%s Starting database operations\n",
|
|
|
|
FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
|
2012-11-07 15:35:08 -08:00
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
shared.SetStart();
|
|
|
|
shared.GetCondVar()->SignalAll();
|
2012-11-07 15:35:08 -08:00
|
|
|
while (!shared.AllOperated()) {
|
2012-10-03 09:58:45 -07:00
|
|
|
shared.GetCondVar()->Wait();
|
|
|
|
}
|
|
|
|
|
2012-11-16 15:28:14 -08:00
|
|
|
now = FLAGS_env->NowMicros();
|
2013-02-23 11:11:16 -08:00
|
|
|
if (FLAGS_test_batches_snapshots) {
|
|
|
|
fprintf(stdout, "%s Limited verification already done during gets\n",
|
|
|
|
FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
|
|
|
|
} else {
|
|
|
|
fprintf(stdout, "%s Starting verification\n",
|
|
|
|
FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
|
|
|
|
}
|
2012-11-16 15:28:14 -08:00
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
shared.SetStartVerify();
|
|
|
|
shared.GetCondVar()->SignalAll();
|
|
|
|
while (!shared.AllDone()) {
|
|
|
|
shared.GetCondVar()->Wait();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-06 12:02:18 -08:00
|
|
|
for (unsigned int i = 1; i < n; i++) {
|
2012-10-03 09:58:45 -07:00
|
|
|
threads[0]->stats.Merge(threads[i]->stats);
|
|
|
|
}
|
|
|
|
threads[0]->stats.Report("Stress Test");
|
|
|
|
|
2012-11-06 12:02:18 -08:00
|
|
|
for (unsigned int i = 0; i < n; i++) {
|
2012-10-03 09:58:45 -07:00
|
|
|
delete threads[i];
|
2013-02-15 11:53:17 -08:00
|
|
|
threads[i] = nullptr;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
2012-11-16 15:28:14 -08:00
|
|
|
double now = FLAGS_env->NowMicros();
|
2013-02-23 11:11:16 -08:00
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
|
|
|
fprintf(stdout, "%s Verification successful\n",
|
|
|
|
FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
|
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
PrintStatistics();
|
2014-04-24 09:22:58 -04:00
|
|
|
|
2014-06-02 10:12:41 -07:00
|
|
|
if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
|
|
|
|
MutexLock l(shared.GetMutex());
|
|
|
|
shared.SetShouldStopBgThread();
|
|
|
|
while (!shared.BgThreadFinished()) {
|
|
|
|
shared.GetCondVar()->Wait();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-24 09:22:58 -04:00
|
|
|
if (shared.HasVerificationFailedYet()) {
|
|
|
|
printf("Verification failed :(\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
static void ThreadBody(void* v) {
|
|
|
|
ThreadState* thread = reinterpret_cast<ThreadState*>(v);
|
|
|
|
SharedState* shared = thread->shared;
|
|
|
|
|
|
|
|
{
|
|
|
|
MutexLock l(shared->GetMutex());
|
|
|
|
shared->IncInitialized();
|
|
|
|
if (shared->AllInitialized()) {
|
|
|
|
shared->GetCondVar()->SignalAll();
|
|
|
|
}
|
|
|
|
while (!shared->Started()) {
|
|
|
|
shared->GetCondVar()->Wait();
|
|
|
|
}
|
|
|
|
}
|
2012-11-07 15:35:08 -08:00
|
|
|
thread->shared->GetStressTest()->OperateDb(thread);
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
{
|
|
|
|
MutexLock l(shared->GetMutex());
|
2012-11-07 15:35:08 -08:00
|
|
|
shared->IncOperated();
|
|
|
|
if (shared->AllOperated()) {
|
2012-10-03 09:58:45 -07:00
|
|
|
shared->GetCondVar()->SignalAll();
|
|
|
|
}
|
|
|
|
while (!shared->VerifyStarted()) {
|
|
|
|
shared->GetCondVar()->Wait();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-02-20 15:57:27 -08:00
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
2013-10-01 23:19:51 -07:00
|
|
|
thread->shared->GetStressTest()->VerifyDb(thread);
|
2013-02-20 15:57:27 -08:00
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
{
|
|
|
|
MutexLock l(shared->GetMutex());
|
|
|
|
shared->IncDone();
|
|
|
|
if (shared->AllDone()) {
|
|
|
|
shared->GetCondVar()->SignalAll();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2014-06-02 10:12:41 -07:00
|
|
|
static void PoolSizeChangeThread(void* v) {
|
|
|
|
assert(FLAGS_compaction_thread_pool_adjust_interval > 0);
|
|
|
|
ThreadState* thread = reinterpret_cast<ThreadState*>(v);
|
|
|
|
SharedState* shared = thread->shared;
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
{
|
|
|
|
MutexLock l(shared->GetMutex());
|
|
|
|
if (shared->ShoudStopBgThread()) {
|
|
|
|
shared->SetBgThreadFinish();
|
|
|
|
shared->GetCondVar()->SignalAll();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
auto thread_pool_size_base = FLAGS_max_background_compactions;
|
2015-04-25 21:09:45 -07:00
|
|
|
auto thread_pool_size_var = FLAGS_compaction_thread_pool_variations;
|
2014-06-02 10:12:41 -07:00
|
|
|
int new_thread_pool_size =
|
|
|
|
thread_pool_size_base - thread_pool_size_var +
|
|
|
|
thread->rand.Next() % (thread_pool_size_var * 2 + 1);
|
|
|
|
if (new_thread_pool_size < 1) {
|
|
|
|
new_thread_pool_size = 1;
|
|
|
|
}
|
|
|
|
FLAGS_env->SetBackgroundThreads(new_thread_pool_size);
|
|
|
|
// Sleep up to 3 seconds
|
|
|
|
FLAGS_env->SleepForMicroseconds(
|
|
|
|
thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval *
|
|
|
|
1000 +
|
|
|
|
1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-02-20 15:57:27 -08:00
|
|
|
// Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
|
|
|
|
// ("9"+K, "9"+V) in DB atomically i.e in a single batch.
|
|
|
|
// Also refer MultiGet.
|
2014-02-27 12:13:48 -08:00
|
|
|
Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value, size_t sz) {
|
2013-02-20 15:57:27 -08:00
|
|
|
std::string keys[10] = {"9", "8", "7", "6", "5",
|
|
|
|
"4", "3", "2", "1", "0"};
|
|
|
|
std::string values[10] = {"9", "8", "7", "6", "5",
|
|
|
|
"4", "3", "2", "1", "0"};
|
|
|
|
Slice value_slices[10];
|
|
|
|
WriteBatch batch;
|
|
|
|
Status s;
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
keys[i] += key.ToString();
|
|
|
|
values[i] += value.ToString();
|
|
|
|
value_slices[i] = values[i];
|
2013-10-24 07:43:14 -07:00
|
|
|
if (FLAGS_use_merge) {
|
2014-03-14 13:40:06 -07:00
|
|
|
batch.Merge(column_family, keys[i], value_slices[i]);
|
Benchmarking for Merge Operator
Summary:
Updated db_bench and utilities/merge_operators.h to allow for dynamic benchmarking
of merge operators in db_bench. Added a new test (--benchmarks=mergerandom), which performs
a bunch of random Merge() operations over random keys. Also added a "--merge_operator=" flag
so that the tester can easily benchmark different merge operators. Currently supports
the PutOperator and UInt64Add operator. Support for stringappend or list append may come later.
Test Plan:
1. make db_bench
2. Test the PutOperator (simulating Put) as follows:
./db_bench --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom --merge_operator=put
--threads=2
3. Test the UInt64AddOperator (simulating numeric addition) similarly:
./db_bench --value_size=8 --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom
--merge_operator=uint64add --threads=2
Reviewers: haobo, dhruba, zshao, MarkCallaghan
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11535
2013-08-15 17:13:07 -07:00
|
|
|
} else {
|
2014-03-14 13:40:06 -07:00
|
|
|
batch.Put(column_family, keys[i], value_slices[i]);
|
Benchmarking for Merge Operator
Summary:
Updated db_bench and utilities/merge_operators.h to allow for dynamic benchmarking
of merge operators in db_bench. Added a new test (--benchmarks=mergerandom), which performs
a bunch of random Merge() operations over random keys. Also added a "--merge_operator=" flag
so that the tester can easily benchmark different merge operators. Currently supports
the PutOperator and UInt64Add operator. Support for stringappend or list append may come later.
Test Plan:
1. make db_bench
2. Test the PutOperator (simulating Put) as follows:
./db_bench --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom --merge_operator=put
--threads=2
3. Test the UInt64AddOperator (simulating numeric addition) similarly:
./db_bench --value_size=8 --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom
--merge_operator=uint64add --threads=2
Reviewers: haobo, dhruba, zshao, MarkCallaghan
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11535
2013-08-15 17:13:07 -07:00
|
|
|
}
|
2013-02-20 15:57:27 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
s = db_->Write(writeoptions, &batch);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
} else {
|
|
|
|
// we did 10 writes each of size sz + 1
|
|
|
|
thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
|
|
|
|
// in DB atomically i.e in a single batch. Also refer MultiGet.
|
2014-02-27 12:13:48 -08:00
|
|
|
Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
2013-02-20 15:57:27 -08:00
|
|
|
std::string keys[10] = {"9", "7", "5", "3", "1",
|
|
|
|
"8", "6", "4", "2", "0"};
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Status s;
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
keys[i] += key.ToString();
|
2014-03-14 13:40:06 -07:00
|
|
|
batch.Delete(column_family, keys[i]);
|
2013-02-20 15:57:27 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
s = db_->Write(writeoptions, &batch);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
} else {
|
|
|
|
thread->stats.AddDeletes(10);
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
|
|
|
|
// in the same snapshot, and verifies that all the values are of the form
|
|
|
|
// "0"+V, "1"+V,..."9"+V.
|
|
|
|
// ASSUMES that MultiPut was used to put (K, V) into the DB.
|
2014-02-27 12:13:48 -08:00
|
|
|
Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value) {
|
2013-02-20 15:57:27 -08:00
|
|
|
std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
|
|
|
|
Slice key_slices[10];
|
|
|
|
std::string values[10];
|
|
|
|
ReadOptions readoptionscopy = readoptions;
|
|
|
|
readoptionscopy.snapshot = db_->GetSnapshot();
|
|
|
|
Status s;
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
keys[i] += key.ToString();
|
|
|
|
key_slices[i] = keys[i];
|
2014-02-27 12:13:48 -08:00
|
|
|
s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
|
2013-02-20 15:57:27 -08:00
|
|
|
if (!s.ok() && !s.IsNotFound()) {
|
|
|
|
fprintf(stderr, "get error: %s\n", s.ToString().c_str());
|
|
|
|
values[i] = "";
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
// we continue after error rather than exiting so that we can
|
|
|
|
// find more errors if any
|
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
values[i] = "";
|
2013-02-23 11:11:16 -08:00
|
|
|
thread->stats.AddGets(1, 0);
|
2013-02-20 15:57:27 -08:00
|
|
|
} else {
|
|
|
|
values[i] = *value;
|
|
|
|
|
|
|
|
char expected_prefix = (keys[i])[0];
|
|
|
|
char actual_prefix = (values[i])[0];
|
|
|
|
if (actual_prefix != expected_prefix) {
|
2013-04-04 23:49:43 -07:00
|
|
|
fprintf(stderr, "error expected prefix = %c actual = %c\n",
|
2013-02-20 15:57:27 -08:00
|
|
|
expected_prefix, actual_prefix);
|
|
|
|
}
|
|
|
|
(values[i])[0] = ' '; // blank out the differing character
|
2013-02-23 11:11:16 -08:00
|
|
|
thread->stats.AddGets(1, 1);
|
2013-02-20 15:57:27 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
db_->ReleaseSnapshot(readoptionscopy.snapshot);
|
|
|
|
|
|
|
|
// Now that we retrieved all values, check that they all match
|
|
|
|
for (int i = 1; i < 10; i++) {
|
|
|
|
if (values[i] != values[0]) {
|
2013-04-04 23:49:43 -07:00
|
|
|
fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
|
hints for narrowing down FindFile range and avoiding checking unrelevant L0 files
Summary:
The file tree structure in Version is prebuilt and the range of each file is known.
On the Get() code path, we do binary search in FindFile() by comparing
target key with each file's largest key and also check the range for each L0 file.
With some pre-calculated knowledge, each key comparision that has been done can serve
as a hint to narrow down further searches:
(1) If a key falls within a L0 file's range, we can safely skip the next
file if its range does not overlap with the current one.
(2) If a key falls within a file's range in level L0 - Ln-1, we should only
need to binary search in the next level for files that overlap with the current one.
(1) will be able to skip some files depending one the key distribution.
(2) can greatly reduce the range of binary search, especially for bottom
levels, given that one file most likely only overlaps with N files from
the level below (where N is max_bytes_for_level_multiplier). So on level
L, we will only look at ~N files instead of N^L files.
Some inital results: measured with 500M key DB, when write is light (10k/s = 1.2M/s), this
improves QPS ~7% on top of blocked bloom. When write is heavier (80k/s =
9.6M/s), it gives us ~13% improvement.
Test Plan: make all check
Reviewers: haobo, igor, dhruba, sdong, yhchiang
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D17205
2014-04-21 09:10:12 -07:00
|
|
|
key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
|
|
|
|
StringToHex(values[i]).c_str());
|
2013-02-20 15:57:27 -08:00
|
|
|
// we continue after error rather than exiting so that we can
|
|
|
|
// find more errors if any
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-03-11 13:44:33 -07:00
|
|
|
// Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
|
|
|
|
// in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
|
|
|
|
// of the key. Each of these 10 scans returns a series of values;
|
|
|
|
// each series should be the same length, and it is verified for each
|
|
|
|
// index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
|
2013-08-14 16:58:36 -07:00
|
|
|
// ASSUMES that MultiPut was used to put (K, V)
|
2014-02-27 12:13:48 -08:00
|
|
|
Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
|
|
|
|
ColumnFamilyHandle* column_family,
|
2014-03-11 13:44:33 -07:00
|
|
|
const Slice& key) {
|
2013-08-14 16:58:36 -07:00
|
|
|
std::string prefixes[10] = {"0", "1", "2", "3", "4",
|
|
|
|
"5", "6", "7", "8", "9"};
|
|
|
|
Slice prefix_slices[10];
|
|
|
|
ReadOptions readoptionscopy[10];
|
|
|
|
const Snapshot* snapshot = db_->GetSnapshot();
|
|
|
|
Iterator* iters[10];
|
|
|
|
Status s = Status::OK();
|
|
|
|
for (int i = 0; i < 10; i++) {
|
2014-03-11 13:44:33 -07:00
|
|
|
prefixes[i] += key.ToString();
|
|
|
|
prefixes[i].resize(FLAGS_prefix_size);
|
|
|
|
prefix_slices[i] = Slice(prefixes[i]);
|
2013-08-14 16:58:36 -07:00
|
|
|
readoptionscopy[i] = readoptions;
|
|
|
|
readoptionscopy[i].snapshot = snapshot;
|
2014-02-27 12:13:48 -08:00
|
|
|
iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
|
2014-03-17 10:02:46 -07:00
|
|
|
iters[i]->Seek(prefix_slices[i]);
|
2013-08-14 16:58:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
int count = 0;
|
2014-03-17 17:02:34 -07:00
|
|
|
while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
|
2013-08-14 16:58:36 -07:00
|
|
|
count++;
|
|
|
|
std::string values[10];
|
|
|
|
// get list of all values for this iteration
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
// no iterator should finish before the first one
|
2014-03-17 17:02:34 -07:00
|
|
|
assert(iters[i]->Valid() &&
|
|
|
|
iters[i]->key().starts_with(prefix_slices[i]));
|
2013-08-14 16:58:36 -07:00
|
|
|
values[i] = iters[i]->value().ToString();
|
|
|
|
|
|
|
|
char expected_first = (prefixes[i])[0];
|
|
|
|
char actual_first = (values[i])[0];
|
|
|
|
|
|
|
|
if (actual_first != expected_first) {
|
|
|
|
fprintf(stderr, "error expected first = %c actual = %c\n",
|
|
|
|
expected_first, actual_first);
|
|
|
|
}
|
|
|
|
(values[i])[0] = ' '; // blank out the differing character
|
|
|
|
}
|
|
|
|
// make sure all values are equivalent
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
if (values[i] != values[0]) {
|
hints for narrowing down FindFile range and avoiding checking unrelevant L0 files
Summary:
The file tree structure in Version is prebuilt and the range of each file is known.
On the Get() code path, we do binary search in FindFile() by comparing
target key with each file's largest key and also check the range for each L0 file.
With some pre-calculated knowledge, each key comparision that has been done can serve
as a hint to narrow down further searches:
(1) If a key falls within a L0 file's range, we can safely skip the next
file if its range does not overlap with the current one.
(2) If a key falls within a file's range in level L0 - Ln-1, we should only
need to binary search in the next level for files that overlap with the current one.
(1) will be able to skip some files depending one the key distribution.
(2) can greatly reduce the range of binary search, especially for bottom
levels, given that one file most likely only overlaps with N files from
the level below (where N is max_bytes_for_level_multiplier). So on level
L, we will only look at ~N files instead of N^L files.
Some inital results: measured with 500M key DB, when write is light (10k/s = 1.2M/s), this
improves QPS ~7% on top of blocked bloom. When write is heavier (80k/s =
9.6M/s), it gives us ~13% improvement.
Test Plan: make all check
Reviewers: haobo, igor, dhruba, sdong, yhchiang
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D17205
2014-04-21 09:10:12 -07:00
|
|
|
fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
|
|
|
|
i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
|
|
|
|
StringToHex(values[i]).c_str());
|
2013-08-14 16:58:36 -07:00
|
|
|
// we continue after error rather than exiting so that we can
|
|
|
|
// find more errors if any
|
|
|
|
}
|
|
|
|
iters[i]->Next();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// cleanup iterators and snapshot
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
// if the first iterator finished, they should have all finished
|
2014-03-17 17:02:34 -07:00
|
|
|
assert(!iters[i]->Valid() ||
|
|
|
|
!iters[i]->key().starts_with(prefix_slices[i]));
|
2013-08-14 16:58:36 -07:00
|
|
|
assert(iters[i]->status().ok());
|
|
|
|
delete iters[i];
|
|
|
|
}
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
thread->stats.AddPrefixes(1, count);
|
|
|
|
} else {
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-09-19 16:47:24 -07:00
|
|
|
// Given a key K, this creates an iterator which scans to K and then
|
|
|
|
// does a random sequence of Next/Prev operations.
|
2014-02-27 12:13:48 -08:00
|
|
|
Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
2013-09-19 16:47:24 -07:00
|
|
|
Status s;
|
|
|
|
const Snapshot* snapshot = db_->GetSnapshot();
|
|
|
|
ReadOptions readoptionscopy = readoptions;
|
|
|
|
readoptionscopy.snapshot = snapshot;
|
2014-02-27 12:13:48 -08:00
|
|
|
unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
|
2013-09-19 16:47:24 -07:00
|
|
|
|
|
|
|
iter->Seek(key);
|
2013-10-24 07:43:14 -07:00
|
|
|
for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
|
2013-09-19 16:47:24 -07:00
|
|
|
if (thread->rand.OneIn(2)) {
|
|
|
|
iter->Next();
|
|
|
|
} else {
|
|
|
|
iter->Prev();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
thread->stats.AddIterations(1);
|
|
|
|
} else {
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-11-04 16:23:05 -08:00
|
|
|
Status SetOptions(ThreadState* thread) {
|
2014-10-27 12:11:16 -07:00
|
|
|
assert(FLAGS_set_options_one_in > 0);
|
|
|
|
std::unordered_map<std::string, std::string> opts;
|
|
|
|
std::string name = options_index_[
|
|
|
|
thread->rand.Next() % options_index_.size()];
|
|
|
|
int value_idx = thread->rand.Next() % options_table_[name].size();
|
|
|
|
if (name == "soft_rate_limit" || name == "hard_rate_limit") {
|
|
|
|
opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx];
|
|
|
|
opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx];
|
|
|
|
} else if (name == "level0_file_num_compaction_trigger" ||
|
|
|
|
name == "level0_slowdown_writes_trigger" ||
|
|
|
|
name == "level0_stop_writes_trigger") {
|
|
|
|
opts["level0_file_num_compaction_trigger"] =
|
|
|
|
options_table_["level0_file_num_compaction_trigger"][value_idx];
|
|
|
|
opts["level0_slowdown_writes_trigger"] =
|
|
|
|
options_table_["level0_slowdown_writes_trigger"][value_idx];
|
|
|
|
opts["level0_stop_writes_trigger"] =
|
|
|
|
options_table_["level0_stop_writes_trigger"][value_idx];
|
|
|
|
} else {
|
|
|
|
opts[name] = options_table_[name][value_idx];
|
|
|
|
}
|
|
|
|
|
|
|
|
int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
|
|
|
|
auto cfh = column_families_[rand_cf_idx];
|
|
|
|
return db_->SetOptions(cfh, opts);
|
|
|
|
}
|
|
|
|
|
2012-11-07 15:35:08 -08:00
|
|
|
void OperateDb(ThreadState* thread) {
|
2012-10-03 09:58:45 -07:00
|
|
|
ReadOptions read_opts(FLAGS_verify_checksum, true);
|
|
|
|
WriteOptions write_opts;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
auto shared = thread->shared;
|
2012-11-07 15:35:08 -08:00
|
|
|
char value[100];
|
2012-10-03 09:58:45 -07:00
|
|
|
long max_key = thread->shared->GetMaxKey();
|
|
|
|
std::string from_db;
|
|
|
|
if (FLAGS_sync) {
|
|
|
|
write_opts.sync = true;
|
|
|
|
}
|
|
|
|
write_opts.disableWAL = FLAGS_disable_wal;
|
2013-09-19 16:47:24 -07:00
|
|
|
const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
|
|
|
|
const int writeBound = prefixBound + (int)FLAGS_writepercent;
|
|
|
|
const int delBound = writeBound + (int)FLAGS_delpercent;
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
thread->stats.Start();
|
2013-10-24 07:43:14 -07:00
|
|
|
for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
|
2014-04-24 09:22:58 -04:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
|
2012-11-09 13:04:12 -08:00
|
|
|
{
|
2012-11-16 15:28:14 -08:00
|
|
|
thread->stats.FinishedSingleOp();
|
2012-11-09 13:04:12 -08:00
|
|
|
MutexLock l(thread->shared->GetMutex());
|
|
|
|
thread->shared->IncVotedReopen();
|
|
|
|
if (thread->shared->AllVotedReopen()) {
|
|
|
|
thread->shared->GetStressTest()->Reopen();
|
|
|
|
thread->shared->GetCondVar()->SignalAll();
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
thread->shared->GetCondVar()->Wait();
|
|
|
|
}
|
2013-02-23 11:11:16 -08:00
|
|
|
// Commenting this out as we don't want to reset stats on each open.
|
|
|
|
// thread->stats.Start();
|
2012-11-09 13:04:12 -08:00
|
|
|
}
|
|
|
|
}
|
2013-02-20 15:57:27 -08:00
|
|
|
|
2014-10-27 12:11:16 -07:00
|
|
|
// Change Options
|
|
|
|
if (FLAGS_set_options_one_in > 0 &&
|
|
|
|
thread->rand.OneIn(FLAGS_set_options_one_in)) {
|
|
|
|
SetOptions(thread);
|
|
|
|
}
|
|
|
|
|
2014-10-31 12:02:14 -07:00
|
|
|
if (FLAGS_set_in_place_one_in > 0 &&
|
|
|
|
thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
|
|
|
|
options_.inplace_update_support ^= options_.inplace_update_support;
|
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
if (!FLAGS_test_batches_snapshots &&
|
2014-07-14 07:56:07 -07:00
|
|
|
FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
|
2014-02-27 12:13:48 -08:00
|
|
|
if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
|
|
|
|
// drop column family and then create it again (can't drop default)
|
|
|
|
int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
|
|
|
|
std::string new_name =
|
2014-11-24 20:44:49 -08:00
|
|
|
ToString(new_column_family_name_.fetch_add(1));
|
2014-02-27 12:13:48 -08:00
|
|
|
{
|
|
|
|
MutexLock l(thread->shared->GetMutex());
|
|
|
|
fprintf(
|
|
|
|
stdout,
|
|
|
|
"[CF %d] Dropping and recreating column family. new name: %s\n",
|
|
|
|
cf, new_name.c_str());
|
|
|
|
}
|
|
|
|
thread->shared->LockColumnFamily(cf);
|
|
|
|
Status s __attribute__((unused));
|
|
|
|
s = db_->DropColumnFamily(column_families_[cf]);
|
|
|
|
delete column_families_[cf];
|
2014-09-02 13:21:59 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "dropping column family error: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
std::terminate();
|
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
|
|
|
|
&column_families_[cf]);
|
|
|
|
column_family_names_[cf] = new_name;
|
|
|
|
thread->shared->ClearColumnFamily(cf);
|
2014-09-02 13:21:59 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "creating column family error: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
std::terminate();
|
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
thread->shared->UnlockColumnFamily(cf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
long rand_key = thread->rand.Next() % max_key;
|
2014-02-27 12:13:48 -08:00
|
|
|
int rand_column_family = thread->rand.Next() % FLAGS_column_families;
|
2013-08-14 16:58:36 -07:00
|
|
|
std::string keystr = Key(rand_key);
|
|
|
|
Slice key = keystr;
|
2014-02-27 12:13:48 -08:00
|
|
|
std::unique_ptr<MutexLock> l;
|
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
|
|
|
l.reset(new MutexLock(
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
shared->GetMutexForKey(rand_column_family, rand_key)));
|
2014-02-27 12:13:48 -08:00
|
|
|
}
|
|
|
|
auto column_family = column_families_[rand_column_family];
|
2013-08-14 16:58:36 -07:00
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
int prob_op = thread->rand.Uniform(100);
|
2013-08-14 16:58:36 -07:00
|
|
|
if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
|
2013-09-19 16:47:24 -07:00
|
|
|
// OPERATION read
|
2013-02-23 11:11:16 -08:00
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
2014-02-27 12:13:48 -08:00
|
|
|
Status s = db_->Get(read_opts, column_family, key, &from_db);
|
2013-02-23 11:11:16 -08:00
|
|
|
if (s.ok()) {
|
|
|
|
// found case
|
|
|
|
thread->stats.AddGets(1, 1);
|
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
// not found case
|
|
|
|
thread->stats.AddGets(1, 0);
|
|
|
|
} else {
|
|
|
|
// errors case
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
}
|
|
|
|
} else {
|
2014-02-27 12:13:48 -08:00
|
|
|
MultiGet(thread, read_opts, column_family, key, &from_db);
|
2013-02-23 11:11:16 -08:00
|
|
|
}
|
2013-09-19 16:47:24 -07:00
|
|
|
} else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
|
|
|
|
// OPERATION prefix scan
|
2014-03-11 16:33:42 -07:00
|
|
|
// keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
|
|
|
|
// (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
|
2014-03-12 09:31:06 -07:00
|
|
|
// be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
|
|
|
|
// prefix
|
2013-02-20 15:57:27 -08:00
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
2014-03-11 13:44:33 -07:00
|
|
|
Slice prefix = Slice(key.data(), FLAGS_prefix_size);
|
2014-02-27 12:13:48 -08:00
|
|
|
Iterator* iter = db_->NewIterator(read_opts, column_family);
|
2014-03-11 16:33:42 -07:00
|
|
|
int64_t count = 0;
|
2014-03-17 10:02:46 -07:00
|
|
|
for (iter->Seek(prefix);
|
|
|
|
iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
|
2014-03-11 16:33:42 -07:00
|
|
|
++count;
|
2013-08-14 16:58:36 -07:00
|
|
|
}
|
2014-03-12 09:31:06 -07:00
|
|
|
assert(count <=
|
|
|
|
(static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
|
2013-08-14 16:58:36 -07:00
|
|
|
if (iter->status().ok()) {
|
2014-11-11 16:47:22 -05:00
|
|
|
thread->stats.AddPrefixes(1, static_cast<int>(count));
|
2013-08-14 16:58:36 -07:00
|
|
|
} else {
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
}
|
|
|
|
delete iter;
|
2013-02-20 15:57:27 -08:00
|
|
|
} else {
|
2014-03-12 10:17:41 -07:00
|
|
|
MultiPrefixScan(thread, read_opts, column_family, key);
|
2012-11-07 15:35:08 -08:00
|
|
|
}
|
2013-09-19 16:47:24 -07:00
|
|
|
} else if (prefixBound <= prob_op && prob_op < writeBound) {
|
|
|
|
// OPERATION write
|
2012-10-03 09:58:45 -07:00
|
|
|
uint32_t value_base = thread->rand.Next();
|
|
|
|
size_t sz = GenerateValue(value_base, value, sizeof(value));
|
|
|
|
Slice v(value, sz);
|
2013-02-20 15:57:27 -08:00
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
// If the chosen key does not allow overwrite and it already
|
|
|
|
// exists, choose another key.
|
|
|
|
while (!shared->AllowsOverwrite(rand_column_family, rand_key) &&
|
|
|
|
shared->Exists(rand_column_family, rand_key)) {
|
|
|
|
l.reset();
|
|
|
|
rand_key = thread->rand.Next() % max_key;
|
|
|
|
rand_column_family = thread->rand.Next() % FLAGS_column_families;
|
|
|
|
l.reset(new MutexLock(
|
|
|
|
shared->GetMutexForKey(rand_column_family, rand_key)));
|
|
|
|
}
|
|
|
|
|
|
|
|
keystr = Key(rand_key);
|
|
|
|
key = keystr;
|
|
|
|
column_family = column_families_[rand_column_family];
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
if (FLAGS_verify_before_write) {
|
Fix two nasty use-after-free-bugs
Summary:
These bugs were caught by ASAN crash test.
1. The first one, in table/filter_block.cc is very nasty. We first reference entries_ and store the reference to Slice prev. Then, we call entries_.append(), which can change the reference. The Slice prev now points to junk.
2. The second one is a bug in a test, so it's not very serious. Once we set read_opts.prefix, we never clear it, so some other function might still reference it.
Test Plan: asan crash test now runs more than 5 mins. Before, it failed immediately. I will run the full one, but the full one takes quite some time (5 hours)
Reviewers: dhruba, haobo, kailiu
Reviewed By: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14223
2013-11-19 21:01:48 -08:00
|
|
|
std::string keystr2 = Key(rand_key);
|
|
|
|
Slice k = keystr2;
|
2014-02-27 12:13:48 -08:00
|
|
|
Status s = db_->Get(read_opts, column_family, k, &from_db);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
if (!VerifyValue(rand_column_family, rand_key, read_opts,
|
|
|
|
thread->shared, from_db, s, true)) {
|
2014-04-24 09:22:58 -04:00
|
|
|
break;
|
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
shared->Put(rand_column_family, rand_key, value_base);
|
2014-09-02 13:21:59 -07:00
|
|
|
Status s;
|
2013-10-24 07:43:14 -07:00
|
|
|
if (FLAGS_use_merge) {
|
2014-09-02 13:21:59 -07:00
|
|
|
s = db_->Merge(write_opts, column_family, key, v);
|
Benchmarking for Merge Operator
Summary:
Updated db_bench and utilities/merge_operators.h to allow for dynamic benchmarking
of merge operators in db_bench. Added a new test (--benchmarks=mergerandom), which performs
a bunch of random Merge() operations over random keys. Also added a "--merge_operator=" flag
so that the tester can easily benchmark different merge operators. Currently supports
the PutOperator and UInt64Add operator. Support for stringappend or list append may come later.
Test Plan:
1. make db_bench
2. Test the PutOperator (simulating Put) as follows:
./db_bench --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom --merge_operator=put
--threads=2
3. Test the UInt64AddOperator (simulating numeric addition) similarly:
./db_bench --value_size=8 --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom
--merge_operator=uint64add --threads=2
Reviewers: haobo, dhruba, zshao, MarkCallaghan
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11535
2013-08-15 17:13:07 -07:00
|
|
|
} else {
|
2014-09-02 13:21:59 -07:00
|
|
|
s = db_->Put(write_opts, column_family, key, v);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
|
|
|
|
std::terminate();
|
Benchmarking for Merge Operator
Summary:
Updated db_bench and utilities/merge_operators.h to allow for dynamic benchmarking
of merge operators in db_bench. Added a new test (--benchmarks=mergerandom), which performs
a bunch of random Merge() operations over random keys. Also added a "--merge_operator=" flag
so that the tester can easily benchmark different merge operators. Currently supports
the PutOperator and UInt64Add operator. Support for stringappend or list append may come later.
Test Plan:
1. make db_bench
2. Test the PutOperator (simulating Put) as follows:
./db_bench --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom --merge_operator=put
--threads=2
3. Test the UInt64AddOperator (simulating numeric addition) similarly:
./db_bench --value_size=8 --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom
--merge_operator=uint64add --threads=2
Reviewers: haobo, dhruba, zshao, MarkCallaghan
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11535
2013-08-15 17:13:07 -07:00
|
|
|
}
|
2013-02-20 15:57:27 -08:00
|
|
|
thread->stats.AddBytesForWrites(1, sz);
|
|
|
|
} else {
|
2014-02-27 12:13:48 -08:00
|
|
|
MultiPut(thread, write_opts, column_family, key, v, sz);
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
2014-11-11 16:47:22 -05:00
|
|
|
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key),
|
|
|
|
value, sz);
|
2013-09-19 16:47:24 -07:00
|
|
|
} else if (writeBound <= prob_op && prob_op < delBound) {
|
|
|
|
// OPERATION delete
|
2013-08-14 16:58:36 -07:00
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
// If the chosen key does not allow overwrite and it does not exist,
|
|
|
|
// choose another key.
|
|
|
|
while (!shared->AllowsOverwrite(rand_column_family, rand_key) &&
|
|
|
|
!shared->Exists(rand_column_family, rand_key)) {
|
|
|
|
l.reset();
|
|
|
|
rand_key = thread->rand.Next() % max_key;
|
|
|
|
rand_column_family = thread->rand.Next() % FLAGS_column_families;
|
|
|
|
l.reset(new MutexLock(
|
|
|
|
shared->GetMutexForKey(rand_column_family, rand_key)));
|
|
|
|
}
|
|
|
|
|
|
|
|
keystr = Key(rand_key);
|
|
|
|
key = keystr;
|
|
|
|
column_family = column_families_[rand_column_family];
|
|
|
|
|
|
|
|
// Use delete if the key may be overwritten and a single deletion
|
|
|
|
// otherwise.
|
|
|
|
if (shared->AllowsOverwrite(rand_column_family, rand_key)) {
|
|
|
|
shared->Delete(rand_column_family, rand_key);
|
|
|
|
Status s = db_->Delete(write_opts, column_family, key);
|
|
|
|
thread->stats.AddDeletes(1);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
|
|
|
|
std::terminate();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
shared->SingleDelete(rand_column_family, rand_key);
|
|
|
|
Status s = db_->SingleDelete(write_opts, column_family, key);
|
|
|
|
thread->stats.AddSingleDeletes(1);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "single delete error: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
std::terminate();
|
|
|
|
}
|
2014-09-02 13:21:59 -07:00
|
|
|
}
|
2013-08-14 16:58:36 -07:00
|
|
|
} else {
|
2014-02-27 12:13:48 -08:00
|
|
|
MultiDelete(thread, write_opts, column_family, key);
|
2013-08-14 16:58:36 -07:00
|
|
|
}
|
2013-09-19 16:47:24 -07:00
|
|
|
} else {
|
|
|
|
// OPERATION iterate
|
2014-02-27 12:13:48 -08:00
|
|
|
MultiIterate(thread, read_opts, column_family, key);
|
2013-08-14 16:58:36 -07:00
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
thread->stats.FinishedSingleOp();
|
|
|
|
}
|
2013-09-19 16:47:24 -07:00
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
thread->stats.Stop();
|
|
|
|
}
|
|
|
|
|
2013-10-01 23:19:51 -07:00
|
|
|
void VerifyDb(ThreadState* thread) const {
|
2012-10-03 09:58:45 -07:00
|
|
|
ReadOptions options(FLAGS_verify_checksum, true);
|
2014-04-24 09:22:58 -04:00
|
|
|
auto shared = thread->shared;
|
2015-09-28 12:06:43 -07:00
|
|
|
const int64_t max_key = shared->GetMaxKey();
|
|
|
|
const int64_t keys_per_thread = max_key / shared->GetNumThreads();
|
|
|
|
int64_t start = keys_per_thread * thread->tid;
|
|
|
|
int64_t end = start + keys_per_thread;
|
2014-04-24 09:22:58 -04:00
|
|
|
if (thread->tid == shared->GetNumThreads() - 1) {
|
Phase 2 of iterator stress test
Summary: Using an iterator instead of the Get method, each thread goes through a portion of the database and verifies values by comparing to the shared state.
Test Plan:
./db_stress --db=/tmp/tmppp --max_key=10000 --ops_per_thread=10000
To test some basic cases, the following lines can be added (each set in turn) to the verifyDb method with the following expected results:
// Should abort with "Unexpected value found"
shared.Delete(start);
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start));
// Should succeed
WriteOptions write_opts;
shared.Delete(start);
db_->Delete(write_opts, Key(start));
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start + (end-start)/2));
// Should abort with "Value not found"
db_->Delete(write_opts, Key(end-1));
// Should abort with "Unexpected value"
shared.Delete(end-1);
// Should abort with "Unexpected value"
shared.Delete(start + (end-start)/2);
// Should abort with "Value not found"
db_->Delete(write_opts, Key(start));
shared.Delete(start);
db_->Delete(write_opts, Key(end-1));
db_->Delete(write_opts, Key(end-2));
To test the out of range abort, change the key in the for loop to Key(i+1), so that the key defined by the index i is now outside of the supposed range of the database.
Reviewers: emayanke
Reviewed By: emayanke
CC: dhruba, xjin
Differential Revision: https://reviews.facebook.net/D13071
2013-09-30 16:48:00 -07:00
|
|
|
end = max_key;
|
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
for (size_t cf = 0; cf < column_families_.size(); ++cf) {
|
2014-04-24 09:22:58 -04:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
if (!thread->rand.OneIn(2)) {
|
|
|
|
// Use iterator to verify this range
|
|
|
|
unique_ptr<Iterator> iter(
|
|
|
|
db_->NewIterator(options, column_families_[cf]));
|
|
|
|
iter->Seek(Key(start));
|
|
|
|
for (long i = start; i < end; i++) {
|
2014-04-24 09:22:58 -04:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2014-03-13 13:21:20 -07:00
|
|
|
// TODO(ljin): update "long" to uint64_t
|
|
|
|
// Reseek when the prefix changes
|
|
|
|
if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
|
|
|
|
0) {
|
|
|
|
iter->Seek(Key(i));
|
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
std::string from_db;
|
|
|
|
std::string keystr = Key(i);
|
|
|
|
Slice k = keystr;
|
|
|
|
Status s = iter->status();
|
|
|
|
if (iter->Valid()) {
|
|
|
|
if (iter->key().compare(k) > 0) {
|
|
|
|
s = Status::NotFound(Slice());
|
|
|
|
} else if (iter->key().compare(k) == 0) {
|
|
|
|
from_db = iter->value().ToString();
|
|
|
|
iter->Next();
|
|
|
|
} else if (iter->key().compare(k) < 0) {
|
2014-11-11 16:47:22 -05:00
|
|
|
VerificationAbort(shared, "An out of range key was found",
|
|
|
|
static_cast<int>(cf), i);
|
2014-02-27 12:13:48 -08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// The iterator found no value for the key in question, so do not
|
|
|
|
// move to the next item in the iterator
|
2013-10-01 23:19:51 -07:00
|
|
|
s = Status::NotFound(Slice());
|
|
|
|
}
|
2014-11-11 16:47:22 -05:00
|
|
|
VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
|
|
|
|
true);
|
2014-02-27 12:13:48 -08:00
|
|
|
if (from_db.length()) {
|
2014-11-11 16:47:22 -05:00
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
|
|
|
|
from_db.data(), from_db.length());
|
2014-02-27 12:13:48 -08:00
|
|
|
}
|
2013-10-01 23:19:51 -07:00
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
} else {
|
|
|
|
// Use Get to verify this range
|
|
|
|
for (long i = start; i < end; i++) {
|
2014-04-24 09:22:58 -04:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
std::string from_db;
|
|
|
|
std::string keystr = Key(i);
|
|
|
|
Slice k = keystr;
|
|
|
|
Status s = db_->Get(options, column_families_[cf], k, &from_db);
|
2014-11-11 16:47:22 -05:00
|
|
|
VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
|
|
|
|
true);
|
2014-02-27 12:13:48 -08:00
|
|
|
if (from_db.length()) {
|
2014-11-11 16:47:22 -05:00
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
|
|
|
|
from_db.data(), from_db.length());
|
2014-02-27 12:13:48 -08:00
|
|
|
}
|
2013-10-01 23:19:51 -07:00
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-24 09:22:58 -04:00
|
|
|
void VerificationAbort(SharedState* shared, std::string msg, int cf,
|
|
|
|
long key) const {
|
|
|
|
printf("Verification failed for column family %d key %ld: %s\n", cf, key,
|
|
|
|
msg.c_str());
|
|
|
|
shared->SetVerificationFailure();
|
2012-10-12 17:00:25 -07:00
|
|
|
}
|
|
|
|
|
2014-04-24 09:22:58 -04:00
|
|
|
bool VerifyValue(int cf, long key, const ReadOptions& opts,
|
|
|
|
SharedState* shared, const std::string& value_from_db,
|
2014-02-27 12:13:48 -08:00
|
|
|
Status s, bool strict = false) const {
|
2014-04-24 09:22:58 -04:00
|
|
|
if (shared->HasVerificationFailedYet()) {
|
|
|
|
return false;
|
|
|
|
}
|
Phase 2 of iterator stress test
Summary: Using an iterator instead of the Get method, each thread goes through a portion of the database and verifies values by comparing to the shared state.
Test Plan:
./db_stress --db=/tmp/tmppp --max_key=10000 --ops_per_thread=10000
To test some basic cases, the following lines can be added (each set in turn) to the verifyDb method with the following expected results:
// Should abort with "Unexpected value found"
shared.Delete(start);
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start));
// Should succeed
WriteOptions write_opts;
shared.Delete(start);
db_->Delete(write_opts, Key(start));
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start + (end-start)/2));
// Should abort with "Value not found"
db_->Delete(write_opts, Key(end-1));
// Should abort with "Unexpected value"
shared.Delete(end-1);
// Should abort with "Unexpected value"
shared.Delete(start + (end-start)/2);
// Should abort with "Value not found"
db_->Delete(write_opts, Key(start));
shared.Delete(start);
db_->Delete(write_opts, Key(end-1));
db_->Delete(write_opts, Key(end-2));
To test the out of range abort, change the key in the for loop to Key(i+1), so that the key defined by the index i is now outside of the supposed range of the database.
Reviewers: emayanke
Reviewed By: emayanke
CC: dhruba, xjin
Differential Revision: https://reviews.facebook.net/D13071
2013-09-30 16:48:00 -07:00
|
|
|
// compare value_from_db with the value in the shared state
|
2012-11-07 15:35:08 -08:00
|
|
|
char value[100];
|
2014-04-24 09:22:58 -04:00
|
|
|
uint32_t value_base = shared->Get(cf, key);
|
2012-10-03 09:58:45 -07:00
|
|
|
if (value_base == SharedState::SENTINEL && !strict) {
|
2014-04-24 09:22:58 -04:00
|
|
|
return true;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
Phase 2 of iterator stress test
Summary: Using an iterator instead of the Get method, each thread goes through a portion of the database and verifies values by comparing to the shared state.
Test Plan:
./db_stress --db=/tmp/tmppp --max_key=10000 --ops_per_thread=10000
To test some basic cases, the following lines can be added (each set in turn) to the verifyDb method with the following expected results:
// Should abort with "Unexpected value found"
shared.Delete(start);
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start));
// Should succeed
WriteOptions write_opts;
shared.Delete(start);
db_->Delete(write_opts, Key(start));
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start + (end-start)/2));
// Should abort with "Value not found"
db_->Delete(write_opts, Key(end-1));
// Should abort with "Unexpected value"
shared.Delete(end-1);
// Should abort with "Unexpected value"
shared.Delete(start + (end-start)/2);
// Should abort with "Value not found"
db_->Delete(write_opts, Key(start));
shared.Delete(start);
db_->Delete(write_opts, Key(end-1));
db_->Delete(write_opts, Key(end-2));
To test the out of range abort, change the key in the for loop to Key(i+1), so that the key defined by the index i is now outside of the supposed range of the database.
Reviewers: emayanke
Reviewed By: emayanke
CC: dhruba, xjin
Differential Revision: https://reviews.facebook.net/D13071
2013-09-30 16:48:00 -07:00
|
|
|
if (s.ok()) {
|
2012-10-03 09:58:45 -07:00
|
|
|
if (value_base == SharedState::SENTINEL) {
|
2014-04-24 09:22:58 -04:00
|
|
|
VerificationAbort(shared, "Unexpected value found", cf, key);
|
|
|
|
return false;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
2013-02-20 17:37:13 -08:00
|
|
|
size_t sz = GenerateValue(value_base, value, sizeof(value));
|
Phase 2 of iterator stress test
Summary: Using an iterator instead of the Get method, each thread goes through a portion of the database and verifies values by comparing to the shared state.
Test Plan:
./db_stress --db=/tmp/tmppp --max_key=10000 --ops_per_thread=10000
To test some basic cases, the following lines can be added (each set in turn) to the verifyDb method with the following expected results:
// Should abort with "Unexpected value found"
shared.Delete(start);
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start));
// Should succeed
WriteOptions write_opts;
shared.Delete(start);
db_->Delete(write_opts, Key(start));
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start + (end-start)/2));
// Should abort with "Value not found"
db_->Delete(write_opts, Key(end-1));
// Should abort with "Unexpected value"
shared.Delete(end-1);
// Should abort with "Unexpected value"
shared.Delete(start + (end-start)/2);
// Should abort with "Value not found"
db_->Delete(write_opts, Key(start));
shared.Delete(start);
db_->Delete(write_opts, Key(end-1));
db_->Delete(write_opts, Key(end-2));
To test the out of range abort, change the key in the for loop to Key(i+1), so that the key defined by the index i is now outside of the supposed range of the database.
Reviewers: emayanke
Reviewed By: emayanke
CC: dhruba, xjin
Differential Revision: https://reviews.facebook.net/D13071
2013-09-30 16:48:00 -07:00
|
|
|
if (value_from_db.length() != sz) {
|
2014-04-24 09:22:58 -04:00
|
|
|
VerificationAbort(shared, "Length of value read is not equal", cf, key);
|
|
|
|
return false;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
Phase 2 of iterator stress test
Summary: Using an iterator instead of the Get method, each thread goes through a portion of the database and verifies values by comparing to the shared state.
Test Plan:
./db_stress --db=/tmp/tmppp --max_key=10000 --ops_per_thread=10000
To test some basic cases, the following lines can be added (each set in turn) to the verifyDb method with the following expected results:
// Should abort with "Unexpected value found"
shared.Delete(start);
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start));
// Should succeed
WriteOptions write_opts;
shared.Delete(start);
db_->Delete(write_opts, Key(start));
// Should abort with "Value not found"
WriteOptions write_opts;
db_->Delete(write_opts, Key(start + (end-start)/2));
// Should abort with "Value not found"
db_->Delete(write_opts, Key(end-1));
// Should abort with "Unexpected value"
shared.Delete(end-1);
// Should abort with "Unexpected value"
shared.Delete(start + (end-start)/2);
// Should abort with "Value not found"
db_->Delete(write_opts, Key(start));
shared.Delete(start);
db_->Delete(write_opts, Key(end-1));
db_->Delete(write_opts, Key(end-2));
To test the out of range abort, change the key in the for loop to Key(i+1), so that the key defined by the index i is now outside of the supposed range of the database.
Reviewers: emayanke
Reviewed By: emayanke
CC: dhruba, xjin
Differential Revision: https://reviews.facebook.net/D13071
2013-09-30 16:48:00 -07:00
|
|
|
if (memcmp(value_from_db.data(), value, sz) != 0) {
|
2014-04-24 09:22:58 -04:00
|
|
|
VerificationAbort(shared, "Contents of value read don't match", cf,
|
|
|
|
key);
|
|
|
|
return false;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (value_base != SharedState::SENTINEL) {
|
2014-04-24 09:22:58 -04:00
|
|
|
VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
|
|
|
|
return false;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
}
|
2014-04-24 09:22:58 -04:00
|
|
|
return true;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
2014-02-27 12:13:48 -08:00
|
|
|
static void PrintKeyValue(int cf, uint32_t key, const char* value,
|
|
|
|
size_t sz) {
|
|
|
|
if (!FLAGS_verbose) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz);
|
|
|
|
for (size_t i = 0; i < sz; i++) {
|
2012-10-03 09:58:45 -07:00
|
|
|
fprintf(stdout, "%X", value[i]);
|
|
|
|
}
|
|
|
|
fprintf(stdout, "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
|
|
|
|
size_t value_sz = ((rand % 3) + 1) * FLAGS_value_size_mult;
|
|
|
|
assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
|
|
|
|
*((uint32_t*)v) = rand;
|
|
|
|
for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
|
|
|
|
v[i] = (char)(rand ^ i);
|
|
|
|
}
|
2013-05-20 14:45:22 -07:00
|
|
|
v[value_sz] = '\0';
|
2012-10-03 09:58:45 -07:00
|
|
|
return value_sz; // the size of the value set.
|
|
|
|
}
|
|
|
|
|
|
|
|
void PrintEnv() const {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "RocksDB version : %d.%d\n", kMajorVersion,
|
2014-02-27 12:13:48 -08:00
|
|
|
kMinorVersion);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Column families : %d\n", FLAGS_column_families);
|
2014-02-27 12:13:48 -08:00
|
|
|
if (!FLAGS_test_batches_snapshots) {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Clear CFs one in : %d\n",
|
2014-02-27 12:13:48 -08:00
|
|
|
FLAGS_clear_column_family_one_in);
|
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Number of threads : %d\n", FLAGS_threads);
|
|
|
|
fprintf(stdout, "Ops per thread : %lu\n",
|
2013-11-16 23:44:39 -08:00
|
|
|
(unsigned long)FLAGS_ops_per_thread);
|
Timestamp and TTL Wrapper for rocksdb
Summary:
When opened with DBTimestamp::Open call, timestamps are prepended to and stripped from the value during subsequent Put and Get calls respectively. The Timestamp is used to discard values in Get and custom compaction filter which have exceeded their TTL which is specified during Open.
Have made a temporary change to Makefile to let us test with the temporary file TestTime.cc. Have also changed the private members of db_impl.h to protected to let them be inherited by the new class DBTimestamp
Test Plan: make db_timestamp; TestTime.cc(will not check it in) shows how to use the apis currently, but I will write unit-tests shortly
Reviewers: dhruba, vamsi, haobo, sheki, heyongqiang, vkrest
Reviewed By: vamsi
CC: zshao, xjin, vkrest, MarkCallaghan
Differential Revision: https://reviews.facebook.net/D10311
2013-04-15 13:33:13 -07:00
|
|
|
std::string ttl_state("unused");
|
|
|
|
if (FLAGS_ttl > 0) {
|
|
|
|
ttl_state = NumberToString(FLAGS_ttl);
|
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Time to live(sec) : %s\n", ttl_state.c_str());
|
|
|
|
fprintf(stdout, "Read percentage : %d%%\n", FLAGS_readpercent);
|
|
|
|
fprintf(stdout, "Prefix percentage : %d%%\n", FLAGS_prefixpercent);
|
|
|
|
fprintf(stdout, "Write percentage : %d%%\n", FLAGS_writepercent);
|
|
|
|
fprintf(stdout, "Delete percentage : %d%%\n", FLAGS_delpercent);
|
|
|
|
fprintf(stdout, "No overwrite percentage : %d%%\n",
|
|
|
|
FLAGS_nooverwritepercent);
|
|
|
|
fprintf(stdout, "Iterate percentage : %d%%\n", FLAGS_iterpercent);
|
|
|
|
fprintf(stdout, "DB-write-buffer-size : %" PRIu64 "\n",
|
|
|
|
FLAGS_db_write_buffer_size);
|
|
|
|
fprintf(stdout, "Write-buffer-size : %d\n",
|
|
|
|
FLAGS_write_buffer_size);
|
|
|
|
fprintf(stdout, "Iterations : %lu\n",
|
2013-11-16 23:44:39 -08:00
|
|
|
(unsigned long)FLAGS_num_iterations);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Max key : %lu\n",
|
2013-11-16 23:44:39 -08:00
|
|
|
(unsigned long)FLAGS_max_key);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Ratio #ops/#keys : %f\n",
|
|
|
|
(1.0 * FLAGS_ops_per_thread * FLAGS_threads) / FLAGS_max_key);
|
|
|
|
fprintf(stdout, "Num times DB reopens : %d\n", FLAGS_reopen);
|
|
|
|
fprintf(stdout, "Batches/snapshots : %d\n",
|
2013-02-20 15:57:27 -08:00
|
|
|
FLAGS_test_batches_snapshots);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Deletes use filter : %d\n", FLAGS_filter_deletes);
|
|
|
|
fprintf(stdout, "Do update in place : %d\n", FLAGS_in_place_update);
|
|
|
|
fprintf(stdout, "Num keys per lock : %d\n",
|
2012-10-03 09:58:45 -07:00
|
|
|
1 << FLAGS_log2_keys_per_lock);
|
2015-08-11 11:46:15 -07:00
|
|
|
std::string compression = CompressionTypeToString(FLAGS_compression_type_e);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Compression : %s\n", compression.c_str());
|
2013-08-22 23:10:02 -07:00
|
|
|
|
|
|
|
const char* memtablerep = "";
|
|
|
|
switch (FLAGS_rep_factory) {
|
|
|
|
case kSkipList:
|
|
|
|
memtablerep = "skip_list";
|
|
|
|
break;
|
2013-12-03 12:42:15 -08:00
|
|
|
case kHashSkipList:
|
2013-08-22 23:10:02 -07:00
|
|
|
memtablerep = "prefix_hash";
|
|
|
|
break;
|
|
|
|
case kVectorRep:
|
|
|
|
memtablerep = "vector";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
fprintf(stdout, "Memtablerep : %s\n", memtablerep);
|
2013-08-22 23:10:02 -07:00
|
|
|
|
2015-10-14 14:08:50 -07:00
|
|
|
fprintf(stdout, "Test kill odd : %d\n", rocksdb_kill_odds);
|
|
|
|
if (!rocksdb_kill_prefix_blacklist.empty()) {
|
|
|
|
fprintf(stdout, "Skipping kill points prefixes:\n");
|
|
|
|
for (auto& p : rocksdb_kill_prefix_blacklist) {
|
|
|
|
fprintf(stdout, " %s\n", p.c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
fprintf(stdout, "------------------------------------------------\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
void Open() {
|
2013-02-15 11:53:17 -08:00
|
|
|
assert(db_ == nullptr);
|
2014-08-25 14:22:05 -07:00
|
|
|
BlockBasedTableOptions block_based_options;
|
|
|
|
block_based_options.block_cache = cache_;
|
|
|
|
block_based_options.block_cache_compressed = compressed_cache_;
|
|
|
|
block_based_options.block_size = FLAGS_block_size;
|
2015-01-14 16:25:36 -08:00
|
|
|
block_based_options.format_version = 2;
|
2014-08-25 14:22:05 -07:00
|
|
|
block_based_options.filter_policy = filter_policy_;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
NewBlockBasedTableFactory(block_based_options));
|
2014-12-02 12:09:20 -08:00
|
|
|
options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.write_buffer_size = FLAGS_write_buffer_size;
|
|
|
|
options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
|
|
|
|
options_.min_write_buffer_number_to_merge =
|
|
|
|
FLAGS_min_write_buffer_number_to_merge;
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 16:34:24 -07:00
|
|
|
options_.max_write_buffer_number_to_maintain =
|
|
|
|
FLAGS_max_write_buffer_number_to_maintain;
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.max_background_compactions = FLAGS_max_background_compactions;
|
|
|
|
options_.max_background_flushes = FLAGS_max_background_flushes;
|
|
|
|
options_.compaction_style =
|
|
|
|
static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
|
|
|
|
options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
|
|
|
|
options_.max_open_files = FLAGS_open_files;
|
|
|
|
options_.statistics = dbstats;
|
|
|
|
options_.env = FLAGS_env;
|
|
|
|
options_.disableDataSync = FLAGS_disable_data_sync;
|
|
|
|
options_.use_fsync = FLAGS_use_fsync;
|
|
|
|
options_.allow_mmap_reads = FLAGS_mmap_read;
|
|
|
|
options_.target_file_size_base = FLAGS_target_file_size_base;
|
|
|
|
options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
|
|
|
|
options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
|
|
|
|
options_.max_bytes_for_level_multiplier =
|
2012-10-03 09:58:45 -07:00
|
|
|
FLAGS_max_bytes_for_level_multiplier;
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
|
|
|
|
options_.level0_slowdown_writes_trigger =
|
|
|
|
FLAGS_level0_slowdown_writes_trigger;
|
|
|
|
options_.level0_file_num_compaction_trigger =
|
|
|
|
FLAGS_level0_file_num_compaction_trigger;
|
|
|
|
options_.compression = FLAGS_compression_type_e;
|
|
|
|
options_.create_if_missing = true;
|
|
|
|
options_.max_manifest_file_size = 10 * 1024;
|
|
|
|
options_.filter_deletes = FLAGS_filter_deletes;
|
2014-10-31 12:02:14 -07:00
|
|
|
options_.inplace_update_support = FLAGS_in_place_update;
|
2015-08-21 14:25:34 -07:00
|
|
|
options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
|
2013-12-03 12:42:15 -08:00
|
|
|
if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
|
2013-08-22 23:10:02 -07:00
|
|
|
fprintf(stderr,
|
|
|
|
"prefix_size should be non-zero iff memtablerep == prefix_hash\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
switch (FLAGS_rep_factory) {
|
|
|
|
case kSkipList:
|
|
|
|
// no need to do anything
|
|
|
|
break;
|
2014-11-12 13:05:12 -08:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
case kHashSkipList:
|
|
|
|
options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
|
|
|
|
break;
|
2013-08-22 23:10:02 -07:00
|
|
|
case kVectorRep:
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.memtable_factory.reset(new VectorRepFactory());
|
2013-08-22 23:10:02 -07:00
|
|
|
break;
|
2014-11-12 13:05:12 -08:00
|
|
|
#else
|
|
|
|
default:
|
|
|
|
fprintf(stderr,
|
|
|
|
"RocksdbLite only supports skip list mem table. Skip "
|
|
|
|
"--rep_factory\n");
|
|
|
|
#endif // ROCKSDB_LITE
|
2013-08-22 23:10:02 -07:00
|
|
|
}
|
2014-11-12 13:05:12 -08:00
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
if (FLAGS_use_merge) {
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.merge_operator = MergeOperators::CreatePutOperator();
|
Benchmarking for Merge Operator
Summary:
Updated db_bench and utilities/merge_operators.h to allow for dynamic benchmarking
of merge operators in db_bench. Added a new test (--benchmarks=mergerandom), which performs
a bunch of random Merge() operations over random keys. Also added a "--merge_operator=" flag
so that the tester can easily benchmark different merge operators. Currently supports
the PutOperator and UInt64Add operator. Support for stringappend or list append may come later.
Test Plan:
1. make db_bench
2. Test the PutOperator (simulating Put) as follows:
./db_bench --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom --merge_operator=put
--threads=2
3. Test the UInt64AddOperator (simulating numeric addition) similarly:
./db_bench --value_size=8 --benchmarks=fillrandom,readrandom,updaterandom,readrandom,mergerandom,readrandom
--merge_operator=uint64add --threads=2
Reviewers: haobo, dhruba, zshao, MarkCallaghan
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11535
2013-08-15 17:13:07 -07:00
|
|
|
}
|
|
|
|
|
2013-09-09 16:06:10 -07:00
|
|
|
// set universal style compaction configurations, if applicable
|
|
|
|
if (FLAGS_universal_size_ratio != 0) {
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.compaction_options_universal.size_ratio =
|
|
|
|
FLAGS_universal_size_ratio;
|
2013-09-09 16:06:10 -07:00
|
|
|
}
|
|
|
|
if (FLAGS_universal_min_merge_width != 0) {
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.compaction_options_universal.min_merge_width =
|
|
|
|
FLAGS_universal_min_merge_width;
|
2013-09-09 16:06:10 -07:00
|
|
|
}
|
|
|
|
if (FLAGS_universal_max_merge_width != 0) {
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.compaction_options_universal.max_merge_width =
|
|
|
|
FLAGS_universal_max_merge_width;
|
2013-09-09 16:06:10 -07:00
|
|
|
}
|
|
|
|
if (FLAGS_universal_max_size_amplification_percent != 0) {
|
2014-02-27 12:13:48 -08:00
|
|
|
options_.compaction_options_universal.max_size_amplification_percent =
|
|
|
|
FLAGS_universal_max_size_amplification_percent;
|
2013-09-09 16:06:10 -07:00
|
|
|
}
|
|
|
|
|
2013-10-24 07:43:14 -07:00
|
|
|
fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
|
2013-06-17 16:13:32 -07:00
|
|
|
|
Timestamp and TTL Wrapper for rocksdb
Summary:
When opened with DBTimestamp::Open call, timestamps are prepended to and stripped from the value during subsequent Put and Get calls respectively. The Timestamp is used to discard values in Get and custom compaction filter which have exceeded their TTL which is specified during Open.
Have made a temporary change to Makefile to let us test with the temporary file TestTime.cc. Have also changed the private members of db_impl.h to protected to let them be inherited by the new class DBTimestamp
Test Plan: make db_timestamp; TestTime.cc(will not check it in) shows how to use the apis currently, but I will write unit-tests shortly
Reviewers: dhruba, vamsi, haobo, sheki, heyongqiang, vkrest
Reviewed By: vamsi
CC: zshao, xjin, vkrest, MarkCallaghan
Differential Revision: https://reviews.facebook.net/D10311
2013-04-15 13:33:13 -07:00
|
|
|
Status s;
|
|
|
|
if (FLAGS_ttl == -1) {
|
2014-02-27 12:13:48 -08:00
|
|
|
std::vector<std::string> existing_column_families;
|
|
|
|
s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
|
|
|
|
&existing_column_families); // ignore errors
|
|
|
|
if (!s.ok()) {
|
|
|
|
// DB doesn't exist
|
|
|
|
assert(existing_column_families.empty());
|
|
|
|
assert(column_family_names_.empty());
|
2014-04-09 09:56:17 -07:00
|
|
|
column_family_names_.push_back(kDefaultColumnFamilyName);
|
2014-02-27 12:13:48 -08:00
|
|
|
} else if (column_family_names_.empty()) {
|
|
|
|
// this is the first call to the function Open()
|
|
|
|
column_family_names_ = existing_column_families;
|
|
|
|
} else {
|
|
|
|
// this is a reopen. just assert that existing column_family_names are
|
|
|
|
// equivalent to what we remember
|
|
|
|
auto sorted_cfn = column_family_names_;
|
|
|
|
sort(sorted_cfn.begin(), sorted_cfn.end());
|
|
|
|
sort(existing_column_families.begin(), existing_column_families.end());
|
|
|
|
if (sorted_cfn != existing_column_families) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Expected column families differ from the existing:\n");
|
|
|
|
printf("Expected: {");
|
|
|
|
for (auto cf : sorted_cfn) {
|
|
|
|
printf("%s ", cf.c_str());
|
|
|
|
}
|
|
|
|
printf("}\n");
|
|
|
|
printf("Existing: {");
|
|
|
|
for (auto cf : existing_column_families) {
|
|
|
|
printf("%s ", cf.c_str());
|
|
|
|
}
|
|
|
|
printf("}\n");
|
|
|
|
}
|
|
|
|
assert(sorted_cfn == existing_column_families);
|
|
|
|
}
|
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descriptors;
|
|
|
|
for (auto name : column_family_names_) {
|
2014-04-09 09:56:17 -07:00
|
|
|
if (name != kDefaultColumnFamilyName) {
|
2014-02-27 12:13:48 -08:00
|
|
|
new_column_family_name_ =
|
|
|
|
std::max(new_column_family_name_.load(), std::stoi(name) + 1);
|
|
|
|
}
|
|
|
|
cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
|
|
|
|
}
|
2014-06-06 18:04:56 -07:00
|
|
|
while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
|
2014-11-24 20:44:49 -08:00
|
|
|
std::string name = ToString(new_column_family_name_.load());
|
2014-06-06 18:04:56 -07:00
|
|
|
new_column_family_name_++;
|
|
|
|
cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
|
|
|
|
column_family_names_.push_back(name);
|
|
|
|
}
|
2015-05-29 13:17:49 -07:00
|
|
|
options_.listeners.clear();
|
|
|
|
options_.listeners.emplace_back(
|
2015-05-30 13:00:23 -07:00
|
|
|
new DbStressListener(FLAGS_db, options_.db_paths));
|
2014-06-06 18:04:56 -07:00
|
|
|
options_.create_missing_column_families = true;
|
2014-02-27 12:13:48 -08:00
|
|
|
s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
|
|
|
|
&column_families_, &db_);
|
|
|
|
assert(!s.ok() || column_families_.size() ==
|
|
|
|
static_cast<size_t>(FLAGS_column_families));
|
Timestamp and TTL Wrapper for rocksdb
Summary:
When opened with DBTimestamp::Open call, timestamps are prepended to and stripped from the value during subsequent Put and Get calls respectively. The Timestamp is used to discard values in Get and custom compaction filter which have exceeded their TTL which is specified during Open.
Have made a temporary change to Makefile to let us test with the temporary file TestTime.cc. Have also changed the private members of db_impl.h to protected to let them be inherited by the new class DBTimestamp
Test Plan: make db_timestamp; TestTime.cc(will not check it in) shows how to use the apis currently, but I will write unit-tests shortly
Reviewers: dhruba, vamsi, haobo, sheki, heyongqiang, vkrest
Reviewed By: vamsi
CC: zshao, xjin, vkrest, MarkCallaghan
Differential Revision: https://reviews.facebook.net/D10311
2013-04-15 13:33:13 -07:00
|
|
|
} else {
|
2014-11-12 13:05:12 -08:00
|
|
|
#ifndef ROCKSDB_LITE
|
2014-04-28 23:44:33 -04:00
|
|
|
DBWithTTL* db_with_ttl;
|
|
|
|
s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
|
|
|
|
db_ = db_with_ttl;
|
2014-11-12 13:05:12 -08:00
|
|
|
#else
|
|
|
|
fprintf(stderr, "TTL is not supported in RocksDBLite\n");
|
|
|
|
exit(1);
|
|
|
|
#endif
|
Timestamp and TTL Wrapper for rocksdb
Summary:
When opened with DBTimestamp::Open call, timestamps are prepended to and stripped from the value during subsequent Put and Get calls respectively. The Timestamp is used to discard values in Get and custom compaction filter which have exceeded their TTL which is specified during Open.
Have made a temporary change to Makefile to let us test with the temporary file TestTime.cc. Have also changed the private members of db_impl.h to protected to let them be inherited by the new class DBTimestamp
Test Plan: make db_timestamp; TestTime.cc(will not check it in) shows how to use the apis currently, but I will write unit-tests shortly
Reviewers: dhruba, vamsi, haobo, sheki, heyongqiang, vkrest
Reviewed By: vamsi
CC: zshao, xjin, vkrest, MarkCallaghan
Differential Revision: https://reviews.facebook.net/D10311
2013-04-15 13:33:13 -07:00
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-09 13:04:12 -08:00
|
|
|
void Reopen() {
|
2014-02-27 12:13:48 -08:00
|
|
|
for (auto cf : column_families_) {
|
|
|
|
delete cf;
|
Timestamp and TTL Wrapper for rocksdb
Summary:
When opened with DBTimestamp::Open call, timestamps are prepended to and stripped from the value during subsequent Put and Get calls respectively. The Timestamp is used to discard values in Get and custom compaction filter which have exceeded their TTL which is specified during Open.
Have made a temporary change to Makefile to let us test with the temporary file TestTime.cc. Have also changed the private members of db_impl.h to protected to let them be inherited by the new class DBTimestamp
Test Plan: make db_timestamp; TestTime.cc(will not check it in) shows how to use the apis currently, but I will write unit-tests shortly
Reviewers: dhruba, vamsi, haobo, sheki, heyongqiang, vkrest
Reviewed By: vamsi
CC: zshao, xjin, vkrest, MarkCallaghan
Differential Revision: https://reviews.facebook.net/D10311
2013-04-15 13:33:13 -07:00
|
|
|
}
|
2014-02-27 12:13:48 -08:00
|
|
|
column_families_.clear();
|
|
|
|
delete db_;
|
2013-02-15 11:53:17 -08:00
|
|
|
db_ = nullptr;
|
2012-11-16 15:28:14 -08:00
|
|
|
|
|
|
|
num_times_reopened_++;
|
|
|
|
double now = FLAGS_env->NowMicros();
|
|
|
|
fprintf(stdout, "%s Reopening database for the %dth time\n",
|
|
|
|
FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
|
|
|
|
num_times_reopened_);
|
2012-11-09 13:04:12 -08:00
|
|
|
Open();
|
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
void PrintStatistics() {
|
|
|
|
if (dbstats) {
|
2013-07-10 13:17:51 -07:00
|
|
|
fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2014-08-25 14:22:05 -07:00
|
|
|
std::shared_ptr<Cache> cache_;
|
|
|
|
std::shared_ptr<Cache> compressed_cache_;
|
|
|
|
std::shared_ptr<const FilterPolicy> filter_policy_;
|
2012-10-03 09:58:45 -07:00
|
|
|
DB* db_;
|
2014-02-27 12:13:48 -08:00
|
|
|
Options options_;
|
|
|
|
std::vector<ColumnFamilyHandle*> column_families_;
|
|
|
|
std::vector<std::string> column_family_names_;
|
|
|
|
std::atomic<int> new_column_family_name_;
|
2012-11-16 15:28:14 -08:00
|
|
|
int num_times_reopened_;
|
2014-10-27 12:11:16 -07:00
|
|
|
std::unordered_map<std::string, std::vector<std::string>> options_table_;
|
|
|
|
std::vector<std::string> options_index_;
|
2012-10-03 09:58:45 -07:00
|
|
|
};
|
|
|
|
|
2013-10-03 21:49:15 -07:00
|
|
|
} // namespace rocksdb
|
2012-10-03 09:58:45 -07:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2014-05-08 17:25:13 -07:00
|
|
|
SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
|
|
|
|
" [OPTIONS]...");
|
|
|
|
ParseCommandLineFlags(&argc, &argv, true);
|
2013-10-24 07:43:14 -07:00
|
|
|
|
|
|
|
if (FLAGS_statistics) {
|
|
|
|
dbstats = rocksdb::CreateDBStatistics();
|
|
|
|
}
|
|
|
|
FLAGS_compression_type_e =
|
|
|
|
StringToCompressionType(FLAGS_compression_type.c_str());
|
|
|
|
if (!FLAGS_hdfs.empty()) {
|
|
|
|
FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs);
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
2013-10-24 07:43:14 -07:00
|
|
|
FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
|
2012-10-03 09:58:45 -07:00
|
|
|
|
2012-11-14 22:00:11 -08:00
|
|
|
// The number of background threads should be at least as much the
|
|
|
|
// max number of concurrent compactions.
|
|
|
|
FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
|
|
|
|
|
2014-03-11 13:44:33 -07:00
|
|
|
if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: prefixpercent is non-zero while prefix_size is "
|
|
|
|
"not positive!\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: please specify prefix_size for "
|
|
|
|
"test_batches_snapshots test!\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
2013-08-14 16:58:36 -07:00
|
|
|
if ((FLAGS_readpercent + FLAGS_prefixpercent +
|
2013-09-19 16:47:24 -07:00
|
|
|
FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n");
|
2012-11-07 15:35:08 -08:00
|
|
|
exit(1);
|
|
|
|
}
|
2013-05-21 11:27:23 -07:00
|
|
|
if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
|
|
|
|
fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
2013-04-08 11:50:44 -07:00
|
|
|
if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
|
2013-11-16 23:44:39 -08:00
|
|
|
fprintf(stderr,
|
|
|
|
"Error: #DB-reopens should be < ops_per_thread\n"
|
|
|
|
"Provided reopens = %d and ops_per_thread = %lu\n",
|
|
|
|
FLAGS_reopen,
|
|
|
|
(unsigned long)FLAGS_ops_per_thread);
|
2013-04-08 11:50:44 -07:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2012-10-03 09:58:45 -07:00
|
|
|
// Choose a location for the test database if none given with --db=<path>
|
2013-10-24 07:43:14 -07:00
|
|
|
if (FLAGS_db.empty()) {
|
|
|
|
std::string default_db_path;
|
2013-10-03 21:49:15 -07:00
|
|
|
rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
|
2012-10-03 09:58:45 -07:00
|
|
|
default_db_path += "/dbstress";
|
2013-10-24 07:43:14 -07:00
|
|
|
FLAGS_db = default_db_path;
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
|
|
|
|
2015-10-14 14:08:50 -07:00
|
|
|
rocksdb_kill_odds = FLAGS_kill_random_test;
|
|
|
|
rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
|
|
|
|
|
2013-10-03 21:49:15 -07:00
|
|
|
rocksdb::StressTest stress;
|
2014-04-24 09:22:58 -04:00
|
|
|
if (stress.Run()) {
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
return 1;
|
|
|
|
}
|
2012-10-03 09:58:45 -07:00
|
|
|
}
|
2014-05-08 17:25:13 -07:00
|
|
|
|
|
|
|
#endif // GFLAGS
|