2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2013-10-16 23:59:46 +02:00
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/db_impl.h"
|
|
|
|
|
2014-09-05 08:14:37 +02:00
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
2014-02-26 23:16:23 +01:00
|
|
|
#define __STDC_FORMAT_MACROS
|
2014-09-05 08:14:37 +02:00
|
|
|
#endif
|
|
|
|
|
2014-02-26 23:16:23 +01:00
|
|
|
#include <inttypes.h>
|
2015-09-02 22:58:22 +02:00
|
|
|
#include <stdint.h>
|
2016-01-19 07:17:31 +01:00
|
|
|
#ifdef OS_SOLARIS
|
2016-01-19 05:45:21 +01:00
|
|
|
#include <alloca.h>
|
2016-01-19 07:17:31 +01:00
|
|
|
#endif
|
2015-09-02 22:58:22 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
#include <algorithm>
|
2012-11-26 22:56:45 +01:00
|
|
|
#include <climits>
|
|
|
|
#include <cstdio>
|
2015-11-11 07:58:01 +01:00
|
|
|
#include <map>
|
2011-03-18 23:37:00 +01:00
|
|
|
#include <set>
|
2013-06-21 01:58:59 +02:00
|
|
|
#include <stdexcept>
|
2013-10-16 20:50:50 +02:00
|
|
|
#include <string>
|
2014-01-28 20:05:04 +01:00
|
|
|
#include <unordered_map>
|
2015-09-02 22:58:22 +02:00
|
|
|
#include <unordered_set>
|
2014-01-17 06:56:26 +01:00
|
|
|
#include <utility>
|
2013-10-16 20:50:50 +02:00
|
|
|
#include <vector>
|
2012-11-26 22:56:45 +01:00
|
|
|
|
2016-01-26 02:49:58 +01:00
|
|
|
#include "db/auto_roll_logger.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/builder.h"
|
2014-11-01 00:31:25 +01:00
|
|
|
#include "db/compaction_job.h"
|
2016-01-26 02:49:58 +01:00
|
|
|
#include "db/db_info_dumper.h"
|
2013-10-16 20:50:50 +02:00
|
|
|
#include "db/db_iter.h"
|
2013-12-31 03:33:57 +01:00
|
|
|
#include "db/dbformat.h"
|
2015-05-28 22:37:47 +02:00
|
|
|
#include "db/event_helpers.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/filename.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "db/flush_job.h"
|
|
|
|
#include "db/forward_iterator.h"
|
2014-11-15 00:43:10 +01:00
|
|
|
#include "db/job_context.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
2015-02-18 20:49:31 +01:00
|
|
|
#include "db/managed_iterator.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/memtable.h"
|
2013-12-11 04:03:13 +01:00
|
|
|
#include "db/memtable_list.h"
|
2013-12-03 03:34:05 +01:00
|
|
|
#include "db/merge_context.h"
|
2013-03-21 23:59:47 +01:00
|
|
|
#include "db/merge_helper.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/table_cache.h"
|
2013-11-20 01:29:42 +01:00
|
|
|
#include "db/table_properties_collector.h"
|
2013-10-16 20:50:50 +02:00
|
|
|
#include "db/transaction_log_impl.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "db/write_batch_internal.h"
|
2015-05-29 23:36:35 +02:00
|
|
|
#include "db/write_callback.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "db/writebuffer.h"
|
2016-01-25 22:27:50 +01:00
|
|
|
#include "db/xfunc_test_points.h"
|
2015-10-16 23:10:33 +02:00
|
|
|
#include "memtable/hash_linklist_rep.h"
|
|
|
|
#include "memtable/hash_skiplist_rep.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "port/likely.h"
|
2013-10-16 20:50:50 +02:00
|
|
|
#include "port/port.h"
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
#include "rocksdb/cache.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/compaction_filter.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/merge_operator.h"
|
2015-09-23 21:42:43 +02:00
|
|
|
#include "rocksdb/sst_file_writer.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/status.h"
|
2013-10-29 01:54:09 +01:00
|
|
|
#include "rocksdb/table.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "rocksdb/version.h"
|
2015-10-13 02:03:03 +02:00
|
|
|
#include "rocksdb/wal_filter.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/block.h"
|
2013-11-20 07:00:48 +01:00
|
|
|
#include "table/block_based_table_factory.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/merger.h"
|
2014-01-28 06:58:46 +01:00
|
|
|
#include "table/table_builder.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "table/two_level_iterator.h"
|
2014-01-28 06:58:46 +01:00
|
|
|
#include "util/autovector.h"
|
2013-02-15 20:53:17 +01:00
|
|
|
#include "util/build_version.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "util/coding.h"
|
2015-04-06 21:50:44 +02:00
|
|
|
#include "util/compression.h"
|
2015-07-11 02:35:27 +02:00
|
|
|
#include "util/crc32c.h"
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
#include "util/file_reader_writer.h"
|
2014-12-16 06:48:16 +01:00
|
|
|
#include "util/file_util.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "util/iostats_context_imp.h"
|
2014-03-10 06:01:13 +01:00
|
|
|
#include "util/log_buffer.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "util/logging.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "util/mutexlock.h"
|
2016-01-29 03:35:01 +01:00
|
|
|
#include "util/sst_file_manager_impl.h"
|
2015-11-11 07:58:01 +01:00
|
|
|
#include "util/options_helper.h"
|
|
|
|
#include "util/options_parser.h"
|
2013-10-23 08:26:51 +02:00
|
|
|
#include "util/perf_context_imp.h"
|
2013-02-15 20:53:17 +01:00
|
|
|
#include "util/stop_watch.h"
|
2014-11-25 05:44:49 +01:00
|
|
|
#include "util/string_util.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "util/sync_point.h"
|
2014-12-22 21:20:17 +01:00
|
|
|
#include "util/thread_status_updater.h"
|
|
|
|
#include "util/thread_status_util.h"
|
2015-02-18 20:49:31 +01:00
|
|
|
#include "util/xfunc.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
namespace rocksdb {
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-04-09 18:56:17 +02:00
|
|
|
const std::string kDefaultColumnFamilyName("default");
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
|
2014-10-07 19:40:57 +02:00
|
|
|
void DumpRocksDBBuildVersion(Logger * log);
|
2012-09-18 22:18:32 +02:00
|
|
|
|
2014-08-12 07:10:32 +02:00
|
|
|
struct DBImpl::WriteContext {
|
|
|
|
autovector<SuperVersion*> superversions_to_free_;
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
autovector<MemTable*> memtables_to_free_;
|
2014-08-12 07:10:32 +02:00
|
|
|
|
|
|
|
~WriteContext() {
|
|
|
|
for (auto& sv : superversions_to_free_) {
|
|
|
|
delete sv;
|
|
|
|
}
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
for (auto& m : memtables_to_free_) {
|
|
|
|
delete m;
|
|
|
|
}
|
2014-08-12 07:10:32 +02:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Options SanitizeOptions(const std::string& dbname,
|
|
|
|
const InternalKeyComparator* icmp,
|
|
|
|
const Options& src) {
|
2014-02-05 01:31:18 +01:00
|
|
|
auto db_options = SanitizeOptions(dbname, DBOptions(src));
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
auto cf_options = SanitizeOptions(db_options, icmp, ColumnFamilyOptions(src));
|
2014-02-05 01:31:18 +01:00
|
|
|
return Options(db_options, cf_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
|
|
|
DBOptions result = src;
|
2014-07-28 21:10:49 +02:00
|
|
|
|
2014-01-07 05:29:17 +01:00
|
|
|
// result.max_open_files means an "infinite" open files.
|
|
|
|
if (result.max_open_files != -1) {
|
2015-09-10 19:49:28 +02:00
|
|
|
int max_max_open_files = port::GetMaxOpenFiles();
|
|
|
|
if (max_max_open_files == -1) {
|
|
|
|
max_max_open_files = 1000000;
|
|
|
|
}
|
|
|
|
ClipToRange(&result.max_open_files, 20, max_max_open_files);
|
2014-01-07 05:29:17 +01:00
|
|
|
}
|
2014-02-05 01:31:18 +01:00
|
|
|
|
2013-02-15 20:53:17 +01:00
|
|
|
if (result.info_log == nullptr) {
|
2015-10-30 02:07:37 +01:00
|
|
|
Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
// No place suitable for logging
|
2013-02-15 20:53:17 +01:00
|
|
|
result.info_log = nullptr;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
2016-01-28 20:56:16 +01:00
|
|
|
if (result.base_background_compactions == -1) {
|
|
|
|
result.base_background_compactions = result.max_background_compactions;
|
|
|
|
}
|
|
|
|
if (result.base_background_compactions > result.max_background_compactions) {
|
|
|
|
result.base_background_compactions = result.max_background_compactions;
|
|
|
|
}
|
2014-11-03 23:11:33 +01:00
|
|
|
result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes,
|
|
|
|
Env::Priority::HIGH);
|
2013-10-01 23:46:52 +02:00
|
|
|
|
2015-03-06 23:21:15 +01:00
|
|
|
if (result.rate_limiter.get() != nullptr) {
|
2014-08-13 01:42:18 +02:00
|
|
|
if (result.bytes_per_sync == 0) {
|
|
|
|
result.bytes_per_sync = 1024 * 1024;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-28 21:29:31 +02:00
|
|
|
if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
|
|
|
|
result.recycle_log_file_num = false;
|
|
|
|
}
|
|
|
|
|
2013-10-01 23:46:52 +02:00
|
|
|
if (result.wal_dir.empty()) {
|
|
|
|
// Use dbname as default
|
|
|
|
result.wal_dir = dbname;
|
|
|
|
}
|
2014-04-07 19:25:38 +02:00
|
|
|
if (result.wal_dir.back() == '/') {
|
2014-04-07 20:36:03 +02:00
|
|
|
result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
|
2014-04-07 19:25:38 +02:00
|
|
|
}
|
2013-10-16 20:50:50 +02:00
|
|
|
|
2014-07-02 18:54:20 +02:00
|
|
|
if (result.db_paths.size() == 0) {
|
2014-07-15 00:34:30 +02:00
|
|
|
result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
|
2014-07-02 18:54:20 +02:00
|
|
|
}
|
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
if (result.compaction_readahead_size > 0) {
|
|
|
|
result.new_table_reader_for_compaction_inputs = true;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-09-02 23:42:23 +02:00
|
|
|
namespace {
|
|
|
|
|
2014-10-18 06:18:36 +02:00
|
|
|
Status SanitizeOptionsByTable(
|
|
|
|
const DBOptions& db_opts,
|
2014-08-21 00:53:39 +02:00
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families) {
|
|
|
|
Status s;
|
|
|
|
for (auto cf : column_families) {
|
2014-10-18 06:18:36 +02:00
|
|
|
s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
|
2014-08-21 00:53:39 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2014-09-09 00:04:34 +02:00
|
|
|
CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) {
|
2013-12-19 19:02:53 +01:00
|
|
|
// Compressing memtable flushes might not help unless the sequential load
|
|
|
|
// optimization is used for leveled compaction. Otherwise the CPU and
|
|
|
|
// latency overhead is not offset by saving much space.
|
|
|
|
|
|
|
|
bool can_compress;
|
|
|
|
|
2014-09-09 00:04:34 +02:00
|
|
|
if (ioptions.compaction_style == kCompactionStyleUniversal) {
|
2013-12-19 19:02:53 +01:00
|
|
|
can_compress =
|
2014-09-09 00:04:34 +02:00
|
|
|
(ioptions.compaction_options_universal.compression_size_percent < 0);
|
2013-12-19 19:02:53 +01:00
|
|
|
} else {
|
|
|
|
// For leveled compress when min_level_to_compress == 0.
|
2014-09-09 00:04:34 +02:00
|
|
|
can_compress = ioptions.compression_per_level.empty() ||
|
|
|
|
ioptions.compression_per_level[0] != kNoCompression;
|
2013-12-19 19:02:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (can_compress) {
|
2014-09-09 00:04:34 +02:00
|
|
|
return ioptions.compression;
|
2013-12-19 19:02:53 +01:00
|
|
|
} else {
|
|
|
|
return kNoCompression;
|
|
|
|
}
|
|
|
|
}
|
2015-03-26 19:22:20 +01:00
|
|
|
|
2015-07-11 02:35:27 +02:00
|
|
|
void DumpSupportInfo(Logger* logger) {
|
2015-04-06 21:50:44 +02:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, logger, "Compression algorithms supported:");
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, logger, "\tSnappy supported: %d",
|
|
|
|
Snappy_Supported());
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, logger, "\tZlib supported: %d",
|
|
|
|
Zlib_Supported());
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, logger, "\tBzip supported: %d",
|
|
|
|
BZip2_Supported());
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, logger, "\tLZ4 supported: %d", LZ4_Supported());
|
2015-07-11 02:35:27 +02:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, logger, "Fast CRC32 supported: %d",
|
|
|
|
crc32c::IsFastCrc32Supported());
|
2015-03-26 19:22:20 +01:00
|
|
|
}
|
|
|
|
|
2014-07-02 20:40:57 +02:00
|
|
|
} // namespace
|
2013-12-19 19:02:53 +01:00
|
|
|
|
2014-02-05 22:12:23 +01:00
|
|
|
DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
2011-03-18 23:37:00 +01:00
|
|
|
: env_(options.env),
|
2012-11-06 04:18:49 +01:00
|
|
|
dbname_(dbname),
|
2014-09-05 20:48:17 +02:00
|
|
|
db_options_(SanitizeOptions(dbname, options)),
|
|
|
|
stats_(db_options_.statistics.get()),
|
2013-02-15 20:53:17 +01:00
|
|
|
db_lock_(nullptr),
|
EventLogger
Summary:
Here's my proposal for making our LOGs easier to read by machines.
The idea is to dump all events as JSON objects. JSON is easy to read by humans, but more importantly, it's easy to read by machines. That way, we can parse this, load into SQLite/mongo and then query or visualize.
I started with table_create and table_delete events, but if everybody agrees, I'll continue by adding more events (flush/compaction/etc etc)
Test Plan:
Ran db_bench. Observed:
2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": 1421360005788015, "event": "table_file_creation", "file_number": 12, "file_size": 1909699}
2015/01/15-14:13:25.956500 110740000 EVENT_LOG_v1 {"time_micros": 1421360005956498, "event": "table_file_deletion", "file_number": 12}
Reviewers: yhchiang, rven, dhruba, MarkCallaghan, lgalanis, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31647
2015-03-13 18:15:54 +01:00
|
|
|
mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, options.use_adaptive_mutex),
|
2014-10-27 22:50:21 +01:00
|
|
|
shutting_down_(false),
|
2011-03-18 23:37:00 +01:00
|
|
|
bg_cv_(&mutex_),
|
2011-06-22 04:36:45 +02:00
|
|
|
logfile_number_(0),
|
2015-01-27 00:48:59 +01:00
|
|
|
log_dir_synced_(false),
|
2014-04-15 18:57:25 +02:00
|
|
|
log_empty_(true),
|
2014-02-11 02:04:44 +01:00
|
|
|
default_cf_handle_(nullptr),
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 12:28:08 +02:00
|
|
|
log_sync_cv_(&mutex_),
|
2014-04-30 20:33:40 +02:00
|
|
|
total_log_size_(0),
|
|
|
|
max_total_in_memory_state_(0),
|
2014-12-11 03:39:09 +01:00
|
|
|
is_snapshot_supported_(true),
|
2014-12-02 21:09:20 +01:00
|
|
|
write_buffer_(options.db_write_buffer_size),
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
write_thread_(options.enable_write_thread_adaptive_yield
|
|
|
|
? options.write_thread_max_yield_usec
|
|
|
|
: 0,
|
|
|
|
options.write_thread_slow_yield_usec),
|
2015-05-16 00:52:51 +02:00
|
|
|
write_controller_(options.delayed_write_rate),
|
|
|
|
last_batch_group_size_(0),
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
unscheduled_flushes_(0),
|
|
|
|
unscheduled_compactions_(0),
|
2012-10-19 23:00:53 +02:00
|
|
|
bg_compaction_scheduled_(0),
|
2015-10-17 09:16:36 +02:00
|
|
|
num_running_compactions_(0),
|
2013-09-13 23:38:37 +02:00
|
|
|
bg_flush_scheduled_(0),
|
2015-10-17 09:16:36 +02:00
|
|
|
num_running_flushes_(0),
|
2014-01-02 12:33:42 +01:00
|
|
|
disable_delete_obsolete_files_(0),
|
2014-12-22 12:04:45 +01:00
|
|
|
delete_obsolete_files_next_run_(
|
|
|
|
options.env->NowMicros() +
|
|
|
|
db_options_.delete_obsolete_files_period_micros),
|
2013-05-11 00:21:04 +02:00
|
|
|
last_stats_dump_time_microsec_(0),
|
2015-02-12 18:54:48 +01:00
|
|
|
next_job_id_(1),
|
2016-02-09 20:20:22 +01:00
|
|
|
has_unpersisted_data_(false),
|
2015-04-08 22:48:02 +02:00
|
|
|
env_options_(db_options_),
|
2014-10-30 01:43:37 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
wal_manager_(db_options_, env_options_),
|
|
|
|
#endif // ROCKSDB_LITE
|
EventLogger
Summary:
Here's my proposal for making our LOGs easier to read by machines.
The idea is to dump all events as JSON objects. JSON is easy to read by humans, but more importantly, it's easy to read by machines. That way, we can parse this, load into SQLite/mongo and then query or visualize.
I started with table_create and table_delete events, but if everybody agrees, I'll continue by adding more events (flush/compaction/etc etc)
Test Plan:
Ran db_bench. Observed:
2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": 1421360005788015, "event": "table_file_creation", "file_number": 12, "file_size": 1909699}
2015/01/15-14:13:25.956500 110740000 EVENT_LOG_v1 {"time_micros": 1421360005956498, "event": "table_file_deletion", "file_number": 12}
Reviewers: yhchiang, rven, dhruba, MarkCallaghan, lgalanis, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31647
2015-03-13 18:15:54 +01:00
|
|
|
event_logger_(db_options_.info_log.get()),
|
2015-10-02 22:17:34 +02:00
|
|
|
bg_work_paused_(0),
|
2014-02-27 20:38:55 +01:00
|
|
|
refitting_level_(false),
|
2015-06-02 00:32:23 +02:00
|
|
|
opened_successfully_(false) {
|
2012-09-06 02:44:13 +02:00
|
|
|
env_->GetAbsolutePath(dbname, &db_absolute_path_);
|
2013-03-02 21:56:04 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Reserve ten files or so for other uses and give the rest to TableCache.
|
2014-01-07 05:29:17 +01:00
|
|
|
// Give a large number for setting of "infinite" open files.
|
2014-09-05 20:48:17 +02:00
|
|
|
const int table_cache_size = (db_options_.max_open_files == -1) ?
|
|
|
|
4194304 : db_options_.max_open_files - 10;
|
2014-02-07 00:42:16 +01:00
|
|
|
table_cache_ =
|
2015-03-17 23:04:37 +01:00
|
|
|
NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits);
|
2014-02-07 00:42:16 +01:00
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
|
2014-12-02 21:09:20 +01:00
|
|
|
table_cache_.get(), &write_buffer_,
|
|
|
|
&write_controller_));
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
column_family_memtables_.reset(
|
|
|
|
new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
|
2012-08-15 00:20:36 +02:00
|
|
|
|
2014-10-07 19:40:57 +02:00
|
|
|
DumpRocksDBBuildVersion(db_options_.info_log.get());
|
2014-09-05 20:48:17 +02:00
|
|
|
DumpDBFileSummary(db_options_, dbname_);
|
|
|
|
db_options_.Dump(db_options_.info_log.get());
|
2015-07-11 02:35:27 +02:00
|
|
|
DumpSupportInfo(db_options_.info_log.get());
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2015-06-25 23:43:25 +02:00
|
|
|
// Will lock the mutex_, will wait for completion if wait is true
|
2015-05-06 01:54:47 +02:00
|
|
|
void DBImpl::CancelAllBackgroundWork(bool wait) {
|
2015-06-25 23:43:25 +02:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2015-03-11 18:31:02 +01:00
|
|
|
shutting_down_.store(true, std::memory_order_release);
|
2015-06-25 23:43:25 +02:00
|
|
|
bg_cv_.SignalAll();
|
2015-05-06 01:54:47 +02:00
|
|
|
if (!wait) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Wait for background work to finish
|
2015-06-02 00:32:23 +02:00
|
|
|
while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
|
2015-05-06 01:54:47 +02:00
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
2015-03-11 18:31:02 +01:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
DBImpl::~DBImpl() {
|
2014-03-12 19:50:10 +01:00
|
|
|
mutex_.Lock();
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
|
2016-02-09 20:20:22 +01:00
|
|
|
if (!shutting_down_.load(std::memory_order_acquire) &&
|
|
|
|
has_unpersisted_data_) {
|
2014-03-12 19:50:10 +01:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) {
|
2014-03-12 19:50:10 +01:00
|
|
|
cfd->Ref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
FlushMemTable(cfd, FlushOptions());
|
|
|
|
mutex_.Lock();
|
2014-04-07 23:21:25 +02:00
|
|
|
cfd->Unref();
|
2014-03-12 19:50:10 +01:00
|
|
|
}
|
|
|
|
}
|
2014-04-07 23:21:25 +02:00
|
|
|
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
2012-11-06 20:21:57 +01:00
|
|
|
}
|
2015-06-25 23:43:25 +02:00
|
|
|
mutex_.Unlock();
|
|
|
|
// CancelAllBackgroundWork called with false means we just set the shutdown
|
|
|
|
// marker. After this we do a variant of the waiting and unschedule work
|
2015-05-06 01:54:47 +02:00
|
|
|
// (to consider: moving all the waiting into CancelAllBackgroundWork(true))
|
|
|
|
CancelAllBackgroundWork(false);
|
2015-03-17 02:49:14 +01:00
|
|
|
int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
|
|
|
|
int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
|
|
|
|
mutex_.Lock();
|
|
|
|
bg_compaction_scheduled_ -= compactions_unscheduled;
|
|
|
|
bg_flush_scheduled_ -= flushes_unscheduled;
|
|
|
|
|
|
|
|
// Wait for background work to finish
|
2015-06-02 00:32:23 +02:00
|
|
|
while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
|
2015-03-17 02:49:14 +01:00
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
2015-06-11 23:18:02 +02:00
|
|
|
EraseThreadStatusDbInfo();
|
2014-09-11 03:46:09 +02:00
|
|
|
flush_scheduler_.Clear();
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
while (!flush_queue_.empty()) {
|
|
|
|
auto cfd = PopFirstFromFlushQueue();
|
|
|
|
if (cfd->Unref()) {
|
|
|
|
delete cfd;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (!compaction_queue_.empty()) {
|
|
|
|
auto cfd = PopFirstFromCompactionQueue();
|
|
|
|
if (cfd->Unref()) {
|
|
|
|
delete cfd;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-11 22:52:17 +01:00
|
|
|
if (default_cf_handle_ != nullptr) {
|
|
|
|
// we need to delete handle outside of lock because it does its own locking
|
|
|
|
mutex_.Unlock();
|
|
|
|
delete default_cf_handle_;
|
|
|
|
mutex_.Lock();
|
2014-03-04 18:03:56 +01:00
|
|
|
}
|
|
|
|
|
2014-09-24 22:12:16 +02:00
|
|
|
// Clean up obsolete files due to SuperVersion release.
|
|
|
|
// (1) Need to delete to obsolete files before closing because RepairDB()
|
|
|
|
// scans all existing files in the file system and builds manifest file.
|
|
|
|
// Keeping obsolete files confuses the repair process.
|
|
|
|
// (2) Need to check if we Open()/Recover() the DB successfully before
|
|
|
|
// deleting because if VersionSet recover fails (may be due to corrupted
|
|
|
|
// manifest file), it is not able to identify live files correctly. As a
|
|
|
|
// result, all "live" files can get deleted by accident. However, corrupted
|
|
|
|
// manifest is recoverable by RepairDB().
|
|
|
|
if (opened_successfully_) {
|
2015-02-12 18:54:48 +01:00
|
|
|
JobContext job_context(next_job_id_.fetch_add(1));
|
2014-10-28 19:54:33 +01:00
|
|
|
FindObsoleteFiles(&job_context, true);
|
2015-06-04 04:57:01 +02:00
|
|
|
|
|
|
|
mutex_.Unlock();
|
2014-09-24 22:12:16 +02:00
|
|
|
// manifest number starting from 2
|
2014-10-28 19:54:33 +01:00
|
|
|
job_context.manifest_file_number = 1;
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
2014-02-27 20:38:55 +01:00
|
|
|
}
|
2014-11-15 00:43:10 +01:00
|
|
|
job_context.Clean();
|
2015-06-04 04:57:01 +02:00
|
|
|
mutex_.Lock();
|
2014-02-27 20:38:55 +01:00
|
|
|
}
|
|
|
|
|
2015-03-30 21:04:10 +02:00
|
|
|
for (auto l : logs_to_free_) {
|
|
|
|
delete l;
|
|
|
|
}
|
2015-08-06 05:51:27 +02:00
|
|
|
for (auto& log : logs_) {
|
|
|
|
log.ClearWriter();
|
|
|
|
}
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 12:28:08 +02:00
|
|
|
logs_.clear();
|
2015-03-30 21:04:10 +02:00
|
|
|
|
2014-02-07 00:42:16 +01:00
|
|
|
// versions need to be destroyed before table_cache since it can hold
|
2014-01-07 05:29:17 +01:00
|
|
|
// references to table_cache.
|
|
|
|
versions_.reset();
|
2014-03-04 18:03:56 +01:00
|
|
|
mutex_.Unlock();
|
2014-03-11 22:52:17 +01:00
|
|
|
if (db_lock_ != nullptr) {
|
|
|
|
env_->UnlockFile(db_lock_);
|
|
|
|
}
|
2014-02-07 00:42:16 +01:00
|
|
|
|
2014-09-05 20:48:17 +02:00
|
|
|
LogFlush(db_options_.info_log);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::NewDB() {
|
2014-01-15 00:27:09 +01:00
|
|
|
VersionEdit new_db;
|
2011-04-12 21:38:58 +02:00
|
|
|
new_db.SetLogNumber(0);
|
2011-03-18 23:37:00 +01:00
|
|
|
new_db.SetNextFile(2);
|
|
|
|
new_db.SetLastSequence(0);
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status s;
|
|
|
|
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log, "Creating manifest 1 \n");
|
2011-03-18 23:37:00 +01:00
|
|
|
const std::string manifest = DescriptorFileName(dbname_, 1);
|
|
|
|
{
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
unique_ptr<WritableFile> file;
|
|
|
|
EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_);
|
2015-10-16 23:33:47 +02:00
|
|
|
s = NewWritableFile(env_, manifest, &file, env_options);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
|
|
|
|
unique_ptr<WritableFileWriter> file_writer(
|
|
|
|
new WritableFileWriter(std::move(file), env_options));
|
2015-10-08 19:07:15 +02:00
|
|
|
log::Writer log(std::move(file_writer), 0, false);
|
2011-03-18 23:37:00 +01:00
|
|
|
std::string record;
|
|
|
|
new_db.EncodeTo(&record);
|
|
|
|
s = log.AddRecord(record);
|
2015-01-22 20:43:38 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
s = SyncManifest(env_, &db_options_, log.file());
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
// Make "CURRENT" file that points to the new manifest file.
|
2015-01-26 22:59:38 +01:00
|
|
|
s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir());
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
|
|
|
env_->DeleteFile(manifest);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::MaybeIgnoreError(Status* s) const {
|
2014-09-05 20:48:17 +02:00
|
|
|
if (s->ok() || db_options_.paranoid_checks) {
|
2011-03-18 23:37:00 +01:00
|
|
|
// No change needed
|
|
|
|
} else {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
|
|
|
db_options_.info_log, "Ignoring error %s", s->ToString().c_str());
|
2011-03-18 23:37:00 +01:00
|
|
|
*s = Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-26 22:56:45 +01:00
|
|
|
const Status DBImpl::CreateArchivalDirectory() {
|
2014-09-05 20:48:17 +02:00
|
|
|
if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) {
|
|
|
|
std::string archivalPath = ArchivalDirectory(db_options_.wal_dir);
|
2012-11-26 22:56:45 +01:00
|
|
|
return env_->CreateDirIfMissing(archivalPath);
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2013-05-28 21:35:43 +02:00
|
|
|
void DBImpl::PrintStatistics() {
|
2014-09-05 20:48:17 +02:00
|
|
|
auto dbstats = db_options_.statistics.get();
|
2013-05-28 21:35:43 +02:00
|
|
|
if (dbstats) {
|
2015-05-20 01:38:31 +02:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
2015-03-02 18:35:50 +01:00
|
|
|
"STATISTICS:\n %s",
|
2013-06-19 05:28:41 +02:00
|
|
|
dbstats->ToString().c_str());
|
2013-05-28 21:35:43 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-11 00:21:04 +02:00
|
|
|
void DBImpl::MaybeDumpStats() {
|
2014-09-05 20:48:17 +02:00
|
|
|
if (db_options_.stats_dump_period_sec == 0) return;
|
2013-05-24 21:52:45 +02:00
|
|
|
|
|
|
|
const uint64_t now_micros = env_->NowMicros();
|
|
|
|
|
|
|
|
if (last_stats_dump_time_microsec_ +
|
2014-09-05 20:48:17 +02:00
|
|
|
db_options_.stats_dump_period_sec * 1000000
|
2013-05-24 21:52:45 +02:00
|
|
|
<= now_micros) {
|
|
|
|
// Multiple threads could race in here simultaneously.
|
|
|
|
// However, the last one will update last_stats_dump_time_microsec_
|
|
|
|
// atomically. We could see more than one dump during one dump
|
|
|
|
// period in rare cases.
|
|
|
|
last_stats_dump_time_microsec_ = now_micros;
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
|
2015-04-10 02:05:17 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
const DBPropertyInfo* cf_property_info =
|
|
|
|
GetPropertyInfo(DB::Properties::kCFStats);
|
|
|
|
assert(cf_property_info != nullptr);
|
|
|
|
const DBPropertyInfo* db_property_info =
|
|
|
|
GetPropertyInfo(DB::Properties::kDBStats);
|
|
|
|
assert(db_property_info != nullptr);
|
|
|
|
|
2013-05-24 21:52:45 +02:00
|
|
|
std::string stats;
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
cfd->internal_stats()->GetStringProperty(
|
|
|
|
*cf_property_info, DB::Properties::kCFStats, &stats);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
}
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
default_cf_internal_stats_->GetStringProperty(
|
|
|
|
*db_property_info, DB::Properties::kDBStats, &stats);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
}
|
2015-05-20 01:38:31 +02:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
2014-11-04 19:28:08 +01:00
|
|
|
db_options_.info_log, "------- DUMPING STATS -------");
|
2015-05-20 01:38:31 +02:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
2014-11-04 19:28:08 +01:00
|
|
|
db_options_.info_log, "%s", stats.c_str());
|
2015-04-10 02:05:17 +02:00
|
|
|
#endif // !ROCKSDB_LITE
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
|
2013-05-28 21:35:43 +02:00
|
|
|
PrintStatistics();
|
2013-05-11 00:21:04 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-06 17:44:30 +01:00
|
|
|
// * Returns the list of live files in 'sst_live'
|
2014-12-22 12:04:45 +01:00
|
|
|
// If it's doing full scan:
|
2015-02-06 17:44:30 +01:00
|
|
|
// * Returns the list of all files in the filesystem in
|
|
|
|
// 'full_scan_candidate_files'.
|
2014-12-22 12:04:45 +01:00
|
|
|
// Otherwise, gets obsolete files from VersionSet.
|
2013-11-15 03:03:57 +01:00
|
|
|
// no_full_scan = true -- never do the full scan using GetChildren()
|
|
|
|
// force = false -- don't force the full scan, except every
|
2014-09-05 20:48:17 +02:00
|
|
|
// db_options_.delete_obsolete_files_period_micros
|
2013-11-15 03:03:57 +01:00
|
|
|
// force = true -- force the full scan
|
2014-10-28 19:54:33 +01:00
|
|
|
void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
2013-11-15 03:03:57 +01:00
|
|
|
bool no_full_scan) {
|
2012-10-21 10:49:48 +02:00
|
|
|
mutex_.AssertHeld();
|
|
|
|
|
2012-09-15 02:11:35 +02:00
|
|
|
// if deletion is disabled, do nothing
|
2014-01-02 12:33:42 +01:00
|
|
|
if (disable_delete_obsolete_files_ > 0) {
|
2012-09-15 02:11:35 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-11-15 07:42:02 +01:00
|
|
|
bool doing_the_full_scan = false;
|
|
|
|
|
|
|
|
// logic for figurint out if we're doing the full scan
|
|
|
|
if (no_full_scan) {
|
|
|
|
doing_the_full_scan = false;
|
2014-09-05 20:48:17 +02:00
|
|
|
} else if (force || db_options_.delete_obsolete_files_period_micros == 0) {
|
2013-11-15 07:42:02 +01:00
|
|
|
doing_the_full_scan = true;
|
|
|
|
} else {
|
|
|
|
const uint64_t now_micros = env_->NowMicros();
|
2014-12-22 12:04:45 +01:00
|
|
|
if (delete_obsolete_files_next_run_ < now_micros) {
|
2013-11-15 07:42:02 +01:00
|
|
|
doing_the_full_scan = true;
|
2014-12-22 12:04:45 +01:00
|
|
|
delete_obsolete_files_next_run_ =
|
|
|
|
now_micros + db_options_.delete_obsolete_files_period_micros;
|
2013-11-15 07:42:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-10 02:38:32 +01:00
|
|
|
// don't delete files that might be currently written to from compaction
|
|
|
|
// threads
|
2015-09-14 23:01:13 +02:00
|
|
|
// Since job_context->min_pending_output is set, until file scan finishes,
|
|
|
|
// mutex_ cannot be released. Otherwise, we might see no min_pending_output
|
|
|
|
// here but later find newer generated unfinalized files while scannint.
|
2015-02-10 02:38:32 +01:00
|
|
|
if (!pending_outputs_.empty()) {
|
|
|
|
job_context->min_pending_output = *pending_outputs_.begin();
|
|
|
|
} else {
|
|
|
|
// delete all of them
|
|
|
|
job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
|
|
|
|
}
|
|
|
|
|
2015-06-04 04:57:01 +02:00
|
|
|
// Get obsolete files. This function will also update the list of
|
|
|
|
// pending files in VersionSet().
|
2015-02-10 02:38:32 +01:00
|
|
|
versions_->GetObsoleteFiles(&job_context->sst_delete_files,
|
|
|
|
job_context->min_pending_output);
|
2013-11-15 03:03:57 +01:00
|
|
|
|
2013-11-09 00:23:46 +01:00
|
|
|
// store the current filenum, lognum, etc
|
2014-11-04 02:45:55 +01:00
|
|
|
job_context->manifest_file_number = versions_->manifest_file_number();
|
2014-10-28 19:54:33 +01:00
|
|
|
job_context->pending_manifest_file_number =
|
2014-11-04 02:45:55 +01:00
|
|
|
versions_->pending_manifest_file_number();
|
2014-10-28 19:54:33 +01:00
|
|
|
job_context->log_number = versions_->MinLogNumber();
|
2014-11-04 02:45:55 +01:00
|
|
|
job_context->prev_log_number = versions_->prev_log_number();
|
2013-11-09 00:23:46 +01:00
|
|
|
|
2015-02-06 17:44:30 +01:00
|
|
|
versions_->AddLiveFiles(&job_context->sst_live);
|
2013-11-15 07:42:02 +01:00
|
|
|
if (doing_the_full_scan) {
|
2015-12-16 00:26:20 +01:00
|
|
|
for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
|
2014-07-02 18:54:20 +02:00
|
|
|
// set of all files in the directory. We'll exclude files that are still
|
|
|
|
// alive in the subsequent processings.
|
|
|
|
std::vector<std::string> files;
|
2014-09-05 20:48:17 +02:00
|
|
|
env_->GetChildren(db_options_.db_paths[path_id].path,
|
2014-08-14 19:05:16 +02:00
|
|
|
&files); // Ignore errors
|
2014-07-02 18:54:20 +02:00
|
|
|
for (std::string file : files) {
|
2014-09-25 20:08:16 +02:00
|
|
|
// TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
|
2015-12-28 23:28:19 +01:00
|
|
|
job_context->full_scan_candidate_files.emplace_back(
|
|
|
|
"/" + file, static_cast<uint32_t>(path_id));
|
2014-07-02 18:54:20 +02:00
|
|
|
}
|
|
|
|
}
|
2013-11-15 07:42:02 +01:00
|
|
|
|
|
|
|
//Add log files in wal_dir
|
2014-09-05 20:48:17 +02:00
|
|
|
if (db_options_.wal_dir != dbname_) {
|
2013-11-15 07:42:02 +01:00
|
|
|
std::vector<std::string> log_files;
|
2014-09-05 20:48:17 +02:00
|
|
|
env_->GetChildren(db_options_.wal_dir, &log_files); // Ignore errors
|
2014-07-02 18:54:20 +02:00
|
|
|
for (std::string log_file : log_files) {
|
2014-12-22 12:04:45 +01:00
|
|
|
job_context->full_scan_candidate_files.emplace_back(log_file, 0);
|
2014-08-08 21:03:30 +02:00
|
|
|
}
|
|
|
|
}
|
2014-08-14 19:05:16 +02:00
|
|
|
// Add info log files in db_log_dir
|
2014-09-05 20:48:17 +02:00
|
|
|
if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) {
|
2014-08-14 19:05:16 +02:00
|
|
|
std::vector<std::string> info_log_files;
|
2014-09-05 20:48:17 +02:00
|
|
|
// Ignore errors
|
|
|
|
env_->GetChildren(db_options_.db_log_dir, &info_log_files);
|
2014-08-14 19:05:16 +02:00
|
|
|
for (std::string log_file : info_log_files) {
|
2014-12-22 12:04:45 +01:00
|
|
|
job_context->full_scan_candidate_files.emplace_back(log_file, 0);
|
2014-08-14 19:05:16 +02:00
|
|
|
}
|
|
|
|
}
|
2013-10-01 23:46:52 +02:00
|
|
|
}
|
2015-09-14 23:01:13 +02:00
|
|
|
|
|
|
|
if (!alive_log_files_.empty()) {
|
|
|
|
uint64_t min_log_number = versions_->MinLogNumber();
|
|
|
|
// find newly obsoleted log files
|
|
|
|
while (alive_log_files_.begin()->number < min_log_number) {
|
|
|
|
auto& earliest = *alive_log_files_.begin();
|
2015-10-01 15:12:04 +02:00
|
|
|
if (db_options_.recycle_log_file_num > log_recycle_files.size()) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"adding log %" PRIu64 " to recycle list\n", earliest.number);
|
|
|
|
log_recycle_files.push_back(earliest.number);
|
|
|
|
} else {
|
|
|
|
job_context->log_delete_files.push_back(earliest.number);
|
|
|
|
}
|
2015-09-14 23:01:13 +02:00
|
|
|
total_log_size_ -= earliest.size;
|
|
|
|
alive_log_files_.pop_front();
|
|
|
|
// Current log should always stay alive since it can't have
|
|
|
|
// number < MinLogNumber().
|
|
|
|
assert(alive_log_files_.size());
|
|
|
|
}
|
|
|
|
while (!logs_.empty() && logs_.front().number < min_log_number) {
|
|
|
|
auto& log = logs_.front();
|
|
|
|
if (log.getting_synced) {
|
|
|
|
log_sync_cv_.Wait();
|
|
|
|
// logs_ could have changed while we were waiting.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
logs_to_free_.push_back(log.ReleaseWriter());
|
|
|
|
logs_.pop_front();
|
|
|
|
}
|
|
|
|
// Current log cannot be obsolete.
|
|
|
|
assert(!logs_.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
// We're just cleaning up for DB::Write().
|
|
|
|
assert(job_context->logs_to_free.empty());
|
|
|
|
job_context->logs_to_free = logs_to_free_;
|
|
|
|
logs_to_free_.clear();
|
2013-07-16 20:56:46 +02:00
|
|
|
}
|
|
|
|
|
2014-07-02 18:54:20 +02:00
|
|
|
namespace {
|
2014-10-28 19:54:33 +01:00
|
|
|
bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
|
|
|
|
const JobContext::CandidateFileInfo& second) {
|
2014-07-02 18:54:20 +02:00
|
|
|
if (first.file_name > second.file_name) {
|
|
|
|
return true;
|
|
|
|
} else if (first.file_name < second.file_name) {
|
|
|
|
return false;
|
|
|
|
} else {
|
2014-08-31 09:54:15 +02:00
|
|
|
return (first.path_id > second.path_id);
|
2014-07-02 18:54:20 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}; // namespace
|
|
|
|
|
2012-10-21 10:49:48 +02:00
|
|
|
// Diffs the files listed in filenames and those that do not
|
2013-11-09 00:23:46 +01:00
|
|
|
// belong to live files are posibly removed. Also, removes all the
|
2013-11-12 20:53:26 +01:00
|
|
|
// files in sst_delete_files and log_delete_files.
|
2013-10-01 23:46:52 +02:00
|
|
|
// It is not necessary to hold the mutex when invoking this method.
|
2014-10-28 19:54:33 +01:00
|
|
|
void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
|
2014-03-09 23:11:34 +01:00
|
|
|
// we'd better have sth to delete
|
|
|
|
assert(state.HaveSomethingToDelete());
|
2013-11-27 23:56:20 +01:00
|
|
|
|
2013-11-15 03:03:57 +01:00
|
|
|
// this checks if FindObsoleteFiles() was run before. If not, don't do
|
|
|
|
// PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also
|
|
|
|
// run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
|
|
|
|
if (state.manifest_file_number == 0) {
|
2013-11-09 00:23:46 +01:00
|
|
|
return;
|
|
|
|
}
|
2014-03-09 23:11:34 +01:00
|
|
|
|
2014-07-02 18:54:20 +02:00
|
|
|
// Now, convert live list to an unordered map, WITHOUT mutex held;
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
2013-04-12 01:49:53 +02:00
|
|
|
// set is slow.
|
2014-07-02 18:54:20 +02:00
|
|
|
std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
|
2015-02-06 17:44:30 +01:00
|
|
|
for (const FileDescriptor& fd : state.sst_live) {
|
2014-07-02 18:54:20 +02:00
|
|
|
sst_live_map[fd.GetNumber()] = &fd;
|
|
|
|
}
|
2013-11-09 00:23:46 +01:00
|
|
|
|
2014-12-22 12:04:45 +01:00
|
|
|
auto candidate_files = state.full_scan_candidate_files;
|
2014-10-28 19:54:33 +01:00
|
|
|
candidate_files.reserve(candidate_files.size() +
|
|
|
|
state.sst_delete_files.size() +
|
|
|
|
state.log_delete_files.size());
|
2013-12-31 03:33:57 +01:00
|
|
|
// We may ignore the dbname when generating the file names.
|
|
|
|
const char* kDumbDbName = "";
|
2013-11-12 20:53:26 +01:00
|
|
|
for (auto file : state.sst_delete_files) {
|
2014-07-02 18:54:20 +02:00
|
|
|
candidate_files.emplace_back(
|
|
|
|
MakeTableFileName(kDumbDbName, file->fd.GetNumber()),
|
|
|
|
file->fd.GetPathId());
|
2013-11-12 20:53:26 +01:00
|
|
|
delete file;
|
2013-11-09 00:23:46 +01:00
|
|
|
}
|
|
|
|
|
2013-12-31 03:33:57 +01:00
|
|
|
for (auto file_num : state.log_delete_files) {
|
|
|
|
if (file_num > 0) {
|
2014-07-02 18:54:20 +02:00
|
|
|
candidate_files.emplace_back(LogFileName(kDumbDbName, file_num).substr(1),
|
|
|
|
0);
|
2013-11-09 00:23:46 +01:00
|
|
|
}
|
|
|
|
}
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
2013-04-12 01:49:53 +02:00
|
|
|
|
2013-12-31 03:33:57 +01:00
|
|
|
// dedup state.candidate_files so we don't try to delete the same
|
2013-11-13 05:32:07 +01:00
|
|
|
// file twice
|
2014-07-02 18:54:20 +02:00
|
|
|
sort(candidate_files.begin(), candidate_files.end(), CompareCandidateFile);
|
2014-03-18 05:50:15 +01:00
|
|
|
candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()),
|
|
|
|
candidate_files.end());
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-03-09 23:11:34 +01:00
|
|
|
std::vector<std::string> old_info_log_files;
|
2014-09-05 20:48:17 +02:00
|
|
|
InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_);
|
2014-07-02 18:54:20 +02:00
|
|
|
for (const auto& candidate_file : candidate_files) {
|
|
|
|
std::string to_delete = candidate_file.file_name;
|
|
|
|
uint32_t path_id = candidate_file.path_id;
|
2013-12-31 03:33:57 +01:00
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
// Ignore file if we cannot recognize it.
|
2014-08-14 19:05:16 +02:00
|
|
|
if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
|
2013-12-31 03:33:57 +01:00
|
|
|
continue;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2013-12-31 03:33:57 +01:00
|
|
|
bool keep = true;
|
|
|
|
switch (type) {
|
|
|
|
case kLogFile:
|
|
|
|
keep = ((number >= state.log_number) ||
|
|
|
|
(number == state.prev_log_number));
|
|
|
|
break;
|
|
|
|
case kDescriptorFile:
|
|
|
|
// Keep my manifest file, and any newer incarnations'
|
2014-03-18 05:50:15 +01:00
|
|
|
// (can happen during manifest roll)
|
2013-12-31 03:33:57 +01:00
|
|
|
keep = (number >= state.manifest_file_number);
|
|
|
|
break;
|
|
|
|
case kTableFile:
|
2014-11-07 20:50:34 +01:00
|
|
|
// If the second condition is not there, this makes
|
|
|
|
// DontDeletePendingOutputs fail
|
|
|
|
keep = (sst_live_map.find(number) != sst_live_map.end()) ||
|
|
|
|
number >= state.min_pending_output;
|
2013-12-31 03:33:57 +01:00
|
|
|
break;
|
|
|
|
case kTempFile:
|
|
|
|
// Any temp files that are currently being written to must
|
2014-03-18 05:50:15 +01:00
|
|
|
// be recorded in pending_outputs_, which is inserted into "live".
|
|
|
|
// Also, SetCurrentFile creates a temp file when writing out new
|
|
|
|
// manifest, which is equal to state.pending_manifest_file_number. We
|
|
|
|
// should not delete that file
|
2015-11-11 07:58:01 +01:00
|
|
|
//
|
|
|
|
// TODO(yhchiang): carefully modify the third condition to safely
|
|
|
|
// remove the temp options files.
|
2014-07-02 18:54:20 +02:00
|
|
|
keep = (sst_live_map.find(number) != sst_live_map.end()) ||
|
2015-11-11 07:58:01 +01:00
|
|
|
(number == state.pending_manifest_file_number) ||
|
|
|
|
(to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
|
2013-12-31 03:33:57 +01:00
|
|
|
break;
|
|
|
|
case kInfoLogFile:
|
|
|
|
keep = true;
|
|
|
|
if (number != 0) {
|
2014-03-09 23:11:34 +01:00
|
|
|
old_info_log_files.push_back(to_delete);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2013-12-31 03:33:57 +01:00
|
|
|
break;
|
|
|
|
case kCurrentFile:
|
|
|
|
case kDBLockFile:
|
|
|
|
case kIdentityFile:
|
|
|
|
case kMetaDatabase:
|
2015-11-11 07:58:01 +01:00
|
|
|
case kOptionsFile:
|
2013-12-31 03:33:57 +01:00
|
|
|
keep = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (keep) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-07-02 18:54:20 +02:00
|
|
|
std::string fname;
|
2013-12-31 03:33:57 +01:00
|
|
|
if (type == kTableFile) {
|
|
|
|
// evict from cache
|
2014-02-07 00:42:16 +01:00
|
|
|
TableCache::Evict(table_cache_.get(), number);
|
2014-09-05 20:48:17 +02:00
|
|
|
fname = TableFileName(db_options_.db_paths, number, path_id);
|
2014-07-02 18:54:20 +02:00
|
|
|
} else {
|
2014-09-05 20:48:17 +02:00
|
|
|
fname = ((type == kLogFile) ?
|
|
|
|
db_options_.wal_dir : dbname_) + "/" + to_delete;
|
2013-12-31 03:33:57 +01:00
|
|
|
}
|
2014-03-09 23:11:34 +01:00
|
|
|
|
2015-05-22 21:10:51 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
2014-10-30 01:43:37 +01:00
|
|
|
if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 ||
|
2015-05-22 21:10:51 +02:00
|
|
|
db_options_.WAL_size_limit_MB > 0)) {
|
2014-10-30 01:43:37 +01:00
|
|
|
wal_manager_.ArchiveWALFile(fname, number);
|
2015-05-22 21:10:51 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
2015-08-05 05:45:27 +02:00
|
|
|
Status file_deletion_status;
|
2016-01-29 03:35:01 +01:00
|
|
|
if (type == kTableFile) {
|
|
|
|
file_deletion_status = DeleteSSTFile(&db_options_, fname, path_id);
|
2015-08-05 05:45:27 +02:00
|
|
|
} else {
|
|
|
|
file_deletion_status = env_->DeleteFile(fname);
|
|
|
|
}
|
2015-05-22 21:10:51 +02:00
|
|
|
if (file_deletion_status.ok()) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id,
|
2015-05-22 21:10:51 +02:00
|
|
|
fname.c_str(), type, number,
|
|
|
|
file_deletion_status.ToString().c_str());
|
2015-09-17 19:21:34 +02:00
|
|
|
} else if (env_->FileExists(fname).IsNotFound()) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
|
|
|
|
" -- %s\n",
|
|
|
|
state.job_id, fname.c_str(), type, number,
|
|
|
|
file_deletion_status.ToString().c_str());
|
2015-05-22 21:10:51 +02:00
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
|
|
|
|
state.job_id, fname.c_str(), type, number,
|
|
|
|
file_deletion_status.ToString().c_str());
|
|
|
|
}
|
|
|
|
if (type == kTableFile) {
|
2015-06-04 04:57:01 +02:00
|
|
|
EventHelpers::LogAndNotifyTableFileDeletion(
|
|
|
|
&event_logger_, state.job_id, number, fname,
|
|
|
|
file_deletion_status, GetName(),
|
|
|
|
db_options_.listeners);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
2012-08-18 01:06:05 +02:00
|
|
|
|
2013-07-16 20:56:46 +02:00
|
|
|
// Delete old info log files.
|
2014-03-09 23:11:34 +01:00
|
|
|
size_t old_info_log_file_count = old_info_log_files.size();
|
2014-09-05 20:48:17 +02:00
|
|
|
if (old_info_log_file_count >= db_options_.keep_log_file_num) {
|
2014-03-09 23:11:34 +01:00
|
|
|
std::sort(old_info_log_files.begin(), old_info_log_files.end());
|
2014-09-05 20:48:17 +02:00
|
|
|
size_t end = old_info_log_file_count - db_options_.keep_log_file_num;
|
2013-03-15 02:32:01 +01:00
|
|
|
for (unsigned int i = 0; i <= end; i++) {
|
2014-03-09 23:11:34 +01:00
|
|
|
std::string& to_delete = old_info_log_files.at(i);
|
2014-09-05 20:48:17 +02:00
|
|
|
std::string full_path_to_delete = (db_options_.db_log_dir.empty() ?
|
|
|
|
dbname_ : db_options_.db_log_dir) + "/" + to_delete;
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[JOB %d] Delete info log file %s\n", state.job_id,
|
2014-08-14 19:05:16 +02:00
|
|
|
full_path_to_delete.c_str());
|
|
|
|
Status s = env_->DeleteFile(full_path_to_delete);
|
2014-03-09 23:11:34 +01:00
|
|
|
if (!s.ok()) {
|
2015-09-17 19:21:34 +02:00
|
|
|
if (env_->FileExists(full_path_to_delete).IsNotFound()) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[JOB %d] Tried to delete non-existing info log file %s FAILED "
|
|
|
|
"-- %s\n",
|
|
|
|
state.job_id, to_delete.c_str(), s.ToString().c_str());
|
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"[JOB %d] Delete info log file %s FAILED -- %s\n", state.job_id,
|
|
|
|
to_delete.c_str(), s.ToString().c_str());
|
|
|
|
}
|
2014-03-09 23:11:34 +01:00
|
|
|
}
|
2012-08-18 01:06:05 +02:00
|
|
|
}
|
|
|
|
}
|
2014-10-30 01:43:37 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
wal_manager_.PurgeObsoleteWALFiles();
|
|
|
|
#endif // ROCKSDB_LITE
|
2014-09-05 20:48:17 +02:00
|
|
|
LogFlush(db_options_.info_log);
|
2012-10-21 10:49:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::DeleteObsoleteFiles() {
|
|
|
|
mutex_.AssertHeld();
|
2015-02-12 18:54:48 +01:00
|
|
|
JobContext job_context(next_job_id_.fetch_add(1));
|
2014-10-28 19:54:33 +01:00
|
|
|
FindObsoleteFiles(&job_context, true);
|
2015-06-04 04:57:01 +02:00
|
|
|
|
|
|
|
mutex_.Unlock();
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
2014-03-10 23:42:14 +01:00
|
|
|
}
|
2014-11-15 00:43:10 +01:00
|
|
|
job_context.Clean();
|
2015-06-04 04:57:01 +02:00
|
|
|
mutex_.Lock();
|
2012-11-26 22:56:45 +01:00
|
|
|
}
|
|
|
|
|
2015-01-26 22:59:38 +01:00
|
|
|
Status DBImpl::Directories::CreateAndNewDirectory(
|
|
|
|
Env* env, const std::string& dirname,
|
|
|
|
std::unique_ptr<Directory>* directory) const {
|
|
|
|
// We call CreateDirIfMissing() as the directory may already exist (if we
|
|
|
|
// are reopening a DB), when this happens we don't want creating the
|
|
|
|
// directory to cause an error. However, we need to check if creating the
|
|
|
|
// directory fails or else we may get an obscure message about the lock
|
|
|
|
// file not existing. One real-world example of this occurring is if
|
|
|
|
// env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
|
|
|
|
// when dbname_ is "dir/db" but when "dir" doesn't exist.
|
|
|
|
Status s = env->CreateDirIfMissing(dirname);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return env->NewDirectory(dirname, directory);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2015-01-26 22:59:38 +01:00
|
|
|
Status DBImpl::Directories::SetDirectories(
|
|
|
|
Env* env, const std::string& dbname, const std::string& wal_dir,
|
|
|
|
const std::vector<DbPath>& data_paths) {
|
|
|
|
Status s = CreateAndNewDirectory(env, dbname, &db_dir_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (!wal_dir.empty() && dbname != wal_dir) {
|
|
|
|
s = CreateAndNewDirectory(env, wal_dir, &wal_dir_);
|
2013-01-07 19:11:18 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2015-01-26 22:59:38 +01:00
|
|
|
}
|
2013-01-07 19:11:18 +01:00
|
|
|
|
2015-01-26 22:59:38 +01:00
|
|
|
data_dirs_.clear();
|
|
|
|
for (auto& p : data_paths) {
|
|
|
|
const std::string db_path = p.path;
|
|
|
|
if (db_path == dbname) {
|
|
|
|
data_dirs_.emplace_back(nullptr);
|
|
|
|
} else {
|
|
|
|
std::unique_ptr<Directory> path_directory;
|
|
|
|
s = CreateAndNewDirectory(env, db_path, &path_directory);
|
2014-07-02 18:54:20 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2015-01-26 22:59:38 +01:00
|
|
|
data_dirs_.emplace_back(path_directory.release());
|
2014-07-02 18:54:20 +02:00
|
|
|
}
|
2015-01-26 22:59:38 +01:00
|
|
|
}
|
|
|
|
assert(data_dirs_.size() == data_paths.size());
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Directory* DBImpl::Directories::GetDataDir(size_t path_id) {
|
|
|
|
assert(path_id < data_dirs_.size());
|
|
|
|
Directory* ret_dir = data_dirs_[path_id].get();
|
|
|
|
if (ret_dir == nullptr) {
|
|
|
|
// Should use db_dir_
|
|
|
|
return db_dir_.get();
|
|
|
|
}
|
|
|
|
return ret_dir;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Recover(
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
|
|
|
|
bool error_if_log_file_exist) {
|
|
|
|
mutex_.AssertHeld();
|
2014-07-02 18:54:20 +02:00
|
|
|
|
2015-01-26 22:59:38 +01:00
|
|
|
bool is_new_db = false;
|
|
|
|
assert(db_lock_ == nullptr);
|
|
|
|
if (!read_only) {
|
|
|
|
Status s = directories_.SetDirectories(env_, dbname_, db_options_.wal_dir,
|
|
|
|
db_options_.db_paths);
|
2014-01-27 20:02:21 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-01-07 19:11:18 +01:00
|
|
|
s = env_->LockFile(LockFileName(dbname_), &db_lock_);
|
2012-12-17 06:01:02 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2015-07-21 02:20:40 +02:00
|
|
|
s = env_->FileExists(CurrentFileName(dbname_));
|
|
|
|
if (s.IsNotFound()) {
|
2014-09-05 20:48:17 +02:00
|
|
|
if (db_options_.create_if_missing) {
|
2012-12-17 06:01:02 +01:00
|
|
|
s = NewDB();
|
2014-04-15 18:06:13 +02:00
|
|
|
is_new_db = true;
|
2012-12-17 06:01:02 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
dbname_, "does not exist (create_if_missing is false)");
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2015-07-21 02:20:40 +02:00
|
|
|
} else if (s.ok()) {
|
2014-09-05 20:48:17 +02:00
|
|
|
if (db_options_.error_if_exists) {
|
2012-12-17 06:01:02 +01:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
dbname_, "exists (error_if_exists is true)");
|
|
|
|
}
|
2015-07-21 02:20:40 +02:00
|
|
|
} else {
|
|
|
|
// Unexpected error reading file
|
|
|
|
assert(s.IsIOError());
|
|
|
|
return s;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2013-10-18 23:50:54 +02:00
|
|
|
// Check for the IDENTITY file and create it if not there
|
2015-07-21 02:20:40 +02:00
|
|
|
s = env_->FileExists(IdentityFileName(dbname_));
|
|
|
|
if (s.IsNotFound()) {
|
2013-10-18 23:50:54 +02:00
|
|
|
s = SetIdentityFile(env_, dbname_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2015-07-21 02:20:40 +02:00
|
|
|
} else if (!s.ok()) {
|
|
|
|
assert(s.IsIOError());
|
|
|
|
return s;
|
2013-10-18 23:50:54 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-04-09 18:56:17 +02:00
|
|
|
Status s = versions_->Recover(column_families, read_only);
|
2014-09-05 20:48:17 +02:00
|
|
|
if (db_options_.paranoid_checks && s.ok()) {
|
2014-03-20 22:18:29 +01:00
|
|
|
s = CheckConsistency();
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
if (s.ok()) {
|
2015-05-29 23:36:35 +02:00
|
|
|
SequenceNumber max_sequence(kMaxSequenceNumber);
|
2014-02-11 02:04:44 +01:00
|
|
|
default_cf_handle_ = new ColumnFamilyHandleImpl(
|
|
|
|
versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
|
2014-06-07 02:26:23 +02:00
|
|
|
single_column_family_mode_ =
|
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
|
2011-06-22 04:36:45 +02:00
|
|
|
|
|
|
|
// Recover from all newer log files than the ones named in the
|
|
|
|
// descriptor (new log files may have been added by the previous
|
|
|
|
// incarnation without registering them in the descriptor).
|
|
|
|
//
|
2014-11-04 02:45:55 +01:00
|
|
|
// Note that prev_log_number() is no longer used, but we pay
|
2011-06-22 04:36:45 +02:00
|
|
|
// attention to it in case we are recovering a database
|
2013-10-05 07:32:05 +02:00
|
|
|
// produced by an older version of rocksdb.
|
2014-01-28 20:05:04 +01:00
|
|
|
const uint64_t min_log = versions_->MinLogNumber();
|
2014-11-04 02:45:55 +01:00
|
|
|
const uint64_t prev_log = versions_->prev_log_number();
|
2011-06-22 04:36:45 +02:00
|
|
|
std::vector<std::string> filenames;
|
2014-09-05 20:48:17 +02:00
|
|
|
s = env_->GetChildren(db_options_.wal_dir, &filenames);
|
2011-06-22 04:36:45 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
2011-04-12 21:38:58 +02:00
|
|
|
}
|
2013-12-31 03:33:57 +01:00
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
std::vector<uint64_t> logs;
|
|
|
|
for (size_t i = 0; i < filenames.size(); i++) {
|
2013-12-31 03:33:57 +01:00
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
2014-04-15 18:06:13 +02:00
|
|
|
if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
|
|
|
|
if (is_new_db) {
|
|
|
|
return Status::Corruption(
|
|
|
|
"While creating a new Db, wal_dir contains "
|
|
|
|
"existing log file: ",
|
|
|
|
filenames[i]);
|
|
|
|
} else if ((number >= min_log) || (number == prev_log)) {
|
|
|
|
logs.push_back(number);
|
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
|
2012-11-06 04:18:49 +01:00
|
|
|
if (logs.size() > 0 && error_if_log_file_exist) {
|
|
|
|
return Status::Corruption(""
|
|
|
|
"The db was opened in readonly mode with error_if_log_file_exist"
|
|
|
|
"flag but a log file already exists");
|
|
|
|
}
|
|
|
|
|
2014-09-09 20:18:50 +02:00
|
|
|
if (!logs.empty()) {
|
|
|
|
// Recover in the order in which the logs were generated
|
|
|
|
std::sort(logs.begin(), logs.end());
|
|
|
|
s = RecoverLogFiles(logs, &max_sequence, read_only);
|
|
|
|
if (!s.ok()) {
|
|
|
|
// Clear memtables if recovery failed
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
2015-05-29 23:36:35 +02:00
|
|
|
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
|
|
|
|
kMaxSequenceNumber);
|
2014-09-09 20:18:50 +02:00
|
|
|
}
|
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
2014-07-28 21:05:36 +02:00
|
|
|
SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
// Initial value
|
|
|
|
max_total_in_memory_state_ = 0;
|
2014-04-30 20:33:40 +02:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
2014-10-17 01:57:59 +02:00
|
|
|
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
|
|
max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
|
|
|
|
mutable_cf_options->max_write_buffer_number;
|
2014-04-30 20:33:40 +02:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-09-09 20:18:50 +02:00
|
|
|
// REQUIRES: log_numbers are sorted in ascending order
|
|
|
|
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
|
|
|
SequenceNumber* max_sequence, bool read_only) {
|
2011-03-18 23:37:00 +01:00
|
|
|
struct LogReporter : public log::Reader::Reporter {
|
|
|
|
Env* env;
|
2011-07-21 04:40:18 +02:00
|
|
|
Logger* info_log;
|
2011-03-18 23:37:00 +01:00
|
|
|
const char* fname;
|
2015-01-05 22:35:56 +01:00
|
|
|
Status* status; // nullptr if db_options_.paranoid_checks==false
|
2015-02-26 20:28:41 +01:00
|
|
|
virtual void Corruption(size_t bytes, const Status& s) override {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
|
|
|
info_log, "%s%s: dropping %d bytes; %s",
|
2013-02-15 20:53:17 +01:00
|
|
|
(this->status == nullptr ? "(ignoring error) " : ""),
|
2011-03-18 23:37:00 +01:00
|
|
|
fname, static_cast<int>(bytes), s.ToString().c_str());
|
2015-06-15 21:03:13 +02:00
|
|
|
if (this->status != nullptr && this->status->ok()) {
|
|
|
|
*this->status = s;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
mutex_.AssertHeld();
|
2014-09-09 20:18:50 +02:00
|
|
|
Status status;
|
2014-01-28 20:05:04 +01:00
|
|
|
std::unordered_map<int, VersionEdit> version_edits;
|
2014-02-11 02:04:44 +01:00
|
|
|
// no need to refcount because iteration is under mutex
|
2014-01-28 20:05:04 +01:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
VersionEdit edit;
|
2014-01-29 22:28:50 +01:00
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
version_edits.insert({cfd->GetID(), edit});
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
int job_id = next_job_id_.fetch_add(1);
|
|
|
|
{
|
|
|
|
auto stream = event_logger_.Log();
|
|
|
|
stream << "job" << job_id << "event"
|
|
|
|
<< "recovery_started";
|
|
|
|
stream << "log_files";
|
|
|
|
stream.StartArray();
|
|
|
|
for (auto log_number : log_numbers) {
|
|
|
|
stream << log_number;
|
|
|
|
}
|
|
|
|
stream.EndArray();
|
|
|
|
}
|
Refactor Recover() code
Summary:
This diff does two things:
* Rethinks how we call Recover() with read_only option. Before, we call it with pointer to memtable where we'd like to apply those changes to. This memtable is set in db_impl_readonly.cc and it's actually DBImpl::mem_. Why don't we just apply updates to mem_ right away? It seems more intuitive.
* Changes when we apply updates to manifest. Before, the process is to recover all the logs, flush it to sst files and then do one giant commit that atomically adds all recovered sst files and sets the next log number. This works good enough, but causes some small troubles for my column family approach, since I can't have one VersionEdit apply to more than single column family[1]. The change here is to commit the files recovered from logs right away. Here is the state of the world before the change:
1. Recover log 5, add new sst files to edit
2. Recover log 7, add new sst files to edit
3. Recover log 8, add new sst files to edit
4. Commit all added sst files to manifest and mark log files 5, 7 and 8 as recoverd (via SetLogNumber(9) function)
After the change, we'll do:
1. Recover log 5, commit the new sst files and set log 5 as recovered
2. Recover log 7, commit the new sst files and set log 7 as recovered
3. Recover log 8, commit the new sst files and set log 8 as recovered
The added (small) benefit is that if we fail after (2), the new recovery will only have to recover log 8. In previous case, we'll have to restart the recovery from the beginning. The bigger benefit will be to enable easier integration of multiple column families in Recovery code path.
[1] I'm happy to dicuss this decison, but I believe this is the cleanest way to go. It also makes backward compatibility much easier. We don't have a requirement of adding multiple column families atomically.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15237
2014-01-22 19:45:26 +01:00
|
|
|
|
2015-06-15 21:03:13 +02:00
|
|
|
bool continue_replay_log = true;
|
2014-09-09 20:18:50 +02:00
|
|
|
for (auto log_number : log_numbers) {
|
|
|
|
// The previous incarnation may not have written any MANIFEST
|
|
|
|
// records after allocating this log number. So we manually
|
|
|
|
// update the file number allocation counter in VersionSet.
|
2014-11-08 00:44:12 +01:00
|
|
|
versions_->MarkFileNumberUsedDuringRecovery(log_number);
|
2014-09-09 20:18:50 +02:00
|
|
|
// Open the log file
|
|
|
|
std::string fname = LogFileName(db_options_.wal_dir, log_number);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
unique_ptr<SequentialFileReader> file_reader;
|
|
|
|
{
|
|
|
|
unique_ptr<SequentialFile> file;
|
|
|
|
status = env_->NewSequentialFile(fname, &file, env_options_);
|
2014-09-09 20:18:50 +02:00
|
|
|
if (!status.ok()) {
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
MaybeIgnoreError(&status);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
} else {
|
|
|
|
// Fail with one log file, but that's ok.
|
|
|
|
// Try next one.
|
|
|
|
continue;
|
|
|
|
}
|
2014-09-09 20:18:50 +02:00
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
file_reader.reset(new SequentialFileReader(std::move(file)));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-09-09 20:18:50 +02:00
|
|
|
// Create the log reader.
|
|
|
|
LogReporter reporter;
|
|
|
|
reporter.env = env_;
|
|
|
|
reporter.info_log = db_options_.info_log.get();
|
|
|
|
reporter.fname = fname.c_str();
|
2015-06-15 21:03:13 +02:00
|
|
|
if (!db_options_.paranoid_checks ||
|
|
|
|
db_options_.wal_recovery_mode ==
|
|
|
|
WALRecoveryMode::kSkipAnyCorruptedRecords) {
|
|
|
|
reporter.status = nullptr;
|
|
|
|
} else {
|
|
|
|
reporter.status = &status;
|
|
|
|
}
|
2014-09-09 20:18:50 +02:00
|
|
|
// We intentially make log::Reader do checksumming even if
|
|
|
|
// paranoid_checks==false so that corruptions cause entire commits
|
|
|
|
// to be skipped instead of propagating bad information (like overly
|
|
|
|
// large sequence numbers).
|
2015-10-08 19:06:16 +02:00
|
|
|
log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
|
|
|
|
true /*checksum*/, 0 /*initial_offset*/, log_number);
|
2015-06-15 21:03:13 +02:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number,
|
|
|
|
db_options_.wal_recovery_mode, !continue_replay_log);
|
|
|
|
|
|
|
|
// Determine if we should tolerate incomplete records at the tail end of the
|
2014-09-09 20:18:50 +02:00
|
|
|
// Read all the records and add to a memtable
|
|
|
|
std::string scratch;
|
|
|
|
Slice record;
|
|
|
|
WriteBatch batch;
|
2015-06-15 21:03:13 +02:00
|
|
|
|
|
|
|
if (!continue_replay_log) {
|
|
|
|
uint64_t bytes;
|
|
|
|
if (env_->GetFileSize(fname, &bytes).ok()) {
|
|
|
|
auto info_log = db_options_.info_log.get();
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, info_log, "%s: dropping %d bytes",
|
|
|
|
fname.c_str(), static_cast<int>(bytes));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-02 21:53:59 +02:00
|
|
|
while (
|
|
|
|
continue_replay_log &&
|
|
|
|
reader.ReadRecord(&record, &scratch, db_options_.wal_recovery_mode) &&
|
|
|
|
status.ok()) {
|
2014-09-09 20:18:50 +02:00
|
|
|
if (record.size() < 12) {
|
|
|
|
reporter.Corruption(record.size(),
|
|
|
|
Status::Corruption("log record too small"));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
WriteBatchInternal::SetContents(&batch, record);
|
|
|
|
|
2015-10-13 20:10:14 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
2015-10-13 02:03:03 +02:00
|
|
|
if (db_options_.wal_filter != nullptr) {
|
2015-10-19 22:27:40 +02:00
|
|
|
WriteBatch new_batch;
|
|
|
|
bool batch_changed = false;
|
|
|
|
|
2015-10-27 02:11:18 +01:00
|
|
|
WalFilter::WalProcessingOption wal_processing_option =
|
2015-10-29 23:52:32 +01:00
|
|
|
db_options_.wal_filter->LogRecord(batch, &new_batch,
|
|
|
|
&batch_changed);
|
2015-10-13 02:03:03 +02:00
|
|
|
|
2015-10-27 02:11:18 +01:00
|
|
|
switch (wal_processing_option) {
|
2015-10-29 23:52:32 +01:00
|
|
|
case WalFilter::WalProcessingOption::kContinueProcessing:
|
|
|
|
// do nothing, proceeed normally
|
|
|
|
break;
|
|
|
|
case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
|
|
|
|
// skip current record
|
2015-10-19 22:27:40 +02:00
|
|
|
continue;
|
2015-10-29 23:52:32 +01:00
|
|
|
case WalFilter::WalProcessingOption::kStopReplay:
|
|
|
|
// skip current record and stop replay
|
|
|
|
continue_replay_log = false;
|
2015-10-19 22:27:40 +02:00
|
|
|
continue;
|
2015-10-29 23:52:32 +01:00
|
|
|
case WalFilter::WalProcessingOption::kCorruptedRecord: {
|
|
|
|
status = Status::Corruption("Corruption reported by Wal Filter ",
|
|
|
|
db_options_.wal_filter->Name());
|
|
|
|
MaybeIgnoreError(&status);
|
|
|
|
if (!status.ok()) {
|
|
|
|
reporter.Corruption(record.size(), status);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
assert(false); // unhandled case
|
|
|
|
status = Status::NotSupported(
|
|
|
|
"Unknown WalProcessingOption returned"
|
|
|
|
" by Wal Filter ",
|
|
|
|
db_options_.wal_filter->Name());
|
|
|
|
MaybeIgnoreError(&status);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
} else {
|
|
|
|
// Ignore the error with current record processing.
|
|
|
|
continue;
|
|
|
|
}
|
2015-10-19 22:27:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (batch_changed) {
|
|
|
|
// Make sure that the count in the new batch is
|
|
|
|
// within the orignal count.
|
|
|
|
int new_count = WriteBatchInternal::Count(&new_batch);
|
|
|
|
int original_count = WriteBatchInternal::Count(&batch);
|
|
|
|
if (new_count > original_count) {
|
2015-10-27 02:11:18 +01:00
|
|
|
Log(InfoLogLevel::FATAL_LEVEL, db_options_.info_log,
|
2015-10-29 23:52:32 +01:00
|
|
|
"Recovering log #%" PRIu64
|
|
|
|
" mode %d log filter %s returned "
|
|
|
|
"more records (%d) than original (%d) which is not allowed. "
|
|
|
|
"Aborting recovery.",
|
|
|
|
log_number, db_options_.wal_recovery_mode,
|
|
|
|
db_options_.wal_filter->Name(), new_count, original_count);
|
|
|
|
status = Status::NotSupported(
|
|
|
|
"More than original # of records "
|
|
|
|
"returned by Wal Filter ",
|
|
|
|
db_options_.wal_filter->Name());
|
2015-10-27 02:11:18 +01:00
|
|
|
return status;
|
2015-10-19 22:27:40 +02:00
|
|
|
}
|
|
|
|
// Set the same sequence number in the new_batch
|
|
|
|
// as the original batch.
|
2015-10-29 23:52:32 +01:00
|
|
|
WriteBatchInternal::SetSequence(&new_batch,
|
|
|
|
WriteBatchInternal::Sequence(&batch));
|
2015-10-19 22:27:40 +02:00
|
|
|
batch = new_batch;
|
2015-10-13 02:03:03 +02:00
|
|
|
}
|
|
|
|
}
|
2015-10-29 23:52:32 +01:00
|
|
|
#endif // ROCKSDB_LITE
|
2015-10-13 02:03:03 +02:00
|
|
|
|
2014-09-09 20:18:50 +02:00
|
|
|
// If column family was not found, it might mean that the WAL write
|
|
|
|
// batch references to the column family that was dropped after the
|
|
|
|
// insert. We don't want to fail the whole write batch in that case --
|
|
|
|
// we just ignore the update.
|
|
|
|
// That's why we set ignore missing column families to true
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
status =
|
|
|
|
WriteBatchInternal::InsertInto(&batch, column_family_memtables_.get(),
|
|
|
|
&flush_scheduler_, true, log_number);
|
2014-09-09 20:18:50 +02:00
|
|
|
|
|
|
|
MaybeIgnoreError(&status);
|
|
|
|
if (!status.ok()) {
|
2015-09-10 06:59:28 +02:00
|
|
|
// We are treating this as a failure while reading since we read valid
|
|
|
|
// blocks that do not form coherent data
|
|
|
|
reporter.Corruption(record.size(), status);
|
|
|
|
continue;
|
2014-09-09 20:18:50 +02:00
|
|
|
}
|
2015-09-10 06:59:28 +02:00
|
|
|
|
2014-09-09 20:18:50 +02:00
|
|
|
const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
|
|
|
|
WriteBatchInternal::Count(&batch) - 1;
|
2015-05-29 23:36:35 +02:00
|
|
|
if ((*max_sequence == kMaxSequenceNumber) || (last_seq > *max_sequence)) {
|
2014-09-09 20:18:50 +02:00
|
|
|
*max_sequence = last_seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!read_only) {
|
2014-09-11 03:46:09 +02:00
|
|
|
// we can do this because this is called before client has access to the
|
|
|
|
// DB and there is only a single thread operating on DB
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
|
2014-09-11 03:46:09 +02:00
|
|
|
cfd->Unref();
|
|
|
|
// If this asserts, it means that InsertInto failed in
|
|
|
|
// filtering updates to already-flushed column families
|
|
|
|
assert(cfd->GetLogNumber() <= log_number);
|
|
|
|
auto iter = version_edits.find(cfd->GetID());
|
|
|
|
assert(iter != version_edits.end());
|
|
|
|
VersionEdit* edit = &iter->second;
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
|
2014-09-11 03:46:09 +02:00
|
|
|
if (!status.ok()) {
|
|
|
|
// Reflect errors immediately so that conditions like full
|
|
|
|
// file-systems cause the DB::Open() to fail.
|
|
|
|
return status;
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
2015-05-29 23:36:35 +02:00
|
|
|
|
|
|
|
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
|
|
|
|
*max_sequence);
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-05 19:49:41 +01:00
|
|
|
if (!status.ok()) {
|
2015-06-15 21:03:13 +02:00
|
|
|
if (db_options_.wal_recovery_mode ==
|
2015-09-10 06:59:28 +02:00
|
|
|
WALRecoveryMode::kSkipAnyCorruptedRecords) {
|
|
|
|
// We should ignore all errors unconditionally
|
|
|
|
status = Status::OK();
|
|
|
|
} else if (db_options_.wal_recovery_mode ==
|
2015-06-15 21:03:13 +02:00
|
|
|
WALRecoveryMode::kPointInTimeRecovery) {
|
|
|
|
// We should ignore the error but not continue replaying
|
|
|
|
status = Status::OK();
|
|
|
|
continue_replay_log = false;
|
|
|
|
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Point in time recovered to log #%" PRIu64 " seq #%" PRIu64,
|
|
|
|
log_number, *max_sequence);
|
2015-09-10 06:59:28 +02:00
|
|
|
} else {
|
|
|
|
assert(db_options_.wal_recovery_mode ==
|
|
|
|
WALRecoveryMode::kTolerateCorruptedTailRecords
|
|
|
|
|| db_options_.wal_recovery_mode ==
|
|
|
|
WALRecoveryMode::kAbsoluteConsistency);
|
2015-06-15 21:03:13 +02:00
|
|
|
return status;
|
|
|
|
}
|
2015-01-05 19:49:41 +01:00
|
|
|
}
|
|
|
|
|
2014-09-11 03:46:09 +02:00
|
|
|
flush_scheduler_.Clear();
|
2015-05-29 23:36:35 +02:00
|
|
|
if ((*max_sequence != kMaxSequenceNumber) &&
|
|
|
|
(versions_->LastSequence() < *max_sequence)) {
|
2014-09-09 20:18:50 +02:00
|
|
|
versions_->SetLastSequence(*max_sequence);
|
|
|
|
}
|
2014-03-12 18:52:32 +01:00
|
|
|
}
|
|
|
|
|
2014-01-28 20:05:04 +01:00
|
|
|
if (!read_only) {
|
2014-02-11 02:04:44 +01:00
|
|
|
// no need to refcount since client still doesn't have access
|
|
|
|
// to the DB and can not drop column families while we iterate
|
2014-09-09 20:18:50 +02:00
|
|
|
auto max_log_number = log_numbers.back();
|
2014-01-28 20:05:04 +01:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
2014-01-29 22:28:50 +01:00
|
|
|
auto iter = version_edits.find(cfd->GetID());
|
2014-01-28 20:05:04 +01:00
|
|
|
assert(iter != version_edits.end());
|
|
|
|
VersionEdit* edit = &iter->second;
|
|
|
|
|
2014-09-09 20:18:50 +02:00
|
|
|
if (cfd->GetLogNumber() > max_log_number) {
|
2014-01-31 02:48:42 +01:00
|
|
|
// Column family cfd has already flushed the data
|
2014-09-09 20:18:50 +02:00
|
|
|
// from all logs. Memtable has to be empty because
|
2014-01-31 02:48:42 +01:00
|
|
|
// we filter the updates based on log_number
|
2014-02-06 20:44:50 +01:00
|
|
|
// (in WriteBatch::InsertInto)
|
2014-01-31 02:48:42 +01:00
|
|
|
assert(cfd->mem()->GetFirstSequenceNumber() == 0);
|
|
|
|
assert(edit->NumEntries() == 0);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// flush the final memtable (if non-empty)
|
|
|
|
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
|
2014-09-09 20:18:50 +02:00
|
|
|
if (!status.ok()) {
|
|
|
|
// Recovery failed
|
|
|
|
break;
|
|
|
|
}
|
2015-05-29 23:36:35 +02:00
|
|
|
|
|
|
|
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
|
|
|
|
*max_sequence);
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-01-28 20:05:04 +01:00
|
|
|
// write MANIFEST with update
|
2014-09-09 20:18:50 +02:00
|
|
|
// writing log_number in the manifest means that any log file
|
2014-01-28 20:05:04 +01:00
|
|
|
// with number strongly less than (log_number + 1) is already
|
|
|
|
// recovered and should be ignored on next reincarnation.
|
2014-09-09 20:18:50 +02:00
|
|
|
// Since we already recovered max_log_number, we want all logs
|
|
|
|
// with numbers `<= max_log_number` (includes this one) to be ignored
|
|
|
|
edit->SetLogNumber(max_log_number + 1);
|
2014-01-31 02:48:42 +01:00
|
|
|
// we must mark the next log number as used, even though it's
|
|
|
|
// not actually used. that is because VersionSet assumes
|
|
|
|
// VersionSet::next_file_number_ always to be strictly greater than any
|
2014-03-11 22:52:17 +01:00
|
|
|
// log number
|
2014-11-08 00:44:12 +01:00
|
|
|
versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1);
|
2014-10-02 01:19:16 +02:00
|
|
|
status = versions_->LogAndApply(
|
|
|
|
cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
|
2014-01-28 20:05:04 +01:00
|
|
|
if (!status.ok()) {
|
2014-09-09 20:18:50 +02:00
|
|
|
// Recovery failed
|
|
|
|
break;
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
|
|
|
}
|
2013-11-25 20:55:36 +01:00
|
|
|
}
|
Refactor Recover() code
Summary:
This diff does two things:
* Rethinks how we call Recover() with read_only option. Before, we call it with pointer to memtable where we'd like to apply those changes to. This memtable is set in db_impl_readonly.cc and it's actually DBImpl::mem_. Why don't we just apply updates to mem_ right away? It seems more intuitive.
* Changes when we apply updates to manifest. Before, the process is to recover all the logs, flush it to sst files and then do one giant commit that atomically adds all recovered sst files and sets the next log number. This works good enough, but causes some small troubles for my column family approach, since I can't have one VersionEdit apply to more than single column family[1]. The change here is to commit the files recovered from logs right away. Here is the state of the world before the change:
1. Recover log 5, add new sst files to edit
2. Recover log 7, add new sst files to edit
3. Recover log 8, add new sst files to edit
4. Commit all added sst files to manifest and mark log files 5, 7 and 8 as recoverd (via SetLogNumber(9) function)
After the change, we'll do:
1. Recover log 5, commit the new sst files and set log 5 as recovered
2. Recover log 7, commit the new sst files and set log 7 as recovered
3. Recover log 8, commit the new sst files and set log 8 as recovered
The added (small) benefit is that if we fail after (2), the new recovery will only have to recover log 8. In previous case, we'll have to restart the recovery from the beginning. The bigger benefit will be to enable easier integration of multiple column families in Recovery code path.
[1] I'm happy to dicuss this decison, but I believe this is the cleanest way to go. It also makes backward compatibility much easier. We don't have a requirement of adding multiple column families atomically.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15237
2014-01-22 19:45:26 +01:00
|
|
|
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
event_logger_.Log() << "job" << job_id << "event"
|
|
|
|
<< "recovery_finished";
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
|
MemTable* mem, VersionEdit* edit) {
|
2011-03-18 23:37:00 +01:00
|
|
|
mutex_.AssertHeld();
|
2011-04-12 21:38:58 +02:00
|
|
|
const uint64_t start_micros = env_->NowMicros();
|
2011-03-18 23:37:00 +01:00
|
|
|
FileMetaData meta;
|
2014-07-02 18:54:20 +02:00
|
|
|
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
|
2014-11-07 20:50:34 +01:00
|
|
|
auto pending_outputs_inserted_elem =
|
|
|
|
CaptureCurrentFileNumberInPendingOutputs();
|
2014-08-26 01:14:30 +02:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
2014-09-05 02:40:41 +02:00
|
|
|
Arena arena;
|
2011-04-12 21:38:58 +02:00
|
|
|
Status s;
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-13 00:53:55 +02:00
|
|
|
TableProperties table_properties;
|
2011-04-12 21:38:58 +02:00
|
|
|
{
|
2014-09-05 02:40:41 +02:00
|
|
|
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [WriteLevel0TableForRecovery]"
|
|
|
|
" Level-0 table #%" PRIu64 ": started",
|
2014-09-05 02:40:41 +02:00
|
|
|
cfd->GetName().c_str(), meta.fd.GetNumber());
|
2011-04-12 21:38:58 +02:00
|
|
|
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-13 00:53:55 +02:00
|
|
|
bool paranoid_file_checks =
|
|
|
|
cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
|
2014-09-05 02:40:41 +02:00
|
|
|
{
|
|
|
|
mutex_.Unlock();
|
2015-06-02 23:12:23 +02:00
|
|
|
TableFileCreationInfo info;
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
|
|
|
|
SequenceNumber earliest_write_conflict_snapshot;
|
|
|
|
std::vector<SequenceNumber> snapshot_seqs =
|
|
|
|
snapshots_.GetAll(&earliest_write_conflict_snapshot);
|
|
|
|
|
2014-09-05 02:40:41 +02:00
|
|
|
s = BuildTable(
|
|
|
|
dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 19:04:30 +02:00
|
|
|
iter.get(), &meta, cfd->internal_comparator(),
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
cfd->int_tbl_prop_collector_factories(), cfd->GetID(), snapshot_seqs,
|
|
|
|
earliest_write_conflict_snapshot,
|
|
|
|
GetCompressionFlush(*cfd->ioptions()),
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 23:35:54 +02:00
|
|
|
cfd->ioptions()->compression_opts, paranoid_file_checks,
|
|
|
|
cfd->internal_stats(), Env::IO_HIGH, &info.table_properties);
|
2014-09-05 20:48:17 +02:00
|
|
|
LogFlush(db_options_.info_log);
|
2015-05-22 20:24:12 +02:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [WriteLevel0TableForRecovery]"
|
|
|
|
" Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
|
|
|
|
cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
|
|
|
|
s.ToString().c_str());
|
|
|
|
|
|
|
|
// output to event logger
|
|
|
|
if (s.ok()) {
|
2015-06-02 23:12:23 +02:00
|
|
|
info.db_name = dbname_;
|
|
|
|
info.cf_name = cfd->GetName();
|
|
|
|
info.file_path = TableFileName(db_options_.db_paths,
|
|
|
|
meta.fd.GetNumber(),
|
|
|
|
meta.fd.GetPathId());
|
|
|
|
info.file_size = meta.fd.GetFileSize();
|
|
|
|
info.job_id = job_id;
|
|
|
|
EventHelpers::LogAndNotifyTableFileCreation(
|
|
|
|
&event_logger_, db_options_.listeners, meta.fd, info);
|
2015-05-22 20:24:12 +02:00
|
|
|
}
|
2014-09-05 02:40:41 +02:00
|
|
|
mutex_.Lock();
|
|
|
|
}
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-13 00:53:55 +02:00
|
|
|
}
|
2014-11-07 20:50:34 +01:00
|
|
|
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
|
2011-06-22 04:36:45 +02:00
|
|
|
|
|
|
|
// Note that if file_size is zero, the file has been deleted and
|
|
|
|
// should not be added to the manifest.
|
|
|
|
int level = 0;
|
2014-06-14 00:54:19 +02:00
|
|
|
if (s.ok() && meta.fd.GetFileSize() > 0) {
|
2014-07-02 18:54:20 +02:00
|
|
|
edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
|
|
|
|
meta.fd.GetFileSize(), meta.smallest, meta.largest,
|
2015-06-04 21:03:40 +02:00
|
|
|
meta.smallest_seqno, meta.largest_seqno,
|
|
|
|
meta.marked_for_compaction);
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
|
|
|
|
2014-07-12 00:03:30 +02:00
|
|
|
InternalStats::CompactionStats stats(1);
|
2011-04-12 21:38:58 +02:00
|
|
|
stats.micros = env_->NowMicros() - start_micros;
|
2014-06-14 00:54:19 +02:00
|
|
|
stats.bytes_written = meta.fd.GetFileSize();
|
2015-06-18 08:40:34 +02:00
|
|
|
stats.num_output_files = 1;
|
2014-02-05 02:45:19 +01:00
|
|
|
cfd->internal_stats()->AddCompactionStats(level, stats);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
cfd->internal_stats()->AddCFStats(
|
|
|
|
InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
|
2014-07-28 21:05:36 +02:00
|
|
|
RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
|
2011-03-18 23:37:00 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
Status DBImpl::FlushMemTableToOutputFile(
|
|
|
|
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) {
|
2012-10-19 23:00:53 +02:00
|
|
|
mutex_.AssertHeld();
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
assert(cfd->imm()->NumNotFlushed() != 0);
|
2014-02-12 23:01:30 +01:00
|
|
|
assert(cfd->imm()->IsFlushPending());
|
2012-10-19 23:00:53 +02:00
|
|
|
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
SequenceNumber earliest_write_conflict_snapshot;
|
|
|
|
std::vector<SequenceNumber> snapshot_seqs =
|
|
|
|
snapshots_.GetAll(&earliest_write_conflict_snapshot);
|
|
|
|
|
|
|
|
FlushJob flush_job(
|
|
|
|
dbname_, cfd, db_options_, mutable_cf_options, env_options_,
|
|
|
|
versions_.get(), &mutex_, &shutting_down_, snapshot_seqs,
|
|
|
|
earliest_write_conflict_snapshot, job_context, log_buffer,
|
|
|
|
directories_.GetDbDir(), directories_.GetDataDir(0U),
|
|
|
|
GetCompressionFlush(*cfd->ioptions()), stats_, &event_logger_);
|
2012-10-19 23:00:53 +02:00
|
|
|
|
2015-06-12 00:22:22 +02:00
|
|
|
FileMetaData file_meta;
|
2015-06-02 23:12:23 +02:00
|
|
|
|
|
|
|
// Within flush_job.Run, rocksdb may call event listener to notify
|
|
|
|
// file creation and deletion.
|
|
|
|
//
|
|
|
|
// Note that flush_job.Run will unlock and lock the db_mutex,
|
|
|
|
// and EventListener callback will be called when the db_mutex
|
|
|
|
// is unlocked by the current thread.
|
2015-06-12 00:22:22 +02:00
|
|
|
Status s = flush_job.Run(&file_meta);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
if (s.ok()) {
|
2015-06-17 21:37:59 +02:00
|
|
|
InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context,
|
|
|
|
mutable_cf_options);
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
if (made_progress) {
|
|
|
|
*made_progress = 1;
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
2014-10-27 23:49:46 +01:00
|
|
|
VersionStorageInfo::LevelSummaryStorage tmp;
|
2014-04-30 20:33:40 +02:00
|
|
|
LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
|
2014-10-31 16:48:19 +01:00
|
|
|
cfd->current()->storage_info()->LevelSummary(&tmp));
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2014-03-20 21:10:02 +01:00
|
|
|
|
2014-09-05 20:48:17 +02:00
|
|
|
if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks &&
|
2014-03-20 21:10:02 +01:00
|
|
|
bg_error_.ok()) {
|
|
|
|
// if a bad error happened (not ShutdownInProgress) and paranoid_checks is
|
|
|
|
// true, mark DB read-only
|
|
|
|
bg_error_ = s;
|
|
|
|
}
|
2014-07-04 01:28:03 +02:00
|
|
|
RecordFlushIOStats();
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (s.ok()) {
|
|
|
|
// may temporarily unlock and lock the mutex.
|
2015-06-12 00:22:22 +02:00
|
|
|
NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
|
2015-09-15 18:03:08 +02:00
|
|
|
job_context->job_id, flush_job.GetTableProperties());
|
2016-01-29 03:35:01 +01:00
|
|
|
auto sfm =
|
|
|
|
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
|
|
|
|
if (sfm) {
|
|
|
|
// Notify sst_file_manager that a new file was added
|
|
|
|
std::string file_path = MakeTableFileName(db_options_.db_paths[0].path,
|
|
|
|
file_meta.fd.GetNumber());
|
|
|
|
sfm->OnAddFile(file_path);
|
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
2011-03-18 23:37:00 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-09-15 18:03:08 +02:00
|
|
|
void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
|
|
|
|
FileMetaData* file_meta,
|
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
int job_id, TableProperties prop) {
|
2014-12-24 01:17:53 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2015-05-28 22:21:39 +02:00
|
|
|
if (db_options_.listeners.size() == 0U) {
|
2014-12-17 02:10:23 +01:00
|
|
|
return;
|
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
mutex_.AssertHeld();
|
|
|
|
if (shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
return;
|
|
|
|
}
|
2015-06-05 21:28:51 +02:00
|
|
|
bool triggered_writes_slowdown =
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
(cfd->current()->storage_info()->NumLevelFiles(0) >=
|
2014-11-18 19:19:48 +01:00
|
|
|
mutable_cf_options.level0_slowdown_writes_trigger);
|
2015-06-05 21:28:51 +02:00
|
|
|
bool triggered_writes_stop =
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
(cfd->current()->storage_info()->NumLevelFiles(0) >=
|
2014-11-18 19:19:48 +01:00
|
|
|
mutable_cf_options.level0_stop_writes_trigger);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
// release lock while notifying events
|
|
|
|
mutex_.Unlock();
|
2015-05-28 22:21:39 +02:00
|
|
|
{
|
2015-06-05 21:28:51 +02:00
|
|
|
FlushJobInfo info;
|
|
|
|
info.cf_name = cfd->GetName();
|
|
|
|
// TODO(yhchiang): make db_paths dynamic in case flush does not
|
|
|
|
// go to L0 in the future.
|
|
|
|
info.file_path = MakeTableFileName(db_options_.db_paths[0].path,
|
2015-06-12 00:22:22 +02:00
|
|
|
file_meta->fd.GetNumber());
|
2015-06-11 23:18:02 +02:00
|
|
|
info.thread_id = env_->GetThreadID();
|
2015-06-05 21:28:51 +02:00
|
|
|
info.job_id = job_id;
|
|
|
|
info.triggered_writes_slowdown = triggered_writes_slowdown;
|
|
|
|
info.triggered_writes_stop = triggered_writes_stop;
|
2015-06-12 00:22:22 +02:00
|
|
|
info.smallest_seqno = file_meta->smallest_seqno;
|
|
|
|
info.largest_seqno = file_meta->largest_seqno;
|
2015-09-15 18:03:08 +02:00
|
|
|
info.table_properties = prop;
|
2015-05-28 22:21:39 +02:00
|
|
|
for (auto listener : db_options_.listeners) {
|
2015-06-05 21:28:51 +02:00
|
|
|
listener->OnFlushCompleted(this, info);
|
2015-05-28 22:21:39 +02:00
|
|
|
}
|
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
mutex_.Lock();
|
|
|
|
// no need to signal bg_cv_ as it will be signaled at the end of the
|
|
|
|
// flush process.
|
2014-12-24 01:17:53 +01:00
|
|
|
#endif // ROCKSDB_LITE
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
}
|
|
|
|
|
2015-06-17 23:36:14 +02:00
|
|
|
Status DBImpl::CompactRange(const CompactRangeOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice* begin, const Slice* end) {
|
|
|
|
if (options.target_path_id >= db_options_.db_paths.size()) {
|
2014-07-17 02:39:18 +02:00
|
|
|
return Status::InvalidArgument("Invalid target path ID");
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
bool exclusive = options.exclusive_manual_compaction;
|
2014-02-01 01:45:20 +01:00
|
|
|
|
|
|
|
Status s = FlushMemTable(cfd, FlushOptions());
|
2014-01-22 21:46:24 +01:00
|
|
|
if (!s.ok()) {
|
2014-09-05 20:48:17 +02:00
|
|
|
LogFlush(db_options_.info_log);
|
2014-01-22 21:46:24 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-05-21 20:43:35 +02:00
|
|
|
int max_level_with_files = 0;
|
2011-10-06 01:30:28 +02:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-02-01 01:45:20 +01:00
|
|
|
Version* base = cfd->current();
|
2015-04-29 19:52:31 +02:00
|
|
|
for (int level = 1; level < base->storage_info()->num_non_empty_levels();
|
|
|
|
level++) {
|
2014-10-31 16:48:19 +01:00
|
|
|
if (base->storage_info()->OverlapInLevel(level, begin, end)) {
|
2011-10-06 01:30:28 +02:00
|
|
|
max_level_with_files = level;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-04-23 01:55:22 +02:00
|
|
|
|
2015-06-11 23:15:52 +02:00
|
|
|
int final_output_level = 0;
|
2015-04-23 01:55:22 +02:00
|
|
|
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
|
|
|
|
cfd->NumberLevels() > 1) {
|
|
|
|
// Always compact all files together.
|
|
|
|
s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
|
2015-06-17 23:36:14 +02:00
|
|
|
cfd->NumberLevels() - 1, options.target_path_id,
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
begin, end, exclusive);
|
2015-06-11 23:15:52 +02:00
|
|
|
final_output_level = cfd->NumberLevels() - 1;
|
2015-04-23 01:55:22 +02:00
|
|
|
} else {
|
|
|
|
for (int level = 0; level <= max_level_with_files; level++) {
|
2015-06-11 23:15:52 +02:00
|
|
|
int output_level;
|
2015-08-25 21:29:44 +02:00
|
|
|
// in case the compaction is universal or if we're compacting the
|
2015-04-23 01:55:22 +02:00
|
|
|
// bottom-most level, the output level will be the same as input one.
|
|
|
|
// level 0 can never be the bottommost level (i.e. if all files are in
|
|
|
|
// level 0, we will compact to level 1)
|
|
|
|
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
2015-06-18 20:03:31 +02:00
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
|
|
|
|
output_level = level;
|
|
|
|
} else if (level == max_level_with_files && level > 0) {
|
2015-06-23 22:32:40 +02:00
|
|
|
if (options.bottommost_level_compaction ==
|
|
|
|
BottommostLevelCompaction::kSkip) {
|
|
|
|
// Skip bottommost level compaction
|
|
|
|
continue;
|
|
|
|
} else if (options.bottommost_level_compaction ==
|
|
|
|
BottommostLevelCompaction::kIfHaveCompactionFilter &&
|
|
|
|
cfd->ioptions()->compaction_filter == nullptr &&
|
2015-07-17 18:59:11 +02:00
|
|
|
cfd->ioptions()->compaction_filter_factory == nullptr) {
|
2015-08-25 21:29:44 +02:00
|
|
|
// Skip bottommost level compaction since we don't have a compaction
|
|
|
|
// filter
|
2015-06-18 20:03:31 +02:00
|
|
|
continue;
|
|
|
|
}
|
2015-06-11 23:15:52 +02:00
|
|
|
output_level = level;
|
2015-04-23 01:55:22 +02:00
|
|
|
} else {
|
2015-06-11 23:15:52 +02:00
|
|
|
output_level = level + 1;
|
2015-04-15 06:45:20 +02:00
|
|
|
if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
|
|
|
|
cfd->ioptions()->level_compaction_dynamic_level_bytes &&
|
|
|
|
level == 0) {
|
|
|
|
output_level = ColumnFamilyData::kCompactToBaseLevel;
|
|
|
|
}
|
2015-04-23 01:55:22 +02:00
|
|
|
}
|
2015-06-17 23:36:14 +02:00
|
|
|
s = RunManualCompaction(cfd, level, output_level, options.target_path_id,
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
begin, end, exclusive);
|
2015-04-23 01:55:22 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
2015-06-11 23:15:52 +02:00
|
|
|
if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
|
|
|
|
final_output_level = cfd->NumberLevels() - 1;
|
|
|
|
} else if (output_level > final_output_level) {
|
|
|
|
final_output_level = output_level;
|
|
|
|
}
|
2015-05-19 00:34:33 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
|
|
|
|
TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
|
2014-01-15 01:19:09 +01:00
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
2015-04-23 01:55:22 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
return s;
|
|
|
|
}
|
2013-06-30 08:21:36 +02:00
|
|
|
|
2015-06-17 23:36:14 +02:00
|
|
|
if (options.change_level) {
|
2015-10-02 22:17:34 +02:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[RefitLevel] waiting for background threads to stop");
|
|
|
|
s = PauseBackgroundWork();
|
|
|
|
if (s.ok()) {
|
|
|
|
s = ReFitLevel(cfd, final_output_level, options.target_level);
|
|
|
|
}
|
|
|
|
ContinueBackgroundWork();
|
2013-06-30 08:21:36 +02:00
|
|
|
}
|
2014-09-05 20:48:17 +02:00
|
|
|
LogFlush(db_options_.info_log);
|
2014-01-22 21:46:24 +01:00
|
|
|
|
2014-08-28 19:06:28 +02:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-08-28 19:06:28 +02:00
|
|
|
// an automatic compaction that has been scheduled might have been
|
|
|
|
// preempted by the manual compactions. Need to schedule it back.
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
|
2014-01-22 21:46:24 +01:00
|
|
|
return s;
|
2013-06-30 08:21:36 +02:00
|
|
|
}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
Status DBImpl::CompactFiles(
|
|
|
|
const CompactionOptions& compact_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<std::string>& input_file_names,
|
|
|
|
const int output_level, const int output_path_id) {
|
2014-11-13 22:45:33 +01:00
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
// not supported in lite version
|
|
|
|
return Status::NotSupported("Not supported in ROCKSDB LITE");
|
|
|
|
#else
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
if (column_family == nullptr) {
|
|
|
|
return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
|
|
|
|
}
|
|
|
|
|
|
|
|
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
|
|
|
|
assert(cfd);
|
2015-03-11 21:06:59 +01:00
|
|
|
|
|
|
|
Status s;
|
|
|
|
JobContext job_context(0, true);
|
|
|
|
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log.get());
|
|
|
|
|
|
|
|
// Perform CompactFiles
|
|
|
|
SuperVersion* sv = GetAndRefSuperVersion(cfd);
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
|
|
|
|
s = CompactFilesImpl(compact_options, cfd, sv->current,
|
|
|
|
input_file_names, output_level,
|
|
|
|
output_path_id, &job_context, &log_buffer);
|
|
|
|
}
|
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
|
|
|
|
|
|
// Find and delete obsolete files
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
// If !s.ok(), this means that Compaction failed. In that case, we want
|
|
|
|
// to delete all obsolete files we might have created and we force
|
|
|
|
// FindObsoleteFiles(). This is because job_context does not
|
|
|
|
// catch all created files if compaction failed.
|
|
|
|
FindObsoleteFiles(&job_context, !s.ok());
|
2015-06-04 04:57:01 +02:00
|
|
|
} // release the mutex
|
2015-03-11 21:06:59 +01:00
|
|
|
|
|
|
|
// delete unnecessary files if any, this is done outside the mutex
|
|
|
|
if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
|
|
|
|
// Have to flush the info logs before bg_compaction_scheduled_--
|
|
|
|
// because if bg_flush_scheduled_ becomes 0 and the lock is
|
|
|
|
// released, the deconstructor of DB can kick in and destroy all the
|
|
|
|
// states of DB so info_log might not be available after that point.
|
|
|
|
// It also applies to access other states that DB owns.
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
2015-06-04 04:57:01 +02:00
|
|
|
// no mutex is locked here. No need to Unlock() and Lock() here.
|
2015-03-11 21:06:59 +01:00
|
|
|
PurgeObsoleteFiles(job_context);
|
|
|
|
}
|
|
|
|
job_context.Clean();
|
|
|
|
}
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
return s;
|
2014-11-13 22:45:33 +01:00
|
|
|
#endif // ROCKSDB_LITE
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
}
|
|
|
|
|
2014-11-13 22:45:33 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
Status DBImpl::CompactFilesImpl(
|
|
|
|
const CompactionOptions& compact_options, ColumnFamilyData* cfd,
|
|
|
|
Version* version, const std::vector<std::string>& input_file_names,
|
2015-03-11 21:06:59 +01:00
|
|
|
const int output_level, int output_path_id, JobContext* job_context,
|
|
|
|
LogBuffer* log_buffer) {
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
mutex_.AssertHeld();
|
|
|
|
|
|
|
|
if (shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
return Status::ShutdownInProgress();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unordered_set<uint64_t> input_set;
|
|
|
|
for (auto file_name : input_file_names) {
|
|
|
|
input_set.insert(TableFileNameToNumber(file_name));
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyMetaData cf_meta;
|
|
|
|
// TODO(yhchiang): can directly use version here if none of the
|
|
|
|
// following functions call is pluggable to external developers.
|
|
|
|
version->GetColumnFamilyMetaData(&cf_meta);
|
|
|
|
|
|
|
|
if (output_path_id < 0) {
|
|
|
|
if (db_options_.db_paths.size() == 1U) {
|
|
|
|
output_path_id = 0;
|
|
|
|
} else {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Automatic output path selection is not "
|
|
|
|
"yet supported in CompactFiles()");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
|
|
|
|
&input_set, cf_meta, output_level);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
2015-04-11 00:01:54 +02:00
|
|
|
std::vector<CompactionInputFiles> input_files;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
|
|
|
|
&input_files, &input_set, version->storage_info(), compact_options);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto inputs : input_files) {
|
|
|
|
if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) {
|
|
|
|
return Status::Aborted(
|
|
|
|
"Some of the necessary compaction input "
|
|
|
|
"files are already being compacted");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// At this point, CompactFiles will be run.
|
|
|
|
bg_compaction_scheduled_++;
|
|
|
|
|
|
|
|
unique_ptr<Compaction> c;
|
|
|
|
assert(cfd->compaction_picker());
|
|
|
|
c.reset(cfd->compaction_picker()->FormCompaction(
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
2015-04-11 00:01:54 +02:00
|
|
|
compact_options, input_files, output_level, version->storage_info(),
|
|
|
|
*cfd->GetLatestMutableCFOptions(), output_path_id));
|
2015-11-13 21:01:00 +01:00
|
|
|
if (!c) {
|
|
|
|
return Status::Aborted("Another Level 0 compaction is running");
|
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
c->SetInputVersion(version);
|
|
|
|
// deletion compaction currently not allowed in CompactFiles.
|
2015-07-09 00:21:10 +02:00
|
|
|
assert(!c->deletion_compaction());
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
|
2015-12-08 21:25:48 +01:00
|
|
|
SequenceNumber earliest_write_conflict_snapshot;
|
|
|
|
std::vector<SequenceNumber> snapshot_seqs =
|
|
|
|
snapshots_.GetAll(&earliest_write_conflict_snapshot);
|
|
|
|
|
2015-05-06 04:01:12 +02:00
|
|
|
assert(is_snapshot_supported_ || snapshots_.empty());
|
2015-01-26 22:59:38 +01:00
|
|
|
CompactionJob compaction_job(
|
2015-05-06 04:01:12 +02:00
|
|
|
job_context->job_id, c.get(), db_options_, env_options_, versions_.get(),
|
|
|
|
&shutting_down_, log_buffer, directories_.GetDbDir(),
|
2015-12-08 21:25:48 +01:00
|
|
|
directories_.GetDataDir(c->output_path_id()), stats_, snapshot_seqs,
|
|
|
|
earliest_write_conflict_snapshot, table_cache_, &event_logger_,
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
c->mutable_cf_options()->paranoid_file_checks,
|
|
|
|
c->mutable_cf_options()->compaction_measure_io_stats, dbname_,
|
2015-06-03 02:07:16 +02:00
|
|
|
nullptr); // Here we pass a nullptr for CompactionJobStats because
|
|
|
|
// CompactFiles does not trigger OnCompactionCompleted(),
|
|
|
|
// which is the only place where CompactionJobStats is
|
|
|
|
// returned. The idea of not triggering OnCompationCompleted()
|
|
|
|
// is that CompactFiles runs in the caller thread, so the user
|
|
|
|
// should always know when it completes. As a result, it makes
|
|
|
|
// less sense to notify the users something they should already
|
|
|
|
// know.
|
|
|
|
//
|
|
|
|
// In the future, if we would like to add CompactionJobStats
|
|
|
|
// support for CompactFiles, we should have CompactFiles API
|
|
|
|
// pass a pointer of CompactionJobStats as the out-value
|
|
|
|
// instead of using EventListener.
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
compaction_job.Prepare();
|
|
|
|
|
|
|
|
mutex_.Unlock();
|
2015-11-13 21:01:00 +01:00
|
|
|
TEST_SYNC_POINT("CompactFilesImpl:0");
|
|
|
|
TEST_SYNC_POINT("CompactFilesImpl:1");
|
2015-08-18 20:06:23 +02:00
|
|
|
compaction_job.Run();
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
mutex_.Lock();
|
2015-06-02 23:12:23 +02:00
|
|
|
|
2015-08-18 20:06:23 +02:00
|
|
|
Status status = compaction_job.Install(*c->mutable_cf_options(), &mutex_);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
if (status.ok()) {
|
2015-06-17 21:37:59 +02:00
|
|
|
InstallSuperVersionAndScheduleWorkWrapper(
|
|
|
|
c->column_family_data(), job_context, *c->mutable_cf_options());
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
}
|
|
|
|
c->ReleaseCompactionFiles(s);
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
// Done
|
|
|
|
} else if (status.IsShutdownInProgress()) {
|
|
|
|
// Ignore compaction errors found during shutting down
|
|
|
|
} else {
|
2015-02-12 18:54:48 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Compaction error: %s",
|
2015-03-11 21:06:59 +01:00
|
|
|
c->column_family_data()->GetName().c_str(), job_context->job_id,
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
status.ToString().c_str());
|
|
|
|
if (db_options_.paranoid_checks && bg_error_.ok()) {
|
|
|
|
bg_error_ = status;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-02-16 20:11:16 +01:00
|
|
|
c.reset();
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
bg_compaction_scheduled_--;
|
2015-06-02 00:32:23 +02:00
|
|
|
if (bg_compaction_scheduled_ == 0) {
|
|
|
|
bg_cv_.SignalAll();
|
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
2014-11-13 22:45:33 +01:00
|
|
|
#endif // ROCKSDB_LITE
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
|
2015-10-02 22:17:34 +02:00
|
|
|
Status DBImpl::PauseBackgroundWork() {
|
|
|
|
InstrumentedMutexLock guard_lock(&mutex_);
|
|
|
|
bg_work_paused_++;
|
|
|
|
while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) {
|
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::ContinueBackgroundWork() {
|
|
|
|
InstrumentedMutexLock guard_lock(&mutex_);
|
2015-11-16 21:34:55 +01:00
|
|
|
if (bg_work_paused_ == 0) {
|
|
|
|
return Status::InvalidArgument();
|
|
|
|
}
|
2015-10-02 22:17:34 +02:00
|
|
|
assert(bg_work_paused_ > 0);
|
|
|
|
bg_work_paused_--;
|
|
|
|
if (bg_work_paused_ == 0) {
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2015-01-27 23:44:02 +01:00
|
|
|
void DBImpl::NotifyOnCompactionCompleted(
|
2015-06-03 02:07:16 +02:00
|
|
|
ColumnFamilyData* cfd, Compaction *c, const Status &st,
|
|
|
|
const CompactionJobStats& compaction_job_stats,
|
2015-06-03 02:35:39 +02:00
|
|
|
const int job_id) {
|
2015-01-27 23:44:02 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2015-05-28 22:21:39 +02:00
|
|
|
if (db_options_.listeners.size() == 0U) {
|
2015-01-27 23:44:02 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
if (shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// release lock while notifying events
|
|
|
|
mutex_.Unlock();
|
2015-05-28 22:21:39 +02:00
|
|
|
{
|
|
|
|
CompactionJobInfo info;
|
|
|
|
info.cf_name = cfd->GetName();
|
|
|
|
info.status = st;
|
2015-06-11 23:18:02 +02:00
|
|
|
info.thread_id = env_->GetThreadID();
|
2015-06-03 02:07:16 +02:00
|
|
|
info.job_id = job_id;
|
|
|
|
info.base_input_level = c->start_level();
|
2015-05-28 22:21:39 +02:00
|
|
|
info.output_level = c->output_level();
|
2015-06-03 02:07:16 +02:00
|
|
|
info.stats = compaction_job_stats;
|
2015-09-15 18:03:08 +02:00
|
|
|
info.table_properties = c->GetOutputTableProperties();
|
2015-12-22 20:37:19 +01:00
|
|
|
info.compaction_reason = c->compaction_reason();
|
2015-05-28 22:21:39 +02:00
|
|
|
for (size_t i = 0; i < c->num_input_levels(); ++i) {
|
|
|
|
for (const auto fmd : *c->inputs(i)) {
|
2015-09-15 18:03:08 +02:00
|
|
|
auto fn = TableFileName(db_options_.db_paths, fmd->fd.GetNumber(),
|
|
|
|
fmd->fd.GetPathId());
|
|
|
|
info.input_files.push_back(fn);
|
|
|
|
if (info.table_properties.count(fn) == 0) {
|
|
|
|
std::shared_ptr<const TableProperties> tp;
|
2015-10-28 22:07:22 +01:00
|
|
|
auto s = cfd->current()->GetTableProperties(&tp, fmd, &fn);
|
2015-09-15 18:03:08 +02:00
|
|
|
if (s.ok()) {
|
|
|
|
info.table_properties[fn] = tp;
|
|
|
|
}
|
|
|
|
}
|
2015-05-28 22:21:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
for (const auto newf : c->edit()->GetNewFiles()) {
|
|
|
|
info.output_files.push_back(
|
|
|
|
TableFileName(db_options_.db_paths,
|
|
|
|
newf.second.fd.GetNumber(),
|
|
|
|
newf.second.fd.GetPathId()));
|
|
|
|
}
|
|
|
|
for (auto listener : db_options_.listeners) {
|
|
|
|
listener->OnCompactionCompleted(this, info);
|
|
|
|
}
|
|
|
|
}
|
2015-01-27 23:44:02 +01:00
|
|
|
mutex_.Lock();
|
|
|
|
// no need to signal bg_cv_ as it will be signaled at the end of the
|
|
|
|
// flush process.
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
2014-11-05 01:23:05 +01:00
|
|
|
Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
|
2014-09-17 21:49:13 +02:00
|
|
|
const std::unordered_map<std::string, std::string>& options_map) {
|
2014-11-13 22:45:33 +01:00
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
return Status::NotSupported("Not supported in ROCKSDB LITE");
|
|
|
|
#else
|
2014-10-17 02:22:28 +02:00
|
|
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
|
|
|
|
if (options_map.empty()) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
|
|
|
db_options_.info_log, "SetOptions() on column family [%s], empty input",
|
2014-10-17 02:22:28 +02:00
|
|
|
cfd->GetName().c_str());
|
2014-11-05 01:23:05 +01:00
|
|
|
return Status::InvalidArgument("empty input");
|
2014-10-17 02:22:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
MutableCFOptions new_options;
|
2014-11-05 01:23:05 +01:00
|
|
|
Status s;
|
2015-12-09 02:01:02 +01:00
|
|
|
Status persist_options_status;
|
2014-10-17 02:22:28 +02:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-11-05 01:23:05 +01:00
|
|
|
s = cfd->SetOptions(options_map);
|
|
|
|
if (s.ok()) {
|
2014-10-17 02:22:28 +02:00
|
|
|
new_options = *cfd->GetLatestMutableCFOptions();
|
|
|
|
}
|
2015-12-09 02:01:02 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
// Persist RocksDB options under the single write thread
|
|
|
|
WriteThread::Writer w;
|
|
|
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
|
|
|
|
|
|
|
persist_options_status = WriteOptionsFile();
|
|
|
|
|
|
|
|
write_thread_.ExitUnbatched(&w);
|
2015-11-11 07:58:01 +01:00
|
|
|
}
|
|
|
|
}
|
2014-10-17 02:22:28 +02:00
|
|
|
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"SetOptions() on column family [%s], inputs:",
|
2014-10-17 02:22:28 +02:00
|
|
|
cfd->GetName().c_str());
|
|
|
|
for (const auto& o : options_map) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"%s: %s\n", o.first.c_str(), o.second.c_str());
|
2014-10-17 02:22:28 +02:00
|
|
|
}
|
2014-11-05 01:23:05 +01:00
|
|
|
if (s.ok()) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log, "[%s] SetOptions succeeded",
|
2014-10-17 02:22:28 +02:00
|
|
|
cfd->GetName().c_str());
|
|
|
|
new_options.Dump(db_options_.info_log.get());
|
2015-12-09 02:01:02 +01:00
|
|
|
if (!persist_options_status.ok()) {
|
|
|
|
if (db_options_.fail_if_options_file_error) {
|
|
|
|
s = Status::IOError(
|
|
|
|
"SetOptions succeeded, but unable to persist options",
|
|
|
|
persist_options_status.ToString());
|
|
|
|
}
|
|
|
|
Warn(db_options_.info_log,
|
|
|
|
"Unable to persist options in SetOptions() -- %s",
|
|
|
|
persist_options_status.ToString().c_str());
|
|
|
|
}
|
2014-10-17 02:22:28 +02:00
|
|
|
} else {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] SetOptions failed", cfd->GetName().c_str());
|
2014-10-17 02:22:28 +02:00
|
|
|
}
|
2015-05-16 00:52:51 +02:00
|
|
|
LogFlush(db_options_.info_log);
|
2014-11-05 01:23:05 +01:00
|
|
|
return s;
|
2014-11-13 22:45:33 +01:00
|
|
|
#endif // ROCKSDB_LITE
|
2014-09-17 21:49:13 +02:00
|
|
|
}
|
|
|
|
|
2013-06-30 08:21:36 +02:00
|
|
|
// return the same level if it cannot be moved
|
2014-10-02 01:19:16 +02:00
|
|
|
int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
|
|
|
|
const MutableCFOptions& mutable_cf_options, int level) {
|
2013-06-30 08:21:36 +02:00
|
|
|
mutex_.AssertHeld();
|
2014-10-31 16:48:19 +01:00
|
|
|
const auto* vstorage = cfd->current()->storage_info();
|
2013-06-30 08:21:36 +02:00
|
|
|
int minimum_level = level;
|
2013-07-22 21:19:46 +02:00
|
|
|
for (int i = level - 1; i > 0; --i) {
|
2013-06-30 08:21:36 +02:00
|
|
|
// stop if level i is not empty
|
2014-10-27 23:49:46 +01:00
|
|
|
if (vstorage->NumLevelFiles(i) > 0) break;
|
2013-06-30 08:21:36 +02:00
|
|
|
// stop if level i is too small (cannot fit the level files)
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
|
2014-02-01 00:30:27 +01:00
|
|
|
break;
|
|
|
|
}
|
2013-06-30 08:21:36 +02:00
|
|
|
|
|
|
|
minimum_level = i;
|
|
|
|
}
|
|
|
|
return minimum_level;
|
|
|
|
}
|
|
|
|
|
2015-10-02 22:17:34 +02:00
|
|
|
// REQUIREMENT: block all background work by calling PauseBackgroundWork()
|
|
|
|
// before calling this function
|
2014-02-01 01:45:20 +01:00
|
|
|
Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
|
|
|
|
assert(level < cfd->NumberLevels());
|
2015-05-28 22:48:12 +02:00
|
|
|
if (target_level >= cfd->NumberLevels()) {
|
|
|
|
return Status::InvalidArgument("Target level exceeds number of levels");
|
|
|
|
}
|
2013-06-30 08:21:36 +02:00
|
|
|
|
2015-10-02 22:17:34 +02:00
|
|
|
std::unique_ptr<SuperVersion> superversion_to_free;
|
|
|
|
std::unique_ptr<SuperVersion> new_superversion(new SuperVersion());
|
|
|
|
|
|
|
|
Status status;
|
2013-12-20 18:57:58 +01:00
|
|
|
|
2015-06-05 20:06:14 +02:00
|
|
|
InstrumentedMutexLock guard_lock(&mutex_);
|
2013-06-30 08:21:36 +02:00
|
|
|
|
|
|
|
// only allow one thread refitting
|
|
|
|
if (refitting_level_) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[ReFitLevel] another thread is refitting");
|
2014-01-22 21:46:24 +01:00
|
|
|
return Status::NotSupported("another thread is refitting");
|
2013-06-30 08:21:36 +02:00
|
|
|
}
|
|
|
|
refitting_level_ = true;
|
|
|
|
|
2015-10-02 22:17:34 +02:00
|
|
|
const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
|
2013-06-30 08:21:36 +02:00
|
|
|
// move to a smaller level
|
2013-09-04 22:13:08 +02:00
|
|
|
int to_level = target_level;
|
|
|
|
if (target_level < 0) {
|
2014-10-02 01:19:16 +02:00
|
|
|
to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
|
2013-09-04 22:13:08 +02:00
|
|
|
}
|
2013-06-30 08:21:36 +02:00
|
|
|
|
2015-05-28 22:48:12 +02:00
|
|
|
auto* vstorage = cfd->current()->storage_info();
|
|
|
|
if (to_level > level) {
|
|
|
|
if (level == 0) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Cannot change from level 0 to other levels.");
|
|
|
|
}
|
|
|
|
// Check levels are empty for a trivial move
|
|
|
|
for (int l = level + 1; l <= to_level; l++) {
|
|
|
|
if (vstorage->NumLevelFiles(l) > 0) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Levels between source and target are not empty for a move.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (to_level != level) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
2015-10-02 22:17:34 +02:00
|
|
|
"[%s] Before refitting:\n%s", cfd->GetName().c_str(),
|
|
|
|
cfd->current()->DebugString().data());
|
2013-06-30 08:21:36 +02:00
|
|
|
|
2014-01-15 00:27:09 +01:00
|
|
|
VersionEdit edit;
|
2014-02-01 01:45:20 +01:00
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
2015-05-28 22:48:12 +02:00
|
|
|
for (const auto& f : vstorage->LevelFiles(level)) {
|
2014-06-14 00:54:19 +02:00
|
|
|
edit.DeleteFile(level, f->fd.GetNumber());
|
2014-07-02 18:54:20 +02:00
|
|
|
edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
|
|
|
|
f->fd.GetFileSize(), f->smallest, f->largest,
|
2015-06-04 21:03:40 +02:00
|
|
|
f->smallest_seqno, f->largest_seqno,
|
|
|
|
f->marked_for_compaction);
|
2013-06-30 08:21:36 +02:00
|
|
|
}
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
2015-10-02 22:17:34 +02:00
|
|
|
"[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
|
|
|
|
edit.DebugString().data());
|
2013-06-30 08:21:36 +02:00
|
|
|
|
2015-01-26 22:59:38 +01:00
|
|
|
status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
|
|
|
|
directories_.GetDbDir());
|
2015-10-02 22:17:34 +02:00
|
|
|
superversion_to_free.reset(InstallSuperVersionAndScheduleWork(
|
|
|
|
cfd, new_superversion.release(), mutable_cf_options));
|
2013-06-30 08:21:36 +02:00
|
|
|
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
|
2014-04-25 15:51:16 +02:00
|
|
|
status.ToString().data());
|
2013-06-30 08:21:36 +02:00
|
|
|
|
|
|
|
if (status.ok()) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
2015-10-02 22:17:34 +02:00
|
|
|
"[%s] After refitting:\n%s", cfd->GetName().c_str(),
|
|
|
|
cfd->current()->DebugString().data());
|
2013-06-30 08:21:36 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
refitting_level_ = false;
|
2013-12-20 18:57:58 +01:00
|
|
|
|
2014-01-22 21:46:24 +01:00
|
|
|
return status;
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
return cfh->cfd()->NumberLevels();
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
|
2015-07-17 21:02:52 +02:00
|
|
|
return 0;
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-10-24 00:37:14 +02:00
|
|
|
return cfh->cfd()->GetSuperVersion()->
|
|
|
|
mutable_cf_options.level0_stop_writes_trigger;
|
2012-06-23 04:30:03 +02:00
|
|
|
}
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
Status DBImpl::Flush(const FlushOptions& flush_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
2014-10-17 01:57:59 +02:00
|
|
|
return FlushMemTable(cfh->cfd(), flush_options);
|
2012-07-06 20:42:09 +02:00
|
|
|
}
|
|
|
|
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
Status DBImpl::SyncWAL() {
|
|
|
|
autovector<log::Writer*, 1> logs_to_sync;
|
|
|
|
bool need_log_dir_sync;
|
|
|
|
uint64_t current_log_number;
|
|
|
|
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
assert(!logs_.empty());
|
|
|
|
|
|
|
|
// This SyncWAL() call only cares about logs up to this number.
|
|
|
|
current_log_number = logfile_number_;
|
|
|
|
|
|
|
|
while (logs_.front().number <= current_log_number &&
|
|
|
|
logs_.front().getting_synced) {
|
|
|
|
log_sync_cv_.Wait();
|
|
|
|
}
|
|
|
|
// First check that logs are safe to sync in background.
|
|
|
|
for (auto it = logs_.begin();
|
|
|
|
it != logs_.end() && it->number <= current_log_number; ++it) {
|
|
|
|
if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"SyncWAL() is not supported for this implementation of WAL file",
|
|
|
|
db_options_.allow_mmap_writes
|
|
|
|
? "try setting Options::allow_mmap_writes to false"
|
|
|
|
: Slice());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto it = logs_.begin();
|
|
|
|
it != logs_.end() && it->number <= current_log_number; ++it) {
|
|
|
|
auto& log = *it;
|
|
|
|
assert(!log.getting_synced);
|
|
|
|
log.getting_synced = true;
|
2015-08-06 05:51:27 +02:00
|
|
|
logs_to_sync.push_back(log.writer);
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
need_log_dir_sync = !log_dir_synced_;
|
|
|
|
}
|
|
|
|
|
2015-09-14 23:32:30 +02:00
|
|
|
RecordTick(stats_, WAL_FILE_SYNCED);
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
Status status;
|
|
|
|
for (log::Writer* log : logs_to_sync) {
|
|
|
|
status = log->file()->SyncWithoutFlush(db_options_.use_fsync);
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (status.ok() && need_log_dir_sync) {
|
|
|
|
status = directories_.GetWalDir()->Fsync();
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
MarkLogsSynced(current_log_number, need_log_dir_sync, status);
|
|
|
|
}
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::MarkLogsSynced(
|
|
|
|
uint64_t up_to, bool synced_dir, const Status& status) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
if (synced_dir &&
|
|
|
|
logfile_number_ == up_to &&
|
|
|
|
status.ok()) {
|
|
|
|
log_dir_synced_ = true;
|
|
|
|
}
|
|
|
|
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
|
|
|
|
auto& log = *it;
|
|
|
|
assert(log.getting_synced);
|
|
|
|
if (status.ok() && logs_.size() > 1) {
|
2015-08-06 05:51:27 +02:00
|
|
|
logs_to_free_.push_back(log.ReleaseWriter());
|
Fix deadlock in WAL sync
Summary:
MarkLogsSynced() was doing `logs_.erase(it++);`. The standard is saying:
```
all iterators and references are invalidated, unless the erased members are at an end (front or back) of the deque (in which case only iterators and references to the erased members are invalidated)
```
Because `it` is an iterator to the first element of the container, it is
invalidated, only one iteration is executed and `log.getting_synced = false;`
is not being done, so `while (logs_.front().getting_synced)` in `WriteImpl()`
is not terminating.
Test Plan: make db_bench && ./db_bench --benchmarks=fillsync
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, yhchiang, sdong, tnovak
Reviewed By: tnovak
Subscribers: kolmike, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45807
2015-08-29 03:06:32 +02:00
|
|
|
it = logs_.erase(it);
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
} else {
|
|
|
|
log.getting_synced = false;
|
|
|
|
++it;
|
|
|
|
}
|
|
|
|
}
|
Fix deadlock in WAL sync
Summary:
MarkLogsSynced() was doing `logs_.erase(it++);`. The standard is saying:
```
all iterators and references are invalidated, unless the erased members are at an end (front or back) of the deque (in which case only iterators and references to the erased members are invalidated)
```
Because `it` is an iterator to the first element of the container, it is
invalidated, only one iteration is executed and `log.getting_synced = false;`
is not being done, so `while (logs_.front().getting_synced)` in `WriteImpl()`
is not terminating.
Test Plan: make db_bench && ./db_bench --benchmarks=fillsync
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, yhchiang, sdong, tnovak
Reviewed By: tnovak
Subscribers: kolmike, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45807
2015-08-29 03:06:32 +02:00
|
|
|
assert(logs_.empty() || (logs_.size() == 1 && !logs_[0].getting_synced));
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
log_sync_cv_.SignalAll();
|
|
|
|
}
|
|
|
|
|
2013-10-25 04:09:02 +02:00
|
|
|
SequenceNumber DBImpl::GetLatestSequenceNumber() const {
|
2012-12-11 00:37:00 +01:00
|
|
|
return versions_->LastSequence();
|
|
|
|
}
|
|
|
|
|
2014-02-01 01:45:20 +01:00
|
|
|
Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
|
2014-07-17 02:39:18 +02:00
|
|
|
int output_level, uint32_t output_path_id,
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
const Slice* begin, const Slice* end,
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
bool exclusive, bool disallow_trivial_move) {
|
2015-04-23 01:55:22 +02:00
|
|
|
assert(input_level == ColumnFamilyData::kCompactAllLevels ||
|
|
|
|
input_level >= 0);
|
2011-06-22 04:36:45 +02:00
|
|
|
|
2011-10-06 01:30:28 +02:00
|
|
|
InternalKey begin_storage, end_storage;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
CompactionArg* ca;
|
2011-10-06 01:30:28 +02:00
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
bool scheduled = false;
|
|
|
|
bool manual_conflict = false;
|
2011-06-07 16:40:26 +02:00
|
|
|
ManualCompaction manual;
|
2014-02-01 01:45:20 +01:00
|
|
|
manual.cfd = cfd;
|
2014-01-15 01:19:09 +01:00
|
|
|
manual.input_level = input_level;
|
|
|
|
manual.output_level = output_level;
|
2014-07-17 02:39:18 +02:00
|
|
|
manual.output_path_id = output_path_id;
|
2011-10-06 01:30:28 +02:00
|
|
|
manual.done = false;
|
2012-10-19 23:00:53 +02:00
|
|
|
manual.in_progress = false;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
manual.incomplete = false;
|
|
|
|
manual.exclusive = exclusive;
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
manual.disallow_trivial_move = disallow_trivial_move;
|
2013-08-08 00:20:41 +02:00
|
|
|
// For universal compaction, we enforce every manual compaction to compact
|
|
|
|
// all files.
|
|
|
|
if (begin == nullptr ||
|
2014-09-09 00:04:34 +02:00
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
|
2013-02-15 20:53:17 +01:00
|
|
|
manual.begin = nullptr;
|
2011-10-06 01:30:28 +02:00
|
|
|
} else {
|
2015-04-24 03:08:37 +02:00
|
|
|
begin_storage.SetMaxPossibleForUserKey(*begin);
|
2011-10-06 01:30:28 +02:00
|
|
|
manual.begin = &begin_storage;
|
|
|
|
}
|
2013-08-08 00:20:41 +02:00
|
|
|
if (end == nullptr ||
|
2014-09-09 00:04:34 +02:00
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
|
2013-02-15 20:53:17 +01:00
|
|
|
manual.end = nullptr;
|
2011-10-06 01:30:28 +02:00
|
|
|
} else {
|
2015-04-24 03:08:37 +02:00
|
|
|
end_storage.SetMinPossibleForUserKey(*end);
|
2011-10-06 01:30:28 +02:00
|
|
|
manual.end = &end_storage;
|
|
|
|
}
|
|
|
|
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2012-10-19 23:00:53 +02:00
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-22 00:10:39 +01:00
|
|
|
// When a manual compaction arrives, temporarily disable scheduling of
|
|
|
|
// non-manual compactions and wait until the number of scheduled compaction
|
|
|
|
// jobs drops to zero. This is needed to ensure that this manual compaction
|
|
|
|
// can compact any range of keys/files.
|
|
|
|
//
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
// HasPendingManualCompaction() is true when at least one thread is inside
|
2014-01-15 01:19:09 +01:00
|
|
|
// RunManualCompaction(), i.e. during that time no other compaction will
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-22 00:10:39 +01:00
|
|
|
// get scheduled (see MaybeScheduleFlushOrCompaction).
|
|
|
|
//
|
|
|
|
// Note that the following loop doesn't stop more that one thread calling
|
2014-01-15 01:19:09 +01:00
|
|
|
// RunManualCompaction() from getting to the second while loop below.
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-22 00:10:39 +01:00
|
|
|
// However, only one of them will actually schedule compaction, while
|
|
|
|
// others will wait on a condition variable until it completes.
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
AddManualCompaction(&manual);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
|
|
|
|
if (exclusive) {
|
|
|
|
while (bg_compaction_scheduled_ > 0) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] Manual compaction waiting for all other scheduled background "
|
|
|
|
"compactions to finish",
|
|
|
|
cfd->GetName().c_str());
|
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-22 00:10:39 +01:00
|
|
|
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] Manual compaction starting",
|
2014-04-25 15:51:16 +02:00
|
|
|
cfd->GetName().c_str());
|
2012-10-19 23:00:53 +02:00
|
|
|
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
2014-09-12 01:24:16 +02:00
|
|
|
// We don't check bg_error_ here, because if we get the error in compaction,
|
|
|
|
// the compaction will set manual.status to bg_error_ and set manual.done to
|
|
|
|
// true.
|
|
|
|
while (!manual.done) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
assert(HasPendingManualCompaction());
|
|
|
|
manual_conflict = false;
|
2015-12-18 01:59:00 +01:00
|
|
|
if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
scheduled ||
|
|
|
|
((manual.manual_end = &manual.tmp_storage1)&&(
|
|
|
|
(manual.compaction = manual.cfd->CompactRange(
|
|
|
|
*manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
|
|
|
|
manual.output_level, manual.output_path_id, manual.begin,
|
|
|
|
manual.end, &manual.manual_end, &manual_conflict)) ==
|
|
|
|
nullptr) &&
|
|
|
|
manual_conflict)) {
|
|
|
|
// exclusive manual compactions should not see a conflict during
|
|
|
|
// CompactRange
|
|
|
|
assert(!exclusive || !manual_conflict);
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-22 00:10:39 +01:00
|
|
|
// Running either this or some other manual compaction
|
2011-10-06 01:30:28 +02:00
|
|
|
bg_cv_.Wait();
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
if (scheduled && manual.incomplete == true) {
|
|
|
|
assert(!manual.in_progress);
|
|
|
|
scheduled = false;
|
|
|
|
manual.incomplete = false;
|
|
|
|
}
|
|
|
|
} else if (!scheduled) {
|
|
|
|
if (manual.compaction == nullptr) {
|
|
|
|
manual.done = true;
|
|
|
|
bg_cv_.SignalAll();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ca = new CompactionArg;
|
|
|
|
ca->db = this;
|
|
|
|
ca->m = &manual;
|
|
|
|
manual.incomplete = false;
|
2014-08-28 19:06:28 +02:00
|
|
|
bg_compaction_scheduled_++;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
|
|
|
|
&DBImpl::UnscheduleCallback);
|
|
|
|
scheduled = true;
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
2011-06-07 16:40:26 +02:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-22 00:10:39 +01:00
|
|
|
assert(!manual.in_progress);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
assert(HasPendingManualCompaction());
|
|
|
|
RemoveManualCompaction(&manual);
|
2015-12-18 01:59:00 +01:00
|
|
|
bg_cv_.SignalAll();
|
2014-01-22 21:46:24 +01:00
|
|
|
return manual.status;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2015-10-14 02:33:23 +02:00
|
|
|
InternalIterator* DBImpl::NewInternalIterator(
|
|
|
|
Arena* arena, ColumnFamilyHandle* column_family) {
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
if (column_family == nullptr) {
|
|
|
|
cfd = default_cf_handle_->cfd();
|
|
|
|
} else {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
cfd = cfh->cfd();
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_.Lock();
|
|
|
|
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
ReadOptions roptions;
|
|
|
|
return NewInternalIterator(roptions, cfd, super_version, arena);
|
|
|
|
}
|
|
|
|
|
2014-01-31 02:48:42 +01:00
|
|
|
Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
2014-10-17 01:57:59 +02:00
|
|
|
const FlushOptions& flush_options) {
|
2014-08-12 21:13:13 +02:00
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
WriteContext context;
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock guard_lock(&mutex_);
|
2014-09-05 21:01:01 +02:00
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty()) {
|
2014-09-05 21:01:01 +02:00
|
|
|
// Nothing to flush
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2015-08-06 01:56:28 +02:00
|
|
|
WriteThread::Writer w;
|
|
|
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
2014-08-12 21:13:13 +02:00
|
|
|
|
2015-06-17 21:37:59 +02:00
|
|
|
// SwitchMemtable() will release and reacquire mutex
|
2014-08-12 21:13:13 +02:00
|
|
|
// during execution
|
2015-06-17 21:37:59 +02:00
|
|
|
s = SwitchMemtable(cfd, &context);
|
2015-08-06 01:56:28 +02:00
|
|
|
write_thread_.ExitUnbatched(&w);
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
|
2014-08-12 21:13:13 +02:00
|
|
|
cfd->imm()->FlushRequested();
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// schedule flush
|
|
|
|
SchedulePendingFlush(cfd);
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
2014-08-12 21:13:13 +02:00
|
|
|
}
|
2014-08-12 07:10:32 +02:00
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
if (s.ok() && flush_options.wait) {
|
2011-04-12 21:38:58 +02:00
|
|
|
// Wait until the compaction completes
|
2014-01-31 02:48:42 +01:00
|
|
|
s = WaitForFlushMemTable(cfd);
|
2011-04-12 21:38:58 +02:00
|
|
|
}
|
|
|
|
return s;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-01-31 02:48:42 +01:00
|
|
|
Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
|
2012-08-15 00:20:36 +02:00
|
|
|
Status s;
|
|
|
|
// Wait until the compaction completes
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
while (cfd->imm()->NumNotFlushed() > 0 && bg_error_.ok()) {
|
2015-06-25 23:43:25 +02:00
|
|
|
if (shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
return Status::ShutdownInProgress();
|
|
|
|
}
|
2012-08-15 00:20:36 +02:00
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
2014-01-31 02:48:42 +01:00
|
|
|
if (!bg_error_.ok()) {
|
2012-08-15 00:20:36 +02:00
|
|
|
s = bg_error_;
|
|
|
|
}
|
|
|
|
return s;
|
2012-07-06 20:42:09 +02:00
|
|
|
}
|
|
|
|
|
2015-11-20 10:07:24 +01:00
|
|
|
Status DBImpl::EnableAutoCompaction(
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family_handles) {
|
|
|
|
Status s;
|
|
|
|
for (auto cf_ptr : column_family_handles) {
|
add call to install superversion and schedule work in enableautocompactions
Summary:
This patch fixes https://github.com/facebook/mysql-5.6/issues/121
There is a recent change in rocksdb to disable auto compactions on startup: https://reviews.facebook.net/D51147. However, there is a small timing window where a column family needs to be compacted and schedules a compaction, but the scheduled compaction fails when it checks the disable_auto_compactions setting. The expectation is once the application is ready, it will call EnableAutoCompactions() to allow new compactions to go through. However, if the Column family is stalled because L0 is full, and no writes can go through, it is possible the column family may never have a new compaction request get scheduled. EnableAutoCompaction() should probably schedule an new flush and compaction event when it resets disable_auto_compaction.
Using InstallSuperVersionAndScheduleWork, we call SchedulePendingFlush,
SchedulePendingCompaction, as well as MaybeScheduleFlushOrcompaction on all the
column families to avoid the situation above.
This is still a first pass for feedback.
Could also just call SchedePendingFlush and SchedulePendingCompaction directly.
Test Plan:
Run on Asan build
cd _build-5.6-ASan/ && ./mysql-test/mtr --mem --big --testcase-timeout=36000 --suite-timeout=12000 --parallel=16 --suite=rocksdb,rocksdb_rpl,rocksdb_sys_vars --mysqld=--default-storage-engine=rocksdb --mysqld=--skip-innodb --mysqld=--default-tmp-storage-engine=MyISAM --mysqld=--rocksdb rocksdb_rpl.rpl_rocksdb_stress_crash --repeat=1000
Ensure that it no longer hangs during the test.
Reviewers: hermanlee4, yhchiang, anthony
Reviewed By: anthony
Subscribers: leveldb, yhchiang, dhruba
Differential Revision: https://reviews.facebook.net/D51747
2015-12-09 09:29:35 +01:00
|
|
|
Status status =
|
|
|
|
this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
|
|
|
|
if (status.ok()) {
|
|
|
|
ColumnFamilyData* cfd =
|
|
|
|
reinterpret_cast<ColumnFamilyHandleImpl*>(cf_ptr)->cfd();
|
|
|
|
InstrumentedMutexLock guard_lock(&mutex_);
|
|
|
|
delete this->InstallSuperVersionAndScheduleWork(
|
|
|
|
cfd, nullptr, *cfd->GetLatestMutableCFOptions());
|
|
|
|
} else {
|
|
|
|
s = status;
|
2015-11-20 10:07:24 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-10-15 00:12:15 +02:00
|
|
|
void DBImpl::MaybeScheduleFlushOrCompaction() {
|
2011-03-18 23:37:00 +01:00
|
|
|
mutex_.AssertHeld();
|
2015-07-16 04:58:28 +02:00
|
|
|
if (!opened_successfully_) {
|
|
|
|
// Compaction may introduce data race to DB open
|
|
|
|
return;
|
|
|
|
}
|
2015-10-02 22:17:34 +02:00
|
|
|
if (bg_work_paused_ > 0) {
|
|
|
|
// we paused the background work
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
return;
|
2014-10-27 22:50:21 +01:00
|
|
|
} else if (shutting_down_.load(std::memory_order_acquire)) {
|
2011-03-18 23:37:00 +01:00
|
|
|
// DB is being deleted; no more background compactions
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
return;
|
|
|
|
}
|
2013-09-13 23:38:37 +02:00
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
while (unscheduled_flushes_ > 0 &&
|
|
|
|
bg_flush_scheduled_ < db_options_.max_background_flushes) {
|
|
|
|
unscheduled_flushes_--;
|
|
|
|
bg_flush_scheduled_++;
|
2015-03-17 02:49:14 +01:00
|
|
|
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this);
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
}
|
|
|
|
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
2016-01-28 20:56:16 +01:00
|
|
|
auto bg_compactions_allowed = BGCompactionsAllowed();
|
|
|
|
|
2015-07-17 21:02:52 +02:00
|
|
|
// special case -- if max_background_flushes == 0, then schedule flush on a
|
|
|
|
// compaction thread
|
|
|
|
if (db_options_.max_background_flushes == 0) {
|
|
|
|
while (unscheduled_flushes_ > 0 &&
|
|
|
|
bg_flush_scheduled_ + bg_compaction_scheduled_ <
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
2016-01-28 20:56:16 +01:00
|
|
|
bg_compactions_allowed) {
|
2015-07-17 21:02:52 +02:00
|
|
|
unscheduled_flushes_--;
|
|
|
|
bg_flush_scheduled_++;
|
|
|
|
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
if (HasExclusiveManualCompaction()) {
|
2015-05-19 00:34:33 +02:00
|
|
|
// only manual compactions are allowed to run. don't schedule automatic
|
|
|
|
// compactions
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
2016-01-28 20:56:16 +01:00
|
|
|
while (bg_compaction_scheduled_ < bg_compactions_allowed &&
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
unscheduled_compactions_ > 0) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
CompactionArg* ca = new CompactionArg;
|
|
|
|
ca->db = this;
|
|
|
|
ca->m = nullptr;
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
bg_compaction_scheduled_++;
|
|
|
|
unscheduled_compactions_--;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
|
|
|
|
&DBImpl::UnscheduleCallback);
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
2016-01-28 20:56:16 +01:00
|
|
|
int DBImpl::BGCompactionsAllowed() const {
|
|
|
|
if (write_controller_.NeedSpeedupCompaction()) {
|
|
|
|
return db_options_.max_background_compactions;
|
|
|
|
} else {
|
|
|
|
return db_options_.base_background_compactions;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
|
|
|
|
assert(!cfd->pending_compaction());
|
|
|
|
cfd->Ref();
|
|
|
|
compaction_queue_.push_back(cfd);
|
|
|
|
cfd->set_pending_compaction(true);
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
|
|
|
|
assert(!compaction_queue_.empty());
|
|
|
|
auto cfd = *compaction_queue_.begin();
|
|
|
|
compaction_queue_.pop_front();
|
|
|
|
assert(cfd->pending_compaction());
|
|
|
|
cfd->set_pending_compaction(false);
|
|
|
|
return cfd;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) {
|
|
|
|
assert(!cfd->pending_flush());
|
|
|
|
cfd->Ref();
|
|
|
|
flush_queue_.push_back(cfd);
|
|
|
|
cfd->set_pending_flush(true);
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() {
|
|
|
|
assert(!flush_queue_.empty());
|
|
|
|
auto cfd = *flush_queue_.begin();
|
|
|
|
flush_queue_.pop_front();
|
|
|
|
assert(cfd->pending_flush());
|
|
|
|
cfd->set_pending_flush(false);
|
|
|
|
return cfd;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) {
|
|
|
|
if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) {
|
|
|
|
AddToFlushQueue(cfd);
|
|
|
|
++unscheduled_flushes_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
|
|
|
|
if (!cfd->pending_compaction() && cfd->NeedsCompaction()) {
|
|
|
|
AddToCompactionQueue(cfd);
|
|
|
|
++unscheduled_compactions_;
|
2013-09-13 23:38:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-04 01:28:03 +02:00
|
|
|
void DBImpl::RecordFlushIOStats() {
|
2014-08-28 18:46:05 +02:00
|
|
|
RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
|
2014-07-04 01:28:03 +02:00
|
|
|
IOSTATS_RESET(bytes_written);
|
|
|
|
}
|
|
|
|
|
2013-09-13 23:38:37 +02:00
|
|
|
void DBImpl::BGWorkFlush(void* db) {
|
2014-07-04 01:28:03 +02:00
|
|
|
IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
|
2015-07-02 23:27:00 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::BGWorkFlush");
|
2013-09-13 23:38:37 +02:00
|
|
|
reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
|
2015-07-02 23:27:00 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
|
2013-09-13 23:38:37 +02:00
|
|
|
}
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
void DBImpl::BGWorkCompaction(void* arg) {
|
|
|
|
CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
|
|
|
|
delete reinterpret_cast<CompactionArg*>(arg);
|
2014-07-04 01:28:03 +02:00
|
|
|
IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
|
2015-03-23 23:05:58 +01:00
|
|
|
TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
reinterpret_cast<DBImpl*>(ca.db)->BackgroundCallCompaction(ca.m);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::UnscheduleCallback(void* arg) {
|
|
|
|
CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
|
|
|
|
delete reinterpret_cast<CompactionArg*>(arg);
|
|
|
|
if ((ca.m != nullptr) && (ca.m->compaction != nullptr)) {
|
|
|
|
delete ca.m->compaction;
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("DBImpl::UnscheduleCallback");
|
2013-09-13 23:38:37 +02:00
|
|
|
}
|
|
|
|
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
|
2014-03-10 06:01:13 +01:00
|
|
|
LogBuffer* log_buffer) {
|
2014-02-11 02:04:44 +01:00
|
|
|
mutex_.AssertHeld();
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
2014-09-12 01:24:16 +02:00
|
|
|
|
2015-03-11 18:31:02 +01:00
|
|
|
Status status = bg_error_;
|
|
|
|
if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
status = Status::ShutdownInProgress();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
2014-09-12 01:24:16 +02:00
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
ColumnFamilyData* cfd = nullptr;
|
|
|
|
while (!flush_queue_.empty()) {
|
|
|
|
// This cfd is already referenced
|
2014-12-21 09:23:28 +01:00
|
|
|
auto first_cfd = PopFirstFromFlushQueue();
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
|
2014-12-21 09:23:28 +01:00
|
|
|
if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) {
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// can't flush this CF, try next one
|
2014-12-21 09:23:28 +01:00
|
|
|
if (first_cfd->Unref()) {
|
|
|
|
delete first_cfd;
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
}
|
|
|
|
continue;
|
2014-02-11 02:04:44 +01:00
|
|
|
}
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
|
|
|
|
// found a flush!
|
2014-12-21 09:23:28 +01:00
|
|
|
cfd = first_cfd;
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cfd != nullptr) {
|
|
|
|
const MutableCFOptions mutable_cf_options =
|
|
|
|
*cfd->GetLatestMutableCFOptions();
|
|
|
|
LogToBuffer(
|
|
|
|
log_buffer,
|
|
|
|
"Calling FlushMemTableToOutputFile with column "
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
2016-01-28 20:56:16 +01:00
|
|
|
"family [%s], flush slots available %d, compaction slots allowed %d, "
|
|
|
|
"compaction slots scheduled %d",
|
|
|
|
cfd->GetName().c_str(), db_options_.max_background_flushes,
|
|
|
|
bg_flush_scheduled_, BGCompactionsAllowed() - bg_compaction_scheduled_);
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
job_context, log_buffer);
|
|
|
|
if (cfd->Unref()) {
|
|
|
|
delete cfd;
|
2014-01-31 02:48:42 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
return status;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2013-09-13 23:38:37 +02:00
|
|
|
void DBImpl::BackgroundCallFlush() {
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
bool made_progress = false;
|
2015-02-12 18:54:48 +01:00
|
|
|
JobContext job_context(next_job_id_.fetch_add(1), true);
|
2013-09-13 23:38:37 +02:00
|
|
|
assert(bg_flush_scheduled_);
|
|
|
|
|
2014-09-05 20:48:17 +02:00
|
|
|
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
|
2014-03-10 06:01:13 +01:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2015-10-17 09:16:36 +02:00
|
|
|
num_running_flushes_++;
|
2014-03-10 06:01:13 +01:00
|
|
|
|
2014-11-07 20:50:34 +01:00
|
|
|
auto pending_outputs_inserted_elem =
|
|
|
|
CaptureCurrentFileNumberInPendingOutputs();
|
|
|
|
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer);
|
2015-03-11 18:31:02 +01:00
|
|
|
if (!s.ok() && !s.IsShutdownInProgress()) {
|
|
|
|
// Wait a little bit before retrying background flush in
|
|
|
|
// case this is an environmental problem and we do not want to
|
|
|
|
// chew up resources for failed flushes for the duration of
|
|
|
|
// the problem.
|
|
|
|
uint64_t error_cnt =
|
|
|
|
default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
|
|
|
|
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
|
|
|
mutex_.Unlock();
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Waiting after background flush error: %s"
|
|
|
|
"Accumulated background error counts: %" PRIu64,
|
|
|
|
s.ToString().c_str(), error_cnt);
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
env_->SleepForMicroseconds(1000000);
|
|
|
|
mutex_.Lock();
|
2014-03-10 06:01:13 +01:00
|
|
|
}
|
|
|
|
|
2014-11-07 20:50:34 +01:00
|
|
|
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
|
|
|
|
|
2015-03-20 01:04:29 +01:00
|
|
|
// If flush failed, we want to delete all temporary files that we might have
|
|
|
|
// created. Thus, we force full scan in FindObsoleteFiles()
|
|
|
|
FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
|
2014-03-10 06:01:13 +01:00
|
|
|
// delete unnecessary files if any, this is done outside the mutex
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
|
2013-09-13 23:38:37 +02:00
|
|
|
mutex_.Unlock();
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
2014-03-11 18:36:30 +01:00
|
|
|
// Have to flush the info logs before bg_flush_scheduled_--
|
|
|
|
// because if bg_flush_scheduled_ becomes 0 and the lock is
|
|
|
|
// released, the deconstructor of DB can kick in and destroy all the
|
|
|
|
// states of DB so info_log might not be available after that point.
|
|
|
|
// It also applies to access other states that DB owns.
|
2014-03-10 06:01:13 +01:00
|
|
|
log_buffer.FlushBufferToLog();
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
2014-03-11 18:36:30 +01:00
|
|
|
}
|
2014-11-15 00:43:10 +01:00
|
|
|
job_context.Clean();
|
2013-09-13 23:38:37 +02:00
|
|
|
mutex_.Lock();
|
|
|
|
}
|
2013-11-09 00:23:46 +01:00
|
|
|
|
2015-10-17 09:16:36 +02:00
|
|
|
assert(num_running_flushes_ > 0);
|
|
|
|
num_running_flushes_--;
|
2014-03-10 06:01:13 +01:00
|
|
|
bg_flush_scheduled_--;
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// See if there's more work to be done
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
2014-08-08 19:24:00 +02:00
|
|
|
RecordFlushIOStats();
|
2014-03-10 06:01:13 +01:00
|
|
|
bg_cv_.SignalAll();
|
2014-03-11 19:27:19 +01:00
|
|
|
// IMPORTANT: there should be no code after calling SignalAll. This call may
|
|
|
|
// signal the DB destructor that it's OK to proceed with destruction. In
|
|
|
|
// that case, all DB variables will be dealloacated and referencing them
|
|
|
|
// will cause trouble.
|
2013-10-16 22:32:53 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
void DBImpl::BackgroundCallCompaction(void* arg) {
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
bool made_progress = false;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
ManualCompaction* m = reinterpret_cast<ManualCompaction*>(arg);
|
2015-02-12 18:54:48 +01:00
|
|
|
JobContext job_context(next_job_id_.fetch_add(1), true);
|
2015-11-13 21:01:00 +01:00
|
|
|
TEST_SYNC_POINT("BackgroundCallCompaction:0");
|
2013-05-24 21:52:45 +02:00
|
|
|
MaybeDumpStats();
|
2014-09-05 20:48:17 +02:00
|
|
|
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 23:32:55 +01:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2015-10-17 09:16:36 +02:00
|
|
|
num_running_compactions_++;
|
2014-11-07 20:50:34 +01:00
|
|
|
|
|
|
|
auto pending_outputs_inserted_elem =
|
|
|
|
CaptureCurrentFileNumberInPendingOutputs();
|
|
|
|
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 23:32:55 +01:00
|
|
|
assert(bg_compaction_scheduled_);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
Status s =
|
|
|
|
BackgroundCompaction(&made_progress, &job_context, &log_buffer, m);
|
2015-11-13 21:01:00 +01:00
|
|
|
TEST_SYNC_POINT("BackgroundCallCompaction:1");
|
2015-03-11 18:31:02 +01:00
|
|
|
if (!s.ok() && !s.IsShutdownInProgress()) {
|
|
|
|
// Wait a little bit before retrying background compaction in
|
|
|
|
// case this is an environmental problem and we do not want to
|
|
|
|
// chew up resources for failed compactions for the duration of
|
|
|
|
// the problem.
|
|
|
|
uint64_t error_cnt =
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
|
2015-03-11 18:31:02 +01:00
|
|
|
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
|
|
|
mutex_.Unlock();
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Waiting after background compaction error: %s, "
|
|
|
|
"Accumulated background error counts: %" PRIu64,
|
|
|
|
s.ToString().c_str(), error_cnt);
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
env_->SleepForMicroseconds(1000000);
|
|
|
|
mutex_.Lock();
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 23:32:55 +01:00
|
|
|
}
|
2013-05-24 21:52:45 +02:00
|
|
|
|
2014-11-07 20:50:34 +01:00
|
|
|
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
|
|
|
|
|
2015-03-20 01:04:29 +01:00
|
|
|
// If compaction failed, we want to delete all temporary files that we might
|
|
|
|
// have created (they might not be all recorded in job_context in case of a
|
|
|
|
// failure). Thus, we force full scan in FindObsoleteFiles()
|
|
|
|
FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 23:32:55 +01:00
|
|
|
|
|
|
|
// delete unnecessary files if any, this is done outside the mutex
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
|
2012-08-23 01:57:51 +02:00
|
|
|
mutex_.Unlock();
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
2014-03-11 18:36:30 +01:00
|
|
|
// Have to flush the info logs before bg_compaction_scheduled_--
|
|
|
|
// because if bg_flush_scheduled_ becomes 0 and the lock is
|
|
|
|
// released, the deconstructor of DB can kick in and destroy all the
|
|
|
|
// states of DB so info_log might not be available after that point.
|
|
|
|
// It also applies to access other states that DB owns.
|
2014-03-10 06:01:13 +01:00
|
|
|
log_buffer.FlushBufferToLog();
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
2014-03-11 18:36:30 +01:00
|
|
|
}
|
2014-11-15 00:43:10 +01:00
|
|
|
job_context.Clean();
|
2012-08-23 01:57:51 +02:00
|
|
|
mutex_.Lock();
|
|
|
|
}
|
2012-10-21 10:49:48 +02:00
|
|
|
|
2015-10-17 09:16:36 +02:00
|
|
|
assert(num_running_compactions_ > 0);
|
|
|
|
num_running_compactions_--;
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 23:32:55 +01:00
|
|
|
bg_compaction_scheduled_--;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-04-07 23:21:25 +02:00
|
|
|
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// See if there's more work to be done
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
if (made_progress || bg_compaction_scheduled_ == 0 ||
|
|
|
|
HasPendingManualCompaction()) {
|
2014-06-03 02:23:55 +02:00
|
|
|
// signal if
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
// * made_progress -- need to wakeup DelayWrite
|
2014-06-03 02:23:55 +02:00
|
|
|
// * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
// * HasPendingManualCompaction -- need to wakeup RunManualCompaction
|
2014-06-03 02:23:55 +02:00
|
|
|
// If none of this is true, there is no need to signal since nobody is
|
|
|
|
// waiting for it
|
|
|
|
bg_cv_.SignalAll();
|
|
|
|
}
|
2014-03-11 19:27:19 +01:00
|
|
|
// IMPORTANT: there should be no code after calling SignalAll. This call may
|
|
|
|
// signal the DB destructor that it's OK to proceed with destruction. In
|
|
|
|
// that case, all DB variables will be dealloacated and referencing them
|
|
|
|
// will cause trouble.
|
2012-10-19 23:00:53 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
Status DBImpl::BackgroundCompaction(bool* made_progress,
|
|
|
|
JobContext* job_context,
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
LogBuffer* log_buffer, void* arg) {
|
|
|
|
ManualCompaction* manual_compaction =
|
|
|
|
reinterpret_cast<ManualCompaction*>(arg);
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
*made_progress = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
mutex_.AssertHeld();
|
2011-04-12 21:38:58 +02:00
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
bool is_manual = (manual_compaction != nullptr);
|
|
|
|
|
|
|
|
// (manual_compaction->in_progress == false);
|
|
|
|
bool trivial_move_disallowed =
|
|
|
|
is_manual && manual_compaction->disallow_trivial_move;
|
2014-04-07 22:46:51 +02:00
|
|
|
|
2015-06-03 02:07:16 +02:00
|
|
|
CompactionJobStats compaction_job_stats;
|
2015-03-11 18:31:02 +01:00
|
|
|
Status status = bg_error_;
|
|
|
|
if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
status = Status::ShutdownInProgress();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!status.ok()) {
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
2014-09-12 01:24:16 +02:00
|
|
|
if (is_manual) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
manual_compaction->status = status;
|
|
|
|
manual_compaction->done = true;
|
|
|
|
manual_compaction->in_progress = false;
|
|
|
|
delete manual_compaction->compaction;
|
|
|
|
manual_compaction = nullptr;
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
2014-09-12 01:24:16 +02:00
|
|
|
}
|
2015-03-11 18:31:02 +01:00
|
|
|
return status;
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
2014-09-12 01:24:16 +02:00
|
|
|
}
|
|
|
|
|
2014-04-07 22:46:51 +02:00
|
|
|
if (is_manual) {
|
|
|
|
// another thread cannot pick up the same work
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
manual_compaction->in_progress = true;
|
2014-04-07 22:46:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
unique_ptr<Compaction> c;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
// InternalKey manual_end_storage;
|
|
|
|
// InternalKey* manual_end = &manual_end_storage;
|
2011-06-07 16:40:26 +02:00
|
|
|
if (is_manual) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
ManualCompaction* m = manual_compaction;
|
2014-04-07 22:46:51 +02:00
|
|
|
assert(m->in_progress);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
c.reset(std::move(m->compaction));
|
2014-01-15 01:19:09 +01:00
|
|
|
if (!c) {
|
2013-01-20 11:07:13 +01:00
|
|
|
m->done = true;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
m->manual_end = nullptr;
|
2015-04-15 06:45:20 +02:00
|
|
|
LogToBuffer(log_buffer,
|
|
|
|
"[%s] Manual compaction from level-%d from %s .. "
|
|
|
|
"%s; nothing to do\n",
|
|
|
|
m->cfd->GetName().c_str(), m->input_level,
|
|
|
|
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
|
|
|
|
(m->end ? m->end->DebugString().c_str() : "(end)"));
|
|
|
|
} else {
|
|
|
|
LogToBuffer(log_buffer,
|
|
|
|
"[%s] Manual compaction from level-%d to level-%d from %s .. "
|
|
|
|
"%s; will stop at %s\n",
|
|
|
|
m->cfd->GetName().c_str(), m->input_level, c->output_level(),
|
|
|
|
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
|
|
|
|
(m->end ? m->end->DebugString().c_str() : "(end)"),
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
((m->done || m->manual_end == nullptr)
|
2015-04-15 06:45:20 +02:00
|
|
|
? "(end)"
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
: m->manual_end->DebugString().c_str()));
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
} else if (!compaction_queue_.empty()) {
|
|
|
|
// cfd is referenced here
|
|
|
|
auto cfd = PopFirstFromCompactionQueue();
|
|
|
|
// We unreference here because the following code will take a Ref() on
|
|
|
|
// this cfd if it is going to use it (Compaction class holds a
|
|
|
|
// reference).
|
|
|
|
// This will all happen under a mutex so we don't have to be afraid of
|
|
|
|
// somebody else deleting it.
|
|
|
|
if (cfd->Unref()) {
|
|
|
|
delete cfd;
|
|
|
|
// This was the last reference of the column family, so no need to
|
|
|
|
// compact.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
if (HaveManualCompaction(cfd)) {
|
|
|
|
// Can't compact right now, but try again later
|
|
|
|
TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// Pick up latest mutable CF Options and use it throughout the
|
|
|
|
// compaction job
|
|
|
|
// Compaction makes a copy of the latest MutableCFOptions. It should be used
|
|
|
|
// throughout the compaction procedure to make sure consistency. It will
|
|
|
|
// eventually be installed into SuperVersion
|
|
|
|
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
|
|
if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
|
|
|
|
// NOTE: try to avoid unnecessary copy of MutableCFOptions if
|
|
|
|
// compaction is not necessary. Need to make sure mutex is held
|
|
|
|
// until we make a copy in the following code
|
|
|
|
c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
|
|
|
|
if (c != nullptr) {
|
|
|
|
// update statistics
|
|
|
|
MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
|
|
|
|
c->inputs(0)->size());
|
|
|
|
// There are three things that can change compaction score:
|
|
|
|
// 1) When flush or compaction finish. This case is covered by
|
2015-06-17 21:37:59 +02:00
|
|
|
// InstallSuperVersionAndScheduleWork
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// 2) When MutableCFOptions changes. This case is also covered by
|
2015-06-17 21:37:59 +02:00
|
|
|
// InstallSuperVersionAndScheduleWork, because this is when the new
|
|
|
|
// options take effect.
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// 3) When we Pick a new compaction, we "remove" those files being
|
|
|
|
// compacted from the calculation, which then influences compaction
|
|
|
|
// score. Here we check if we need the new compaction even without the
|
|
|
|
// files that are currently being compacted. If we need another
|
|
|
|
// compaction, we might be able to execute it in parallel, so we add it
|
|
|
|
// to the queue and schedule a new thread.
|
|
|
|
if (cfd->NeedsCompaction()) {
|
|
|
|
// Yes, we need more compactions!
|
|
|
|
AddToCompactionQueue(cfd);
|
|
|
|
++unscheduled_compactions_;
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
2014-02-05 22:12:23 +01:00
|
|
|
}
|
2014-02-01 01:45:20 +01:00
|
|
|
}
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2013-01-20 11:07:13 +01:00
|
|
|
if (!c) {
|
2011-06-07 16:40:26 +02:00
|
|
|
// Nothing to do
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 23:32:55 +01:00
|
|
|
LogToBuffer(log_buffer, "Compaction nothing to do");
|
2015-07-09 00:21:10 +02:00
|
|
|
} else if (c->deletion_compaction()) {
|
2014-05-21 20:43:35 +02:00
|
|
|
// TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
|
|
|
|
// file if there is alive snapshot pointing to it
|
|
|
|
assert(c->num_input_files(1) == 0);
|
|
|
|
assert(c->level() == 0);
|
2014-09-09 00:04:34 +02:00
|
|
|
assert(c->column_family_data()->ioptions()->compaction_style ==
|
2014-05-21 20:43:35 +02:00
|
|
|
kCompactionStyleFIFO);
|
2015-06-03 02:07:16 +02:00
|
|
|
|
|
|
|
compaction_job_stats.num_input_files = c->num_input_files(0);
|
|
|
|
|
2014-05-21 20:43:35 +02:00
|
|
|
for (const auto& f : *c->inputs(0)) {
|
2014-06-14 00:54:19 +02:00
|
|
|
c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
|
2014-05-21 20:43:35 +02:00
|
|
|
}
|
2015-01-26 22:59:38 +01:00
|
|
|
status = versions_->LogAndApply(c->column_family_data(),
|
|
|
|
*c->mutable_cf_options(), c->edit(),
|
|
|
|
&mutex_, directories_.GetDbDir());
|
2015-06-17 21:37:59 +02:00
|
|
|
InstallSuperVersionAndScheduleWorkWrapper(
|
|
|
|
c->column_family_data(), job_context, *c->mutable_cf_options());
|
2014-05-21 20:43:35 +02:00
|
|
|
LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
|
|
|
|
c->column_family_data()->GetName().c_str(),
|
|
|
|
c->num_input_files(0));
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
*made_progress = true;
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
} else if (!trivial_move_disallowed && c->IsTrivialMove()) {
|
2015-04-02 20:06:30 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
|
2015-01-13 09:04:08 +01:00
|
|
|
// Instrument for event update
|
|
|
|
// TODO(yhchiang): add op details for showing trivial-move.
|
2016-01-26 01:26:53 +01:00
|
|
|
ThreadStatusUtil::SetColumnFamily(
|
|
|
|
c->column_family_data(), c->column_family_data()->ioptions()->env,
|
|
|
|
c->column_family_data()->options()->enable_thread_tracking);
|
2015-01-13 09:04:08 +01:00
|
|
|
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
|
|
|
|
|
2015-06-03 02:07:16 +02:00
|
|
|
compaction_job_stats.num_input_files = c->num_input_files(0);
|
|
|
|
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
// Move files to next level
|
|
|
|
int32_t moved_files = 0;
|
|
|
|
int64_t moved_bytes = 0;
|
2015-07-07 23:18:55 +02:00
|
|
|
for (unsigned int l = 0; l < c->num_input_levels(); l++) {
|
2015-07-27 23:25:57 +02:00
|
|
|
if (c->level(l) == c->output_level()) {
|
2015-07-07 23:18:55 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < c->num_input_files(l); i++) {
|
|
|
|
FileMetaData* f = c->input(l, i);
|
2015-07-15 21:28:22 +02:00
|
|
|
c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
|
2015-07-07 23:18:55 +02:00
|
|
|
c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
|
|
|
|
f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
|
|
|
|
f->largest, f->smallest_seqno, f->largest_seqno,
|
|
|
|
f->marked_for_compaction);
|
|
|
|
|
|
|
|
LogToBuffer(log_buffer,
|
|
|
|
"[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
|
|
|
|
c->column_family_data()->GetName().c_str(),
|
|
|
|
f->fd.GetNumber(), c->output_level(), f->fd.GetFileSize());
|
|
|
|
++moved_files;
|
|
|
|
moved_bytes += f->fd.GetFileSize();
|
|
|
|
}
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
}
|
2015-07-07 23:18:55 +02:00
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
status = versions_->LogAndApply(c->column_family_data(),
|
2015-01-26 22:59:38 +01:00
|
|
|
*c->mutable_cf_options(), c->edit(),
|
|
|
|
&mutex_, directories_.GetDbDir());
|
2014-09-17 21:49:13 +02:00
|
|
|
// Use latest MutableCFOptions
|
2015-06-17 21:37:59 +02:00
|
|
|
InstallSuperVersionAndScheduleWorkWrapper(
|
|
|
|
c->column_family_data(), job_context, *c->mutable_cf_options());
|
2014-01-24 23:30:28 +01:00
|
|
|
|
2014-10-27 23:49:46 +01:00
|
|
|
VersionStorageInfo::LevelSummaryStorage tmp;
|
2015-06-23 00:20:30 +02:00
|
|
|
c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
moved_bytes);
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
{
|
|
|
|
event_logger_.LogToBuffer(log_buffer)
|
|
|
|
<< "job" << job_context->job_id << "event"
|
|
|
|
<< "trivial_move"
|
2015-06-23 00:20:30 +02:00
|
|
|
<< "destination_level" << c->output_level() << "files" << moved_files
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
<< "total_files_size" << moved_bytes;
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
}
|
2015-01-06 02:32:49 +01:00
|
|
|
LogToBuffer(
|
|
|
|
log_buffer,
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
2015-06-05 01:51:25 +02:00
|
|
|
"[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
|
2015-06-23 00:20:30 +02:00
|
|
|
c->column_family_data()->GetName().c_str(), moved_files,
|
|
|
|
c->output_level(), moved_bytes, status.ToString().c_str(),
|
2015-01-06 02:32:49 +01:00
|
|
|
c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
*made_progress = true;
|
2015-01-13 09:04:08 +01:00
|
|
|
|
|
|
|
// Clear Instrument
|
|
|
|
ThreadStatusUtil::ResetThreadStatus();
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
2015-07-27 23:25:57 +02:00
|
|
|
int output_level __attribute__((unused)) = c->output_level();
|
|
|
|
TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
|
|
|
|
&output_level);
|
2015-12-08 21:25:48 +01:00
|
|
|
|
|
|
|
SequenceNumber earliest_write_conflict_snapshot;
|
|
|
|
std::vector<SequenceNumber> snapshot_seqs =
|
|
|
|
snapshots_.GetAll(&earliest_write_conflict_snapshot);
|
|
|
|
|
2015-05-06 04:01:12 +02:00
|
|
|
assert(is_snapshot_supported_ || snapshots_.empty());
|
2015-01-26 22:59:38 +01:00
|
|
|
CompactionJob compaction_job(
|
2015-05-06 04:01:12 +02:00
|
|
|
job_context->job_id, c.get(), db_options_, env_options_,
|
|
|
|
versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(),
|
2015-12-08 21:25:48 +01:00
|
|
|
directories_.GetDataDir(c->output_path_id()), stats_, snapshot_seqs,
|
|
|
|
earliest_write_conflict_snapshot, table_cache_, &event_logger_,
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
c->mutable_cf_options()->paranoid_file_checks,
|
|
|
|
c->mutable_cf_options()->compaction_measure_io_stats, dbname_,
|
2015-07-13 21:11:05 +02:00
|
|
|
&compaction_job_stats);
|
2014-11-01 00:31:25 +01:00
|
|
|
compaction_job.Prepare();
|
2015-06-02 23:12:23 +02:00
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
mutex_.Unlock();
|
2015-08-18 20:06:23 +02:00
|
|
|
compaction_job.Run();
|
2015-07-16 04:58:28 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
|
2014-11-01 00:31:25 +01:00
|
|
|
mutex_.Lock();
|
2015-06-02 23:12:23 +02:00
|
|
|
|
2015-08-18 20:06:23 +02:00
|
|
|
status = compaction_job.Install(*c->mutable_cf_options(), &mutex_);
|
2014-11-01 00:31:25 +01:00
|
|
|
if (status.ok()) {
|
2015-06-17 21:37:59 +02:00
|
|
|
InstallSuperVersionAndScheduleWorkWrapper(
|
|
|
|
c->column_family_data(), job_context, *c->mutable_cf_options());
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
*made_progress = true;
|
2015-01-27 23:44:02 +01:00
|
|
|
}
|
|
|
|
if (c != nullptr) {
|
2015-06-03 02:07:16 +02:00
|
|
|
NotifyOnCompactionCompleted(
|
|
|
|
c->column_family_data(), c.get(), status,
|
|
|
|
compaction_job_stats, job_context->job_id);
|
2014-02-01 01:45:20 +01:00
|
|
|
c->ReleaseCompactionFiles(status);
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
*made_progress = true;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2015-01-15 21:44:19 +01:00
|
|
|
// this will unref its input_version and column_family_data
|
2013-01-20 11:07:13 +01:00
|
|
|
c.reset();
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
// Done
|
2014-09-02 21:25:58 +02:00
|
|
|
} else if (status.IsShutdownInProgress()) {
|
2011-03-18 23:37:00 +01:00
|
|
|
// Ignore compaction errors found during shutting down
|
|
|
|
} else {
|
2014-09-05 20:48:17 +02:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s",
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 23:32:55 +01:00
|
|
|
status.ToString().c_str());
|
2014-09-05 20:48:17 +02:00
|
|
|
if (db_options_.paranoid_checks && bg_error_.ok()) {
|
2011-03-18 23:37:00 +01:00
|
|
|
bg_error_ = status;
|
|
|
|
}
|
|
|
|
}
|
2011-06-07 16:40:26 +02:00
|
|
|
|
|
|
|
if (is_manual) {
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
ManualCompaction* m = manual_compaction;
|
2012-01-25 23:56:52 +01:00
|
|
|
if (!status.ok()) {
|
2014-01-22 21:46:24 +01:00
|
|
|
m->status = status;
|
2012-01-25 23:56:52 +01:00
|
|
|
m->done = true;
|
|
|
|
}
|
2013-08-02 20:46:47 +02:00
|
|
|
// For universal compaction:
|
|
|
|
// Because universal compaction always happens at level 0, so one
|
|
|
|
// compaction will pick up all overlapped files. No files will be
|
|
|
|
// filtered out due to size limit and left for a successive compaction.
|
|
|
|
// So we can safely conclude the current compaction.
|
|
|
|
//
|
|
|
|
// Also note that, if we don't stop here, then the current compaction
|
|
|
|
// writes a new file back to level 0, which will be used in successive
|
|
|
|
// compaction. Hence the manual compaction will never finish.
|
2014-01-15 01:19:09 +01:00
|
|
|
//
|
|
|
|
// Stop the compaction if manual_end points to nullptr -- this means
|
|
|
|
// that we compacted the whole range. manual_end should always point
|
|
|
|
// to nullptr in case of universal compaction
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
if (m->manual_end == nullptr) {
|
2013-08-02 20:46:47 +02:00
|
|
|
m->done = true;
|
|
|
|
}
|
2011-10-06 01:30:28 +02:00
|
|
|
if (!m->done) {
|
|
|
|
// We only compacted part of the requested range. Update *m
|
|
|
|
// to the range that is left to be compacted.
|
2014-05-21 20:43:35 +02:00
|
|
|
// Universal and FIFO compactions should always compact the whole range
|
2015-03-30 23:04:21 +02:00
|
|
|
assert(m->cfd->ioptions()->compaction_style !=
|
|
|
|
kCompactionStyleUniversal ||
|
|
|
|
m->cfd->ioptions()->num_levels > 1);
|
2014-09-09 00:04:34 +02:00
|
|
|
assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
m->tmp_storage = *m->manual_end;
|
2011-10-06 01:30:28 +02:00
|
|
|
m->begin = &m->tmp_storage;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
m->incomplete = true;
|
2011-10-06 01:30:28 +02:00
|
|
|
}
|
2012-10-19 23:00:53 +02:00
|
|
|
m->in_progress = false; // not being processed anymore
|
2011-06-07 16:40:26 +02:00
|
|
|
}
|
2012-08-23 01:57:51 +02:00
|
|
|
return status;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
bool DBImpl::HasPendingManualCompaction() {
|
|
|
|
return (!manual_compaction_dequeue_.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::AddManualCompaction(DBImpl::ManualCompaction* m) {
|
|
|
|
manual_compaction_dequeue_.push_back(m);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) {
|
|
|
|
// Remove from queue
|
|
|
|
std::deque<ManualCompaction*>::iterator it =
|
|
|
|
manual_compaction_dequeue_.begin();
|
|
|
|
while (it != manual_compaction_dequeue_.end()) {
|
|
|
|
if (m == (*it)) {
|
|
|
|
it = manual_compaction_dequeue_.erase(it);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
it++;
|
|
|
|
}
|
|
|
|
assert(false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-12-18 01:59:00 +01:00
|
|
|
bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) {
|
|
|
|
if (m->exclusive) {
|
|
|
|
return (bg_compaction_scheduled_ > 0);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 20:20:34 +01:00
|
|
|
}
|
|
|
|
std::deque<ManualCompaction*>::iterator it =
|
|
|
|
manual_compaction_dequeue_.begin();
|
|
|
|
bool seen = false;
|
|
|
|
while (it != manual_compaction_dequeue_.end()) {
|
|
|
|
if (m == (*it)) {
|
|
|
|
it++;
|
|
|
|
seen = true;
|
|
|
|
continue;
|
|
|
|
} else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
|
|
|
|
// Consider the other manual compaction *it, conflicts if:
|
|
|
|
// overlaps with m
|
|
|
|
// and (*it) is ahead in the queue and is not yet in progress
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
it++;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
|
|
|
|
// Remove from priority queue
|
|
|
|
std::deque<ManualCompaction*>::iterator it =
|
|
|
|
manual_compaction_dequeue_.begin();
|
|
|
|
while (it != manual_compaction_dequeue_.end()) {
|
|
|
|
if ((*it)->exclusive) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) {
|
|
|
|
// Allow automatic compaction if manual compaction is
|
|
|
|
// is in progress
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
it++;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBImpl::HasExclusiveManualCompaction() {
|
|
|
|
// Remove from priority queue
|
|
|
|
std::deque<ManualCompaction*>::iterator it =
|
|
|
|
manual_compaction_dequeue_.begin();
|
|
|
|
while (it != manual_compaction_dequeue_.end()) {
|
|
|
|
if ((*it)->exclusive) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
it++;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBImpl::MCOverlap(ManualCompaction* m, ManualCompaction* m1) {
|
|
|
|
if ((m->exclusive) || (m1->exclusive)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (m->cfd != m1->cfd) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-05-28 02:53:58 +02:00
|
|
|
namespace {
|
|
|
|
struct IterState {
|
2015-02-05 06:39:45 +01:00
|
|
|
IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version)
|
2014-10-31 19:59:54 +01:00
|
|
|
: db(_db), mu(_mu), super_version(_super_version) {}
|
2014-02-03 22:13:36 +01:00
|
|
|
|
|
|
|
DBImpl* db;
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutex* mu;
|
2014-02-03 22:44:47 +01:00
|
|
|
SuperVersion* super_version;
|
2011-05-28 02:53:58 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static void CleanupIteratorState(void* arg1, void* arg2) {
|
|
|
|
IterState* state = reinterpret_cast<IterState*>(arg1);
|
2014-02-03 22:13:36 +01:00
|
|
|
|
2014-04-14 18:34:59 +02:00
|
|
|
if (state->super_version->Unref()) {
|
2015-02-12 18:54:48 +01:00
|
|
|
// Job id == 0 means that this is not our background process, but rather
|
|
|
|
// user thread
|
|
|
|
JobContext job_context(0);
|
2014-03-05 03:17:27 +01:00
|
|
|
|
2014-02-03 22:13:36 +01:00
|
|
|
state->mu->Lock();
|
|
|
|
state->super_version->Cleanup();
|
2014-10-28 19:54:33 +01:00
|
|
|
state->db->FindObsoleteFiles(&job_context, false, true);
|
2014-02-03 22:13:36 +01:00
|
|
|
state->mu->Unlock();
|
|
|
|
|
|
|
|
delete state->super_version;
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
state->db->PurgeObsoleteFiles(job_context);
|
2014-03-10 23:42:14 +01:00
|
|
|
}
|
2014-11-15 00:43:10 +01:00
|
|
|
job_context.Clean();
|
MemTableListVersion
Summary:
MemTableListVersion is to MemTableList what Version is to VersionSet. I took almost the same ideas to develop MemTableListVersion. The reason is to have copying std::list done in background, while flushing, rather than in foreground (MultiGet() and NewIterator()) under a mutex! Also, whenever we copied MemTableList, we copied also some MemTableList metadata (flush_requested_, commit_in_progress_, etc.), which was wasteful.
This diff avoids std::list copy under a mutex in both MultiGet() and NewIterator(). I created a small database with some number of immutable memtables, and creating 100.000 iterators in a single-thread (!) decreased from {188739, 215703, 198028} to {154352, 164035, 159817}. A lot of the savings come from code under a mutex, so we should see much higher savings with multiple threads. Creating new iterator is very important to LogDevice team.
I also think this diff will make SuperVersion obsolete for performance reasons. I will try it in the next diff. SuperVersion gave us huge savings on Get() code path, but I think that most of the savings came from copying MemTableList under a mutex. If we had MemTableListVersion, we would never need to copy the entire object (like we still do in NewIterator() and MultiGet())
Test Plan: `make check` works. I will also do `make valgrind_check` before commit
Reviewers: dhruba, haobo, kailiu, sdong, emayanke, tnovak
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15255
2014-01-24 23:52:08 +01:00
|
|
|
}
|
2014-01-17 06:56:26 +01:00
|
|
|
|
2011-05-28 02:53:58 +02:00
|
|
|
delete state;
|
|
|
|
}
|
2011-10-31 18:22:06 +01:00
|
|
|
} // namespace
|
2011-05-28 02:53:58 +02:00
|
|
|
|
2015-10-13 00:06:38 +02:00
|
|
|
InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyData* cfd,
|
|
|
|
SuperVersion* super_version,
|
|
|
|
Arena* arena) {
|
|
|
|
InternalIterator* internal_iter;
|
2014-09-05 02:40:41 +02:00
|
|
|
assert(arena != nullptr);
|
|
|
|
// Need to create internal iterator from the arena.
|
|
|
|
MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
|
|
|
|
// Collect iterator for mutable mem
|
|
|
|
merge_iter_builder.AddIterator(
|
2014-10-17 01:57:59 +02:00
|
|
|
super_version->mem->NewIterator(read_options, arena));
|
2014-09-05 02:40:41 +02:00
|
|
|
// Collect all needed child iterators for immutable memtables
|
2014-10-17 01:57:59 +02:00
|
|
|
super_version->imm->AddIterators(read_options, &merge_iter_builder);
|
2014-09-05 02:40:41 +02:00
|
|
|
// Collect iterators for files in L0 - Ln
|
2014-10-17 01:57:59 +02:00
|
|
|
super_version->current->AddIterators(read_options, env_options_,
|
2014-09-05 02:40:41 +02:00
|
|
|
&merge_iter_builder);
|
|
|
|
internal_iter = merge_iter_builder.Finish();
|
2014-02-03 22:13:36 +01:00
|
|
|
IterState* cleanup = new IterState(this, &mutex_, super_version);
|
2013-02-15 20:53:17 +01:00
|
|
|
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
return internal_iter;
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
|
|
|
|
return default_cf_handle_;
|
|
|
|
}
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
Status DBImpl::Get(const ReadOptions& read_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
2011-03-18 23:37:00 +01:00
|
|
|
std::string* value) {
|
2014-10-17 01:57:59 +02:00
|
|
|
return GetImpl(read_options, column_family, key, value);
|
2013-07-06 03:49:18 +02:00
|
|
|
}
|
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
// JobContext gets created and destructed outside of the lock --
|
|
|
|
// we
|
2013-12-20 18:57:58 +01:00
|
|
|
// use this convinently to:
|
|
|
|
// * malloc one SuperVersion() outside of the lock -- new_superversion
|
2014-02-27 20:38:55 +01:00
|
|
|
// * delete SuperVersion()s outside of the lock -- superversions_to_free
|
2013-12-20 18:57:58 +01:00
|
|
|
//
|
2015-06-17 21:37:59 +02:00
|
|
|
// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
|
|
|
|
// same job_context, we can't reuse the SuperVersion() that got
|
|
|
|
// malloced because
|
2013-12-20 18:57:58 +01:00
|
|
|
// first call already used it. In that rare case, we take a hit and create a
|
|
|
|
// new SuperVersion() inside of the mutex. We do similar thing
|
|
|
|
// for superversion_to_free
|
2015-06-17 21:37:59 +02:00
|
|
|
void DBImpl::InstallSuperVersionAndScheduleWorkWrapper(
|
2014-10-28 19:54:33 +01:00
|
|
|
ColumnFamilyData* cfd, JobContext* job_context,
|
2014-10-02 01:19:16 +02:00
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
2014-01-29 22:28:50 +01:00
|
|
|
mutex_.AssertHeld();
|
2015-06-17 21:37:59 +02:00
|
|
|
SuperVersion* old_superversion = InstallSuperVersionAndScheduleWork(
|
2014-10-28 19:54:33 +01:00
|
|
|
cfd, job_context->new_superversion, mutable_cf_options);
|
|
|
|
job_context->new_superversion = nullptr;
|
|
|
|
job_context->superversions_to_free.push_back(old_superversion);
|
2013-12-20 18:57:58 +01:00
|
|
|
}
|
|
|
|
|
2015-06-17 21:37:59 +02:00
|
|
|
SuperVersion* DBImpl::InstallSuperVersionAndScheduleWork(
|
2014-10-17 01:57:59 +02:00
|
|
|
ColumnFamilyData* cfd, SuperVersion* new_sv,
|
2015-06-17 21:37:59 +02:00
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
2014-10-17 01:57:59 +02:00
|
|
|
mutex_.AssertHeld();
|
2014-12-08 21:52:18 +01:00
|
|
|
|
|
|
|
// Update max_total_in_memory_state_
|
|
|
|
size_t old_memtable_size = 0;
|
|
|
|
auto* old_sv = cfd->GetSuperVersion();
|
|
|
|
if (old_sv) {
|
|
|
|
old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
|
|
|
|
old_sv->mutable_cf_options.max_write_buffer_number;
|
|
|
|
}
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
auto* old = cfd->InstallSuperVersion(
|
|
|
|
new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// Whenever we install new SuperVersion, we might need to issue new flushes or
|
2015-06-17 21:37:59 +02:00
|
|
|
// compactions.
|
|
|
|
SchedulePendingFlush(cfd);
|
|
|
|
SchedulePendingCompaction(cfd);
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
2014-10-17 01:57:59 +02:00
|
|
|
|
|
|
|
// Update max_total_in_memory_state_
|
|
|
|
max_total_in_memory_state_ =
|
|
|
|
max_total_in_memory_state_ - old_memtable_size +
|
|
|
|
mutable_cf_options.write_buffer_size *
|
|
|
|
mutable_cf_options.max_write_buffer_number;
|
|
|
|
return old;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::GetImpl(const ReadOptions& read_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value, bool* value_found) {
|
2014-07-28 21:05:36 +02:00
|
|
|
StopWatch sw(env_, stats_, DB_GET);
|
2014-08-23 00:28:58 +02:00
|
|
|
PERF_TIMER_GUARD(get_snapshot_time);
|
2014-01-28 20:05:04 +01:00
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
2011-06-22 04:36:45 +02:00
|
|
|
SequenceNumber snapshot;
|
2014-10-17 01:57:59 +02:00
|
|
|
if (read_options.snapshot != nullptr) {
|
|
|
|
snapshot = reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_;
|
2011-06-22 04:36:45 +02:00
|
|
|
} else {
|
|
|
|
snapshot = versions_->LastSequence();
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2014-02-27 20:38:55 +01:00
|
|
|
// Acquire SuperVersion
|
2014-08-05 20:27:34 +02:00
|
|
|
SuperVersion* sv = GetAndRefSuperVersion(cfd);
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// Prepare to store a list of merge operations if merge occurs.
|
2013-12-03 03:34:05 +01:00
|
|
|
MergeContext merge_context;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
2014-01-28 20:05:04 +01:00
|
|
|
Status s;
|
2013-03-26 02:01:47 +01:00
|
|
|
// First look in the memtable, then in the immutable memtable (if any).
|
2013-03-21 23:59:47 +01:00
|
|
|
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// merge_operands will contain the sequence of merges in the latter case.
|
2013-03-26 02:01:47 +01:00
|
|
|
LookupKey lkey(key, snapshot);
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_TIMER_STOP(get_snapshot_time);
|
2014-08-23 00:28:58 +02:00
|
|
|
|
2016-02-09 20:20:22 +01:00
|
|
|
bool skip_memtable =
|
|
|
|
(read_options.read_tier == kPersistedTier && has_unpersisted_data_);
|
|
|
|
bool done = false;
|
|
|
|
if (!skip_memtable) {
|
|
|
|
if (sv->mem->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
done = true;
|
|
|
|
RecordTick(stats_, MEMTABLE_HIT);
|
|
|
|
} else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
done = true;
|
|
|
|
RecordTick(stats_, MEMTABLE_HIT);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!done) {
|
2014-08-23 00:28:58 +02:00
|
|
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
2014-10-17 01:57:59 +02:00
|
|
|
sv->current->Get(read_options, lkey, value, &s, &merge_context,
|
|
|
|
value_found);
|
2014-07-28 21:05:36 +02:00
|
|
|
RecordTick(stats_, MEMTABLE_MISS);
|
2011-06-22 04:36:45 +02:00
|
|
|
}
|
2011-08-22 23:08:51 +02:00
|
|
|
|
2014-08-23 00:28:58 +02:00
|
|
|
{
|
|
|
|
PERF_TIMER_GUARD(get_post_process_time);
|
2011-08-22 23:08:51 +02:00
|
|
|
|
2014-08-23 00:28:58 +02:00
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
2013-11-25 20:55:36 +01:00
|
|
|
|
2014-08-23 00:28:58 +02:00
|
|
|
RecordTick(stats_, NUMBER_KEYS_READ);
|
|
|
|
RecordTick(stats_, BYTES_READ, value->size());
|
2016-02-01 03:09:24 +01:00
|
|
|
MeasureTime(stats_, BYTES_PER_READ, value->size());
|
2014-08-23 00:28:58 +02:00
|
|
|
}
|
2011-06-22 04:36:45 +02:00
|
|
|
return s;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
std::vector<Status> DBImpl::MultiGet(
|
2014-10-17 01:57:59 +02:00
|
|
|
const ReadOptions& read_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
2013-06-05 20:22:38 +02:00
|
|
|
|
2014-07-28 21:05:36 +02:00
|
|
|
StopWatch sw(env_, stats_, DB_MULTIGET);
|
2014-08-23 00:28:58 +02:00
|
|
|
PERF_TIMER_GUARD(get_snapshot_time);
|
2013-11-27 20:47:40 +01:00
|
|
|
|
2013-06-05 20:22:38 +02:00
|
|
|
SequenceNumber snapshot;
|
2013-11-25 20:55:36 +01:00
|
|
|
|
2014-02-04 00:28:03 +01:00
|
|
|
struct MultiGetColumnFamilyData {
|
2014-02-05 22:12:23 +01:00
|
|
|
ColumnFamilyData* cfd;
|
2014-02-04 00:28:03 +01:00
|
|
|
SuperVersion* super_version;
|
|
|
|
};
|
|
|
|
std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data;
|
|
|
|
// fill up and allocate outside of mutex
|
|
|
|
for (auto cf : column_family) {
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
|
|
|
|
auto mgcfd = new MultiGetColumnFamilyData();
|
|
|
|
mgcfd->cfd = cfd;
|
|
|
|
multiget_cf_data.insert({cfd->GetID(), mgcfd});
|
2014-02-04 00:28:03 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-27 07:23:47 +02:00
|
|
|
mutex_.Lock();
|
2014-10-17 01:57:59 +02:00
|
|
|
if (read_options.snapshot != nullptr) {
|
|
|
|
snapshot = reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_;
|
2013-06-05 20:22:38 +02:00
|
|
|
} else {
|
|
|
|
snapshot = versions_->LastSequence();
|
|
|
|
}
|
2014-02-04 00:28:03 +01:00
|
|
|
for (auto mgd_iter : multiget_cf_data) {
|
2014-02-11 02:04:44 +01:00
|
|
|
mgd_iter.second->super_version =
|
|
|
|
mgd_iter.second->cfd->GetSuperVersion()->Ref();
|
2014-02-04 00:28:03 +01:00
|
|
|
}
|
2013-06-05 20:22:38 +02:00
|
|
|
mutex_.Unlock();
|
2014-02-03 22:13:36 +01:00
|
|
|
|
2013-12-03 03:34:05 +01:00
|
|
|
// Contain a list of merge operations if merge occurs.
|
|
|
|
MergeContext merge_context;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
|
2013-06-05 20:22:38 +02:00
|
|
|
// Note: this always resizes the values array
|
2014-02-04 00:28:03 +01:00
|
|
|
size_t num_keys = keys.size();
|
|
|
|
std::vector<Status> stat_list(num_keys);
|
|
|
|
values->resize(num_keys);
|
2013-06-05 20:22:38 +02:00
|
|
|
|
|
|
|
// Keep track of bytes that we read for statistics-recording later
|
2014-02-04 00:28:03 +01:00
|
|
|
uint64_t bytes_read = 0;
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_TIMER_STOP(get_snapshot_time);
|
2013-06-05 20:22:38 +02:00
|
|
|
|
|
|
|
// For each of the given keys, apply the entire "get" process as follows:
|
|
|
|
// First look in the memtable, then in the immutable memtable (if any).
|
|
|
|
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
|
|
|
// merge_operands will contain the sequence of merges in the latter case.
|
2014-02-04 00:28:03 +01:00
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
2013-12-03 03:34:05 +01:00
|
|
|
merge_context.Clear();
|
2014-02-04 00:28:03 +01:00
|
|
|
Status& s = stat_list[i];
|
2013-06-05 20:22:38 +02:00
|
|
|
std::string* value = &(*values)[i];
|
|
|
|
|
|
|
|
LookupKey lkey(keys[i], snapshot);
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
|
|
|
|
auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
|
2014-02-04 00:28:03 +01:00
|
|
|
assert(mgd_iter != multiget_cf_data.end());
|
|
|
|
auto mgd = mgd_iter->second;
|
|
|
|
auto super_version = mgd->super_version;
|
2016-02-09 20:20:22 +01:00
|
|
|
bool skip_memtable =
|
|
|
|
(read_options.read_tier == kPersistedTier && has_unpersisted_data_);
|
|
|
|
bool done = false;
|
|
|
|
if (!skip_memtable) {
|
|
|
|
if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
done = true;
|
|
|
|
// TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
|
|
|
|
} else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
done = true;
|
|
|
|
// TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!done) {
|
2014-10-03 02:02:30 +02:00
|
|
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
2014-10-17 01:57:59 +02:00
|
|
|
super_version->current->Get(read_options, lkey, value, &s,
|
|
|
|
&merge_context);
|
2016-02-09 20:20:22 +01:00
|
|
|
// TODO(?): RecordTick(stats_, MEMTABLE_MISS)?
|
2013-06-05 20:22:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2014-02-04 00:28:03 +01:00
|
|
|
bytes_read += value->size();
|
2013-06-05 20:22:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Post processing (decrement reference counts and record statistics)
|
2014-08-23 00:28:58 +02:00
|
|
|
PERF_TIMER_GUARD(get_post_process_time);
|
2014-02-04 00:28:03 +01:00
|
|
|
autovector<SuperVersion*> superversions_to_delete;
|
|
|
|
|
2014-06-20 10:23:02 +02:00
|
|
|
// TODO(icanadi) do we need lock here or just around Cleanup()?
|
2014-02-04 00:28:03 +01:00
|
|
|
mutex_.Lock();
|
|
|
|
for (auto mgd_iter : multiget_cf_data) {
|
|
|
|
auto mgd = mgd_iter.second;
|
|
|
|
if (mgd->super_version->Unref()) {
|
|
|
|
mgd->super_version->Cleanup();
|
|
|
|
superversions_to_delete.push_back(mgd->super_version);
|
2014-02-03 22:13:36 +01:00
|
|
|
}
|
|
|
|
}
|
2014-02-04 00:28:03 +01:00
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
for (auto td : superversions_to_delete) {
|
|
|
|
delete td;
|
|
|
|
}
|
|
|
|
for (auto mgd : multiget_cf_data) {
|
|
|
|
delete mgd.second;
|
2013-06-05 20:22:38 +02:00
|
|
|
}
|
2013-11-25 20:55:36 +01:00
|
|
|
|
2014-07-28 21:05:36 +02:00
|
|
|
RecordTick(stats_, NUMBER_MULTIGET_CALLS);
|
|
|
|
RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
|
|
|
|
RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
|
2016-02-01 03:09:24 +01:00
|
|
|
MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read);
|
2014-04-08 19:58:07 +02:00
|
|
|
PERF_TIMER_STOP(get_post_process_time);
|
2013-06-05 20:22:38 +02:00
|
|
|
|
2014-02-04 00:28:03 +01:00
|
|
|
return stat_list;
|
2013-06-05 20:22:38 +02:00
|
|
|
}
|
|
|
|
|
2015-09-23 21:42:43 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status DBImpl::AddFile(ColumnFamilyHandle* column_family,
|
|
|
|
const std::string& file_path, bool move_file) {
|
|
|
|
Status status;
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
ColumnFamilyData* cfd = cfh->cfd();
|
|
|
|
|
|
|
|
ExternalSstFileInfo file_info;
|
|
|
|
file_info.file_path = file_path;
|
|
|
|
status = env_->GetFileSize(file_path, &file_info.file_size);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Access the file using TableReader to extract
|
|
|
|
// version, number of entries, smallest user key, largest user key
|
|
|
|
std::unique_ptr<RandomAccessFile> sst_file;
|
|
|
|
status = env_->NewRandomAccessFile(file_path, &sst_file, env_options_);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
std::unique_ptr<RandomAccessFileReader> sst_file_reader;
|
|
|
|
sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file)));
|
|
|
|
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
|
|
status = cfd->ioptions()->table_factory->NewTableReader(
|
|
|
|
TableReaderOptions(*cfd->ioptions(), env_options_,
|
|
|
|
cfd->internal_comparator()),
|
|
|
|
std::move(sst_file_reader), file_info.file_size, &table_reader);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the external sst file version from table properties
|
|
|
|
const UserCollectedProperties& user_collected_properties =
|
|
|
|
table_reader->GetTableProperties()->user_collected_properties;
|
|
|
|
UserCollectedProperties::const_iterator external_sst_file_version_iter =
|
|
|
|
user_collected_properties.find(ExternalSstFilePropertyNames::kVersion);
|
|
|
|
if (external_sst_file_version_iter == user_collected_properties.end()) {
|
|
|
|
return Status::InvalidArgument("Generated table version not found");
|
|
|
|
}
|
|
|
|
|
|
|
|
file_info.version =
|
|
|
|
DecodeFixed32(external_sst_file_version_iter->second.c_str());
|
|
|
|
if (file_info.version == 1) {
|
|
|
|
// version 1 imply that all sequence numbers in table equal 0
|
|
|
|
file_info.sequence_number = 0;
|
|
|
|
} else {
|
|
|
|
return Status::InvalidArgument("Generated table version is not supported");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get number of entries in table
|
|
|
|
file_info.num_entries = table_reader->GetTableProperties()->num_entries;
|
|
|
|
|
|
|
|
ParsedInternalKey key;
|
2015-10-13 00:06:38 +02:00
|
|
|
std::unique_ptr<InternalIterator> iter(
|
|
|
|
table_reader->NewIterator(ReadOptions()));
|
2015-09-23 21:42:43 +02:00
|
|
|
|
|
|
|
// Get first (smallest) key from file
|
|
|
|
iter->SeekToFirst();
|
|
|
|
if (!ParseInternalKey(iter->key(), &key)) {
|
|
|
|
return Status::Corruption("Generated table have corrupted keys");
|
|
|
|
}
|
|
|
|
if (key.sequence != 0) {
|
|
|
|
return Status::Corruption("Generated table have non zero sequence number");
|
|
|
|
}
|
|
|
|
file_info.smallest_key = key.user_key.ToString();
|
|
|
|
|
|
|
|
// Get last (largest) key from file
|
|
|
|
iter->SeekToLast();
|
|
|
|
if (!ParseInternalKey(iter->key(), &key)) {
|
|
|
|
return Status::Corruption("Generated table have corrupted keys");
|
|
|
|
}
|
|
|
|
if (key.sequence != 0) {
|
|
|
|
return Status::Corruption("Generated table have non zero sequence number");
|
|
|
|
}
|
|
|
|
file_info.largest_key = key.user_key.ToString();
|
|
|
|
|
|
|
|
return AddFile(column_family, &file_info, move_file);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::AddFile(ColumnFamilyHandle* column_family,
|
|
|
|
const ExternalSstFileInfo* file_info, bool move_file) {
|
|
|
|
Status status;
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
ColumnFamilyData* cfd = cfh->cfd();
|
|
|
|
|
2016-01-25 22:47:07 +01:00
|
|
|
if (file_info->num_entries == 0) {
|
|
|
|
return Status::InvalidArgument("File contain no entries");
|
|
|
|
}
|
2015-09-23 21:42:43 +02:00
|
|
|
if (file_info->version != 1) {
|
|
|
|
return Status::InvalidArgument("Generated table version is not supported");
|
|
|
|
}
|
|
|
|
// version 1 imply that file have only Put Operations with Sequence Number = 0
|
|
|
|
|
|
|
|
FileMetaData meta;
|
|
|
|
meta.smallest =
|
|
|
|
InternalKey(file_info->smallest_key, file_info->sequence_number,
|
|
|
|
ValueType::kTypeValue);
|
|
|
|
meta.largest = InternalKey(file_info->largest_key, file_info->sequence_number,
|
|
|
|
ValueType::kTypeValue);
|
|
|
|
if (!meta.smallest.Valid() || !meta.largest.Valid()) {
|
|
|
|
return Status::Corruption("Generated table have corrupted keys");
|
|
|
|
}
|
|
|
|
meta.smallest_seqno = file_info->sequence_number;
|
|
|
|
meta.largest_seqno = file_info->sequence_number;
|
|
|
|
if (meta.smallest_seqno != 0 || meta.largest_seqno != 0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Non zero sequence numbers are not supported");
|
|
|
|
}
|
|
|
|
// Generate a location for the new table
|
|
|
|
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, file_info->file_size);
|
|
|
|
std::string db_fname = TableFileName(
|
|
|
|
db_options_.db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
|
|
|
|
|
|
|
|
if (move_file) {
|
|
|
|
status = env_->LinkFile(file_info->file_path, db_fname);
|
|
|
|
if (status.IsNotSupported()) {
|
|
|
|
// Original file is on a different FS, use copy instead of hard linking
|
|
|
|
status = CopyFile(env_, file_info->file_path, db_fname, 0);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
status = CopyFile(env_, file_info->file_path, db_fname, 0);
|
|
|
|
}
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
const MutableCFOptions mutable_cf_options =
|
|
|
|
*cfd->GetLatestMutableCFOptions();
|
|
|
|
|
|
|
|
WriteThread::Writer w;
|
|
|
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
|
|
|
|
2015-10-31 00:38:10 +01:00
|
|
|
if (!snapshots_.empty()) {
|
|
|
|
// Check that no snapshots are being held
|
|
|
|
status =
|
|
|
|
Status::NotSupported("Cannot add a file while holding snapshots");
|
2015-09-23 21:42:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
2015-10-31 00:38:10 +01:00
|
|
|
// Verify that added file key range dont overlap with any keys in DB
|
|
|
|
SuperVersion* sv = cfd->GetSuperVersion()->Ref();
|
|
|
|
Arena arena;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
ScopedArenaIterator iter(NewInternalIterator(ro, cfd, sv, &arena));
|
|
|
|
|
|
|
|
InternalKey range_start(file_info->smallest_key, kMaxSequenceNumber,
|
|
|
|
kTypeValue);
|
|
|
|
iter->Seek(range_start.Encode());
|
|
|
|
status = iter->status();
|
|
|
|
|
|
|
|
if (status.ok() && iter->Valid()) {
|
|
|
|
ParsedInternalKey seek_result;
|
|
|
|
if (ParseInternalKey(iter->key(), &seek_result)) {
|
|
|
|
auto* vstorage = cfd->current()->storage_info();
|
|
|
|
if (vstorage->InternalComparator()->user_comparator()->Compare(
|
|
|
|
seek_result.user_key, file_info->largest_key) <= 0) {
|
|
|
|
status = Status::NotSupported("Cannot add overlapping range");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
status = Status::Corruption("DB have corrupted keys");
|
2015-09-23 21:42:43 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
2015-10-31 00:38:10 +01:00
|
|
|
// Add file to L0
|
2015-09-23 21:42:43 +02:00
|
|
|
VersionEdit edit;
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
2015-10-31 00:38:10 +01:00
|
|
|
edit.AddFile(0, meta.fd.GetNumber(), meta.fd.GetPathId(),
|
2015-09-23 21:42:43 +02:00
|
|
|
meta.fd.GetFileSize(), meta.smallest, meta.largest,
|
|
|
|
meta.smallest_seqno, meta.largest_seqno,
|
|
|
|
meta.marked_for_compaction);
|
|
|
|
|
|
|
|
status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
|
|
|
|
directories_.GetDbDir());
|
|
|
|
}
|
|
|
|
write_thread_.ExitUnbatched(&w);
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
delete InstallSuperVersionAndScheduleWork(cfd, nullptr,
|
|
|
|
mutable_cf_options);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!status.ok()) {
|
|
|
|
// We failed to add the file to the database
|
|
|
|
Status s = env_->DeleteFile(db_fname);
|
|
|
|
if (!s.ok()) {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"AddFile() clean up for file %s failed : %s", db_fname.c_str(),
|
|
|
|
s.ToString().c_str());
|
|
|
|
}
|
|
|
|
} else if (status.ok() && move_file) {
|
|
|
|
// The file was moved and added successfully, remove original file link
|
|
|
|
Status s = env_->DeleteFile(file_info->file_path);
|
|
|
|
if (!s.ok()) {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"%s was added to DB successfully but failed to remove original file "
|
|
|
|
"link : %s",
|
|
|
|
file_info->file_path.c_str(), s.ToString().c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
2014-01-06 22:31:06 +01:00
|
|
|
const std::string& column_family_name,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle** handle) {
|
2014-11-20 19:49:32 +01:00
|
|
|
Status s;
|
2015-12-09 02:01:02 +01:00
|
|
|
Status persist_options_status;
|
2014-02-28 19:29:37 +01:00
|
|
|
*handle = nullptr;
|
2015-06-18 23:55:05 +02:00
|
|
|
|
|
|
|
s = CheckCompressionSupported(cf_options);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (s.ok() && db_options_.allow_concurrent_memtable_write) {
|
|
|
|
s = CheckConcurrentWritesSupported(cf_options);
|
|
|
|
}
|
2015-06-18 23:55:05 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-11-20 19:49:32 +01:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-02-28 19:29:37 +01:00
|
|
|
|
2014-11-20 19:49:32 +01:00
|
|
|
if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
|
|
|
|
nullptr) {
|
|
|
|
return Status::InvalidArgument("Column family already exists");
|
|
|
|
}
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddColumnFamily(column_family_name);
|
|
|
|
uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
|
|
|
|
edit.SetColumnFamily(new_id);
|
|
|
|
edit.SetLogNumber(logfile_number_);
|
|
|
|
edit.SetComparatorName(cf_options.comparator->Name());
|
|
|
|
|
|
|
|
// LogAndApply will both write the creation in MANIFEST and create
|
|
|
|
// ColumnFamilyData object
|
|
|
|
Options opt(db_options_, cf_options);
|
2015-01-06 21:44:21 +01:00
|
|
|
{ // write thread
|
2015-08-06 01:56:28 +02:00
|
|
|
WriteThread::Writer w;
|
|
|
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
2015-01-06 21:44:21 +01:00
|
|
|
// LogAndApply will both write the creation in MANIFEST and create
|
|
|
|
// ColumnFamilyData object
|
|
|
|
s = versions_->LogAndApply(
|
|
|
|
nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit,
|
2015-01-26 22:59:38 +01:00
|
|
|
&mutex_, directories_.GetDbDir(), false, &cf_options);
|
2015-12-09 02:01:02 +01:00
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
// If the column family was created successfully, we then persist
|
|
|
|
// the updated RocksDB options under the same single write thread
|
|
|
|
persist_options_status = WriteOptionsFile();
|
|
|
|
}
|
2015-08-06 01:56:28 +02:00
|
|
|
write_thread_.ExitUnbatched(&w);
|
2015-01-06 21:44:21 +01:00
|
|
|
}
|
2014-11-20 19:49:32 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
single_column_family_mode_ = false;
|
|
|
|
auto* cfd =
|
|
|
|
versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
|
|
|
|
assert(cfd != nullptr);
|
2015-06-17 21:37:59 +02:00
|
|
|
delete InstallSuperVersionAndScheduleWork(
|
2014-11-20 19:49:32 +01:00
|
|
|
cfd, nullptr, *cfd->GetLatestMutableCFOptions());
|
2014-12-11 03:39:09 +01:00
|
|
|
|
|
|
|
if (!cfd->mem()->IsSnapshotSupported()) {
|
|
|
|
is_snapshot_supported_ = false;
|
|
|
|
}
|
|
|
|
|
2014-11-20 19:49:32 +01:00
|
|
|
*handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Created column family [%s] (ID %u)",
|
|
|
|
column_family_name.c_str(), (unsigned)cfd->GetID());
|
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Creating column family [%s] FAILED -- %s",
|
|
|
|
column_family_name.c_str(), s.ToString().c_str());
|
|
|
|
}
|
2015-02-05 06:39:45 +01:00
|
|
|
} // InstrumentedMutexLock l(&mutex_)
|
2014-11-20 19:49:32 +01:00
|
|
|
|
|
|
|
// this is outside the mutex
|
2014-01-06 22:31:06 +01:00
|
|
|
if (s.ok()) {
|
2015-12-09 02:01:02 +01:00
|
|
|
NewThreadStatusCfInfo(
|
|
|
|
reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
|
2015-11-11 07:58:01 +01:00
|
|
|
if (!persist_options_status.ok()) {
|
|
|
|
if (db_options_.fail_if_options_file_error) {
|
|
|
|
s = Status::IOError(
|
|
|
|
"ColumnFamily has been created, but unable to persist"
|
|
|
|
"options in CreateColumnFamily()",
|
|
|
|
persist_options_status.ToString().c_str());
|
|
|
|
}
|
|
|
|
Warn(db_options_.info_log,
|
|
|
|
"Unable to persist options in CreateColumnFamily() -- %s",
|
|
|
|
persist_options_status.ToString().c_str());
|
|
|
|
}
|
2014-02-28 01:44:26 +01:00
|
|
|
}
|
2014-01-06 22:31:06 +01:00
|
|
|
return s;
|
2014-01-02 18:08:12 +01:00
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
if (cfd->GetID() == 0) {
|
2014-01-11 00:12:34 +01:00
|
|
|
return Status::InvalidArgument("Can't drop default column family");
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
|
2014-12-11 03:39:09 +01:00
|
|
|
bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
VersionEdit edit;
|
|
|
|
edit.DropColumnFamily();
|
2014-02-11 02:04:44 +01:00
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
|
2014-02-28 01:44:26 +01:00
|
|
|
Status s;
|
2015-12-09 02:01:02 +01:00
|
|
|
Status options_persist_status;
|
2014-03-05 02:16:40 +01:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-03-05 02:16:40 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
s = Status::InvalidArgument("Column family already dropped!\n");
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
2014-09-06 00:20:05 +02:00
|
|
|
// we drop column family from a single write thread
|
2015-08-06 01:56:28 +02:00
|
|
|
WriteThread::Writer w;
|
|
|
|
write_thread_.EnterUnbatched(&w, &mutex_);
|
2014-10-02 01:19:16 +02:00
|
|
|
s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
|
|
|
|
&edit, &mutex_);
|
2015-12-09 02:01:02 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
// If the column family was dropped successfully, we then persist
|
|
|
|
// the updated RocksDB options under the same single write thread
|
|
|
|
options_persist_status = WriteOptionsFile();
|
|
|
|
}
|
2015-08-06 01:56:28 +02:00
|
|
|
write_thread_.ExitUnbatched(&w);
|
2014-03-05 02:16:40 +01:00
|
|
|
}
|
2014-12-11 03:39:09 +01:00
|
|
|
|
|
|
|
if (!cf_support_snapshot) {
|
|
|
|
// Dropped Column Family doesn't support snapshot. Need to recalculate
|
|
|
|
// is_snapshot_supported_.
|
|
|
|
bool new_is_snapshot_supported = true;
|
|
|
|
for (auto c : *versions_->GetColumnFamilySet()) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
|
2014-12-11 03:39:09 +01:00
|
|
|
new_is_snapshot_supported = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
is_snapshot_supported_ = new_is_snapshot_supported;
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
}
|
2014-02-06 20:44:50 +01:00
|
|
|
|
2014-02-28 01:44:26 +01:00
|
|
|
if (s.ok()) {
|
2014-11-20 19:49:32 +01:00
|
|
|
// Note that here we erase the associated cf_info of the to-be-dropped
|
|
|
|
// cfd before its ref-count goes to zero to avoid having to erase cf_info
|
|
|
|
// later inside db_mutex.
|
|
|
|
EraseThreadStatusCfInfo(cfd);
|
2014-03-11 22:52:17 +01:00
|
|
|
assert(cfd->IsDropped());
|
2014-10-17 01:57:59 +02:00
|
|
|
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
|
|
max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
|
|
|
|
mutable_cf_options->max_write_buffer_number;
|
2015-12-09 02:01:02 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Dropped column family with id %u\n", cfd->GetID());
|
|
|
|
|
2015-11-11 07:58:01 +01:00
|
|
|
if (!options_persist_status.ok()) {
|
|
|
|
if (db_options_.fail_if_options_file_error) {
|
|
|
|
s = Status::IOError(
|
|
|
|
"ColumnFamily has been dropped, but unable to persist "
|
|
|
|
"options in DropColumnFamily()",
|
|
|
|
options_persist_status.ToString().c_str());
|
|
|
|
}
|
|
|
|
Warn(db_options_.info_log,
|
|
|
|
"Unable to persist options in DropColumnFamily() -- %s",
|
|
|
|
options_persist_status.ToString().c_str());
|
|
|
|
}
|
2014-02-28 01:44:26 +01:00
|
|
|
} else {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
2014-09-05 20:48:17 +02:00
|
|
|
"Dropping column family with id %u FAILED -- %s\n",
|
2014-02-28 01:44:26 +01:00
|
|
|
cfd->GetID(), s.ToString().c_str());
|
|
|
|
}
|
|
|
|
|
2014-01-06 22:31:06 +01:00
|
|
|
return s;
|
2014-01-02 18:08:12 +01:00
|
|
|
}
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
bool DBImpl::KeyMayExist(const ReadOptions& read_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value, bool* value_found) {
|
2013-07-26 21:57:01 +02:00
|
|
|
if (value_found != nullptr) {
|
2013-11-17 07:59:22 +01:00
|
|
|
// falsify later if key-may-exist but can't fetch value
|
|
|
|
*value_found = true;
|
2013-07-26 21:57:01 +02:00
|
|
|
}
|
2014-10-17 01:57:59 +02:00
|
|
|
ReadOptions roptions = read_options;
|
2013-08-25 07:48:51 +02:00
|
|
|
roptions.read_tier = kBlockCacheTier; // read from block cache only
|
2014-01-28 20:05:04 +01:00
|
|
|
auto s = GetImpl(roptions, column_family, key, value, value_found);
|
2013-11-17 07:59:22 +01:00
|
|
|
|
2014-08-25 23:22:05 +02:00
|
|
|
// If block_cache is enabled and the index block of the table didn't
|
2013-11-17 07:59:22 +01:00
|
|
|
// not present in block_cache, the return value will be Status::Incomplete.
|
|
|
|
// In this case, key may still exist in the table.
|
|
|
|
return s.ok() || s.IsIncomplete();
|
2013-07-06 03:49:18 +02:00
|
|
|
}
|
|
|
|
|
2014-09-09 00:04:34 +02:00
|
|
|
Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* column_family) {
|
2016-02-09 20:20:22 +01:00
|
|
|
if (read_options.read_tier == kPersistedTier) {
|
|
|
|
return NewErrorIterator(Status::NotSupported(
|
|
|
|
"ReadTier::kPersistedData is not yet supported in iterators."));
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
2014-01-17 06:56:26 +01:00
|
|
|
|
2015-02-18 20:49:31 +01:00
|
|
|
XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new,
|
|
|
|
reinterpret_cast<DBImpl*>(this),
|
|
|
|
const_cast<ReadOptions*>(&read_options), is_snapshot_supported_);
|
|
|
|
if (read_options.managed) {
|
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
// not supported in lite version
|
|
|
|
return NewErrorIterator(Status::InvalidArgument(
|
|
|
|
"Managed Iterators not supported in RocksDBLite."));
|
|
|
|
#else
|
|
|
|
if ((read_options.tailing) || (read_options.snapshot != nullptr) ||
|
|
|
|
(is_snapshot_supported_)) {
|
|
|
|
return new ManagedIterator(this, read_options, cfd);
|
|
|
|
}
|
|
|
|
// Managed iter not supported
|
|
|
|
return NewErrorIterator(Status::InvalidArgument(
|
|
|
|
"Managed Iterators not supported without snapshots."));
|
|
|
|
#endif
|
|
|
|
} else if (read_options.tailing) {
|
2014-04-15 22:39:26 +02:00
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
// not supported in lite version
|
|
|
|
return nullptr;
|
|
|
|
#else
|
2014-10-24 00:34:21 +02:00
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
|
|
|
auto iter = new ForwardIterator(this, read_options, cfd, sv);
|
2015-11-05 22:24:05 +01:00
|
|
|
return NewDBIterator(
|
|
|
|
env_, *cfd->ioptions(), cfd->user_comparator(), iter,
|
2014-10-24 00:34:21 +02:00
|
|
|
kMaxSequenceNumber,
|
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
2015-12-16 21:08:30 +01:00
|
|
|
read_options.iterate_upper_bound, read_options.prefix_same_as_start,
|
|
|
|
read_options.pin_data);
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif
|
2014-01-17 06:56:26 +01:00
|
|
|
} else {
|
2014-04-14 18:34:59 +02:00
|
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
2014-10-24 00:34:21 +02:00
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
2014-04-14 18:34:59 +02:00
|
|
|
|
2014-03-08 01:12:34 +01:00
|
|
|
auto snapshot =
|
2014-09-09 00:04:34 +02:00
|
|
|
read_options.snapshot != nullptr
|
|
|
|
? reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_
|
2014-03-08 01:12:34 +01:00
|
|
|
: latest_snapshot;
|
2014-01-17 06:56:26 +01:00
|
|
|
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
// Try to generate a DB iterator tree in continuous memory area to be
|
|
|
|
// cache friendly. Here is an example of result:
|
|
|
|
// +-------------------------------+
|
|
|
|
// | |
|
|
|
|
// | ArenaWrappedDBIter |
|
|
|
|
// | + |
|
|
|
|
// | +---> Inner Iterator ------------+
|
|
|
|
// | | | |
|
|
|
|
// | | +-- -- -- -- -- -- -- --+ |
|
|
|
|
// | +--- | Arena | |
|
|
|
|
// | | | |
|
|
|
|
// | Allocated Memory: | |
|
|
|
|
// | | +-------------------+ |
|
|
|
|
// | | | DBIter | <---+
|
|
|
|
// | | + |
|
|
|
|
// | | | +-> iter_ ------------+
|
|
|
|
// | | | | |
|
|
|
|
// | | +-------------------+ |
|
|
|
|
// | | | MergingIterator | <---+
|
|
|
|
// | | + |
|
|
|
|
// | | | +->child iter1 ------------+
|
|
|
|
// | | | | | |
|
|
|
|
// | | +->child iter2 ----------+ |
|
|
|
|
// | | | | | | |
|
|
|
|
// | | | +->child iter3 --------+ | |
|
|
|
|
// | | | | | |
|
|
|
|
// | | +-------------------+ | | |
|
|
|
|
// | | | Iterator1 | <--------+
|
|
|
|
// | | +-------------------+ | |
|
|
|
|
// | | | Iterator2 | <------+
|
|
|
|
// | | +-------------------+ |
|
|
|
|
// | | | Iterator3 | <----+
|
|
|
|
// | | +-------------------+
|
|
|
|
// | | |
|
|
|
|
// +-------+-----------------------+
|
|
|
|
//
|
2015-09-02 22:58:22 +02:00
|
|
|
// ArenaWrappedDBIter inlines an arena area where all the iterators in
|
|
|
|
// the iterator tree are allocated in the order of being accessed when
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
// querying.
|
|
|
|
// Laying out the iterators in the order of being accessed makes it more
|
|
|
|
// likely that any iterator pointer is close to the iterator it points to so
|
|
|
|
// that they are likely to be in the same cache line and/or page.
|
|
|
|
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
2015-11-05 22:24:05 +01:00
|
|
|
env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
|
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
2015-12-16 21:08:30 +01:00
|
|
|
read_options.iterate_upper_bound, read_options.prefix_same_as_start,
|
|
|
|
read_options.pin_data);
|
2014-09-04 19:48:24 +02:00
|
|
|
|
2015-10-13 00:06:38 +02:00
|
|
|
InternalIterator* internal_iter =
|
2014-09-09 00:04:34 +02:00
|
|
|
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-03 01:38:00 +02:00
|
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
|
|
|
|
|
|
return db_iter;
|
|
|
|
}
|
2014-09-09 00:04:34 +02:00
|
|
|
// To stop compiler from complaining
|
|
|
|
return nullptr;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
Status DBImpl::NewIterators(
|
2014-09-09 00:04:34 +02:00
|
|
|
const ReadOptions& read_options,
|
2014-03-08 01:12:34 +01:00
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
std::vector<Iterator*>* iterators) {
|
2016-02-09 20:20:22 +01:00
|
|
|
if (read_options.read_tier == kPersistedTier) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"ReadTier::kPersistedData is not yet supported in iterators.");
|
|
|
|
}
|
2014-03-08 01:12:34 +01:00
|
|
|
iterators->clear();
|
|
|
|
iterators->reserve(column_families.size());
|
2015-02-18 20:49:31 +01:00
|
|
|
XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new,
|
|
|
|
reinterpret_cast<DBImpl*>(this),
|
|
|
|
const_cast<ReadOptions*>(&read_options), is_snapshot_supported_);
|
|
|
|
if (read_options.managed) {
|
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Managed interator not supported in RocksDB lite");
|
|
|
|
#else
|
|
|
|
if ((!read_options.tailing) && (read_options.snapshot == nullptr) &&
|
|
|
|
(!is_snapshot_supported_)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Managed interator not supported without snapshots");
|
|
|
|
}
|
|
|
|
for (auto cfh : column_families) {
|
|
|
|
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
|
|
|
|
auto iter = new ManagedIterator(this, read_options, cfd);
|
|
|
|
iterators->push_back(iter);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
} else if (read_options.tailing) {
|
2014-04-15 22:39:26 +02:00
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Tailing interator not supported in RocksDB lite");
|
|
|
|
#else
|
2014-03-08 01:12:34 +01:00
|
|
|
for (auto cfh : column_families) {
|
|
|
|
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
|
2014-10-24 00:34:21 +02:00
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
|
|
|
auto iter = new ForwardIterator(this, read_options, cfd, sv);
|
2015-12-16 21:08:30 +01:00
|
|
|
iterators->push_back(NewDBIterator(
|
|
|
|
env_, *cfd->ioptions(), cfd->user_comparator(), iter,
|
|
|
|
kMaxSequenceNumber,
|
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations, nullptr,
|
|
|
|
false, read_options.pin_data));
|
2014-03-08 01:12:34 +01:00
|
|
|
}
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif
|
2014-03-08 01:12:34 +01:00
|
|
|
} else {
|
2014-10-24 00:34:21 +02:00
|
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
|
|
|
|
2014-03-08 01:12:34 +01:00
|
|
|
for (size_t i = 0; i < column_families.size(); ++i) {
|
2014-10-24 00:34:21 +02:00
|
|
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
|
|
|
|
column_families[i])->cfd();
|
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
2014-03-08 01:12:34 +01:00
|
|
|
|
|
|
|
auto snapshot =
|
2014-09-09 00:04:34 +02:00
|
|
|
read_options.snapshot != nullptr
|
|
|
|
? reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_
|
2014-03-08 01:12:34 +01:00
|
|
|
: latest_snapshot;
|
|
|
|
|
2014-09-05 02:40:41 +02:00
|
|
|
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
2014-09-09 00:04:34 +02:00
|
|
|
env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
|
2015-12-16 21:08:30 +01:00
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations, nullptr,
|
|
|
|
false, read_options.pin_data);
|
2015-10-13 00:06:38 +02:00
|
|
|
InternalIterator* internal_iter =
|
|
|
|
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
|
2014-09-05 02:40:41 +02:00
|
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
|
|
iterators->push_back(db_iter);
|
2014-03-08 01:12:34 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
}
|
|
|
|
|
2015-12-08 21:25:48 +01:00
|
|
|
const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
|
|
|
|
|
2015-11-05 05:52:22 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2015-12-08 21:25:48 +01:00
|
|
|
const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
|
|
|
|
return GetSnapshotImpl(true);
|
|
|
|
}
|
2015-11-05 05:52:22 +01:00
|
|
|
#endif // ROCKSDB_LITE
|
2015-12-08 21:25:48 +01:00
|
|
|
|
|
|
|
const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) {
|
2014-12-06 01:12:10 +01:00
|
|
|
int64_t unix_time = 0;
|
|
|
|
env_->GetCurrentTime(&unix_time); // Ignore error
|
2015-06-09 01:57:44 +02:00
|
|
|
SnapshotImpl* s = new SnapshotImpl;
|
2014-12-06 01:12:10 +01:00
|
|
|
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
2014-04-30 02:13:46 +02:00
|
|
|
// returns null if the underlying memtable does not support snapshot.
|
2015-06-09 01:57:44 +02:00
|
|
|
if (!is_snapshot_supported_) {
|
|
|
|
delete s;
|
|
|
|
return nullptr;
|
|
|
|
}
|
2015-12-08 21:25:48 +01:00
|
|
|
return snapshots_.New(s, versions_->LastSequence(), unix_time,
|
|
|
|
is_write_conflict_boundary);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
2015-06-09 01:57:44 +02:00
|
|
|
const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
snapshots_.Delete(casted_s);
|
|
|
|
}
|
|
|
|
delete casted_s;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Convenience methods
|
2014-02-11 02:04:44 +01:00
|
|
|
Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& val) {
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
return DB::Put(o, column_family, key, val);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& val) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
2014-09-09 00:04:34 +02:00
|
|
|
if (!cfh->cfd()->ioptions()->merge_operator) {
|
2013-03-21 23:59:47 +01:00
|
|
|
return Status::NotSupported("Provide a merge_operator when opening DB");
|
|
|
|
} else {
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
return DB::Merge(o, column_family, key, val);
|
2013-03-21 23:59:47 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
Status DBImpl::Delete(const WriteOptions& write_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
2014-10-17 01:57:59 +02:00
|
|
|
return DB::Delete(write_options, column_family, key);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 20:42:56 +02:00
|
|
|
Status DBImpl::SingleDelete(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
|
|
|
return DB::SingleDelete(write_options, column_family, key);
|
|
|
|
}
|
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
|
2015-05-29 23:36:35 +02:00
|
|
|
return WriteImpl(write_options, my_batch, nullptr);
|
|
|
|
}
|
|
|
|
|
2015-05-30 00:22:00 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
2015-05-29 23:36:35 +02:00
|
|
|
Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch,
|
|
|
|
WriteCallback* callback) {
|
|
|
|
return WriteImpl(write_options, my_batch, callback);
|
|
|
|
}
|
2015-05-30 00:22:00 +02:00
|
|
|
#endif // ROCKSDB_LITE
|
2015-05-29 23:36:35 +02:00
|
|
|
|
|
|
|
Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch, WriteCallback* callback) {
|
2014-08-12 07:10:32 +02:00
|
|
|
if (my_batch == nullptr) {
|
|
|
|
return Status::Corruption("Batch is nullptr!");
|
|
|
|
}
|
Deprecate WriteOptions::timeout_hint_us
Summary:
In one of our recent meetings, we discussed deprecating features that are not being actively used. One of those features, at least within Facebook, is timeout_hint. The feature is really nicely implemented, but if nobody needs it, we should remove it from our code-base (until we get a valid use-case). Some arguments:
* Less code == better icache hit rate, smaller builds, simpler code
* The motivation for adding timeout_hint_us was to work-around RocksDB's stall issue. However, we're currently addressing the stall issue itself (see @sdong's recent work on stall write_rate), so we should never see sharp lock-ups in the future.
* Nobody is using the feature within Facebook's code-base. Googling for `timeout_hint_us` also doesn't yield any users.
Test Plan: make check
Reviewers: anthony, kradhakrishnan, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D41937
2015-07-14 09:35:48 +02:00
|
|
|
if (write_options.timeout_hint_us != 0) {
|
|
|
|
return Status::InvalidArgument("timeout_hint_us is deprecated");
|
|
|
|
}
|
2015-05-29 23:36:35 +02:00
|
|
|
|
|
|
|
Status status;
|
2015-07-31 05:31:36 +02:00
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
bool xfunc_attempted_write = false;
|
|
|
|
XFUNC_TEST("transaction", "transaction_xftest_write_impl",
|
|
|
|
xf_transaction_write1, xf_transaction_write, write_options,
|
|
|
|
db_options_, my_batch, callback, this, &status,
|
|
|
|
&xfunc_attempted_write);
|
|
|
|
if (xfunc_attempted_write) {
|
|
|
|
// Test already did the write
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2014-08-23 00:28:58 +02:00
|
|
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
2015-08-06 01:56:28 +02:00
|
|
|
WriteThread::Writer w;
|
2014-08-12 07:10:32 +02:00
|
|
|
w.batch = my_batch;
|
2014-10-17 01:57:59 +02:00
|
|
|
w.sync = write_options.sync;
|
|
|
|
w.disableWAL = write_options.disableWAL;
|
2014-08-12 07:10:32 +02:00
|
|
|
w.in_batch_group = false;
|
2016-02-05 19:44:13 +01:00
|
|
|
w.callback = callback;
|
2014-08-12 07:10:32 +02:00
|
|
|
|
2014-10-17 01:57:59 +02:00
|
|
|
if (!write_options.disableWAL) {
|
2014-08-12 07:10:32 +02:00
|
|
|
RecordTick(stats_, WRITE_WITH_WAL);
|
|
|
|
}
|
|
|
|
|
2015-06-02 21:35:12 +02:00
|
|
|
StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE);
|
|
|
|
|
2015-08-06 01:56:28 +02:00
|
|
|
write_thread_.JoinBatchGroup(&w);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (w.state == WriteThread::STATE_PARALLEL_FOLLOWER) {
|
|
|
|
// we are a non-leader in a parallel group
|
|
|
|
PERF_TIMER_GUARD(write_memtable_time);
|
|
|
|
|
2016-02-05 19:44:13 +01:00
|
|
|
if (!w.CallbackFailed()) {
|
|
|
|
ColumnFamilyMemTablesImpl column_family_memtables(
|
|
|
|
versions_->GetColumnFamilySet());
|
|
|
|
WriteBatchInternal::SetSequence(w.batch, w.sequence);
|
|
|
|
w.status = WriteBatchInternal::InsertInto(
|
|
|
|
w.batch, &column_family_memtables, &flush_scheduler_,
|
|
|
|
write_options.ignore_missing_column_families, 0 /*log_number*/, this,
|
|
|
|
true /*dont_filter_deletes*/, true /*concurrent_memtable_writes*/);
|
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
|
|
|
|
if (write_thread_.CompleteParallelWorker(&w)) {
|
|
|
|
// we're responsible for early exit
|
2016-02-05 19:44:13 +01:00
|
|
|
auto last_sequence = w.parallel_group->last_sequence;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
|
|
|
|
versions_->SetLastSequence(last_sequence);
|
|
|
|
write_thread_.EarlyExitParallelGroup(&w);
|
|
|
|
}
|
|
|
|
assert(w.state == WriteThread::STATE_COMPLETED);
|
|
|
|
// STATE_COMPLETED conditional below handles exit
|
2016-02-05 19:44:13 +01:00
|
|
|
|
|
|
|
status = w.FinalStatus();
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
}
|
|
|
|
if (w.state == WriteThread::STATE_COMPLETED) {
|
|
|
|
// write is complete and leader has updated sequence
|
2015-08-06 01:56:28 +02:00
|
|
|
RecordTick(stats_, WRITE_DONE_BY_OTHER);
|
2016-02-05 19:44:13 +01:00
|
|
|
return w.FinalStatus();
|
2015-08-06 01:56:28 +02:00
|
|
|
}
|
|
|
|
// else we are the leader of the write batch group
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
assert(w.state == WriteThread::STATE_GROUP_LEADER);
|
2015-08-06 01:56:28 +02:00
|
|
|
|
2014-08-12 07:10:32 +02:00
|
|
|
WriteContext context;
|
|
|
|
mutex_.Lock();
|
2015-01-24 03:04:39 +01:00
|
|
|
|
|
|
|
if (!write_options.disableWAL) {
|
|
|
|
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
|
|
|
|
}
|
|
|
|
|
2014-08-12 07:10:32 +02:00
|
|
|
RecordTick(stats_, WRITE_DONE_BY_SELF);
|
|
|
|
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
|
2012-03-09 01:23:21 +01:00
|
|
|
|
2014-07-04 00:47:02 +02:00
|
|
|
// Once reaches this point, the current writer "w" will try to do its write
|
|
|
|
// job. It may also pick up some of the remaining writers in the "writers_"
|
|
|
|
// when it finds suitable, and finish them in the same write batch.
|
|
|
|
// This is how a write job could be done by the other writer.
|
2014-06-07 02:26:23 +02:00
|
|
|
assert(!single_column_family_mode_ ||
|
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
|
|
|
|
|
2014-09-05 20:48:17 +02:00
|
|
|
uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0)
|
2014-06-03 00:33:54 +02:00
|
|
|
? 4 * max_total_in_memory_state_
|
2014-09-05 20:48:17 +02:00
|
|
|
: db_options_.max_total_wal_size;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (UNLIKELY(!single_column_family_mode_ &&
|
|
|
|
alive_log_files_.begin()->getting_flushed == false &&
|
|
|
|
total_log_size_ > max_total_wal_size)) {
|
2014-09-11 03:46:09 +02:00
|
|
|
uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number;
|
2014-04-30 20:33:40 +02:00
|
|
|
alive_log_files_.begin()->getting_flushed = true;
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2014-06-03 00:33:54 +02:00
|
|
|
"Flushing all column families with data in WAL number %" PRIu64
|
|
|
|
". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
|
|
|
|
flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
|
2014-09-11 03:46:09 +02:00
|
|
|
// no need to refcount because drop is happening in write thread, so can't
|
|
|
|
// happen while we're in the write thread
|
2014-06-07 02:26:23 +02:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-09-11 03:46:09 +02:00
|
|
|
if (cfd->GetLogNumber() <= flush_column_family_if_log_file) {
|
2015-06-17 21:37:59 +02:00
|
|
|
status = SwitchMemtable(cfd, &context);
|
2014-09-11 03:46:09 +02:00
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
2014-08-12 07:10:32 +02:00
|
|
|
cfd->imm()->FlushRequested();
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
SchedulePendingFlush(cfd);
|
2014-06-07 02:26:23 +02:00
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
}
|
2015-06-17 21:37:59 +02:00
|
|
|
MaybeScheduleFlushOrCompaction();
|
2014-12-02 21:09:20 +01:00
|
|
|
} else if (UNLIKELY(write_buffer_.ShouldFlush())) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2015-11-25 00:53:42 +01:00
|
|
|
"Flushing column family with largest mem table size. Write buffer is "
|
|
|
|
"using %" PRIu64 " bytes out of a total of %" PRIu64 ".",
|
2014-12-02 21:09:20 +01:00
|
|
|
write_buffer_.memory_usage(), write_buffer_.buffer_size());
|
|
|
|
// no need to refcount because drop is happening in write thread, so can't
|
|
|
|
// happen while we're in the write thread
|
2015-11-25 00:53:42 +01:00
|
|
|
ColumnFamilyData* largest_cfd = nullptr;
|
|
|
|
size_t largest_cfd_size = 0;
|
|
|
|
|
2014-12-02 21:09:20 +01:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
2015-03-20 01:04:29 +01:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-12-02 21:09:20 +01:00
|
|
|
if (!cfd->mem()->IsEmpty()) {
|
2015-11-25 00:53:42 +01:00
|
|
|
// We only consider active mem table, hoping immutable memtable is
|
|
|
|
// already in the process of flushing.
|
|
|
|
size_t cfd_size = cfd->mem()->ApproximateMemoryUsage();
|
|
|
|
if (largest_cfd == nullptr || cfd_size > largest_cfd_size) {
|
|
|
|
largest_cfd = cfd;
|
|
|
|
largest_cfd_size = cfd_size;
|
2014-12-02 21:09:20 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-11-25 00:53:42 +01:00
|
|
|
if (largest_cfd != nullptr) {
|
|
|
|
status = SwitchMemtable(largest_cfd, &context);
|
|
|
|
if (status.ok()) {
|
|
|
|
largest_cfd->imm()->FlushRequested();
|
|
|
|
SchedulePendingFlush(largest_cfd);
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
}
|
2014-09-11 03:46:09 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && !bg_error_.ok())) {
|
|
|
|
status = bg_error_;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
|
|
|
|
status = ScheduleFlushes(&context);
|
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
|
|
|
|
write_controller_.NeedsDelay()))) {
|
2015-06-02 11:07:58 +02:00
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
PERF_TIMER_GUARD(write_delay_time);
|
2015-05-16 00:52:51 +02:00
|
|
|
// We don't know size of curent batch so that we always use the size
|
|
|
|
// for previous one. It might create a fairness issue that expiration
|
|
|
|
// might happen for smaller writes but larger writes can go through.
|
|
|
|
// Can optimize it if it is an issue.
|
Deprecate WriteOptions::timeout_hint_us
Summary:
In one of our recent meetings, we discussed deprecating features that are not being actively used. One of those features, at least within Facebook, is timeout_hint. The feature is really nicely implemented, but if nobody needs it, we should remove it from our code-base (until we get a valid use-case). Some arguments:
* Less code == better icache hit rate, smaller builds, simpler code
* The motivation for adding timeout_hint_us was to work-around RocksDB's stall issue. However, we're currently addressing the stall issue itself (see @sdong's recent work on stall write_rate), so we should never see sharp lock-ups in the future.
* Nobody is using the feature within Facebook's code-base. Googling for `timeout_hint_us` also doesn't yield any users.
Test Plan: make check
Reviewers: anthony, kradhakrishnan, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D41937
2015-07-14 09:35:48 +02:00
|
|
|
status = DelayWrite(last_batch_group_size_);
|
2015-06-02 11:07:58 +02:00
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
2014-09-11 03:46:09 +02:00
|
|
|
}
|
|
|
|
|
2011-04-21 00:48:11 +02:00
|
|
|
uint64_t last_sequence = versions_->LastSequence();
|
2014-09-13 01:23:58 +02:00
|
|
|
WriteThread::Writer* last_writer = &w;
|
2016-02-05 19:44:13 +01:00
|
|
|
autovector<WriteThread::Writer*> write_group;
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
bool need_log_sync = !write_options.disableWAL && write_options.sync;
|
|
|
|
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
|
2015-05-29 23:36:35 +02:00
|
|
|
|
2014-08-12 07:10:32 +02:00
|
|
|
if (status.ok()) {
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
if (need_log_sync) {
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 12:28:08 +02:00
|
|
|
while (logs_.front().getting_synced) {
|
|
|
|
log_sync_cv_.Wait();
|
|
|
|
}
|
|
|
|
for (auto& log : logs_) {
|
|
|
|
assert(!log.getting_synced);
|
|
|
|
log.getting_synced = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-03-09 01:23:21 +01:00
|
|
|
// Add to log and apply to memtable. We can release the lock
|
|
|
|
// during this phase since &w is currently responsible for logging
|
|
|
|
// and protects against concurrent loggers and concurrent writes
|
2014-01-28 20:05:04 +01:00
|
|
|
// into memtables
|
2015-05-29 23:36:35 +02:00
|
|
|
}
|
|
|
|
|
2016-02-05 19:44:13 +01:00
|
|
|
mutex_.Unlock();
|
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
// At this point the mutex is unlocked
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
bool exit_completed_early = false;
|
2016-02-05 19:44:13 +01:00
|
|
|
last_batch_group_size_ =
|
|
|
|
write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
if (status.ok()) {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
// Rules for when we can update the memtable concurrently
|
|
|
|
// 1. supported by memtable
|
|
|
|
// 2. Puts are not okay if inplace_update_support
|
|
|
|
// 3. Deletes or SingleDeletes are not okay if filtering deletes
|
|
|
|
// (controlled by both batch and memtable setting)
|
|
|
|
// 4. Merges are not okay
|
|
|
|
//
|
|
|
|
// Rules 1..3 are enforced by checking the options
|
|
|
|
// during startup (CheckConcurrentWritesSupported), so if
|
|
|
|
// options.allow_concurrent_memtable_write is true then they can be
|
|
|
|
// assumed to be true. Rule 4 is checked for each batch. We could
|
|
|
|
// relax rules 2 and 3 if we could prevent write batches from referring
|
|
|
|
// more than once to a particular key.
|
2016-02-05 19:44:13 +01:00
|
|
|
bool parallel =
|
|
|
|
db_options_.allow_concurrent_memtable_write && write_group.size() > 1;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
int total_count = 0;
|
|
|
|
uint64_t total_byte_size = 0;
|
2016-02-05 19:44:13 +01:00
|
|
|
for (auto writer : write_group) {
|
|
|
|
if (writer->CheckCallback(this)) {
|
|
|
|
total_count += WriteBatchInternal::Count(writer->batch);
|
|
|
|
total_byte_size = WriteBatchInternal::AppendedByteSize(
|
|
|
|
total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
|
|
|
|
parallel = parallel && !writer->batch->HasMerge();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (total_count == 0) {
|
|
|
|
write_thread_.ExitAsBatchGroupLeader(&w, last_writer, status);
|
|
|
|
return w.FinalStatus();
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const SequenceNumber current_sequence = last_sequence + 1;
|
|
|
|
last_sequence += total_count;
|
|
|
|
|
|
|
|
// Record statistics
|
|
|
|
RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
|
|
|
|
RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
|
2016-02-01 03:09:24 +01:00
|
|
|
MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
2014-01-14 23:49:31 +01:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (write_options.disableWAL) {
|
2016-02-09 20:20:22 +01:00
|
|
|
has_unpersisted_data_ = true;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
}
|
2015-11-06 16:29:10 +01:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
uint64_t log_size = 0;
|
|
|
|
if (!write_options.disableWAL) {
|
|
|
|
PERF_TIMER_GUARD(write_wal_time);
|
2015-11-06 16:29:10 +01:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
WriteBatch* merged_batch = nullptr;
|
2016-02-05 19:44:13 +01:00
|
|
|
if (write_group.size() == 1) {
|
|
|
|
merged_batch = write_group[0]->batch;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
} else {
|
|
|
|
// WAL needs all of the batches flattened into a single batch.
|
|
|
|
// We could avoid copying here with an iov-like AddRecord
|
|
|
|
// interface
|
|
|
|
merged_batch = &tmp_batch_;
|
2016-02-05 19:44:13 +01:00
|
|
|
for (auto writer : write_group) {
|
|
|
|
if (!writer->CallbackFailed()) {
|
|
|
|
WriteBatchInternal::Append(merged_batch, writer->batch);
|
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
WriteBatchInternal::SetSequence(merged_batch, current_sequence);
|
|
|
|
|
|
|
|
assert(WriteBatchInternal::Count(merged_batch) == total_count);
|
|
|
|
assert(WriteBatchInternal::ByteSize(merged_batch) == total_byte_size);
|
|
|
|
|
|
|
|
Slice log_entry = WriteBatchInternal::Contents(merged_batch);
|
|
|
|
status = logs_.back().writer->AddRecord(log_entry);
|
|
|
|
total_log_size_ += log_entry.size();
|
|
|
|
alive_log_files_.back().AddSize(log_entry.size());
|
|
|
|
log_empty_ = false;
|
|
|
|
log_size = log_entry.size();
|
|
|
|
RecordTick(stats_, WAL_FILE_BYTES, log_size);
|
|
|
|
if (status.ok() && need_log_sync) {
|
|
|
|
RecordTick(stats_, WAL_FILE_SYNCED);
|
|
|
|
StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
|
|
|
|
// It's safe to access logs_ with unlocked mutex_ here because:
|
|
|
|
// - we've set getting_synced=true for all logs,
|
|
|
|
// so other threads won't pop from logs_ while we're here,
|
|
|
|
// - only writer thread can push to logs_, and we're in
|
|
|
|
// writer thread, so no one will push to logs_,
|
|
|
|
// - as long as other threads don't modify it, it's safe to read
|
|
|
|
// from std::deque from multiple threads concurrently.
|
|
|
|
for (auto& log : logs_) {
|
|
|
|
status = log.writer->file()->Sync(db_options_.use_fsync);
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (status.ok() && need_log_dir_sync) {
|
|
|
|
// We only sync WAL directory the first time WAL syncing is
|
|
|
|
// requested, so that in case users never turn on WAL sync,
|
|
|
|
// we can avoid the disk I/O in the write code path.
|
|
|
|
status = directories_.GetWalDir()->Fsync();
|
|
|
|
}
|
2013-07-24 19:01:13 +02:00
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (merged_batch == &tmp_batch_) {
|
|
|
|
tmp_batch_.Clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (status.ok()) {
|
|
|
|
PERF_TIMER_GUARD(write_memtable_time);
|
2015-11-06 16:29:10 +01:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
{
|
|
|
|
// Update stats while we are an exclusive group leader, so we know
|
|
|
|
// that nobody else can be writing to these particular stats.
|
|
|
|
// We're optimistic, updating the stats before we successfully
|
|
|
|
// commit. That lets us release our leader status early in
|
|
|
|
// some cases.
|
|
|
|
auto stats = default_cf_internal_stats_;
|
|
|
|
stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size);
|
|
|
|
stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count);
|
|
|
|
if (!write_options.disableWAL) {
|
|
|
|
if (write_options.sync) {
|
|
|
|
stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1);
|
2015-11-06 16:29:10 +01:00
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size);
|
2015-11-06 16:29:10 +01:00
|
|
|
}
|
2016-02-05 19:44:13 +01:00
|
|
|
uint64_t for_other = write_group.size() - 1;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (for_other > 0) {
|
|
|
|
stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, for_other);
|
|
|
|
if (!write_options.disableWAL) {
|
|
|
|
stats->AddDBStats(InternalStats::WRITE_WITH_WAL, for_other);
|
2015-01-26 22:59:38 +01:00
|
|
|
}
|
2012-07-05 22:39:28 +02:00
|
|
|
}
|
2011-09-01 21:08:02 +02:00
|
|
|
}
|
2014-06-19 23:53:03 +02:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (!parallel) {
|
2014-01-28 20:05:04 +01:00
|
|
|
status = WriteBatchInternal::InsertInto(
|
2016-02-05 19:44:13 +01:00
|
|
|
write_group, current_sequence, column_family_memtables_.get(),
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
&flush_scheduler_, write_options.ignore_missing_column_families,
|
|
|
|
0 /*log_number*/, this, false /*dont_filter_deletes*/);
|
2016-02-05 19:44:13 +01:00
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
// There were no write failures. Set leader's status
|
|
|
|
// in case the write callback returned a non-ok status.
|
|
|
|
status = w.FinalStatus();
|
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
} else {
|
2015-12-30 14:49:06 +01:00
|
|
|
WriteThread::ParallelGroup pg;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
pg.leader = &w;
|
|
|
|
pg.last_writer = last_writer;
|
2016-02-05 19:44:13 +01:00
|
|
|
pg.last_sequence = last_sequence;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
pg.early_exit_allowed = !need_log_sync;
|
2016-02-05 19:44:13 +01:00
|
|
|
pg.running.store(static_cast<uint32_t>(write_group.size()),
|
2015-12-28 18:50:49 +01:00
|
|
|
std::memory_order_relaxed);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
write_thread_.LaunchParallelFollowers(&pg, current_sequence);
|
|
|
|
|
2016-02-05 19:44:13 +01:00
|
|
|
if (!w.CallbackFailed()) {
|
|
|
|
// do leader write
|
|
|
|
ColumnFamilyMemTablesImpl column_family_memtables(
|
|
|
|
versions_->GetColumnFamilySet());
|
|
|
|
assert(w.sequence == current_sequence);
|
|
|
|
WriteBatchInternal::SetSequence(w.batch, w.sequence);
|
|
|
|
w.status = WriteBatchInternal::InsertInto(
|
|
|
|
w.batch, &column_family_memtables, &flush_scheduler_,
|
|
|
|
write_options.ignore_missing_column_families, 0 /*log_number*/,
|
|
|
|
this, true /*dont_filter_deletes*/,
|
|
|
|
true /*concurrent_memtable_writes*/);
|
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
// CompleteParallelWorker returns true if this thread should
|
|
|
|
// handle exit, false means somebody else did
|
|
|
|
exit_completed_early = !write_thread_.CompleteParallelWorker(&w);
|
2016-02-05 19:44:13 +01:00
|
|
|
status = w.FinalStatus();
|
2013-09-27 01:04:12 +02:00
|
|
|
}
|
2015-05-29 23:36:35 +02:00
|
|
|
|
2016-02-05 19:44:13 +01:00
|
|
|
if (!exit_completed_early && w.status.ok()) {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
|
|
|
|
versions_->SetLastSequence(last_sequence);
|
|
|
|
if (!need_log_sync) {
|
2016-02-05 19:44:13 +01:00
|
|
|
write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
exit_completed_early = true;
|
2015-09-22 01:11:26 +02:00
|
|
|
}
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
|
|
|
|
// A non-OK status here indicates that the state implied by the
|
|
|
|
// WAL has diverged from the in-memory state. This could be
|
|
|
|
// because of a corrupt write_batch (very bad), or because the
|
|
|
|
// client specified an invalid column family and didn't specify
|
|
|
|
// ignore_missing_column_families.
|
|
|
|
//
|
|
|
|
// Is setting bg_error_ enough here? This will at least stop
|
|
|
|
// compaction and fail any further writes.
|
2016-02-05 19:44:13 +01:00
|
|
|
if (!status.ok() && bg_error_.ok() && !w.CallbackFailed()) {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
bg_error_ = status;
|
2011-09-01 21:08:02 +02:00
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
}
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 12:28:08 +02:00
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
2015-05-29 23:36:35 +02:00
|
|
|
|
2016-02-05 19:44:13 +01:00
|
|
|
if (db_options_.paranoid_checks && !status.ok() && !w.CallbackFailed() &&
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
!status.IsBusy()) {
|
|
|
|
mutex_.Lock();
|
|
|
|
if (bg_error_.ok()) {
|
|
|
|
bg_error_ = status; // stop compaction & fail any further writes
|
|
|
|
}
|
|
|
|
mutex_.Unlock();
|
2013-10-28 20:36:02 +01:00
|
|
|
}
|
2012-03-09 01:23:21 +01:00
|
|
|
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
if (need_log_sync) {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
mutex_.Lock();
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
mutex_.Unlock();
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 12:28:08 +02:00
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (!exit_completed_early) {
|
2016-02-05 19:44:13 +01:00
|
|
|
write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status);
|
2015-08-06 01:56:28 +02:00
|
|
|
}
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
// REQUIRES: mutex_ is held
|
|
|
|
// REQUIRES: this thread is currently at the front of the writer queue
|
Deprecate WriteOptions::timeout_hint_us
Summary:
In one of our recent meetings, we discussed deprecating features that are not being actively used. One of those features, at least within Facebook, is timeout_hint. The feature is really nicely implemented, but if nobody needs it, we should remove it from our code-base (until we get a valid use-case). Some arguments:
* Less code == better icache hit rate, smaller builds, simpler code
* The motivation for adding timeout_hint_us was to work-around RocksDB's stall issue. However, we're currently addressing the stall issue itself (see @sdong's recent work on stall write_rate), so we should never see sharp lock-ups in the future.
* Nobody is using the feature within Facebook's code-base. Googling for `timeout_hint_us` also doesn't yield any users.
Test Plan: make check
Reviewers: anthony, kradhakrishnan, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D41937
2015-07-14 09:35:48 +02:00
|
|
|
Status DBImpl::DelayWrite(uint64_t num_bytes) {
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 02:51:08 +01:00
|
|
|
uint64_t time_delayed = 0;
|
|
|
|
bool delayed = false;
|
|
|
|
{
|
|
|
|
StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
|
2015-05-16 00:52:51 +02:00
|
|
|
auto delay = write_controller_.GetDelay(env_, num_bytes);
|
|
|
|
if (delay > 0) {
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 02:51:08 +01:00
|
|
|
mutex_.Unlock();
|
|
|
|
delayed = true;
|
2015-03-30 23:04:21 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
|
Deprecate WriteOptions::timeout_hint_us
Summary:
In one of our recent meetings, we discussed deprecating features that are not being actively used. One of those features, at least within Facebook, is timeout_hint. The feature is really nicely implemented, but if nobody needs it, we should remove it from our code-base (until we get a valid use-case). Some arguments:
* Less code == better icache hit rate, smaller builds, simpler code
* The motivation for adding timeout_hint_us was to work-around RocksDB's stall issue. However, we're currently addressing the stall issue itself (see @sdong's recent work on stall write_rate), so we should never see sharp lock-ups in the future.
* Nobody is using the feature within Facebook's code-base. Googling for `timeout_hint_us` also doesn't yield any users.
Test Plan: make check
Reviewers: anthony, kradhakrishnan, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D41937
2015-07-14 09:35:48 +02:00
|
|
|
// hopefully we don't have to sleep more than 2 billion microseconds
|
|
|
|
env_->SleepForMicroseconds(static_cast<int>(delay));
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 02:51:08 +01:00
|
|
|
mutex_.Lock();
|
|
|
|
}
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
|
Deprecate WriteOptions::timeout_hint_us
Summary:
In one of our recent meetings, we discussed deprecating features that are not being actively used. One of those features, at least within Facebook, is timeout_hint. The feature is really nicely implemented, but if nobody needs it, we should remove it from our code-base (until we get a valid use-case). Some arguments:
* Less code == better icache hit rate, smaller builds, simpler code
* The motivation for adding timeout_hint_us was to work-around RocksDB's stall issue. However, we're currently addressing the stall issue itself (see @sdong's recent work on stall write_rate), so we should never see sharp lock-ups in the future.
* Nobody is using the feature within Facebook's code-base. Googling for `timeout_hint_us` also doesn't yield any users.
Test Plan: make check
Reviewers: anthony, kradhakrishnan, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D41937
2015-07-14 09:35:48 +02:00
|
|
|
while (bg_error_.ok() && write_controller_.IsStopped()) {
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 02:51:08 +01:00
|
|
|
delayed = true;
|
Deprecate WriteOptions::timeout_hint_us
Summary:
In one of our recent meetings, we discussed deprecating features that are not being actively used. One of those features, at least within Facebook, is timeout_hint. The feature is really nicely implemented, but if nobody needs it, we should remove it from our code-base (until we get a valid use-case). Some arguments:
* Less code == better icache hit rate, smaller builds, simpler code
* The motivation for adding timeout_hint_us was to work-around RocksDB's stall issue. However, we're currently addressing the stall issue itself (see @sdong's recent work on stall write_rate), so we should never see sharp lock-ups in the future.
* Nobody is using the feature within Facebook's code-base. Googling for `timeout_hint_us` also doesn't yield any users.
Test Plan: make check
Reviewers: anthony, kradhakrishnan, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D41937
2015-07-14 09:35:48 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
|
|
|
|
bg_cv_.Wait();
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
}
|
|
|
|
}
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 02:51:08 +01:00
|
|
|
if (delayed) {
|
|
|
|
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS,
|
|
|
|
time_delayed);
|
|
|
|
RecordTick(stats_, STALL_MICROS, time_delayed);
|
|
|
|
}
|
2014-10-13 23:25:55 +02:00
|
|
|
|
|
|
|
return bg_error_;
|
2013-08-01 01:20:48 +02:00
|
|
|
}
|
|
|
|
|
2014-09-11 03:46:09 +02:00
|
|
|
Status DBImpl::ScheduleFlushes(WriteContext* context) {
|
|
|
|
ColumnFamilyData* cfd;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
|
2015-06-17 21:37:59 +02:00
|
|
|
auto status = SwitchMemtable(cfd, context);
|
2014-09-11 03:46:09 +02:00
|
|
|
if (cfd->Unref()) {
|
|
|
|
delete cfd;
|
|
|
|
}
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
2014-08-12 07:10:32 +02:00
|
|
|
}
|
|
|
|
}
|
2014-09-11 03:46:09 +02:00
|
|
|
return Status::OK();
|
2014-08-12 07:10:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: mutex_ is held
|
|
|
|
// REQUIRES: this thread is currently at the front of the writer queue
|
2015-06-17 21:37:59 +02:00
|
|
|
Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
2014-08-12 07:10:32 +02:00
|
|
|
mutex_.AssertHeld();
|
|
|
|
unique_ptr<WritableFile> lfile;
|
|
|
|
log::Writer* new_log = nullptr;
|
|
|
|
MemTable* new_mem = nullptr;
|
|
|
|
|
|
|
|
// Attempt to switch to a new memtable and trigger flush of old.
|
|
|
|
// Do this without holding the dbmutex lock.
|
2014-11-04 02:45:55 +01:00
|
|
|
assert(versions_->prev_log_number() == 0);
|
2014-08-12 07:10:32 +02:00
|
|
|
bool creating_new_log = !log_empty_;
|
2015-10-01 15:12:04 +02:00
|
|
|
uint64_t recycle_log_number = 0;
|
|
|
|
if (creating_new_log && db_options_.recycle_log_file_num &&
|
|
|
|
!log_recycle_files.empty()) {
|
|
|
|
recycle_log_number = log_recycle_files.front();
|
|
|
|
log_recycle_files.pop_front();
|
|
|
|
}
|
2014-08-12 07:10:32 +02:00
|
|
|
uint64_t new_log_number =
|
|
|
|
creating_new_log ? versions_->NewFileNumber() : logfile_number_;
|
|
|
|
SuperVersion* new_superversion = nullptr;
|
2014-09-17 21:49:13 +02:00
|
|
|
const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
|
2014-08-12 07:10:32 +02:00
|
|
|
mutex_.Unlock();
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
if (creating_new_log) {
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
EnvOptions opt_env_opt =
|
|
|
|
env_->OptimizeForLogWrite(env_options_, db_options_);
|
2015-10-01 15:12:04 +02:00
|
|
|
if (recycle_log_number) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"reusing log %" PRIu64 " from recycle list\n", recycle_log_number);
|
|
|
|
s = env_->ReuseWritableFile(
|
|
|
|
LogFileName(db_options_.wal_dir, new_log_number),
|
|
|
|
LogFileName(db_options_.wal_dir, recycle_log_number), &lfile,
|
|
|
|
opt_env_opt);
|
|
|
|
} else {
|
2015-10-29 23:52:32 +01:00
|
|
|
s = NewWritableFile(env_,
|
|
|
|
LogFileName(db_options_.wal_dir, new_log_number),
|
|
|
|
&lfile, opt_env_opt);
|
2015-10-01 15:12:04 +02:00
|
|
|
}
|
2014-08-12 07:10:32 +02:00
|
|
|
if (s.ok()) {
|
|
|
|
// Our final size should be less than write_buffer_size
|
|
|
|
// (compression, etc) but err on the side of caution.
|
2015-11-19 20:47:12 +01:00
|
|
|
lfile->SetPreallocationBlockSize(
|
|
|
|
mutable_cf_options.write_buffer_size / 10 +
|
|
|
|
mutable_cf_options.write_buffer_size);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
unique_ptr<WritableFileWriter> file_writer(
|
|
|
|
new WritableFileWriter(std::move(lfile), opt_env_opt));
|
2015-10-29 23:52:32 +01:00
|
|
|
new_log = new log::Writer(std::move(file_writer), new_log_number,
|
|
|
|
db_options_.recycle_log_file_num > 0);
|
2014-04-15 18:57:25 +02:00
|
|
|
}
|
2014-08-12 07:10:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2015-05-29 23:36:35 +02:00
|
|
|
SequenceNumber seq = versions_->LastSequence();
|
|
|
|
new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
|
2014-08-12 07:10:32 +02:00
|
|
|
new_superversion = new SuperVersion();
|
|
|
|
}
|
|
|
|
}
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
2014-10-15 19:56:50 +02:00
|
|
|
"[%s] New memtable created with log file: #%" PRIu64 "\n",
|
|
|
|
cfd->GetName().c_str(), new_log_number);
|
2014-08-12 07:10:32 +02:00
|
|
|
mutex_.Lock();
|
|
|
|
if (!s.ok()) {
|
|
|
|
// how do we fail if we're not creating new log?
|
|
|
|
assert(creating_new_log);
|
|
|
|
assert(!new_mem);
|
|
|
|
assert(!new_log);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (creating_new_log) {
|
|
|
|
logfile_number_ = new_log_number;
|
|
|
|
assert(new_log != nullptr);
|
|
|
|
log_empty_ = true;
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
log_dir_synced_ = false;
|
2015-08-06 05:51:27 +02:00
|
|
|
logs_.emplace_back(logfile_number_, new_log);
|
2014-08-12 07:10:32 +02:00
|
|
|
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
|
2014-10-31 19:59:54 +01:00
|
|
|
for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
|
2014-08-12 07:10:32 +02:00
|
|
|
// all this is just optimization to delete logs that
|
|
|
|
// are no longer needed -- if CF is empty, that means it
|
|
|
|
// doesn't need that particular log to stay alive, so we just
|
|
|
|
// advance the log number. no need to persist this in the manifest
|
2014-10-31 19:59:54 +01:00
|
|
|
if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
loop_cfd->imm()->NumNotFlushed() == 0) {
|
2014-10-31 19:59:54 +01:00
|
|
|
loop_cfd->SetLogNumber(logfile_number_);
|
2013-09-05 02:24:35 +02:00
|
|
|
}
|
2011-04-12 21:38:58 +02:00
|
|
|
}
|
|
|
|
}
|
2014-08-12 07:10:32 +02:00
|
|
|
cfd->mem()->SetNextLogNumber(logfile_number_);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
|
2014-08-12 07:10:32 +02:00
|
|
|
new_mem->Ref();
|
|
|
|
cfd->SetMemtable(new_mem);
|
2015-06-17 21:37:59 +02:00
|
|
|
context->superversions_to_free_.push_back(InstallSuperVersionAndScheduleWork(
|
|
|
|
cfd, new_superversion, mutable_cf_options));
|
2011-04-12 21:38:58 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-04-15 22:39:26 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
2014-02-15 02:02:10 +01:00
|
|
|
Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
|
|
|
TablePropertiesCollection* props) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
2014-02-14 01:28:21 +01:00
|
|
|
// Increment the ref count
|
|
|
|
mutex_.Lock();
|
2014-02-15 02:02:10 +01:00
|
|
|
auto version = cfd->current();
|
2014-02-14 01:28:21 +01:00
|
|
|
version->Ref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
auto s = version->GetPropertiesOfAllTables(props);
|
|
|
|
|
|
|
|
// Decrement the ref count
|
|
|
|
mutex_.Lock();
|
|
|
|
version->Unref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
2015-10-13 23:24:45 +02:00
|
|
|
|
|
|
|
Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
|
2015-10-19 19:34:55 +02:00
|
|
|
const Range* range, std::size_t n,
|
2015-10-13 23:24:45 +02:00
|
|
|
TablePropertiesCollection* props) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
|
|
|
// Increment the ref count
|
|
|
|
mutex_.Lock();
|
|
|
|
auto version = cfd->current();
|
|
|
|
version->Ref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
auto s = version->GetPropertiesOfTablesInRange(range, n, props);
|
|
|
|
|
|
|
|
// Decrement the ref count
|
|
|
|
mutex_.Lock();
|
|
|
|
version->Unref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif // ROCKSDB_LITE
|
2014-02-14 01:28:21 +01:00
|
|
|
|
[RocksDB] BackupableDB
Summary:
In this diff I present you BackupableDB v1. You can easily use it to backup your DB and it will do incremental snapshots for you.
Let's first describe how you would use BackupableDB. It's inheriting StackableDB interface so you can easily construct it with your DB object -- it will add a method RollTheSnapshot() to the DB object. When you call RollTheSnapshot(), current snapshot of the DB will be stored in the backup dir. To restore, you can just call RestoreDBFromBackup() on a BackupableDB (which is a static method) and it will restore all files from the backup dir. In the next version, it will even support automatic backuping every X minutes.
There are multiple things you can configure:
1. backup_env and db_env can be different, which is awesome because then you can easily backup to HDFS or wherever you feel like.
2. sync - if true, it *guarantees* backup consistency on machine reboot
3. number of snapshots to keep - this will keep last N snapshots around if you want, for some reason, be able to restore from an earlier snapshot. All the backuping is done in incremental fashion - if we already have 00010.sst, we will not copy it again. *IMPORTANT* -- This is based on assumption that 00010.sst never changes - two files named 00010.sst from the same DB will always be exactly the same. Is this true? I always copy manifest, current and log files.
4. You can decide if you want to flush the memtables before you backup, or you're fine with backing up the log files -- either way, you get a complete and consistent view of the database at a time of backup.
5. More things you can find in BackupableDBOptions
Here is the directory structure I use:
backup_dir/CURRENT_SNAPSHOT - just 4 bytes holding the latest snapshot
0, 1, 2, ... - files containing serialized version of each snapshot - containing a list of files
files/*.sst - sst files shared between snapshots - if one snapshot references 00010.sst and another one needs to backup it from the DB, it will just reference the same file
files/ 0/, 1/, 2/, ... - snapshot directories containing private snapshot files - current, manifest and log files
All the files are ref counted and deleted immediatelly when they get out of scope.
Some other stuff in this diff:
1. Added GetEnv() method to the DB. Discussed with @haobo and we agreed that it seems right thing to do.
2. Fixed StackableDB interface. The way it was set up before, I was not able to implement BackupableDB.
Test Plan:
I have a unittest, but please don't look at this yet. I just hacked it up to help me with debugging. I will write a lot of good tests and update the diff.
Also, `make asan_check`
Reviewers: dhruba, haobo, emayanke
Reviewed By: dhruba
CC: leveldb, haobo
Differential Revision: https://reviews.facebook.net/D14295
2013-12-09 23:06:52 +01:00
|
|
|
const std::string& DBImpl::GetName() const {
|
|
|
|
return dbname_;
|
|
|
|
}
|
|
|
|
|
2013-11-25 21:39:23 +01:00
|
|
|
Env* DBImpl::GetEnv() const {
|
|
|
|
return env_;
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
2014-03-11 22:52:17 +01:00
|
|
|
return *cfh->cfd()->options();
|
2013-11-26 00:51:50 +01:00
|
|
|
}
|
|
|
|
|
2015-05-11 23:51:51 +02:00
|
|
|
const DBOptions& DBImpl::GetDBOptions() const { return db_options_; }
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
const Slice& property, std::string* value) {
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
const DBPropertyInfo* property_info = GetPropertyInfo(property);
|
2011-04-12 21:38:58 +02:00
|
|
|
value->clear();
|
2015-11-04 00:54:18 +01:00
|
|
|
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
if (property_info == nullptr) {
|
|
|
|
return false;
|
|
|
|
} else if (property_info->handle_int) {
|
2014-08-05 20:27:34 +02:00
|
|
|
uint64_t int_value;
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
bool ret_value =
|
|
|
|
GetIntPropertyInternal(cfd, *property_info, false, &int_value);
|
2014-08-05 20:27:34 +02:00
|
|
|
if (ret_value) {
|
2014-11-25 05:44:49 +01:00
|
|
|
*value = ToString(int_value);
|
2014-08-05 20:27:34 +02:00
|
|
|
}
|
|
|
|
return ret_value;
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
} else if (property_info->handle_string) {
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
return cfd->internal_stats()->GetStringProperty(*property_info, property,
|
2014-08-05 20:27:34 +02:00
|
|
|
value);
|
|
|
|
}
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
// Shouldn't reach here since exactly one of handle_string and handle_int
|
|
|
|
// should be non-nullptr.
|
|
|
|
assert(false);
|
|
|
|
return false;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-07-29 00:28:53 +02:00
|
|
|
bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& property, uint64_t* value) {
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
const DBPropertyInfo* property_info = GetPropertyInfo(property);
|
|
|
|
if (property_info == nullptr || property_info->handle_int == nullptr) {
|
2014-08-05 20:27:34 +02:00
|
|
|
return false;
|
|
|
|
}
|
2015-11-04 00:54:18 +01:00
|
|
|
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
return GetIntPropertyInternal(cfd, *property_info, false, value);
|
2014-08-05 20:27:34 +02:00
|
|
|
}
|
|
|
|
|
2015-11-04 00:54:18 +01:00
|
|
|
bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
const DBPropertyInfo& property_info,
|
|
|
|
bool is_locked, uint64_t* value) {
|
|
|
|
assert(property_info.handle_int != nullptr);
|
|
|
|
if (!property_info.need_out_of_mutex) {
|
2015-11-04 00:54:18 +01:00
|
|
|
if (is_locked) {
|
|
|
|
mutex_.AssertHeld();
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
return cfd->internal_stats()->GetIntProperty(property_info, value, this);
|
2015-11-04 00:54:18 +01:00
|
|
|
} else {
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
return cfd->internal_stats()->GetIntProperty(property_info, value, this);
|
2015-11-04 00:54:18 +01:00
|
|
|
}
|
2014-08-05 20:27:34 +02:00
|
|
|
} else {
|
2015-11-04 00:54:18 +01:00
|
|
|
SuperVersion* sv = nullptr;
|
|
|
|
if (!is_locked) {
|
|
|
|
sv = GetAndRefSuperVersion(cfd);
|
|
|
|
} else {
|
|
|
|
sv = cfd->GetSuperVersion();
|
|
|
|
}
|
2014-08-05 20:27:34 +02:00
|
|
|
|
|
|
|
bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
property_info, sv->current, value);
|
2014-08-05 20:27:34 +02:00
|
|
|
|
2015-11-04 00:54:18 +01:00
|
|
|
if (!is_locked) {
|
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
|
|
}
|
2014-08-05 20:27:34 +02:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-11-04 00:54:18 +01:00
|
|
|
bool DBImpl::GetAggregatedIntProperty(const Slice& property,
|
|
|
|
uint64_t* aggregated_value) {
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
const DBPropertyInfo* property_info = GetPropertyInfo(property);
|
|
|
|
if (property_info == nullptr || property_info->handle_int == nullptr) {
|
2015-11-04 00:54:18 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t sum = 0;
|
|
|
|
{
|
|
|
|
// Needs mutex to protect the list of column families.
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
uint64_t value;
|
|
|
|
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
Eliminate duplicated property constants
Summary:
Before this diff, there were duplicated constants to refer to properties (user-
facing API had strings and InternalStats had an enum). I noticed these were
inconsistent in terms of which constants are provided, names of constants, and
documentation of constants. Overall it seemed annoying/error-prone to maintain
these duplicated constants.
So, this diff gets rid of InternalStats's constants and replaces them with a map
keyed on the user-facing constant. The value in that map contains a function
pointer to get the property value, so we don't need to do string matching while
holding db->mutex_. This approach has a side benefit of making many small
handler functions rather than a giant switch-statement.
Test Plan: db_properties_test passes, running "make commit-prereq -j32"
Reviewers: sdong, yhchiang, kradhakrishnan, IslamAbdelRahman, rven, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D53253
2016-02-03 04:14:56 +01:00
|
|
|
if (GetIntPropertyInternal(cfd, *property_info, true, &value)) {
|
2015-11-04 00:54:18 +01:00
|
|
|
sum += value;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*aggregated_value = sum;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-08-05 20:27:34 +02:00
|
|
|
SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
|
|
|
|
// TODO(ljin): consider using GetReferencedSuperVersion() directly
|
2014-09-24 22:12:16 +02:00
|
|
|
return cfd->GetThreadLocalSuperVersion(&mutex_);
|
2014-08-05 20:27:34 +02:00
|
|
|
}
|
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
// REQUIRED: this function should only be called on the write thread or if the
|
|
|
|
// mutex is held.
|
|
|
|
SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
|
|
|
|
auto column_family_set = versions_->GetColumnFamilySet();
|
|
|
|
auto cfd = column_family_set->GetColumnFamily(column_family_id);
|
|
|
|
if (!cfd) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return GetAndRefSuperVersion(cfd);
|
|
|
|
}
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 02:37:33 +02:00
|
|
|
// REQUIRED: mutex is NOT held
|
|
|
|
SuperVersion* DBImpl::GetAndRefSuperVersionUnlocked(uint32_t column_family_id) {
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
auto column_family_set = versions_->GetColumnFamilySet();
|
|
|
|
cfd = column_family_set->GetColumnFamily(column_family_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!cfd) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return GetAndRefSuperVersion(cfd);
|
|
|
|
}
|
|
|
|
|
2014-08-05 20:27:34 +02:00
|
|
|
void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
|
|
|
|
SuperVersion* sv) {
|
2014-09-24 22:12:16 +02:00
|
|
|
bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
|
2014-08-05 20:27:34 +02:00
|
|
|
|
|
|
|
if (unref_sv) {
|
|
|
|
// Release SuperVersion
|
|
|
|
if (sv->Unref()) {
|
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-08-05 20:27:34 +02:00
|
|
|
sv->Cleanup();
|
|
|
|
}
|
|
|
|
delete sv;
|
|
|
|
RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
|
|
|
|
}
|
|
|
|
RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
|
|
|
|
}
|
2014-07-29 00:28:53 +02:00
|
|
|
}
|
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
// REQUIRED: this function should only be called on the write thread.
|
|
|
|
void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
|
|
|
|
SuperVersion* sv) {
|
|
|
|
auto column_family_set = versions_->GetColumnFamilySet();
|
|
|
|
auto cfd = column_family_set->GetColumnFamily(column_family_id);
|
|
|
|
|
|
|
|
// If SuperVersion is held, and we successfully fetched a cfd using
|
|
|
|
// GetAndRefSuperVersion(), it must still exist.
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
|
|
}
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 02:37:33 +02:00
|
|
|
// REQUIRED: Mutex should NOT be held.
|
|
|
|
void DBImpl::ReturnAndCleanupSuperVersionUnlocked(uint32_t column_family_id,
|
|
|
|
SuperVersion* sv) {
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
auto column_family_set = versions_->GetColumnFamilySet();
|
|
|
|
cfd = column_family_set->GetColumnFamily(column_family_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If SuperVersion is held, and we successfully fetched a cfd using
|
|
|
|
// GetAndRefSuperVersion(), it must still exist.
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
|
|
}
|
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
// REQUIRED: this function should only be called on the write thread or if the
|
|
|
|
// mutex is held.
|
|
|
|
ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
|
|
|
|
ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
|
|
|
|
|
|
|
|
if (!cf_memtables->Seek(column_family_id)) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return cf_memtables->GetColumnFamilyHandle();
|
|
|
|
}
|
|
|
|
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 02:37:33 +02:00
|
|
|
// REQUIRED: mutex is NOT held.
|
|
|
|
ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked(
|
|
|
|
uint32_t column_family_id) {
|
|
|
|
ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
|
|
|
|
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
|
|
|
|
if (!cf_memtables->Seek(column_family_id)) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return cf_memtables->GetColumnFamilyHandle();
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
|
2015-06-13 03:04:30 +02:00
|
|
|
const Range* range, int n, uint64_t* sizes,
|
|
|
|
bool include_memtable) {
|
2011-03-18 23:37:00 +01:00
|
|
|
Version* v;
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
2015-06-13 03:04:30 +02:00
|
|
|
SuperVersion* sv = GetAndRefSuperVersion(cfd);
|
|
|
|
v = sv->current;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
// Convert user_key into a corresponding internal key.
|
2015-04-30 00:36:21 +02:00
|
|
|
InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
|
|
|
|
InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
|
|
|
|
sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
|
2015-06-13 03:04:30 +02:00
|
|
|
if (include_memtable) {
|
|
|
|
sizes[i] += sv->mem->ApproximateSize(k1.Encode(), k2.Encode());
|
|
|
|
sizes[i] += sv->imm->ApproximateSize(k1.Encode(), k2.Encode());
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2015-06-13 03:04:30 +02:00
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-11-07 20:50:34 +01:00
|
|
|
std::list<uint64_t>::iterator
|
|
|
|
DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
|
|
|
|
// We need to remember the iterator of our insert, because after the
|
|
|
|
// background job is done, we need to remove that element from
|
|
|
|
// pending_outputs_.
|
|
|
|
pending_outputs_.push_back(versions_->current_next_file_number());
|
|
|
|
auto pending_outputs_inserted_elem = pending_outputs_.end();
|
|
|
|
--pending_outputs_inserted_elem;
|
|
|
|
return pending_outputs_inserted_elem;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::ReleaseFileNumberFromPendingOutputs(
|
|
|
|
std::list<uint64_t>::iterator v) {
|
|
|
|
pending_outputs_.erase(v);
|
|
|
|
}
|
|
|
|
|
2014-04-15 22:39:26 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status DBImpl::GetUpdatesSince(
|
|
|
|
SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
|
|
|
|
const TransactionLogIterator::ReadOptions& read_options) {
|
|
|
|
|
2014-07-28 21:05:36 +02:00
|
|
|
RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
|
2014-04-15 22:39:26 +02:00
|
|
|
if (seq > versions_->LastSequence()) {
|
|
|
|
return Status::NotFound("Requested sequence not yet written in the db");
|
|
|
|
}
|
2014-10-30 01:43:37 +01:00
|
|
|
return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
|
2014-04-15 22:39:26 +02:00
|
|
|
}
|
|
|
|
|
2013-08-22 23:32:53 +02:00
|
|
|
Status DBImpl::DeleteFile(std::string name) {
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
2013-10-24 08:39:23 +02:00
|
|
|
WalFileType log_type;
|
|
|
|
if (!ParseFileName(name, &number, &type, &log_type) ||
|
|
|
|
(type != kTableFile && type != kLogFile)) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed.\n", name.c_str());
|
2013-08-22 23:32:53 +02:00
|
|
|
return Status::InvalidArgument("Invalid file name");
|
|
|
|
}
|
|
|
|
|
2013-10-24 08:39:23 +02:00
|
|
|
Status status;
|
|
|
|
if (type == kLogFile) {
|
|
|
|
// Only allow deleting archived log files
|
|
|
|
if (log_type != kArchivedLogFile) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed - not archived log.\n",
|
2014-03-09 23:11:34 +01:00
|
|
|
name.c_str());
|
2013-10-24 08:39:23 +02:00
|
|
|
return Status::NotSupported("Delete only supported for archived logs");
|
|
|
|
}
|
2014-09-05 20:48:17 +02:00
|
|
|
status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str());
|
2013-10-24 08:39:23 +02:00
|
|
|
if (!status.ok()) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed -- %s.\n",
|
2014-03-09 23:11:34 +01:00
|
|
|
name.c_str(), status.ToString().c_str());
|
2013-10-24 08:39:23 +02:00
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2013-08-22 23:32:53 +02:00
|
|
|
int level;
|
2014-04-08 00:38:53 +02:00
|
|
|
FileMetaData* metadata;
|
2014-01-27 23:33:50 +01:00
|
|
|
ColumnFamilyData* cfd;
|
2014-01-15 00:27:09 +01:00
|
|
|
VersionEdit edit;
|
2015-02-12 18:54:48 +01:00
|
|
|
JobContext job_context(next_job_id_.fetch_add(1), true);
|
2013-08-27 23:54:06 +02:00
|
|
|
{
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-01-27 23:33:50 +01:00
|
|
|
status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
|
2013-08-27 23:54:06 +02:00
|
|
|
if (!status.ok()) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed. File not found\n", name.c_str());
|
2014-11-15 01:20:24 +01:00
|
|
|
job_context.Clean();
|
2013-08-27 23:54:06 +02:00
|
|
|
return Status::InvalidArgument("File not found");
|
|
|
|
}
|
2014-10-21 20:23:06 +02:00
|
|
|
assert(level < cfd->NumberLevels());
|
2013-08-22 23:32:53 +02:00
|
|
|
|
2013-08-27 23:54:06 +02:00
|
|
|
// If the file is being compacted no need to delete.
|
2014-01-07 05:29:17 +01:00
|
|
|
if (metadata->being_compacted) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
2013-10-24 08:39:23 +02:00
|
|
|
"DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
|
2014-11-15 01:20:24 +01:00
|
|
|
job_context.Clean();
|
2013-08-27 23:54:06 +02:00
|
|
|
return Status::OK();
|
2013-08-22 23:32:53 +02:00
|
|
|
}
|
|
|
|
|
2013-08-27 23:54:06 +02:00
|
|
|
// Only the files in the last level can be deleted externally.
|
|
|
|
// This is to make sure that any deletion tombstones are not
|
|
|
|
// lost. Check that the level passed is the last level.
|
2014-10-31 16:48:19 +01:00
|
|
|
auto* vstoreage = cfd->current()->storage_info();
|
2014-02-03 21:08:33 +01:00
|
|
|
for (int i = level + 1; i < cfd->NumberLevels(); i++) {
|
2014-10-27 23:49:46 +01:00
|
|
|
if (vstoreage->NumLevelFiles(i) != 0) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
2013-10-24 08:39:23 +02:00
|
|
|
"DeleteFile %s FAILED. File not in last level\n", name.c_str());
|
2014-11-15 01:20:24 +01:00
|
|
|
job_context.Clean();
|
2013-08-27 23:54:06 +02:00
|
|
|
return Status::InvalidArgument("File not in last level");
|
|
|
|
}
|
|
|
|
}
|
2014-10-21 20:23:06 +02:00
|
|
|
// if level == 0, it has to be the oldest file
|
2014-10-31 16:48:19 +01:00
|
|
|
if (level == 0 &&
|
|
|
|
vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
|
2014-11-04 19:28:08 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed ---"
|
2014-11-05 00:58:14 +01:00
|
|
|
" target file in level 0 must be the oldest.", name.c_str());
|
2014-11-15 01:20:24 +01:00
|
|
|
job_context.Clean();
|
2014-10-21 20:23:06 +02:00
|
|
|
return Status::InvalidArgument("File in level 0, but not oldest");
|
|
|
|
}
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
2013-08-27 23:54:06 +02:00
|
|
|
edit.DeleteFile(level, number);
|
2014-10-02 01:19:16 +02:00
|
|
|
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
|
2015-01-26 22:59:38 +01:00
|
|
|
&edit, &mutex_, directories_.GetDbDir());
|
2013-12-20 18:57:58 +01:00
|
|
|
if (status.ok()) {
|
2015-06-17 21:37:59 +02:00
|
|
|
InstallSuperVersionAndScheduleWorkWrapper(
|
|
|
|
cfd, &job_context, *cfd->GetLatestMutableCFOptions());
|
2013-12-20 18:57:58 +01:00
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
FindObsoleteFiles(&job_context, false);
|
|
|
|
} // lock released here
|
2015-06-04 04:57:01 +02:00
|
|
|
|
2015-12-29 22:22:13 +01:00
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
// remove files outside the db-lock
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
// Call PurgeObsoleteFiles() without holding mutex.
|
|
|
|
PurgeObsoleteFiles(job_context);
|
|
|
|
}
|
|
|
|
job_context.Clean();
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice* begin, const Slice* end) {
|
|
|
|
Status status;
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
ColumnFamilyData* cfd = cfh->cfd();
|
|
|
|
VersionEdit edit;
|
2016-01-07 23:48:45 +01:00
|
|
|
std::vector<FileMetaData*> deleted_files;
|
2015-12-29 22:22:13 +01:00
|
|
|
JobContext job_context(next_job_id_.fetch_add(1), true);
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2016-01-07 23:48:45 +01:00
|
|
|
Version* input_version = cfd->current();
|
2015-12-29 22:22:13 +01:00
|
|
|
|
2016-01-07 23:48:45 +01:00
|
|
|
auto* vstorage = input_version->storage_info();
|
|
|
|
for (int i = 1; i < cfd->NumberLevels(); i++) {
|
2015-12-29 22:22:13 +01:00
|
|
|
if (vstorage->LevelFiles(i).empty() ||
|
|
|
|
!vstorage->OverlapInLevel(i, begin, end)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
std::vector<FileMetaData*> level_files;
|
|
|
|
InternalKey begin_storage, end_storage, *begin_key, *end_key;
|
|
|
|
if (begin == nullptr) {
|
|
|
|
begin_key = nullptr;
|
|
|
|
} else {
|
|
|
|
begin_storage.SetMaxPossibleForUserKey(*begin);
|
|
|
|
begin_key = &begin_storage;
|
|
|
|
}
|
|
|
|
if (end == nullptr) {
|
|
|
|
end_key = nullptr;
|
|
|
|
} else {
|
|
|
|
end_storage.SetMinPossibleForUserKey(*end);
|
|
|
|
end_key = &end_storage;
|
|
|
|
}
|
|
|
|
|
|
|
|
vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1,
|
|
|
|
nullptr, false);
|
2016-01-07 23:48:45 +01:00
|
|
|
FileMetaData* level_file;
|
|
|
|
for (uint32_t j = 0; j < level_files.size(); j++) {
|
|
|
|
level_file = level_files[j];
|
2015-12-29 22:22:13 +01:00
|
|
|
if (((begin == nullptr) ||
|
|
|
|
(cfd->internal_comparator().user_comparator()->Compare(
|
|
|
|
level_file->smallest.user_key(), *begin) >= 0)) &&
|
|
|
|
((end == nullptr) ||
|
|
|
|
(cfd->internal_comparator().user_comparator()->Compare(
|
|
|
|
level_file->largest.user_key(), *end) <= 0))) {
|
|
|
|
if (level_file->being_compacted) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
edit.DeleteFile(i, level_file->fd.GetNumber());
|
2016-01-07 23:48:45 +01:00
|
|
|
deleted_files.push_back(level_file);
|
|
|
|
level_file->being_compacted = true;
|
2015-12-29 22:22:13 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (edit.GetDeletedFiles().empty()) {
|
2016-01-04 19:55:31 +01:00
|
|
|
job_context.Clean();
|
2015-12-29 22:22:13 +01:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2016-01-07 23:48:45 +01:00
|
|
|
input_version->Ref();
|
2015-12-29 22:22:13 +01:00
|
|
|
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
|
|
|
|
&edit, &mutex_, directories_.GetDbDir());
|
|
|
|
if (status.ok()) {
|
|
|
|
InstallSuperVersionAndScheduleWorkWrapper(
|
|
|
|
cfd, &job_context, *cfd->GetLatestMutableCFOptions());
|
|
|
|
}
|
2016-01-07 23:48:45 +01:00
|
|
|
for (auto* deleted_file : deleted_files) {
|
|
|
|
deleted_file->being_compacted = false;
|
|
|
|
}
|
|
|
|
input_version->Unref();
|
2015-12-29 22:22:13 +01:00
|
|
|
FindObsoleteFiles(&job_context, false);
|
|
|
|
} // lock released here
|
|
|
|
|
2014-09-05 20:48:17 +02:00
|
|
|
LogFlush(db_options_.info_log);
|
2013-12-20 18:57:58 +01:00
|
|
|
// remove files outside the db-lock
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
2015-06-04 04:57:01 +02:00
|
|
|
// Call PurgeObsoleteFiles() without holding mutex.
|
2014-10-28 19:54:33 +01:00
|
|
|
PurgeObsoleteFiles(job_context);
|
2014-03-10 23:42:14 +01:00
|
|
|
}
|
2014-11-15 00:43:10 +01:00
|
|
|
job_context.Clean();
|
2013-08-22 23:32:53 +02:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2014-02-11 02:04:44 +01:00
|
|
|
versions_->GetLiveFilesMetaData(metadata);
|
2013-08-22 23:32:53 +02:00
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
|
|
|
|
void DBImpl::GetColumnFamilyMetaData(
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
ColumnFamilyMetaData* cf_meta) {
|
|
|
|
assert(column_family);
|
|
|
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
|
|
|
|
auto* sv = GetAndRefSuperVersion(cfd);
|
|
|
|
sv->current->GetColumnFamilyMetaData(cf_meta);
|
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
|
|
}
|
|
|
|
|
2014-04-15 22:39:26 +02:00
|
|
|
#endif // ROCKSDB_LITE
|
2013-08-22 23:32:53 +02:00
|
|
|
|
2014-03-20 22:18:29 +01:00
|
|
|
Status DBImpl::CheckConsistency() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
versions_->GetLiveFilesMetaData(&metadata);
|
|
|
|
|
|
|
|
std::string corruption_messages;
|
|
|
|
for (const auto& md : metadata) {
|
2015-01-17 00:44:55 +01:00
|
|
|
// md.name has a leading "/".
|
|
|
|
std::string file_path = md.db_path + md.name;
|
2014-07-02 18:54:20 +02:00
|
|
|
|
2014-03-20 22:18:29 +01:00
|
|
|
uint64_t fsize = 0;
|
|
|
|
Status s = env_->GetFileSize(file_path, &fsize);
|
2015-10-07 02:46:22 +02:00
|
|
|
if (!s.ok() &&
|
|
|
|
env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
|
|
|
|
s = Status::OK();
|
|
|
|
}
|
2014-03-20 22:18:29 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
corruption_messages +=
|
|
|
|
"Can't access " + md.name + ": " + s.ToString() + "\n";
|
|
|
|
} else if (fsize != md.size) {
|
2014-07-02 18:54:20 +02:00
|
|
|
corruption_messages += "Sst file size mismatch: " + file_path +
|
2014-03-20 22:18:29 +01:00
|
|
|
". Size recorded in manifest " +
|
2014-11-25 05:44:49 +01:00
|
|
|
ToString(md.size) + ", actual size " +
|
|
|
|
ToString(fsize) + "\n";
|
2014-03-20 22:18:29 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (corruption_messages.size() == 0) {
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
return Status::Corruption(corruption_messages);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-21 20:01:48 +02:00
|
|
|
Status DBImpl::GetDbIdentity(std::string& identity) const {
|
2013-12-03 15:39:07 +01:00
|
|
|
std::string idfilename = IdentityFileName(dbname_);
|
|
|
|
const EnvOptions soptions;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
unique_ptr<SequentialFileReader> id_file_reader;
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
unique_ptr<SequentialFile> idfile;
|
|
|
|
s = env_->NewSequentialFile(idfilename, &idfile, soptions);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
id_file_reader.reset(new SequentialFileReader(std::move(idfile)));
|
2013-12-03 15:39:07 +01:00
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
|
2013-12-03 15:39:07 +01:00
|
|
|
uint64_t file_size;
|
|
|
|
s = env_->GetFileSize(idfilename, &file_size);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2015-07-02 01:13:49 +02:00
|
|
|
char* buffer = reinterpret_cast<char*>(alloca(file_size));
|
2013-12-03 15:39:07 +01:00
|
|
|
Slice id;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
s = id_file_reader->Read(static_cast<size_t>(file_size), &id, buffer);
|
2013-12-03 15:39:07 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
identity.assign(id.ToString());
|
|
|
|
// If last character is '\n' remove it from identity
|
|
|
|
if (identity.size() > 0 && identity.back() == '\n') {
|
|
|
|
identity.pop_back();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Default implementations of convenience methods that subclasses of DB
|
|
|
|
// can call if they wish
|
2014-02-11 02:04:44 +01:00
|
|
|
Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
const Slice& key, const Slice& value) {
|
2014-01-14 19:42:36 +01:00
|
|
|
// Pre-allocate size of write batch conservatively.
|
|
|
|
// 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
|
|
|
|
// and we allocate 11 extra bytes for key length, as well as value length.
|
|
|
|
WriteBatch batch(key.size() + value.size() + 24);
|
2014-03-14 19:26:13 +01:00
|
|
|
batch.Put(column_family, key, value);
|
2011-03-18 23:37:00 +01:00
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
2011-03-18 23:37:00 +01:00
|
|
|
WriteBatch batch;
|
2014-03-14 19:26:13 +01:00
|
|
|
batch.Delete(column_family, key);
|
2011-03-18 23:37:00 +01:00
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 20:42:56 +02:00
|
|
|
Status DB::SingleDelete(const WriteOptions& opt,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
|
|
|
WriteBatch batch;
|
|
|
|
batch.SingleDelete(column_family, key);
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) {
|
2013-03-21 23:59:47 +01:00
|
|
|
WriteBatch batch;
|
2014-03-14 19:26:13 +01:00
|
|
|
batch.Merge(column_family, key, value);
|
2013-03-21 23:59:47 +01:00
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
2014-01-02 18:08:12 +01:00
|
|
|
// Default implementation -- returns not supported status
|
2014-10-17 01:57:59 +02:00
|
|
|
Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
2014-01-06 22:31:06 +01:00
|
|
|
const std::string& column_family_name,
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle** handle) {
|
2014-01-02 18:08:12 +01:00
|
|
|
return Status::NotSupported("");
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) {
|
2014-01-02 18:08:12 +01:00
|
|
|
return Status::NotSupported("");
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
DB::~DB() { }
|
|
|
|
|
2013-07-23 23:42:27 +02:00
|
|
|
Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
|
2014-01-06 22:31:06 +01:00
|
|
|
DBOptions db_options(options);
|
|
|
|
ColumnFamilyOptions cf_options(options);
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
column_families.push_back(
|
2014-04-09 18:56:17 +02:00
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
2014-02-11 02:04:44 +01:00
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
2014-09-02 23:42:23 +02:00
|
|
|
Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
|
2014-02-11 02:04:44 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
assert(handles.size() == 1);
|
|
|
|
// i can delete the handle since DBImpl is always holding a reference to
|
|
|
|
// default column family
|
|
|
|
delete handles[0];
|
|
|
|
}
|
|
|
|
return s;
|
2014-01-06 22:31:06 +01:00
|
|
|
}
|
|
|
|
|
2014-02-07 23:48:48 +01:00
|
|
|
Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
2014-02-11 02:04:44 +01:00
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
|
2014-10-18 06:18:36 +02:00
|
|
|
Status s = SanitizeOptionsByTable(db_options, column_families);
|
2014-09-02 23:42:23 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-12-16 06:48:16 +01:00
|
|
|
|
2015-06-18 23:55:05 +02:00
|
|
|
for (auto& cfd : column_families) {
|
|
|
|
s = CheckCompressionSupported(cfd.options);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
if (s.ok() && db_options.allow_concurrent_memtable_write) {
|
|
|
|
s = CheckConcurrentWritesSupported(cfd.options);
|
|
|
|
}
|
2015-06-18 23:55:05 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (db_options.db_paths.size() > 1) {
|
2014-12-16 06:48:16 +01:00
|
|
|
if ((cfd.options.compaction_style != kCompactionStyleUniversal) &&
|
|
|
|
(cfd.options.compaction_style != kCompactionStyleLevel)) {
|
2014-07-15 00:34:30 +02:00
|
|
|
return Status::NotSupported(
|
|
|
|
"More than one DB paths are only supported in "
|
2014-12-16 06:48:16 +01:00
|
|
|
"universal and level compaction styles. ");
|
2014-07-15 00:34:30 +02:00
|
|
|
}
|
|
|
|
}
|
2015-06-18 23:55:05 +02:00
|
|
|
}
|
2014-07-15 00:34:30 +02:00
|
|
|
|
2015-06-18 23:55:05 +02:00
|
|
|
if (db_options.db_paths.size() > 4) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"More than four DB paths are not supported yet. ");
|
2014-07-02 18:54:20 +02:00
|
|
|
}
|
|
|
|
|
2013-02-15 20:53:17 +01:00
|
|
|
*dbptr = nullptr;
|
2014-02-11 02:04:44 +01:00
|
|
|
handles->clear();
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-02-05 22:12:23 +01:00
|
|
|
size_t max_write_buffer_size = 0;
|
|
|
|
for (auto cf : column_families) {
|
|
|
|
max_write_buffer_size =
|
|
|
|
std::max(max_write_buffer_size, cf.options.write_buffer_size);
|
2012-11-01 01:02:24 +01:00
|
|
|
}
|
2013-10-01 23:46:52 +02:00
|
|
|
|
2014-02-05 22:12:23 +01:00
|
|
|
DBImpl* impl = new DBImpl(db_options, dbname);
|
2014-09-05 20:48:17 +02:00
|
|
|
s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir);
|
2014-07-02 18:54:20 +02:00
|
|
|
if (s.ok()) {
|
2014-09-05 20:48:17 +02:00
|
|
|
for (auto db_path : impl->db_options_.db_paths) {
|
2014-07-15 00:34:30 +02:00
|
|
|
s = impl->env_->CreateDirIfMissing(db_path.path);
|
2014-07-02 18:54:20 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-01 23:46:52 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
delete impl;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
s = impl->CreateArchivalDirectory();
|
2012-11-26 22:56:45 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
delete impl;
|
|
|
|
return s;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
impl->mutex_.Lock();
|
2014-01-24 23:30:28 +01:00
|
|
|
// Handles create_if_missing, error_if_exists
|
|
|
|
s = impl->Recover(column_families);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (s.ok()) {
|
2011-04-12 21:38:58 +02:00
|
|
|
uint64_t new_log_number = impl->versions_->NewFileNumber();
|
2013-01-20 11:07:13 +01:00
|
|
|
unique_ptr<WritableFile> lfile;
|
2014-03-04 02:54:04 +01:00
|
|
|
EnvOptions soptions(db_options);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
EnvOptions opt_env_options =
|
|
|
|
impl->db_options_.env->OptimizeForLogWrite(soptions, impl->db_options_);
|
2015-10-16 23:33:47 +02:00
|
|
|
s = NewWritableFile(impl->db_options_.env,
|
|
|
|
LogFileName(impl->db_options_.wal_dir, new_log_number),
|
|
|
|
&lfile, opt_env_options);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (s.ok()) {
|
2015-11-19 20:47:12 +01:00
|
|
|
lfile->SetPreallocationBlockSize((max_write_buffer_size / 10) + max_write_buffer_size);
|
2011-06-22 04:36:45 +02:00
|
|
|
impl->logfile_number_ = new_log_number;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
unique_ptr<WritableFileWriter> file_writer(
|
|
|
|
new WritableFileWriter(std::move(lfile), opt_env_options));
|
2015-10-08 19:07:15 +02:00
|
|
|
impl->logs_.emplace_back(
|
|
|
|
new_log_number,
|
2015-10-29 23:52:32 +01:00
|
|
|
new log::Writer(std::move(file_writer), new_log_number,
|
|
|
|
impl->db_options_.recycle_log_file_num > 0));
|
2014-03-11 22:52:17 +01:00
|
|
|
|
2014-01-06 22:31:06 +01:00
|
|
|
// set column family handles
|
|
|
|
for (auto cf : column_families) {
|
2014-03-11 22:52:17 +01:00
|
|
|
auto cfd =
|
|
|
|
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
|
2014-06-07 03:04:56 +02:00
|
|
|
if (cfd != nullptr) {
|
|
|
|
handles->push_back(
|
|
|
|
new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
|
2014-11-22 09:04:41 +01:00
|
|
|
impl->NewThreadStatusCfInfo(cfd);
|
2014-06-07 03:04:56 +02:00
|
|
|
} else {
|
|
|
|
if (db_options.create_missing_column_families) {
|
|
|
|
// missing column family, create it
|
|
|
|
ColumnFamilyHandle* handle;
|
|
|
|
impl->mutex_.Unlock();
|
|
|
|
s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
|
|
|
|
impl->mutex_.Lock();
|
|
|
|
if (s.ok()) {
|
|
|
|
handles->push_back(handle);
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
s = Status::InvalidArgument("Column family not found: ", cf.name);
|
|
|
|
break;
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
2014-01-06 22:31:06 +01:00
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
if (s.ok()) {
|
2014-01-24 23:30:28 +01:00
|
|
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
2015-06-17 21:37:59 +02:00
|
|
|
delete impl->InstallSuperVersionAndScheduleWork(
|
2014-10-17 01:57:59 +02:00
|
|
|
cfd, nullptr, *cfd->GetLatestMutableCFOptions());
|
2014-01-24 23:30:28 +01:00
|
|
|
}
|
2014-04-30 20:33:40 +02:00
|
|
|
impl->alive_log_files_.push_back(
|
|
|
|
DBImpl::LogFileNumberSize(impl->logfile_number_));
|
2011-03-18 23:37:00 +01:00
|
|
|
impl->DeleteObsoleteFiles();
|
2015-01-26 22:59:38 +01:00
|
|
|
s = impl->directories_.GetDbDir()->Fsync();
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
}
|
2013-09-04 22:13:08 +02:00
|
|
|
|
2014-02-03 21:08:33 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
2015-03-30 23:04:21 +02:00
|
|
|
if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
|
2014-10-31 16:48:19 +01:00
|
|
|
auto* vstorage = cfd->current()->storage_info();
|
2014-11-04 02:45:55 +01:00
|
|
|
for (int i = 1; i < vstorage->num_levels(); ++i) {
|
2014-10-27 23:49:46 +01:00
|
|
|
int num_files = vstorage->NumLevelFiles(i);
|
2014-02-03 21:08:33 +01:00
|
|
|
if (num_files > 0) {
|
2014-05-21 20:43:35 +02:00
|
|
|
s = Status::InvalidArgument(
|
|
|
|
"Not all files are at level 0. Cannot "
|
2015-03-30 23:04:21 +02:00
|
|
|
"open with FIFO compaction style.");
|
2014-02-03 21:08:33 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-12-11 03:39:09 +01:00
|
|
|
if (!cfd->mem()->IsSnapshotSupported()) {
|
|
|
|
impl->is_snapshot_supported_ = false;
|
|
|
|
}
|
2014-09-09 00:04:34 +02:00
|
|
|
if (cfd->ioptions()->merge_operator != nullptr &&
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
2014-04-30 02:13:46 +02:00
|
|
|
!cfd->mem()->IsMergeOperatorSupported()) {
|
|
|
|
s = Status::InvalidArgument(
|
|
|
|
"The memtable of column family %s does not support merge operator "
|
|
|
|
"its options.merge_operator is non-null", cfd->GetName().c_str());
|
|
|
|
}
|
2014-02-03 21:08:33 +01:00
|
|
|
if (!s.ok()) {
|
2013-09-04 22:13:08 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-07-16 04:58:28 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::Open:Opened");
|
2015-12-09 02:01:02 +01:00
|
|
|
Status persist_options_status;
|
2015-07-16 04:58:28 +02:00
|
|
|
if (s.ok()) {
|
2015-12-09 02:01:02 +01:00
|
|
|
// Persist RocksDB Options before scheduling the compaction.
|
|
|
|
// The WriteOptionsFile() will release and lock the mutex internally.
|
|
|
|
persist_options_status = impl->WriteOptionsFile();
|
|
|
|
|
2015-11-20 10:07:24 +01:00
|
|
|
*dbptr = impl;
|
2015-07-16 04:58:28 +02:00
|
|
|
impl->opened_successfully_ = true;
|
|
|
|
impl->MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
2014-01-16 01:15:43 +01:00
|
|
|
impl->mutex_.Unlock();
|
|
|
|
|
2016-01-29 03:35:01 +01:00
|
|
|
auto sfm = static_cast<SstFileManagerImpl*>(
|
|
|
|
impl->db_options_.sst_file_manager.get());
|
|
|
|
if (s.ok() && sfm) {
|
|
|
|
// Notify SstFileManager about all sst files that already exist in
|
|
|
|
// db_paths[0] when the DB is opened.
|
|
|
|
auto& db_path = impl->db_options_.db_paths[0];
|
|
|
|
std::vector<std::string> existing_files;
|
|
|
|
impl->db_options_.env->GetChildren(db_path.path, &existing_files);
|
|
|
|
for (auto& file_name : existing_files) {
|
|
|
|
uint64_t file_number;
|
|
|
|
FileType file_type;
|
|
|
|
std::string file_path = db_path.path + "/" + file_name;
|
|
|
|
if (ParseFileName(file_name, &file_number, &file_type) &&
|
|
|
|
file_type == kTableFile) {
|
|
|
|
sfm->OnAddFile(file_path);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
if (s.ok()) {
|
2015-02-09 21:03:45 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p",
|
|
|
|
impl);
|
2015-09-17 01:04:31 +02:00
|
|
|
LogFlush(impl->db_options_.info_log);
|
2015-11-11 07:58:01 +01:00
|
|
|
if (!persist_options_status.ok()) {
|
|
|
|
if (db_options.fail_if_options_file_error) {
|
|
|
|
s = Status::IOError(
|
|
|
|
"DB::Open() failed --- Unable to persist Options file",
|
|
|
|
persist_options_status.ToString());
|
|
|
|
}
|
|
|
|
Warn(impl->db_options_.info_log,
|
|
|
|
"Unable to persist options in DB::Open() -- %s",
|
|
|
|
persist_options_status.ToString().c_str());
|
|
|
|
}
|
|
|
|
}
|
2015-11-20 10:07:24 +01:00
|
|
|
if (!s.ok()) {
|
2014-11-20 19:49:32 +01:00
|
|
|
for (auto* h : *handles) {
|
2014-02-11 02:04:44 +01:00
|
|
|
delete h;
|
|
|
|
}
|
2014-01-06 22:31:06 +01:00
|
|
|
handles->clear();
|
2011-03-18 23:37:00 +01:00
|
|
|
delete impl;
|
2015-11-11 07:58:01 +01:00
|
|
|
*dbptr = nullptr;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-01-02 18:08:12 +01:00
|
|
|
Status DB::ListColumnFamilies(const DBOptions& db_options,
|
|
|
|
const std::string& name,
|
|
|
|
std::vector<std::string>* column_families) {
|
2014-01-22 20:44:53 +01:00
|
|
|
return VersionSet::ListColumnFamilies(column_families, name, db_options.env);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 20:14:09 +01:00
|
|
|
}
|
|
|
|
|
2011-05-21 04:17:43 +02:00
|
|
|
Snapshot::~Snapshot() {
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Status DestroyDB(const std::string& dbname, const Options& options) {
|
2013-10-01 23:46:52 +02:00
|
|
|
const InternalKeyComparator comparator(options.comparator);
|
2014-08-25 23:22:05 +02:00
|
|
|
const Options& soptions(SanitizeOptions(dbname, &comparator, options));
|
2013-10-01 23:46:52 +02:00
|
|
|
Env* env = soptions.env;
|
2011-03-18 23:37:00 +01:00
|
|
|
std::vector<std::string> filenames;
|
2012-12-10 20:02:07 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Ignore error in case directory does not exist
|
|
|
|
env->GetChildren(dbname, &filenames);
|
2013-10-01 23:46:52 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
FileLock* lock;
|
2011-07-15 02:20:57 +02:00
|
|
|
const std::string lockname = LockFileName(dbname);
|
|
|
|
Status result = env->LockFile(lockname, &lock);
|
2011-03-18 23:37:00 +01:00
|
|
|
if (result.ok()) {
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
2014-08-20 19:29:32 +02:00
|
|
|
InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), dbname);
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < filenames.size(); i++) {
|
2014-08-20 19:29:32 +02:00
|
|
|
if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) &&
|
2012-03-09 16:51:04 +01:00
|
|
|
type != kDBLockFile) { // Lock file will be deleted at end
|
2012-12-17 20:26:59 +01:00
|
|
|
Status del;
|
2015-08-20 00:02:17 +02:00
|
|
|
std::string path_to_delete = dbname + "/" + filenames[i];
|
2012-12-17 20:26:59 +01:00
|
|
|
if (type == kMetaDatabase) {
|
2015-08-20 00:02:17 +02:00
|
|
|
del = DestroyDB(path_to_delete, options);
|
|
|
|
} else if (type == kTableFile) {
|
2016-01-29 03:35:01 +01:00
|
|
|
del = DeleteSSTFile(&options, path_to_delete, 0);
|
2012-12-17 20:26:59 +01:00
|
|
|
} else {
|
2015-08-20 00:02:17 +02:00
|
|
|
del = env->DeleteFile(path_to_delete);
|
2012-12-17 20:26:59 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
if (result.ok() && !del.ok()) {
|
|
|
|
result = del;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-12-10 20:02:07 +01:00
|
|
|
|
2015-08-20 00:02:17 +02:00
|
|
|
for (size_t path_id = 0; path_id < options.db_paths.size(); path_id++) {
|
|
|
|
const auto& db_path = options.db_paths[path_id];
|
2014-07-15 00:34:30 +02:00
|
|
|
env->GetChildren(db_path.path, &filenames);
|
2014-07-02 18:54:20 +02:00
|
|
|
for (size_t i = 0; i < filenames.size(); i++) {
|
|
|
|
if (ParseFileName(filenames[i], &number, &type) &&
|
|
|
|
type == kTableFile) { // Lock file will be deleted at end
|
2015-08-20 00:02:17 +02:00
|
|
|
std::string table_path = db_path.path + "/" + filenames[i];
|
2016-01-29 03:35:01 +01:00
|
|
|
Status del = DeleteSSTFile(&options, table_path,
|
|
|
|
static_cast<uint32_t>(path_id));
|
2014-07-02 18:54:20 +02:00
|
|
|
if (result.ok() && !del.ok()) {
|
|
|
|
result = del;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-06 05:09:42 +01:00
|
|
|
std::vector<std::string> walDirFiles;
|
|
|
|
std::string archivedir = ArchivalDirectory(dbname);
|
|
|
|
if (dbname != soptions.wal_dir) {
|
|
|
|
env->GetChildren(soptions.wal_dir, &walDirFiles);
|
|
|
|
archivedir = ArchivalDirectory(soptions.wal_dir);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete log files in the WAL dir
|
|
|
|
for (const auto& file : walDirFiles) {
|
|
|
|
if (ParseFileName(file, &number, &type) && type == kLogFile) {
|
|
|
|
Status del = env->DeleteFile(soptions.wal_dir + "/" + file);
|
|
|
|
if (result.ok() && !del.ok()) {
|
|
|
|
result = del;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> archiveFiles;
|
2013-10-01 23:46:52 +02:00
|
|
|
env->GetChildren(archivedir, &archiveFiles);
|
2012-12-10 20:02:07 +01:00
|
|
|
// Delete archival files.
|
|
|
|
for (size_t i = 0; i < archiveFiles.size(); ++i) {
|
2013-10-11 03:02:10 +02:00
|
|
|
if (ParseFileName(archiveFiles[i], &number, &type) &&
|
|
|
|
type == kLogFile) {
|
2013-10-01 23:46:52 +02:00
|
|
|
Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]);
|
2012-12-10 20:02:07 +01:00
|
|
|
if (result.ok() && !del.ok()) {
|
|
|
|
result = del;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-11-11 07:58:01 +01:00
|
|
|
|
2012-12-14 22:21:53 +01:00
|
|
|
// ignore case where no archival directory is present.
|
2013-10-01 23:46:52 +02:00
|
|
|
env->DeleteDir(archivedir);
|
2012-12-10 20:02:07 +01:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
env->UnlockFile(lock); // Ignore error since state is already gone
|
2011-07-15 02:20:57 +02:00
|
|
|
env->DeleteFile(lockname);
|
2011-03-18 23:37:00 +01:00
|
|
|
env->DeleteDir(dbname); // Ignore error in case dir contains other files
|
2013-10-01 23:46:52 +02:00
|
|
|
env->DeleteDir(soptions.wal_dir);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2015-11-11 07:58:01 +01:00
|
|
|
Status DBImpl::WriteOptionsFile() {
|
|
|
|
#ifndef ROCKSDB_LITE
|
2015-12-09 02:01:02 +01:00
|
|
|
mutex_.AssertHeld();
|
2015-11-11 07:58:01 +01:00
|
|
|
|
|
|
|
std::vector<std::string> cf_names;
|
|
|
|
std::vector<ColumnFamilyOptions> cf_opts;
|
2015-12-09 02:01:02 +01:00
|
|
|
|
|
|
|
// This part requires mutex to protect the column family options
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
2015-11-11 07:58:01 +01:00
|
|
|
}
|
2015-12-09 02:01:02 +01:00
|
|
|
cf_names.push_back(cfd->GetName());
|
|
|
|
cf_opts.push_back(BuildColumnFamilyOptions(
|
|
|
|
*cfd->options(), *cfd->GetLatestMutableCFOptions()));
|
2015-11-11 07:58:01 +01:00
|
|
|
}
|
|
|
|
|
2015-12-09 02:01:02 +01:00
|
|
|
// Unlock during expensive operations. New writes cannot get here
|
|
|
|
// because the single write thread ensures all new writes get queued.
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
std::string file_name =
|
|
|
|
TempOptionsFileName(GetName(), versions_->NewFileNumber());
|
|
|
|
Status s = PersistRocksDBOptions(GetDBOptions(), cf_names, cf_opts, file_name,
|
|
|
|
GetEnv());
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
s = RenameTempFileToOptionsFile(file_name);
|
|
|
|
}
|
|
|
|
mutex_.Lock();
|
2015-11-11 07:58:01 +01:00
|
|
|
return s;
|
|
|
|
#else
|
|
|
|
return Status::OK();
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
namespace {
|
|
|
|
void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
|
|
|
|
const size_t num_files_to_keep,
|
|
|
|
const std::shared_ptr<Logger>& info_log,
|
|
|
|
Env* env) {
|
|
|
|
if (filenames.size() <= num_files_to_keep) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
for (auto iter = std::next(filenames.begin(), num_files_to_keep);
|
|
|
|
iter != filenames.end(); ++iter) {
|
|
|
|
if (!env->DeleteFile(iter->second).ok()) {
|
|
|
|
Warn(info_log, "Unable to delete options file %s", iter->second.c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
|
|
|
Status DBImpl::DeleteObsoleteOptionsFiles() {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
std::vector<std::string> filenames;
|
|
|
|
// use ordered map to store keep the filenames sorted from the newest
|
|
|
|
// to the oldest.
|
|
|
|
std::map<uint64_t, std::string> options_filenames;
|
|
|
|
Status s;
|
|
|
|
s = GetEnv()->GetChildren(GetName(), &filenames);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
for (auto& filename : filenames) {
|
|
|
|
uint64_t file_number;
|
|
|
|
FileType type;
|
|
|
|
if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
|
|
|
|
options_filenames.insert(
|
|
|
|
{std::numeric_limits<uint64_t>::max() - file_number,
|
|
|
|
GetName() + "/" + filename});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Keeps the latest 2 Options file
|
|
|
|
const size_t kNumOptionsFilesKept = 2;
|
|
|
|
DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
|
|
|
|
db_options_.info_log, GetEnv());
|
|
|
|
return Status::OK();
|
|
|
|
#else
|
|
|
|
return Status::OK();
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status s;
|
|
|
|
std::string options_file_name =
|
|
|
|
OptionsFileName(GetName(), versions_->NewFileNumber());
|
|
|
|
// Retry if the file name happen to conflict with an existing one.
|
|
|
|
s = GetEnv()->RenameFile(file_name, options_file_name);
|
|
|
|
|
|
|
|
DeleteObsoleteOptionsFiles();
|
|
|
|
return s;
|
|
|
|
#else
|
|
|
|
return Status::OK();
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
2014-11-20 19:49:32 +01:00
|
|
|
#if ROCKSDB_USING_THREAD_STATUS
|
2014-12-22 21:20:17 +01:00
|
|
|
|
2014-11-20 19:49:32 +01:00
|
|
|
void DBImpl::NewThreadStatusCfInfo(
|
|
|
|
ColumnFamilyData* cfd) const {
|
2014-11-21 06:13:18 +01:00
|
|
|
if (db_options_.enable_thread_tracking) {
|
2016-01-26 01:26:53 +01:00
|
|
|
ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
|
|
|
|
cfd->ioptions()->env);
|
2014-11-21 06:13:18 +01:00
|
|
|
}
|
2014-11-20 19:49:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::EraseThreadStatusCfInfo(
|
|
|
|
ColumnFamilyData* cfd) const {
|
2014-11-21 06:13:18 +01:00
|
|
|
if (db_options_.enable_thread_tracking) {
|
2014-12-22 21:20:17 +01:00
|
|
|
ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
|
2014-11-21 06:13:18 +01:00
|
|
|
}
|
2014-11-20 19:49:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::EraseThreadStatusDbInfo() const {
|
2014-11-21 06:13:18 +01:00
|
|
|
if (db_options_.enable_thread_tracking) {
|
2014-12-22 21:20:17 +01:00
|
|
|
ThreadStatusUtil::EraseDatabaseInfo(this);
|
2014-11-21 06:13:18 +01:00
|
|
|
}
|
2014-11-20 19:49:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
void DBImpl::NewThreadStatusCfInfo(
|
|
|
|
ColumnFamilyData* cfd) const {
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::EraseThreadStatusCfInfo(
|
|
|
|
ColumnFamilyData* cfd) const {
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::EraseThreadStatusDbInfo() const {
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_USING_THREAD_STATUS
|
|
|
|
|
2012-08-21 09:33:21 +02:00
|
|
|
//
|
|
|
|
// A global method that can dump out the build version
|
2014-10-07 19:40:57 +02:00
|
|
|
void DumpRocksDBBuildVersion(Logger * log) {
|
2014-04-11 19:19:58 +02:00
|
|
|
#if !defined(IOS_CROSS_COMPILE)
|
2014-10-07 19:40:57 +02:00
|
|
|
// if we compile with Xcode, we don't run build_detect_vesion, so we don't
|
|
|
|
// generate util/build_version.cc
|
2015-09-16 20:31:45 +02:00
|
|
|
Header(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
|
|
|
|
ROCKSDB_PATCH);
|
|
|
|
Header(log, "Git sha %s", rocksdb_build_git_sha);
|
|
|
|
Header(log, "Compile date %s", rocksdb_build_compile_date);
|
2014-04-11 19:19:58 +02:00
|
|
|
#endif
|
2012-08-21 09:33:21 +02:00
|
|
|
}
|
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
|
|
|
|
bool include_history) {
|
|
|
|
// Find the earliest sequence number that we know we can rely on reading
|
|
|
|
// from the memtable without needing to check sst files.
|
|
|
|
SequenceNumber earliest_seq =
|
|
|
|
sv->imm->GetEarliestSequenceNumber(include_history);
|
|
|
|
if (earliest_seq == kMaxSequenceNumber) {
|
|
|
|
earliest_seq = sv->mem->GetEarliestSequenceNumber();
|
|
|
|
}
|
|
|
|
assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
|
|
|
|
|
|
|
|
return earliest_seq;
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
|
|
|
bool cache_only, SequenceNumber* seq,
|
|
|
|
bool* found_record_for_key) {
|
2015-05-29 23:36:35 +02:00
|
|
|
Status s;
|
|
|
|
MergeContext merge_context;
|
|
|
|
|
|
|
|
SequenceNumber current_seq = versions_->LastSequence();
|
|
|
|
LookupKey lkey(key, current_seq);
|
|
|
|
|
|
|
|
*seq = kMaxSequenceNumber;
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
*found_record_for_key = false;
|
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
// Check if there is a record for this key in the latest memtable
|
2015-12-22 01:57:04 +01:00
|
|
|
sv->mem->Get(lkey, nullptr, &s, &merge_context, seq);
|
2015-05-29 23:36:35 +02:00
|
|
|
|
|
|
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
|
|
|
// unexpected error reading memtable.
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Unexpected status returned from MemTable::Get: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*seq != kMaxSequenceNumber) {
|
|
|
|
// Found a sequence number, no need to check immutable memtables
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
*found_record_for_key = true;
|
2015-05-29 23:36:35 +02:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if there is a record for this key in the immutable memtables
|
2015-12-22 01:57:04 +01:00
|
|
|
sv->imm->Get(lkey, nullptr, &s, &merge_context, seq);
|
2015-05-29 23:36:35 +02:00
|
|
|
|
|
|
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
|
|
|
// unexpected error reading memtable.
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Unexpected status returned from MemTableList::Get: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*seq != kMaxSequenceNumber) {
|
|
|
|
// Found a sequence number, no need to check memtable history
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
*found_record_for_key = true;
|
2015-05-29 23:36:35 +02:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if there is a record for this key in the immutable memtables
|
2015-12-22 01:57:04 +01:00
|
|
|
sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, seq);
|
2015-05-29 23:36:35 +02:00
|
|
|
|
|
|
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
|
|
|
// unexpected error reading memtable.
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Unexpected status returned from MemTableList::GetFromHistory: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
if (*seq != kMaxSequenceNumber) {
|
|
|
|
// Found a sequence number, no need to check SST files
|
|
|
|
*found_record_for_key = true;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(agiardullo): possible optimization: consider checking cached
|
|
|
|
// SST files if cache_only=true?
|
|
|
|
if (!cache_only) {
|
|
|
|
// Check tables
|
|
|
|
ReadOptions read_options;
|
|
|
|
|
2015-12-22 01:57:04 +01:00
|
|
|
sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
nullptr /* value_found */, found_record_for_key, seq);
|
|
|
|
|
|
|
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
|
|
|
// unexpected error reading SST files
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Unexpected status returned from Version::Get: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-29 23:36:35 +02:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
} // namespace rocksdb
|