rocksdb/db/db_impl/db_impl.cc

4457 lines
156 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_impl/db_impl.h"
#include <stdint.h>
2016-01-19 07:17:31 +01:00
#ifdef OS_SOLARIS
#include <alloca.h>
2016-01-19 07:17:31 +01:00
#endif
#include <algorithm>
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
#include <cinttypes>
#include <cstdio>
#include <map>
#include <set>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "db/arena_wrapped_db_iter.h"
#include "db/builder.h"
#include "db/compaction/compaction_job.h"
#include "db/db_info_dumper.h"
#include "db/db_iter.h"
#include "db/dbformat.h"
#include "db/error_handler.h"
#include "db/event_helpers.h"
#include "db/external_sst_file_ingestion_job.h"
#include "db/flush_job.h"
#include "db/forward_iterator.h"
#include "db/import_column_family_job.h"
#include "db/job_context.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
db: avoid `#include`ing malloc and jemalloc simultaneously Summary: This fixes a compilation failure on Linux when the system libc is not glibc. jemalloc's configure script incorrectly assumes that glibc is always used on Linux systems, producing glibc-style signatures; when the system libc is e.g. musl, the following error is observed: ``` [ 0%] Building CXX object CMakeFiles/rocksdb.dir/db/db_impl.cc.o In file included from /go/src/github.com/cockroachdb/cockroach/c-deps/rocksdb.src/table/block.h:19:0, from /go/src/github.com/cockroachdb/cockroach/c-deps/rocksdb.src/db/db_impl.cc:77: /x-tools/x86_64-unknown-linux-musl/x86_64-unknown-linux-musl/sysroot/usr/include/malloc.h:19:8: error: declaration of 'size_t malloc_usable_size(void*)' has a different exception specifier size_t malloc_usable_size(void *); ^~~~~~~~~~~~~~~~~~ In file included from /go/src/github.com/cockroachdb/cockroach/c-deps/rocksdb.src/db/db_impl.cc:20:0: /go/native/x86_64-unknown-linux-musl/jemalloc/include/jemalloc/jemalloc.h:78:33: note: from previous declaration 'size_t malloc_usable_size(void*) throw ()' # define je_malloc_usable_size malloc_usable_size ^ /go/native/x86_64-unknown-linux-musl/jemalloc/include/jemalloc/jemalloc.h:239:41: note: in expansion of macro 'je_malloc_usable_size' JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_usable_size( ^~~~~~~~~~~~~~~~~~~~~ CMakeFiles/rocksdb.dir/build.make:350: recipe for target 'CMakeFiles/rocksdb.dir/db/db_impl.cc.o' failed ``` This works around the issue by rearranging the sources such that jemalloc's headers are never in the same scope as the system's malloc header. The jemalloc issue has been reported as well, see: https://github.com/jemalloc/jemalloc/issues/778. cc tschottdorf Closes https://github.com/facebook/rocksdb/pull/2188 Differential Revision: D5163048 Pulled By: siying fbshipit-source-id: c553125458892def175c1be5682b0330d80b2a0d
2017-06-01 07:41:44 +02:00
#include "db/malloc_stats.h"
#include "db/memtable.h"
2013-12-11 04:03:13 +01:00
#include "db/memtable_list.h"
#include "db/merge_context.h"
#include "db/merge_helper.h"
#include "db/range_tombstone_fragmenter.h"
#include "db/table_cache.h"
#include "db/table_properties_collector.h"
#include "db/transaction_log_impl.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "db/write_callback.h"
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
#include "env/composite_env_wrapper.h"
#include "file/file_util.h"
#include "file/filename.h"
#include "file/random_access_file_reader.h"
#include "file/sst_file_manager_impl.h"
#include "logging/auto_roll_logger.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "memtable/hash_linklist_rep.h"
#include "memtable/hash_skiplist_rep.h"
#include "monitoring/in_memory_stats_history.h"
#include "monitoring/iostats_context_imp.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/persistent_stats_history.h"
#include "monitoring/thread_status_updater.h"
#include "monitoring/thread_status_util.h"
#include "options/cf_options.h"
#include "options/options_helper.h"
#include "options/options_parser.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/statistics.h"
#include "rocksdb/stats_history.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "rocksdb/write_buffer_manager.h"
#include "table/block_based/block.h"
#include "table/block_based/block_based_table_factory.h"
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
#include "table/get_context.h"
#include "table/merging_iterator.h"
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
#include "table/multiget_context.h"
#include "table/table_builder.h"
#include "table/two_level_iterator.h"
#include "test_util/sync_point.h"
#include "tools/sst_dump_tool_imp.h"
#include "util/autovector.h"
#include "util/build_version.h"
#include "util/cast_util.h"
#include "util/coding.h"
#include "util/compression.h"
#include "util/crc32c.h"
#include "util/mutexlock.h"
#include "util/stop_watch.h"
#include "util/string_util.h"
namespace rocksdb {
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
const std::string kDefaultColumnFamilyName("default");
const std::string kPersistentStatsColumnFamilyName(
"___rocksdb_stats_history___");
void DumpRocksDBBuildVersion(Logger* log);
CompressionType GetCompressionFlush(
const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options) {
// Compressing memtable flushes might not help unless the sequential load
// optimization is used for leveled compaction. Otherwise the CPU and
// latency overhead is not offset by saving much space.
if (ioptions.compaction_style == kCompactionStyleUniversal) {
if (mutable_cf_options.compaction_options_universal
.compression_size_percent < 0) {
return mutable_cf_options.compression;
} else {
return kNoCompression;
}
} else if (!ioptions.compression_per_level.empty()) {
// For leveled compress when min_level_to_compress != 0.
return ioptions.compression_per_level[0];
} else {
return mutable_cf_options.compression;
}
}
namespace {
void DumpSupportInfo(Logger* logger) {
ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
for (auto& compression : OptionsHelper::compression_type_string_map) {
if (compression.second != kNoCompression &&
compression.second != kDisableCompressionOption) {
ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
CompressionTypeSupported(compression.second));
}
}
ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
crc32c::IsFastCrc32Supported().c_str());
}
} // namespace
DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
const bool seq_per_batch, const bool batch_per_txn)
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
: dbname_(dbname),
own_info_log_(options.info_log == nullptr),
initial_db_options_(SanitizeOptions(dbname, options)),
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
env_(initial_db_options_.env),
fs_(initial_db_options_.file_system),
immutable_db_options_(initial_db_options_),
mutable_db_options_(initial_db_options_),
stats_(immutable_db_options_.statistics.get()),
mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS,
immutable_db_options_.use_adaptive_mutex),
Support for single-primary, multi-secondary instances (#4899) Summary: This PR allows RocksDB to run in single-primary, multi-secondary process mode. The writer is a regular RocksDB (e.g. an `DBImpl`) instance playing the role of a primary. Multiple `DBImplSecondary` processes (secondaries) share the same set of SST files, MANIFEST, WAL files with the primary. Secondaries tail the MANIFEST of the primary and apply updates to their own in-memory state of the file system, e.g. `VersionStorageInfo`. This PR has several components: 1. (Originally in #4745). Add a `PathNotFound` subcode to `IOError` to denote the failure when a secondary tries to open a file which has been deleted by the primary. 2. (Similar to #4602). Add `FragmentBufferedReader` to handle partially-read, trailing record at the end of a log from where future read can continue. 3. (Originally in #4710 and #4820). Add implementation of the secondary, i.e. `DBImplSecondary`. 3.1 Tail the primary's MANIFEST during recovery. 3.2 Tail the primary's MANIFEST during normal processing by calling `ReadAndApply`. 3.3 Tailing WAL will be in a future PR. 4. Add an example in 'examples/multi_processes_example.cc' to demonstrate the usage of secondary RocksDB instance in a multi-process setting. Instructions to run the example can be found at the beginning of the source code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4899 Differential Revision: D14510945 Pulled By: riversand963 fbshipit-source-id: 4ac1c5693e6012ad23f7b4b42d3c374fecbe8886
2019-03-27 00:41:31 +01:00
default_cf_handle_(nullptr),
max_total_in_memory_state_(0),
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
file_options_, immutable_db_options_)),
seq_per_batch_(seq_per_batch),
batch_per_txn_(batch_per_txn),
Support for single-primary, multi-secondary instances (#4899) Summary: This PR allows RocksDB to run in single-primary, multi-secondary process mode. The writer is a regular RocksDB (e.g. an `DBImpl`) instance playing the role of a primary. Multiple `DBImplSecondary` processes (secondaries) share the same set of SST files, MANIFEST, WAL files with the primary. Secondaries tail the MANIFEST of the primary and apply updates to their own in-memory state of the file system, e.g. `VersionStorageInfo`. This PR has several components: 1. (Originally in #4745). Add a `PathNotFound` subcode to `IOError` to denote the failure when a secondary tries to open a file which has been deleted by the primary. 2. (Similar to #4602). Add `FragmentBufferedReader` to handle partially-read, trailing record at the end of a log from where future read can continue. 3. (Originally in #4710 and #4820). Add implementation of the secondary, i.e. `DBImplSecondary`. 3.1 Tail the primary's MANIFEST during recovery. 3.2 Tail the primary's MANIFEST during normal processing by calling `ReadAndApply`. 3.3 Tailing WAL will be in a future PR. 4. Add an example in 'examples/multi_processes_example.cc' to demonstrate the usage of secondary RocksDB instance in a multi-process setting. Instructions to run the example can be found at the beginning of the source code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4899 Differential Revision: D14510945 Pulled By: riversand963 fbshipit-source-id: 4ac1c5693e6012ad23f7b4b42d3c374fecbe8886
2019-03-27 00:41:31 +01:00
db_lock_(nullptr),
shutting_down_(false),
manual_compaction_paused_(false),
bg_cv_(&mutex_),
logfile_number_(0),
log_dir_synced_(false),
log_empty_(true),
persist_stats_cf_handle_(nullptr),
log_sync_cv_(&mutex_),
total_log_size_(0),
is_snapshot_supported_(true),
write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
write_thread_(immutable_db_options_),
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
nonmem_write_thread_(immutable_db_options_),
write_controller_(mutable_db_options_.delayed_write_rate),
last_batch_group_size_(0),
Rewritten system for scheduling background work Summary: When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue. The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction. Here are the performance results: Command: ./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333 Before the patch: fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s After the patch: fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got: fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s Test Plan: make check two stress tests: Big number of compactions and flushes: ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 max_background_flushes=0, to verify that this case also works correctly ./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 Reviewers: ljin, rven, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
unscheduled_flushes_(0),
unscheduled_compactions_(0),
Introduce bottom-pri thread pool for large universal compactions Summary: When we had a single thread pool for compactions, a thread could be busy for a long time (minutes) executing a compaction involving the bottom level. In multi-instance setups, the entire thread pool could be consumed by such bottom-level compactions. Then, top-level compactions (e.g., a few L0 files) would be blocked for a long time ("head-of-line blocking"). Such top-level compactions are critical to prevent compaction stalls as they can quickly reduce number of L0 files / sorted runs. This diff introduces a bottom-priority queue for universal compactions including the bottom level. This alleviates the head-of-line blocking situation for fast, top-level compactions. - Added `Env::Priority::BOTTOM` thread pool. This feature is only enabled if user explicitly configures it to have a positive number of threads. - Changed `ThreadPoolImpl`'s default thread limit from one to zero. This change is invisible to users as we call `IncBackgroundThreadsIfNeeded` on the low-pri/high-pri pools during `DB::Open` with values of at least one. It is necessary, though, for bottom-pri to start with zero threads so the feature is disabled by default. - Separated `ManualCompaction` into two parts in `PrepickedCompaction`. `PrepickedCompaction` is used for any compaction that's picked outside of its execution thread, either manual or automatic. - Forward universal compactions involving last level to the bottom pool (worker thread's entry point is `BGWorkBottomCompaction`). - Track `bg_bottom_compaction_scheduled_` so we can wait for bottom-level compactions to finish. We don't count them against the background jobs limits. So users of this feature will get an extra compaction for free. Closes https://github.com/facebook/rocksdb/pull/2580 Differential Revision: D5422916 Pulled By: ajkr fbshipit-source-id: a74bd11f1ea4933df3739b16808bb21fcd512333
2017-08-04 00:36:28 +02:00
bg_bottom_compaction_scheduled_(0),
bg_compaction_scheduled_(0),
num_running_compactions_(0),
bg_flush_scheduled_(0),
num_running_flushes_(0),
bg_purge_scheduled_(0),
disable_delete_obsolete_files_(0),
pending_purge_obsolete_files_(0),
delete_obsolete_files_last_run_(env_->NowMicros()),
last_stats_dump_time_microsec_(0),
next_job_id_(1),
has_unpersisted_data_(false),
Skip deleted WALs during recovery Summary: This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic. Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction) This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2. Closes https://github.com/facebook/rocksdb/pull/3765 Differential Revision: D7747618 Pulled By: siying fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
unable_to_release_oldest_log_(false),
num_running_ingest_file_(0),
#ifndef ROCKSDB_LITE
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
wal_manager_(immutable_db_options_, file_options_, seq_per_batch),
#endif // ROCKSDB_LITE
event_logger_(immutable_db_options_.info_log.get()),
bg_work_paused_(0),
bg_compaction_paused_(0),
refitting_level_(false),
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
opened_successfully_(false),
two_write_queues_(options.two_write_queues),
manual_wal_flush_(options.manual_wal_flush),
// last_sequencee_ is always maintained by the main queue that also writes
// to the memtable. When two_write_queues_ is disabled last seq in
// memtable is the same as last seq published to the readers. When it is
// enabled but seq_per_batch_ is disabled, last seq in memtable still
// indicates last published seq since wal-only writes that go to the 2nd
// queue do not consume a sequence number. Otherwise writes performed by
// the 2nd queue could change what is visible to the readers. In this
// cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
// separate variable to indicate the last published sequence.
last_seq_same_as_publish_seq_(
!(seq_per_batch && options.two_write_queues)),
// Since seq_per_batch_ is currently set only by WritePreparedTxn which
// requires a custom gc for compaction, we use that to set use_custom_gc_
// as well.
use_custom_gc_(seq_per_batch),
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
shutdown_initiated_(false),
own_sfm_(options.sst_file_manager == nullptr),
preserve_deletes_(options.preserve_deletes),
closed_(false),
error_handler_(this, immutable_db_options_, &mutex_),
atomic_flush_install_cv_(&mutex_) {
// !batch_per_trx_ implies seq_per_batch_ because it is only unset for
// WriteUnprepared, which should use seq_per_batch_.
assert(batch_per_txn_ || seq_per_batch_);
env_->GetAbsolutePath(dbname, &db_absolute_path_);
// Reserve ten files or so for other uses and give the rest to TableCache.
// Give a large number for setting of "infinite" open files.
const int table_cache_size = (mutable_db_options_.max_open_files == -1)
? TableCache::kInfiniteCapacity
: mutable_db_options_.max_open_files - 10;
LRUCacheOptions co;
co.capacity = table_cache_size;
co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
co.metadata_charge_policy = kDontChargeCacheMetadata;
table_cache_ = NewLRUCache(co);
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
table_cache_.get(), write_buffer_manager_,
&write_controller_, &block_cache_tracer_));
support for concurrent adds to memtable Summary: This diff adds support for concurrent adds to the skiplist memtable implementations. Memory allocation is made thread-safe by the addition of a spinlock, with small per-core buffers to avoid contention. Concurrent memtable writes are made via an additional method and don't impose a performance overhead on the non-concurrent case, so parallelism can be selected on a per-batch basis. Write thread synchronization is an increasing bottleneck for higher levels of concurrency, so this diff adds --enable_write_thread_adaptive_yield (default off). This feature causes threads joining a write batch group to spin for a short time (default 100 usec) using sched_yield, rather than going to sleep on a mutex. If the timing of the yield calls indicates that another thread has actually run during the yield then spinning is avoided. This option improves performance for concurrent situations even without parallel adds, although it has the potential to increase CPU usage (and the heuristic adaptation is not yet mature). Parallel writes are not currently compatible with inplace updates, update callbacks, or delete filtering. Enable it with --allow_concurrent_memtable_write (and --enable_write_thread_adaptive_yield). Parallel memtable writes are performance neutral when there is no actual parallelism, and in my experiments (SSD server-class Linux and varying contention and key sizes for fillrandom) they are always a performance win when there is more than one thread. Statistics are updated earlier in the write path, dropping the number of DB mutex acquisitions from 2 to 1 for almost all cases. This diff was motivated and inspired by Yahoo's cLSM work. It is more conservative than cLSM: RocksDB's write batch group leader role is preserved (along with all of the existing flush and write throttling logic) and concurrent writers are blocked until all memtable insertions have completed and the sequence number has been advanced, to preserve linearizability. My test config is "db_bench -benchmarks=fillrandom -threads=$T -batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T -level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999 -disable_auto_compactions --max_write_buffer_number=8 -max_background_flushes=8 --disable_wal --write_buffer_size=160000000 --block_size=16384 --allow_concurrent_memtable_write" on a two-socket Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1 thread I get ~440Kops/sec. Peak performance for 1 socket (numactl -N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance across both sockets happens at 30 threads, and is ~900Kops/sec, although with fewer threads there is less performance loss when the system has background work. Test Plan: 1. concurrent stress tests for InlineSkipList and DynamicBloom 2. make clean; make check 3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench 4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench 5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench 6. make clean; OPT=-DROCKSDB_LITE make check 7. verify no perf regressions when disabled Reviewers: igor, sdong Reviewed By: sdong Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
column_family_memtables_.reset(
new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
DumpDBFileSummary(immutable_db_options_, dbname_);
immutable_db_options_.Dump(immutable_db_options_.info_log.get());
mutable_db_options_.Dump(immutable_db_options_.info_log.get());
DumpSupportInfo(immutable_db_options_.info_log.get());
Added support for differential snapshots Summary: The motivation for this PR is to add to RocksDB support for differential (incremental) snapshots, as snapshot of the DB changes between two points in time (one can think of it as diff between to sequence numbers, or the diff D which can be thought of as an SST file or just set of KVs that can be applied to sequence number S1 to get the database to the state at sequence number S2). This feature would be useful for various distributed storages layers built on top of RocksDB, as it should help reduce resources (time and network bandwidth) needed to recover and rebuilt DB instances as replicas in the context of distributed storages. From the API standpoint that would like client app requesting iterator between (start seqnum) and current DB state, and reading the "diff". This is a very draft PR for initial review in the discussion on the approach, i'm going to rework some parts and keep updating the PR. For now, what's done here according to initial discussions: Preserving deletes: - We want to be able to optionally preserve recent deletes for some defined period of time, so that if a delete came in recently and might need to be included in the next incremental snapshot it would't get dropped by a compaction. This is done by adding new param to Options (preserve deletes flag) and new variable to DB Impl where we keep track of the sequence number after which we don't want to drop tombstones, even if they are otherwise eligible for deletion. - I also added a new API call for clients to be able to advance this cutoff seqnum after which we drop deletes; i assume it's more flexible to let clients control this, since otherwise we'd need to keep some kind of timestamp < -- > seqnum mapping inside the DB, which sounds messy and painful to support. Clients could make use of it by periodically calling GetLatestSequenceNumber(), noting the timestamp, doing some calculation and figuring out by how much we need to advance the cutoff seqnum. - Compaction codepath in compaction_iterator.cc has been modified to avoid dropping tombstones with seqnum > cutoff seqnum. Iterator changes: - couple params added to ReadOptions, to optionally allow client to request internal keys instead of user keys (so that client can get the latest value of a key, be it delete marker or a put), as well as min timestamp and min seqnum. TableCache changes: - I modified table_cache code to be able to quickly exclude SST files from iterators heep if creation_time on the file is less then iter_start_ts as passed in ReadOptions. That would help a lot in some DB settings (like reading very recent data only or using FIFO compactions), but not so much for universal compaction with more or less long iterator time span. What's left: - Still looking at how to best plug that inside DBIter codepath. So far it seems that FindNextUserKeyInternal only parses values as UserKeys, and iter->key() call generally returns user key. Can we add new API to DBIter as internal_key(), and modify this internal method to optionally set saved_key_ to point to the full internal key? I don't need to store actual seqnum there, but I do need to store type. Closes https://github.com/facebook/rocksdb/pull/2999 Differential Revision: D6175602 Pulled By: mikhail-antonov fbshipit-source-id: c779a6696ee2d574d86c69cec866a3ae095aa900
2017-11-02 02:43:29 +01:00
// always open the DB with 0 here, which means if preserve_deletes_==true
// we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
// is called by client and this seqnum is advanced.
preserve_deletes_seqnum_.store(0);
}
Status DBImpl::Resume() {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
InstrumentedMutexLock db_mutex(&mutex_);
if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
// Nothing to do
return Status::OK();
}
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
if (error_handler_.IsRecoveryInProgress()) {
// Don't allow a mix of manual and automatic recovery
return Status::Busy();
}
mutex_.Unlock();
Status s = error_handler_.RecoverFromBGError(true);
mutex_.Lock();
return s;
}
// This function implements the guts of recovery from a background error. It
// is eventually called for both manual as well as automatic recovery. It does
// the following -
// 1. Wait for currently scheduled background flush/compaction to exit, in
// order to inadvertently causing an error and thinking recovery failed
// 2. Flush memtables if there's any data for all the CFs. This may result
// another error, which will be saved by error_handler_ and reported later
// as the recovery status
// 3. Find and delete any obsolete files
// 4. Schedule compactions if needed for all the CFs. This is needed as the
// flush in the prior step might have been a no-op for some CFs, which
// means a new super version wouldn't have been installed
Status DBImpl::ResumeImpl() {
mutex_.AssertHeld();
WaitForBackgroundWork();
Status bg_error = error_handler_.GetBGError();
Status s;
if (shutdown_initiated_) {
// Returning shutdown status to SFM during auto recovery will cause it
// to abort the recovery and allow the shutdown to progress
s = Status::ShutdownInProgress();
}
if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
ROCKS_LOG_INFO(
immutable_db_options_.info_log,
"DB resume requested but failed due to Fatal/Unrecoverable error");
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
s = bg_error;
}
// We cannot guarantee consistency of the WAL. So force flush Memtables of
// all the column families
if (s.ok()) {
FlushOptions flush_opts;
// We allow flush to stall write since we are trying to resume from error.
flush_opts.allow_write_stall = true;
if (immutable_db_options_.atomic_flush) {
autovector<ColumnFamilyData*> cfds;
SelectColumnFamiliesForAtomicFlush(&cfds);
mutex_.Unlock();
s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery);
mutex_.Lock();
} else {
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
cfd->Ref();
mutex_.Unlock();
s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
mutex_.Lock();
cfd->UnrefAndTryDelete();
if (!s.ok()) {
break;
}
}
}
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
if (!s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"DB resume requested but failed due to Flush failure [%s]",
s.ToString().c_str());
}
}
JobContext job_context(0);
FindObsoleteFiles(&job_context, true);
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
if (s.ok()) {
s = error_handler_.ClearBGError();
}
mutex_.Unlock();
job_context.manifest_file_number = 1;
if (job_context.HaveSomethingToDelete()) {
PurgeObsoleteFiles(job_context);
}
job_context.Clean();
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
if (s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
}
mutex_.Lock();
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
// Check for shutdown again before scheduling further compactions,
// since we released and re-acquired the lock above
if (shutdown_initiated_) {
s = Status::ShutdownInProgress();
}
if (s.ok()) {
for (auto cfd : *versions_->GetColumnFamilySet()) {
SchedulePendingCompaction(cfd);
}
MaybeScheduleFlushOrCompaction();
}
// Wake up any waiters - in this case, it could be the shutdown thread
bg_cv_.SignalAll();
// No need to check BGError again. If something happened, event listener would
// be notified and the operation causing it would have failed
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
return s;
}
void DBImpl::WaitForBackgroundWork() {
// Wait for background work to finish
while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
bg_flush_scheduled_) {
bg_cv_.Wait();
}
}
// Will lock the mutex_, will wait for completion if wait is true
void DBImpl::CancelAllBackgroundWork(bool wait) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Shutdown: canceling all background work");
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
if (thread_dump_stats_ != nullptr) {
thread_dump_stats_->cancel();
thread_dump_stats_.reset();
}
if (thread_persist_stats_ != nullptr) {
thread_persist_stats_->cancel();
thread_persist_stats_.reset();
}
InstrumentedMutexLock l(&mutex_);
if (!shutting_down_.load(std::memory_order_acquire) &&
has_unpersisted_data_.load(std::memory_order_relaxed) &&
!mutable_db_options_.avoid_flush_during_shutdown) {
if (immutable_db_options_.atomic_flush) {
autovector<ColumnFamilyData*> cfds;
SelectColumnFamiliesForAtomicFlush(&cfds);
mutex_.Unlock();
AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
mutex_.Lock();
} else {
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
cfd->Ref();
mutex_.Unlock();
FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
mutex_.Lock();
cfd->UnrefAndTryDelete();
}
}
}
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
}
Persist data during user initiated shutdown Summary: Move the manual memtable flush for databases containing data that has bypassed the WAL from DBImpl's destructor to CancleAllBackgroundWork(). CancelAllBackgroundWork() is a publicly exposed API which allows async operations performed by background threads to be disabled on a database. In effect, this places the database into a "shutdown" state in advance of calling the database object's destructor. No compactions or flushing of SST files can occur once a call to this API completes. When writes are issued to a database with WriteOptions::disableWAL set to true, DBImpl::has_unpersisted_data_ is set so that memtables can be flushed when the database object is destroyed. If CancelAllBackgroundWork() has been called prior to DBImpl's destructor, this flush operation is not possible and is skipped, causing unnecessary loss of data. Since CancelAllBackgroundWork() is already invoked by DBImpl's destructor in order to perform the thread join portion of its cleanup processing, moving the manual memtable flush to CancelAllBackgroundWork() ensures data is persisted regardless of client behavior. Test Plan: Write an amount of data that will not cause a memtable flush to a rocksdb database with all writes marked with WriteOptions::disableWAL. Properly "close" the database. Reopen database and verify that the data was persisted. Reviewers: IslamAbdelRahman, yiwu, yoshinorim, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D62277
2016-08-25 21:24:22 +02:00
shutting_down_.store(true, std::memory_order_release);
bg_cv_.SignalAll();
if (!wait) {
return;
}
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
WaitForBackgroundWork();
Persist data during user initiated shutdown Summary: Move the manual memtable flush for databases containing data that has bypassed the WAL from DBImpl's destructor to CancleAllBackgroundWork(). CancelAllBackgroundWork() is a publicly exposed API which allows async operations performed by background threads to be disabled on a database. In effect, this places the database into a "shutdown" state in advance of calling the database object's destructor. No compactions or flushing of SST files can occur once a call to this API completes. When writes are issued to a database with WriteOptions::disableWAL set to true, DBImpl::has_unpersisted_data_ is set so that memtables can be flushed when the database object is destroyed. If CancelAllBackgroundWork() has been called prior to DBImpl's destructor, this flush operation is not possible and is skipped, causing unnecessary loss of data. Since CancelAllBackgroundWork() is already invoked by DBImpl's destructor in order to perform the thread join portion of its cleanup processing, moving the manual memtable flush to CancelAllBackgroundWork() ensures data is persisted regardless of client behavior. Test Plan: Write an amount of data that will not cause a memtable flush to a rocksdb database with all writes marked with WriteOptions::disableWAL. Properly "close" the database. Reopen database and verify that the data was persisted. Reviewers: IslamAbdelRahman, yiwu, yoshinorim, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D62277
2016-08-25 21:24:22 +02:00
}
Status DBImpl::CloseHelper() {
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
// Guarantee that there is no background error recovery in progress before
// continuing with the shutdown
mutex_.Lock();
shutdown_initiated_ = true;
error_handler_.CancelErrorRecovery();
while (error_handler_.IsRecoveryInProgress()) {
bg_cv_.Wait();
}
mutex_.Unlock();
// CancelAllBackgroundWork called with false means we just set the shutdown
// marker. After this we do a variant of the waiting and unschedule work
// (to consider: moving all the waiting into CancelAllBackgroundWork(true))
CancelAllBackgroundWork(false);
Introduce bottom-pri thread pool for large universal compactions Summary: When we had a single thread pool for compactions, a thread could be busy for a long time (minutes) executing a compaction involving the bottom level. In multi-instance setups, the entire thread pool could be consumed by such bottom-level compactions. Then, top-level compactions (e.g., a few L0 files) would be blocked for a long time ("head-of-line blocking"). Such top-level compactions are critical to prevent compaction stalls as they can quickly reduce number of L0 files / sorted runs. This diff introduces a bottom-priority queue for universal compactions including the bottom level. This alleviates the head-of-line blocking situation for fast, top-level compactions. - Added `Env::Priority::BOTTOM` thread pool. This feature is only enabled if user explicitly configures it to have a positive number of threads. - Changed `ThreadPoolImpl`'s default thread limit from one to zero. This change is invisible to users as we call `IncBackgroundThreadsIfNeeded` on the low-pri/high-pri pools during `DB::Open` with values of at least one. It is necessary, though, for bottom-pri to start with zero threads so the feature is disabled by default. - Separated `ManualCompaction` into two parts in `PrepickedCompaction`. `PrepickedCompaction` is used for any compaction that's picked outside of its execution thread, either manual or automatic. - Forward universal compactions involving last level to the bottom pool (worker thread's entry point is `BGWorkBottomCompaction`). - Track `bg_bottom_compaction_scheduled_` so we can wait for bottom-level compactions to finish. We don't count them against the background jobs limits. So users of this feature will get an extra compaction for free. Closes https://github.com/facebook/rocksdb/pull/2580 Differential Revision: D5422916 Pulled By: ajkr fbshipit-source-id: a74bd11f1ea4933df3739b16808bb21fcd512333
2017-08-04 00:36:28 +02:00
int bottom_compactions_unscheduled =
env_->UnSchedule(this, Env::Priority::BOTTOM);
int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
Status ret;
mutex_.Lock();
Introduce bottom-pri thread pool for large universal compactions Summary: When we had a single thread pool for compactions, a thread could be busy for a long time (minutes) executing a compaction involving the bottom level. In multi-instance setups, the entire thread pool could be consumed by such bottom-level compactions. Then, top-level compactions (e.g., a few L0 files) would be blocked for a long time ("head-of-line blocking"). Such top-level compactions are critical to prevent compaction stalls as they can quickly reduce number of L0 files / sorted runs. This diff introduces a bottom-priority queue for universal compactions including the bottom level. This alleviates the head-of-line blocking situation for fast, top-level compactions. - Added `Env::Priority::BOTTOM` thread pool. This feature is only enabled if user explicitly configures it to have a positive number of threads. - Changed `ThreadPoolImpl`'s default thread limit from one to zero. This change is invisible to users as we call `IncBackgroundThreadsIfNeeded` on the low-pri/high-pri pools during `DB::Open` with values of at least one. It is necessary, though, for bottom-pri to start with zero threads so the feature is disabled by default. - Separated `ManualCompaction` into two parts in `PrepickedCompaction`. `PrepickedCompaction` is used for any compaction that's picked outside of its execution thread, either manual or automatic. - Forward universal compactions involving last level to the bottom pool (worker thread's entry point is `BGWorkBottomCompaction`). - Track `bg_bottom_compaction_scheduled_` so we can wait for bottom-level compactions to finish. We don't count them against the background jobs limits. So users of this feature will get an extra compaction for free. Closes https://github.com/facebook/rocksdb/pull/2580 Differential Revision: D5422916 Pulled By: ajkr fbshipit-source-id: a74bd11f1ea4933df3739b16808bb21fcd512333
2017-08-04 00:36:28 +02:00
bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
bg_compaction_scheduled_ -= compactions_unscheduled;
bg_flush_scheduled_ -= flushes_unscheduled;
// Wait for background work to finish
Introduce bottom-pri thread pool for large universal compactions Summary: When we had a single thread pool for compactions, a thread could be busy for a long time (minutes) executing a compaction involving the bottom level. In multi-instance setups, the entire thread pool could be consumed by such bottom-level compactions. Then, top-level compactions (e.g., a few L0 files) would be blocked for a long time ("head-of-line blocking"). Such top-level compactions are critical to prevent compaction stalls as they can quickly reduce number of L0 files / sorted runs. This diff introduces a bottom-priority queue for universal compactions including the bottom level. This alleviates the head-of-line blocking situation for fast, top-level compactions. - Added `Env::Priority::BOTTOM` thread pool. This feature is only enabled if user explicitly configures it to have a positive number of threads. - Changed `ThreadPoolImpl`'s default thread limit from one to zero. This change is invisible to users as we call `IncBackgroundThreadsIfNeeded` on the low-pri/high-pri pools during `DB::Open` with values of at least one. It is necessary, though, for bottom-pri to start with zero threads so the feature is disabled by default. - Separated `ManualCompaction` into two parts in `PrepickedCompaction`. `PrepickedCompaction` is used for any compaction that's picked outside of its execution thread, either manual or automatic. - Forward universal compactions involving last level to the bottom pool (worker thread's entry point is `BGWorkBottomCompaction`). - Track `bg_bottom_compaction_scheduled_` so we can wait for bottom-level compactions to finish. We don't count them against the background jobs limits. So users of this feature will get an extra compaction for free. Closes https://github.com/facebook/rocksdb/pull/2580 Differential Revision: D5422916 Pulled By: ajkr fbshipit-source-id: a74bd11f1ea4933df3739b16808bb21fcd512333
2017-08-04 00:36:28 +02:00
while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
bg_flush_scheduled_ || bg_purge_scheduled_ ||
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
pending_purge_obsolete_files_ ||
error_handler_.IsRecoveryInProgress()) {
TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
bg_cv_.Wait();
}
Fix race condition causing double deletion of ssts Summary: Possible interleaved execution of background compaction thread calling `FindObsoleteFiles (no full scan) / PurgeObsoleteFiles` and user thread calling `FindObsoleteFiles (full scan) / PurgeObsoleteFiles` can lead to race condition on which RocksDB attempts to delete a file twice. The second attempt will fail and return `IO error`. This may occur to other files, but this PR targets sst. Also add a unit test to verify that this PR fixes the issue. The newly added unit test `obsolete_files_test` has a test case for this scenario, implemented in `ObsoleteFilesTest#RaceForObsoleteFileDeletion`. `TestSyncPoint`s are used to coordinate the interleaving the `user_thread` and background compaction thread. They execute as follows ``` timeline user_thread background_compaction thread t1 | FindObsoleteFiles(full_scan=false) t2 | FindObsoleteFiles(full_scan=true) t3 | PurgeObsoleteFiles t4 | PurgeObsoleteFiles V ``` When `user_thread` invokes `FindObsoleteFiles` with full scan, it collects ALL files in RocksDB directory, including the ones that background compaction thread have collected in its job context. Then `user_thread` will see an IO error when trying to delete these files in `PurgeObsoleteFiles` because background compaction thread has already deleted the file in `PurgeObsoleteFiles`. To fix this, we make RocksDB remember which (SST) files have been found by threads after calling `FindObsoleteFiles` (see `DBImpl#files_grabbed_for_purge_`). Therefore, when another thread calls `FindObsoleteFiles` with full scan, it will not collect such files. ajkr could you take a look and comment? Thanks! Closes https://github.com/facebook/rocksdb/pull/3638 Differential Revision: D7384372 Pulled By: riversand963 fbshipit-source-id: 01489516d60012e722ee65a80e1449e589ce26d3
2018-03-28 19:23:31 +02:00
TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
&files_grabbed_for_purge_);
EraseThreadStatusDbInfo();
flush_scheduler_.Clear();
Refactor trimming logic for immutable memtables (#5022) Summary: MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory. We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one. The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming. In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022 Differential Revision: D14394062 Pulled By: miasantreble fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
trim_history_scheduler_.Clear();
Rewritten system for scheduling background work Summary: When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue. The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction. Here are the performance results: Command: ./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333 Before the patch: fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s After the patch: fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got: fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s Test Plan: make check two stress tests: Big number of compactions and flushes: ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 max_background_flushes=0, to verify that this case also works correctly ./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 Reviewers: ljin, rven, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
while (!flush_queue_.empty()) {
const FlushRequest& flush_req = PopFirstFromFlushQueue();
for (const auto& iter : flush_req) {
iter.first->UnrefAndTryDelete();
Rewritten system for scheduling background work Summary: When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue. The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction. Here are the performance results: Command: ./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333 Before the patch: fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s After the patch: fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got: fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s Test Plan: make check two stress tests: Big number of compactions and flushes: ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 max_background_flushes=0, to verify that this case also works correctly ./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 Reviewers: ljin, rven, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
}
}
while (!compaction_queue_.empty()) {
auto cfd = PopFirstFromCompactionQueue();
cfd->UnrefAndTryDelete();
Rewritten system for scheduling background work Summary: When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue. The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction. Here are the performance results: Command: ./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333 Before the patch: fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s After the patch: fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got: fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s Test Plan: make check two stress tests: Big number of compactions and flushes: ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 max_background_flushes=0, to verify that this case also works correctly ./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 Reviewers: ljin, rven, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
}
if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
// we need to delete handle outside of lock because it does its own locking
mutex_.Unlock();
if (default_cf_handle_) {
delete default_cf_handle_;
default_cf_handle_ = nullptr;
}
if (persist_stats_cf_handle_) {
delete persist_stats_cf_handle_;
persist_stats_cf_handle_ = nullptr;
}
mutex_.Lock();
}
// Clean up obsolete files due to SuperVersion release.
// (1) Need to delete to obsolete files before closing because RepairDB()
// scans all existing files in the file system and builds manifest file.
// Keeping obsolete files confuses the repair process.
// (2) Need to check if we Open()/Recover() the DB successfully before
// deleting because if VersionSet recover fails (may be due to corrupted
// manifest file), it is not able to identify live files correctly. As a
// result, all "live" files can get deleted by accident. However, corrupted
// manifest is recoverable by RepairDB().
if (opened_successfully_) {
JobContext job_context(next_job_id_.fetch_add(1));
FindObsoleteFiles(&job_context, true);
mutex_.Unlock();
// manifest number starting from 2
job_context.manifest_file_number = 1;
if (job_context.HaveSomethingToDelete()) {
PurgeObsoleteFiles(job_context);
}
job_context.Clean();
mutex_.Lock();
}
for (auto l : logs_to_free_) {
delete l;
}
for (auto& log : logs_) {
uint64_t log_number = log.writer->get_log_number();
Status s = log.ClearWriter();
if (!s.ok()) {
ROCKS_LOG_WARN(
immutable_db_options_.info_log,
"Unable to Sync WAL file %s with error -- %s",
LogFileName(immutable_db_options_.wal_dir, log_number).c_str(),
s.ToString().c_str());
// Retain the first error
if (ret.ok()) {
ret = s;
}
}
}
logs_.clear();
// Table cache may have table handles holding blocks from the block cache.
// We need to release them before the block cache is destroyed. The block
// cache may be destroyed inside versions_.reset(), when column family data
// list is destroyed, so leaving handles in table cache after
// versions_.reset() may cause issues.
// Here we clean all unreferenced handles in table cache.
// Now we assume all user queries have finished, so only version set itself
// can possibly hold the blocks from block cache. After releasing unreferenced
// handles here, only handles held by version set left and inside
// versions_.reset(), we will release them. There, we need to make sure every
// time a handle is released, we erase it from the cache too. By doing that,
// we can guarantee that after versions_.reset(), table cache is empty
// so the cache can be safely destroyed.
table_cache_->EraseUnRefEntries();
for (auto& txn_entry : recovered_transactions_) {
delete txn_entry.second;
}
// versions need to be destroyed before table_cache since it can hold
// references to table_cache.
versions_.reset();
mutex_.Unlock();
if (db_lock_ != nullptr) {
env_->UnlockFile(db_lock_);
}
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
LogFlush(immutable_db_options_.info_log);
#ifndef ROCKSDB_LITE
// If the sst_file_manager was allocated by us during DB::Open(), ccall
// Close() on it before closing the info_log. Otherwise, background thread
// in SstFileManagerImpl might try to log something
if (immutable_db_options_.sst_file_manager && own_sfm_) {
auto sfm = static_cast<SstFileManagerImpl*>(
immutable_db_options_.sst_file_manager.get());
sfm->Close();
}
#endif // ROCKSDB_LITE
if (immutable_db_options_.info_log && own_info_log_) {
Status s = immutable_db_options_.info_log->Close();
if (ret.ok()) {
ret = s;
}
}
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
if (ret.IsAborted()) {
// Reserve IsAborted() error for those where users didn't release
// certain resource and they can release them and come back and
// retry. In this case, we wrap this exception to something else.
return Status::Incomplete(ret.ToString());
}
return ret;
}
Status DBImpl::CloseImpl() { return CloseHelper(); }
DBImpl::~DBImpl() {
if (!closed_) {
closed_ = true;
CloseHelper();
}
}
void DBImpl::MaybeIgnoreError(Status* s) const {
if (s->ok() || immutable_db_options_.paranoid_checks) {
// No change needed
} else {
ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s",
s->ToString().c_str());
*s = Status::OK();
}
}
const Status DBImpl::CreateArchivalDirectory() {
if (immutable_db_options_.wal_ttl_seconds > 0 ||
immutable_db_options_.wal_size_limit_mb > 0) {
std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir);
return env_->CreateDirIfMissing(archivalPath);
}
return Status::OK();
}
void DBImpl::PrintStatistics() {
auto dbstats = immutable_db_options_.statistics.get();
if (dbstats) {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
dbstats->ToString().c_str());
}
}
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
void DBImpl::StartTimedTasks() {
unsigned int stats_dump_period_sec = 0;
unsigned int stats_persist_period_sec = 0;
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
{
InstrumentedMutexLock l(&mutex_);
stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec;
if (stats_dump_period_sec > 0) {
if (!thread_dump_stats_) {
thread_dump_stats_.reset(new rocksdb::RepeatableThread(
[this]() { DBImpl::DumpStats(); }, "dump_st", env_,
static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond));
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
}
}
stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
if (stats_persist_period_sec > 0) {
if (!thread_persist_stats_) {
thread_persist_stats_.reset(new rocksdb::RepeatableThread(
[this]() { DBImpl::PersistStats(); }, "pst_st", env_,
static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond));
}
}
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
}
}
make internal stats independent of statistics Summary: also make it aware of column family output from db_bench ``` ** Compaction Stats [default] ** Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15 L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06 L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00 L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00 L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00 Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22 Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00 Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown ** DB Stats ** Uptime(secs): 202.1 total, 13.5 interval Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written Test Plan: ran it Reviewers: sdong, yhchiang, igor Reviewed By: igor Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
// esitmate the total size of stats_history_
size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
size_t size_total =
sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
if (stats_history_.size() == 0) return size_total;
size_t size_per_slice =
sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
// non-empty map, stats_history_.begin() guaranteed to exist
std::map<std::string, uint64_t> sample_slice(stats_history_.begin()->second);
for (const auto& pairs : sample_slice) {
size_per_slice +=
pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
}
size_total = size_per_slice * stats_history_.size();
return size_total;
}
void DBImpl::PersistStats() {
TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
#ifndef ROCKSDB_LITE
if (shutdown_initiated_) {
return;
}
uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
Statistics* statistics = immutable_db_options_.statistics.get();
if (!statistics) {
return;
}
size_t stats_history_size_limit = 0;
{
InstrumentedMutexLock l(&mutex_);
stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
}
std::map<std::string, uint64_t> stats_map;
if (!statistics->getTickerMap(&stats_map)) {
return;
}
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"------- PERSISTING STATS -------");
if (immutable_db_options_.persist_stats_to_disk) {
WriteBatch batch;
if (stats_slice_initialized_) {
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Reading %" ROCKSDB_PRIszt " stats from statistics\n",
stats_slice_.size());
for (const auto& stat : stats_map) {
char key[100];
int length =
EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
// calculate the delta from last time
if (stats_slice_.find(stat.first) != stats_slice_.end()) {
uint64_t delta = stat.second - stats_slice_[stat.first];
batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)),
ToString(delta));
}
}
}
stats_slice_initialized_ = true;
std::swap(stats_slice_, stats_map);
WriteOptions wo;
wo.low_pri = true;
wo.no_slowdown = true;
wo.sync = false;
Status s = Write(wo, &batch);
if (!s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
"Writing to persistent stats CF failed -- %s",
s.ToString().c_str());
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
} else {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
" to persistent stats CF succeeded",
stats_slice_.size(), now_seconds);
}
// TODO(Zhongyi): add purging for persisted data
} else {
InstrumentedMutexLock l(&stats_history_mutex_);
// calculate the delta from last time
if (stats_slice_initialized_) {
std::map<std::string, uint64_t> stats_delta;
for (const auto& stat : stats_map) {
if (stats_slice_.find(stat.first) != stats_slice_.end()) {
stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
}
}
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
" to in-memory stats history",
stats_slice_.size(), now_seconds);
stats_history_[now_seconds] = stats_delta;
}
stats_slice_initialized_ = true;
std::swap(stats_slice_, stats_map);
TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
// delete older stats snapshots to control memory consumption
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
size_t stats_history_size = EstimateInMemoryStatsHistorySize();
bool purge_needed = stats_history_size > stats_history_size_limit;
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
" bytes, slice count: %" ROCKSDB_PRIszt,
stats_history_size, stats_history_.size());
while (purge_needed && !stats_history_.empty()) {
stats_history_.erase(stats_history_.begin());
purge_needed =
EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
}
add more tracing for stats history (#5566) Summary: Sample info log output from db_bench: In-memory: ``` 2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history 2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103 2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102 ``` On-disk: ``` 2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded ``` ``` 2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS ------- 2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics 2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall ... 2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566 Differential Revision: D16258187 Pulled By: miasantreble fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
2019-07-15 20:39:18 +02:00
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
" bytes, slice count: %" ROCKSDB_PRIszt,
stats_history_size, stats_history_.size());
}
#endif // !ROCKSDB_LITE
}
bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
uint64_t* new_time,
std::map<std::string, uint64_t>* stats_map) {
assert(new_time);
assert(stats_map);
if (!new_time || !stats_map) return false;
// lock when search for start_time
{
InstrumentedMutexLock l(&stats_history_mutex_);
auto it = stats_history_.lower_bound(start_time);
if (it != stats_history_.end() && it->first < end_time) {
// make a copy for timestamp and stats_map
*new_time = it->first;
*stats_map = it->second;
return true;
} else {
return false;
}
}
}
Status DBImpl::GetStatsHistory(
uint64_t start_time, uint64_t end_time,
std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
if (!stats_iterator) {
return Status::InvalidArgument("stats_iterator not preallocated.");
}
if (immutable_db_options_.persist_stats_to_disk) {
stats_iterator->reset(
new PersistentStatsHistoryIterator(start_time, end_time, this));
} else {
stats_iterator->reset(
new InMemoryStatsHistoryIterator(start_time, end_time, this));
}
return (*stats_iterator)->status();
}
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
void DBImpl::DumpStats() {
TEST_SYNC_POINT("DBImpl::DumpStats:1");
#ifndef ROCKSDB_LITE
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
const DBPropertyInfo* cf_property_info =
GetPropertyInfo(DB::Properties::kCFStats);
assert(cf_property_info != nullptr);
const DBPropertyInfo* db_property_info =
GetPropertyInfo(DB::Properties::kDBStats);
assert(db_property_info != nullptr);
std::string stats;
if (shutdown_initiated_) {
return;
}
{
InstrumentedMutexLock l(&mutex_);
default_cf_internal_stats_->GetStringProperty(
*db_property_info, DB::Properties::kDBStats, &stats);
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->initialized()) {
cfd->internal_stats()->GetStringProperty(
*cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
make internal stats independent of statistics Summary: also make it aware of column family output from db_bench ``` ** Compaction Stats [default] ** Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15 L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06 L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00 L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00 L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00 Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22 Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00 Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown ** DB Stats ** Uptime(secs): 202.1 total, 13.5 interval Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written Test Plan: ran it Reviewers: sdong, yhchiang, igor Reviewed By: igor Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
}
}
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->initialized()) {
cfd->internal_stats()->GetStringProperty(
*cf_property_info, DB::Properties::kCFFileHistogram, &stats);
}
}
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
}
TEST_SYNC_POINT("DBImpl::DumpStats:2");
ROCKS_LOG_INFO(immutable_db_options_.info_log,
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
"------- DUMPING STATS -------");
ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
if (immutable_db_options_.dump_malloc_stats) {
stats.clear();
DumpMallocStats(&stats);
if (!stats.empty()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
"------- Malloc STATS -------");
ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
}
}
#endif // !ROCKSDB_LITE
make internal stats independent of statistics Summary: also make it aware of column family output from db_bench ``` ** Compaction Stats [default] ** Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15 L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06 L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00 L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00 L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00 Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22 Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00 Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown ** DB Stats ** Uptime(secs): 202.1 total, 13.5 interval Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written Test Plan: ran it Reviewers: sdong, yhchiang, igor Reviewed By: igor Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
PrintStatistics();
}
Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
int max_entries_to_print,
std::string* out_str) {
auto* cfh =
static_cast_with_check<ColumnFamilyHandleImpl, ColumnFamilyHandle>(
column_family);
ColumnFamilyData* cfd = cfh->cfd();
SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
Version* version = super_version->current;
Status s =
version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
CleanupSuperVersion(super_version);
return s;
}
void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
if (!job_context->logs_to_free.empty()) {
for (auto l : job_context->logs_to_free) {
AddToLogsToFreeQueue(l);
}
job_context->logs_to_free.clear();
SchedulePurge();
}
}
Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
assert(cfd);
Directory* ret_dir = cfd->GetDataDir(path_id);
if (ret_dir == nullptr) {
return directories_.GetDataDir(path_id);
}
return ret_dir;
}
Status DBImpl::SetOptions(
ColumnFamilyHandle* column_family,
const std::unordered_map<std::string, std::string>& options_map) {
#ifdef ROCKSDB_LITE
(void)column_family;
(void)options_map;
return Status::NotSupported("Not supported in ROCKSDB LITE");
#else
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
if (options_map.empty()) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"SetOptions() on column family [%s], empty input",
cfd->GetName().c_str());
return Status::InvalidArgument("empty input");
}
MutableCFOptions new_options;
Status s;
Status persist_options_status;
SuperVersionContext sv_context(/* create_superversion */ true);
{
auto db_options = GetDBOptions();
InstrumentedMutexLock l(&mutex_);
s = cfd->SetOptions(db_options, options_map);
if (s.ok()) {
new_options = *cfd->GetLatestMutableCFOptions();
// Append new version to recompute compaction score.
VersionEdit dummy_edit;
versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
directories_.GetDbDir());
// Trigger possible flush/compactions. This has to be before we persist
// options to file, otherwise there will be a deadlock with writer
// thread.
InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
persist_options_status = WriteOptionsFile(
false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
bg_cv_.SignalAll();
}
}
sv_context.Clean();
ROCKS_LOG_INFO(
immutable_db_options_.info_log,
"SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
for (const auto& o : options_map) {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
o.second.c_str());
}
if (s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"[%s] SetOptions() succeeded", cfd->GetName().c_str());
new_options.Dump(immutable_db_options_.info_log.get());
if (!persist_options_status.ok()) {
s = persist_options_status;
}
} else {
ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
cfd->GetName().c_str());
}
LogFlush(immutable_db_options_.info_log);
return s;
#endif // ROCKSDB_LITE
}
Status DBImpl::SetDBOptions(
const std::unordered_map<std::string, std::string>& options_map) {
#ifdef ROCKSDB_LITE
(void)options_map;
return Status::NotSupported("Not supported in ROCKSDB LITE");
#else
if (options_map.empty()) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"SetDBOptions(), empty input.");
return Status::InvalidArgument("empty input");
}
MutableDBOptions new_options;
Status s;
Status persist_options_status;
bool wal_changed = false;
WriteContext write_context;
{
InstrumentedMutexLock l(&mutex_);
s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
&new_options);
if (new_options.bytes_per_sync == 0) {
new_options.bytes_per_sync = 1024 * 1024;
}
DBOptions new_db_options =
BuildDBOptions(immutable_db_options_, new_options);
if (s.ok()) {
s = ValidateOptions(new_db_options);
}
if (s.ok()) {
for (auto c : *versions_->GetColumnFamilySet()) {
if (!c->IsDropped()) {
auto cf_options = c->GetLatestCFOptions();
s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
if (!s.ok()) {
break;
}
}
}
}
if (s.ok()) {
if (new_options.max_background_compactions >
mutable_db_options_.max_background_compactions) {
env_->IncBackgroundThreadsIfNeeded(
new_options.max_background_compactions, Env::Priority::LOW);
MaybeScheduleFlushOrCompaction();
}
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
if (new_options.stats_dump_period_sec !=
mutable_db_options_.stats_dump_period_sec) {
if (thread_dump_stats_) {
mutex_.Unlock();
thread_dump_stats_->cancel();
mutex_.Lock();
}
if (new_options.stats_dump_period_sec > 0) {
thread_dump_stats_.reset(new rocksdb::RepeatableThread(
[this]() { DBImpl::DumpStats(); }, "dump_st", env_,
static_cast<uint64_t>(new_options.stats_dump_period_sec) *
kMicrosInSecond));
} else {
thread_dump_stats_.reset();
}
move dump stats to a separate thread (#4382) Summary: Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all. We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior. Tested with db_bench using `--stats_dump_period_sec=20` in command line: > LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- LOG content: > 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS ------- 2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606] ** DB Stats ** Uptime(secs): 20.0 total, 20.0 interval Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s Interval stall: 00:00:0.012 H:M:S, 0.1 percent ** Compaction Stats [default] ** Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382 Differential Revision: D9933051 Pulled By: miasantreble fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 07:52:58 +02:00
}
if (new_options.stats_persist_period_sec !=
mutable_db_options_.stats_persist_period_sec) {
if (thread_persist_stats_) {
mutex_.Unlock();
thread_persist_stats_->cancel();
mutex_.Lock();
}
if (new_options.stats_persist_period_sec > 0) {
thread_persist_stats_.reset(new rocksdb::RepeatableThread(
[this]() { DBImpl::PersistStats(); }, "pst_st", env_,
static_cast<uint64_t>(new_options.stats_persist_period_sec) *
kMicrosInSecond));
} else {
thread_persist_stats_.reset();
}
}
write_controller_.set_max_delayed_write_rate(
new_options.delayed_write_rate);
table_cache_.get()->SetCapacity(new_options.max_open_files == -1
? TableCache::kInfiniteCapacity
: new_options.max_open_files - 10);
wal_changed = mutable_db_options_.wal_bytes_per_sync !=
new_options.wal_bytes_per_sync;
mutable_db_options_ = new_options;
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
file_options_for_compaction_ = FileOptions(new_db_options);
file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
file_options_for_compaction_, immutable_db_options_);
versions_->ChangeFileOptions(mutable_db_options_);
//TODO(xiez): clarify why apply optimize for read to write options
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
file_options_for_compaction_, immutable_db_options_);
file_options_for_compaction_.compaction_readahead_size =
mutable_db_options_.compaction_readahead_size;
WriteThread::Writer w;
write_thread_.EnterUnbatched(&w, &mutex_);
if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
Status purge_wal_status = SwitchWAL(&write_context);
if (!purge_wal_status.ok()) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"Unable to purge WAL files in SetDBOptions() -- %s",
purge_wal_status.ToString().c_str());
}
}
persist_options_status = WriteOptionsFile(
false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
write_thread_.ExitUnbatched(&w);
}
}
ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
for (const auto& o : options_map) {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
o.second.c_str());
}
if (s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
new_options.Dump(immutable_db_options_.info_log.get());
if (!persist_options_status.ok()) {
if (immutable_db_options_.fail_if_options_file_error) {
s = Status::IOError(
"SetDBOptions() succeeded, but unable to persist options",
persist_options_status.ToString());
}
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"Unable to persist options in SetDBOptions() -- %s",
persist_options_status.ToString().c_str());
}
} else {
ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
}
LogFlush(immutable_db_options_.info_log);
return s;
#endif // ROCKSDB_LITE
}
// return the same level if it cannot be moved
int DBImpl::FindMinimumEmptyLevelFitting(
ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
int level) {
mutex_.AssertHeld();
const auto* vstorage = cfd->current()->storage_info();
int minimum_level = level;
for (int i = level - 1; i > 0; --i) {
// stop if level i is not empty
if (vstorage->NumLevelFiles(i) > 0) break;
// stop if level i is too small (cannot fit the level files)
if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
break;
}
minimum_level = i;
}
return minimum_level;
}
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
Status DBImpl::FlushWAL(bool sync) {
if (manual_wal_flush_) {
Status s;
{
// We need to lock log_write_mutex_ since logs_ might change concurrently
InstrumentedMutexLock wl(&log_write_mutex_);
log::Writer* cur_log_writer = logs_.back().writer;
s = cur_log_writer->WriteBuffer();
}
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
if (!s.ok()) {
ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
s.ToString().c_str());
// In case there is a fs error we should set it globally to prevent the
// future writes
WriteStatusCheck(s);
// whether sync or not, we should abort the rest of function upon error
return s;
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
}
if (!sync) {
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
return s;
}
}
if (!sync) {
return Status::OK();
}
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
// sync = true
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
return SyncWAL();
}
Status DBImpl::SyncWAL() {
autovector<log::Writer*, 1> logs_to_sync;
bool need_log_dir_sync;
uint64_t current_log_number;
{
InstrumentedMutexLock l(&mutex_);
assert(!logs_.empty());
// This SyncWAL() call only cares about logs up to this number.
current_log_number = logfile_number_;
while (logs_.front().number <= current_log_number &&
logs_.front().getting_synced) {
log_sync_cv_.Wait();
}
// First check that logs are safe to sync in background.
for (auto it = logs_.begin();
it != logs_.end() && it->number <= current_log_number; ++it) {
if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
return Status::NotSupported(
"SyncWAL() is not supported for this implementation of WAL file",
immutable_db_options_.allow_mmap_writes
? "try setting Options::allow_mmap_writes to false"
: Slice());
}
}
for (auto it = logs_.begin();
it != logs_.end() && it->number <= current_log_number; ++it) {
auto& log = *it;
assert(!log.getting_synced);
log.getting_synced = true;
logs_to_sync.push_back(log.writer);
}
Rewritten system for scheduling background work Summary: When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue. The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction. Here are the performance results: Command: ./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333 Before the patch: fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s After the patch: fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got: fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s Test Plan: make check two stress tests: Big number of compactions and flushes: ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 max_background_flushes=0, to verify that this case also works correctly ./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 Reviewers: ljin, rven, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
need_log_dir_sync = !log_dir_synced_;
Rewritten system for scheduling background work Summary: When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue. The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction. Here are the performance results: Command: ./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333 Before the patch: fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s After the patch: fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got: fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s Test Plan: make check two stress tests: Big number of compactions and flushes: ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 max_background_flushes=0, to verify that this case also works correctly ./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 Reviewers: ljin, rven, yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
}
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
RecordTick(stats_, WAL_FILE_SYNCED);
Status status;
for (log::Writer* log : logs_to_sync) {
status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
if (!status.ok()) {
break;
}
}
if (status.ok() && need_log_dir_sync) {
status = directories_.GetWalDir()->Fsync();
}
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
{
InstrumentedMutexLock l(&mutex_);
MarkLogsSynced(current_log_number, need_log_dir_sync, status);
}
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
return status;
}
Expose DB methods to lock and unlock the WAL (#5146) Summary: Expose DB methods to lock and unlock the WAL. These methods are intended to use by MyRocks in order to obtain WAL coordinates in consistent way. Usage scenario is following: MySQL has performance_schema.log_status which provides information that enables a backup tool to copy the required log files without locking for the duration of copy. To populate this table MySQL does following: 1. Lock the binary log. Transactions are not allowed to commit now 2. Save the binary log coordinates 3. Walk through the storage engines and lock writes on each engine. For InnoDB, redo log is locked. For MyRocks, WAL should be locked. 4. Ask storage engines for their coordinates. InnoDB reports its current LSN and checkpoint LSN. MyRocks should report active WAL files names and sizes. 5. Release storage engine's locks 6. Unlock binary log Backup tool will then use this information to copy InnoDB, RocksDB and MySQL binary logs up to specified positions to end up with consistent DB state after restore. Currently, RocksDB allows to obtain the list of WAL files. Only missing bit is the method to lock the writes to WAL files. LockWAL method must flush the WAL in order for the reported size to be accurate (GetSortedWALFiles is using file system stat call to return the file size), also, since backup tool is going to copy the WAL, it is better to be flushed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5146 Differential Revision: D14815447 Pulled By: maysamyabandeh fbshipit-source-id: eec9535a6025229ed471119f19fe7b3d8ae888a3
2019-04-06 15:36:42 +02:00
Status DBImpl::LockWAL() {
log_write_mutex_.Lock();
auto cur_log_writer = logs_.back().writer;
auto status = cur_log_writer->WriteBuffer();
if (!status.ok()) {
ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
status.ToString().c_str());
// In case there is a fs error we should set it globally to prevent the
// future writes
WriteStatusCheck(status);
}
return status;
}
Status DBImpl::UnlockWAL() {
log_write_mutex_.Unlock();
return Status::OK();
}
void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
const Status& status) {
mutex_.AssertHeld();
if (synced_dir && logfile_number_ == up_to && status.ok()) {
log_dir_synced_ = true;
}
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
auto& log = *it;
assert(log.getting_synced);
if (status.ok() && logs_.size() > 1) {
logs_to_free_.push_back(log.ReleaseWriter());
// To modify logs_ both mutex_ and log_write_mutex_ must be held
InstrumentedMutexLock l(&log_write_mutex_);
it = logs_.erase(it);
} else {
log.getting_synced = false;
++it;
}
}
assert(!status.ok() || logs_.empty() || logs_[0].number > up_to ||
(logs_.size() == 1 && !logs_[0].getting_synced));
log_sync_cv_.SignalAll();
}
SequenceNumber DBImpl::GetLatestSequenceNumber() const {
return versions_->LastSequence();
}
void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
versions_->SetLastPublishedSequence(seq);
}
Added support for differential snapshots Summary: The motivation for this PR is to add to RocksDB support for differential (incremental) snapshots, as snapshot of the DB changes between two points in time (one can think of it as diff between to sequence numbers, or the diff D which can be thought of as an SST file or just set of KVs that can be applied to sequence number S1 to get the database to the state at sequence number S2). This feature would be useful for various distributed storages layers built on top of RocksDB, as it should help reduce resources (time and network bandwidth) needed to recover and rebuilt DB instances as replicas in the context of distributed storages. From the API standpoint that would like client app requesting iterator between (start seqnum) and current DB state, and reading the "diff". This is a very draft PR for initial review in the discussion on the approach, i'm going to rework some parts and keep updating the PR. For now, what's done here according to initial discussions: Preserving deletes: - We want to be able to optionally preserve recent deletes for some defined period of time, so that if a delete came in recently and might need to be included in the next incremental snapshot it would't get dropped by a compaction. This is done by adding new param to Options (preserve deletes flag) and new variable to DB Impl where we keep track of the sequence number after which we don't want to drop tombstones, even if they are otherwise eligible for deletion. - I also added a new API call for clients to be able to advance this cutoff seqnum after which we drop deletes; i assume it's more flexible to let clients control this, since otherwise we'd need to keep some kind of timestamp < -- > seqnum mapping inside the DB, which sounds messy and painful to support. Clients could make use of it by periodically calling GetLatestSequenceNumber(), noting the timestamp, doing some calculation and figuring out by how much we need to advance the cutoff seqnum. - Compaction codepath in compaction_iterator.cc has been modified to avoid dropping tombstones with seqnum > cutoff seqnum. Iterator changes: - couple params added to ReadOptions, to optionally allow client to request internal keys instead of user keys (so that client can get the latest value of a key, be it delete marker or a put), as well as min timestamp and min seqnum. TableCache changes: - I modified table_cache code to be able to quickly exclude SST files from iterators heep if creation_time on the file is less then iter_start_ts as passed in ReadOptions. That would help a lot in some DB settings (like reading very recent data only or using FIFO compactions), but not so much for universal compaction with more or less long iterator time span. What's left: - Still looking at how to best plug that inside DBIter codepath. So far it seems that FindNextUserKeyInternal only parses values as UserKeys, and iter->key() call generally returns user key. Can we add new API to DBIter as internal_key(), and modify this internal method to optionally set saved_key_ to point to the full internal key? I don't need to store actual seqnum there, but I do need to store type. Closes https://github.com/facebook/rocksdb/pull/2999 Differential Revision: D6175602 Pulled By: mikhail-antonov fbshipit-source-id: c779a6696ee2d574d86c69cec866a3ae095aa900
2017-11-02 02:43:29 +01:00
bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
if (seqnum > preserve_deletes_seqnum_.load()) {
preserve_deletes_seqnum_.store(seqnum);
return true;
} else {
return false;
}
}
InternalIterator* DBImpl::NewInternalIterator(
Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
ColumnFamilyHandle* column_family) {
ColumnFamilyData* cfd;
if (column_family == nullptr) {
cfd = default_cf_handle_->cfd();
} else {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
cfd = cfh->cfd();
}
mutex_.Lock();
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
mutex_.Unlock();
ReadOptions roptions;
return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg,
sequence);
}
void DBImpl::SchedulePurge() {
mutex_.AssertHeld();
assert(opened_successfully_);
// Purge operations are put into High priority queue
bg_purge_scheduled_++;
env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr);
}
void DBImpl::BackgroundCallPurge() {
mutex_.Lock();
while (!logs_to_free_queue_.empty()) {
assert(!logs_to_free_queue_.empty());
log::Writer* log_writer = *(logs_to_free_queue_.begin());
logs_to_free_queue_.pop_front();
mutex_.Unlock();
delete log_writer;
mutex_.Lock();
}
for (const auto& file : purge_files_) {
const PurgeFileInfo& purge_file = file.second;
const std::string& fname = purge_file.fname;
const std::string& dir_to_sync = purge_file.dir_to_sync;
FileType type = purge_file.type;
uint64_t number = purge_file.number;
int job_id = purge_file.job_id;
mutex_.Unlock();
DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
mutex_.Lock();
}
purge_files_.clear();
bg_purge_scheduled_--;
bg_cv_.SignalAll();
// IMPORTANT:there should be no code after calling SignalAll. This call may
// signal the DB destructor that it's OK to proceed with destruction. In
// that case, all DB variables will be dealloacated and referencing them
// will cause trouble.
mutex_.Unlock();
}
namespace {
struct IterState {
IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version,
bool _background_purge)
: db(_db),
mu(_mu),
super_version(_super_version),
background_purge(_background_purge) {}
DBImpl* db;
InstrumentedMutex* mu;
SuperVersion* super_version;
bool background_purge;
};
static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
IterState* state = reinterpret_cast<IterState*>(arg1);
if (state->super_version->Unref()) {
// Job id == 0 means that this is not our background process, but rather
// user thread
JobContext job_context(0);
state->mu->Lock();
state->super_version->Cleanup();
state->db->FindObsoleteFiles(&job_context, false, true);
if (state->background_purge) {
state->db->ScheduleBgLogWriterClose(&job_context);
}
state->mu->Unlock();
delete state->super_version;
if (job_context.HaveSomethingToDelete()) {
if (state->background_purge) {
// PurgeObsoleteFiles here does not delete files. Instead, it adds the
// files to be deleted to a job queue, and deletes it in a separate
// background thread.
state->db->PurgeObsoleteFiles(job_context, true /* schedule only */);
state->mu->Lock();
state->db->SchedulePurge();
state->mu->Unlock();
} else {
state->db->PurgeObsoleteFiles(job_context);
}
}
job_context.Clean();
MemTableListVersion Summary: MemTableListVersion is to MemTableList what Version is to VersionSet. I took almost the same ideas to develop MemTableListVersion. The reason is to have copying std::list done in background, while flushing, rather than in foreground (MultiGet() and NewIterator()) under a mutex! Also, whenever we copied MemTableList, we copied also some MemTableList metadata (flush_requested_, commit_in_progress_, etc.), which was wasteful. This diff avoids std::list copy under a mutex in both MultiGet() and NewIterator(). I created a small database with some number of immutable memtables, and creating 100.000 iterators in a single-thread (!) decreased from {188739, 215703, 198028} to {154352, 164035, 159817}. A lot of the savings come from code under a mutex, so we should see much higher savings with multiple threads. Creating new iterator is very important to LogDevice team. I also think this diff will make SuperVersion obsolete for performance reasons. I will try it in the next diff. SuperVersion gave us huge savings on Get() code path, but I think that most of the savings came from copying MemTableList under a mutex. If we had MemTableListVersion, we would never need to copy the entire object (like we still do in NewIterator() and MultiGet()) Test Plan: `make check` works. I will also do `make valgrind_check` before commit Reviewers: dhruba, haobo, kailiu, sdong, emayanke, tnovak Reviewed By: kailiu CC: leveldb Differential Revision: https://reviews.facebook.net/D15255
2014-01-24 23:52:08 +01:00
}
delete state;
}
} // namespace
InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
ColumnFamilyData* cfd,
SuperVersion* super_version,
Arena* arena,
RangeDelAggregator* range_del_agg,
SequenceNumber sequence) {
InternalIterator* internal_iter;
assert(arena != nullptr);
assert(range_del_agg != nullptr);
// Need to create internal iterator from the arena.
MergeIteratorBuilder merge_iter_builder(
&cfd->internal_comparator(), arena,
!read_options.total_order_seek &&
super_version->mutable_cf_options.prefix_extractor != nullptr);
// Collect iterator for mutable mem
merge_iter_builder.AddIterator(
super_version->mem->NewIterator(read_options, arena));
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
Status s;
if (!read_options.ignore_range_deletions) {
range_del_iter.reset(
super_version->mem->NewRangeTombstoneIterator(read_options, sequence));
range_del_agg->AddTombstones(std::move(range_del_iter));
}
// Collect all needed child iterators for immutable memtables
if (s.ok()) {
super_version->imm->AddIterators(read_options, &merge_iter_builder);
if (!read_options.ignore_range_deletions) {
s = super_version->imm->AddRangeTombstoneIterators(read_options, arena,
range_del_agg);
}
}
TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
if (s.ok()) {
// Collect iterators for files in L0 - Ln
if (read_options.read_tier != kMemtableTier) {
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
super_version->current->AddIterators(read_options, file_options_,
&merge_iter_builder, range_del_agg);
}
internal_iter = merge_iter_builder.Finish();
IterState* cleanup =
new IterState(this, &mutex_, super_version,
read_options.background_purge_on_iterator_cleanup ||
immutable_db_options_.avoid_unnecessary_blocking_io);
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
return internal_iter;
} else {
CleanupSuperVersion(super_version);
}
return NewErrorInternalIterator<Slice>(s, arena);
}
ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
return default_cf_handle_;
}
ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
return persist_stats_cf_handle_;
}
Status DBImpl::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) {
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
GetImplOptions get_impl_options;
get_impl_options.column_family = column_family;
get_impl_options.value = value;
return GetImpl(read_options, key, get_impl_options);
}
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
GetImplOptions get_impl_options) {
assert(get_impl_options.value != nullptr ||
get_impl_options.merge_operands != nullptr);
PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
StopWatch sw(env_, stats_, DB_GET);
PERF_TIMER_GUARD(get_snapshot_time);
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
auto cfh =
reinterpret_cast<ColumnFamilyHandleImpl*>(get_impl_options.column_family);
auto cfd = cfh->cfd();
if (tracer_) {
// TODO: This mutex should be removed later, to improve performance when
// tracing is enabled.
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_) {
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
tracer_->Get(get_impl_options.column_family, key);
}
}
// Acquire SuperVersion
SuperVersion* sv = GetAndRefSuperVersion(cfd);
TEST_SYNC_POINT("DBImpl::GetImpl:1");
TEST_SYNC_POINT("DBImpl::GetImpl:2");
SequenceNumber snapshot;
if (read_options.snapshot != nullptr) {
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
if (get_impl_options.callback) {
// Already calculated based on read_options.snapshot
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
snapshot = get_impl_options.callback->max_visible_seq();
} else {
snapshot =
reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
}
} else {
// Note that the snapshot is assigned AFTER referencing the super
// version because otherwise a flush happening in between may compact away
// data for the snapshot, so the reader would see neither data that was be
// visible to the snapshot before compaction nor the newer data inserted
// afterwards.
snapshot = last_seq_same_as_publish_seq_
? versions_->LastSequence()
: versions_->LastPublishedSequence();
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
if (get_impl_options.callback) {
// The unprep_seqs are not published for write unprepared, so it could be
// that max_visible_seq is larger. Seek to the std::max of the two.
// However, we still want our callback to contain the actual snapshot so
// that it can do the correct visibility filtering.
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
get_impl_options.callback->Refresh(snapshot);
// Internally, WriteUnpreparedTxnReadCallback::Refresh would set
// max_visible_seq = max(max_visible_seq, snapshot)
//
// Currently, the commented out assert is broken by
// InvalidSnapshotReadCallback, but if write unprepared recovery followed
// the regular transaction flow, then this special read callback would not
// be needed.
//
// assert(callback->max_visible_seq() >= snapshot);
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
snapshot = get_impl_options.callback->max_visible_seq();
}
}
TEST_SYNC_POINT("DBImpl::GetImpl:3");
TEST_SYNC_POINT("DBImpl::GetImpl:4");
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
// Prepare to store a list of merge operations if merge occurs.
MergeContext merge_context;
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
SequenceNumber max_covering_tombstone_seq = 0;
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
Status s;
// First look in the memtable, then in the immutable memtable (if any).
// s is both in/out. When in, s could either be OK or MergeInProgress.
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
// merge_operands will contain the sequence of merges in the latter case.
Avoid user key copying for Get/Put/Write with user-timestamp (#5502) Summary: In previous https://github.com/facebook/rocksdb/issues/5079, we added user-specified timestamp to `DB::Get()` and `DB::Put()`. Limitation is that these two functions may cause extra memory allocation and key copy. The reason is that `WriteBatch` does not allocate extra memory for timestamps because it is not aware of timestamp size, and we did not provide an API to assign/update timestamp of each key within a `WriteBatch`. We address these issues in this PR by doing the following. 1. Add a `timestamp_size_` to `WriteBatch` so that `WriteBatch` can take timestamps into account when calling `WriteBatch::Put`, `WriteBatch::Delete`, etc. 2. Add APIs `WriteBatch::AssignTimestamp` and `WriteBatch::AssignTimestamps` so that application can assign/update timestamps for each key in a `WriteBatch`. 3. Avoid key copy in `GetImpl` by adding new constructor to `LookupKey`. Test plan (on devserver): ``` $make clean && COMPILE_WITH_ASAN=1 make -j32 all $./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/* $make check ``` If the API extension looks good, I will add more unit tests. Some simple benchmark using db_bench. ``` $rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000 $rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000 -disable_wal=true ``` Master is at a78503bd6c80a3c4137df1962a972fe406b4d90b. ``` | | readrandom | fillrandom | | master | 15.53 MB/s | 25.97 MB/s | | PR5502 | 16.70 MB/s | 25.80 MB/s | ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5502 Differential Revision: D16340894 Pulled By: riversand963 fbshipit-source-id: 51132cf792be07d1efc3ac33f5768c4ee2608bb8
2019-07-26 00:23:46 +02:00
LookupKey lkey(key, snapshot, read_options.timestamp);
PERF_TIMER_STOP(get_snapshot_time);
bool skip_memtable = (read_options.read_tier == kPersistedTier &&
has_unpersisted_data_.load(std::memory_order_relaxed));
bool done = false;
if (!skip_memtable) {
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
// Get value associated with key
if (get_impl_options.get_value) {
if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
&merge_context, &max_covering_tombstone_seq,
read_options, get_impl_options.callback,
get_impl_options.is_blob_index)) {
done = true;
get_impl_options.value->PinSelf();
RecordTick(stats_, MEMTABLE_HIT);
} else if ((s.ok() || s.IsMergeInProgress()) &&
sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
&merge_context, &max_covering_tombstone_seq,
read_options, get_impl_options.callback,
get_impl_options.is_blob_index)) {
done = true;
get_impl_options.value->PinSelf();
RecordTick(stats_, MEMTABLE_HIT);
}
} else {
// Get Merge Operands associated with key, Merge Operands should not be
// merged and raw values should be returned to the user.
if (sv->mem->Get(lkey, nullptr, &s, &merge_context,
&max_covering_tombstone_seq, read_options, nullptr,
nullptr, false)) {
done = true;
RecordTick(stats_, MEMTABLE_HIT);
} else if ((s.ok() || s.IsMergeInProgress()) &&
sv->imm->GetMergeOperands(lkey, &s, &merge_context,
&max_covering_tombstone_seq,
read_options)) {
done = true;
RecordTick(stats_, MEMTABLE_HIT);
}
}
if (!done && !s.ok() && !s.IsMergeInProgress()) {
ReturnAndCleanupSuperVersion(cfd, sv);
return s;
}
}
if (!done) {
PERF_TIMER_GUARD(get_from_output_files_time);
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
sv->current->Get(
read_options, lkey, get_impl_options.value, &s, &merge_context,
&max_covering_tombstone_seq,
get_impl_options.get_value ? get_impl_options.value_found : nullptr,
nullptr, nullptr,
get_impl_options.get_value ? get_impl_options.callback : nullptr,
get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
get_impl_options.get_value);
RecordTick(stats_, MEMTABLE_MISS);
}
{
PERF_TIMER_GUARD(get_post_process_time);
ReturnAndCleanupSuperVersion(cfd, sv);
RecordTick(stats_, NUMBER_KEYS_READ);
size_t size = 0;
if (s.ok()) {
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
if (get_impl_options.get_value) {
size = get_impl_options.value->size();
} else {
// Return all merge operands for get_impl_options.key
*get_impl_options.number_of_operands =
static_cast<int>(merge_context.GetNumOperands());
if (*get_impl_options.number_of_operands >
get_impl_options.get_merge_operands_options
->expected_max_number_of_operands) {
s = Status::Incomplete(
Status::SubCode::KMergeOperandsInsufficientCapacity);
} else {
for (const Slice& sl : merge_context.GetOperands()) {
size += sl.size();
get_impl_options.merge_operands->PinSelf(sl);
get_impl_options.merge_operands++;
}
}
}
RecordTick(stats_, BYTES_READ, size);
PERF_COUNTER_ADD(get_read_bytes, size);
}
RecordInHistogram(stats_, BYTES_PER_READ, size);
}
return s;
}
std::vector<Status> DBImpl::MultiGet(
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
StopWatch sw(env_, stats_, DB_MULTIGET);
PERF_TIMER_GUARD(get_snapshot_time);
2013-11-27 20:47:40 +01:00
SequenceNumber consistent_seqnum;
;
std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
column_family.size());
for (auto cf : column_family) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
auto cfd = cfh->cfd();
if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
multiget_cf_data.emplace(cfd->GetID(),
MultiGetColumnFamilyData(cfh, nullptr));
}
}
std::function<MultiGetColumnFamilyData*(
std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&)>
iter_deref_lambda =
[](std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&
cf_iter) { return &cf_iter->second; };
bool unref_only =
MultiCFSnapshot<std::unordered_map<uint32_t, MultiGetColumnFamilyData>>(
read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
&consistent_seqnum);
// Contain a list of merge operations if merge occurs.
MergeContext merge_context;
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
// Note: this always resizes the values array
size_t num_keys = keys.size();
std::vector<Status> stat_list(num_keys);
values->resize(num_keys);
// Keep track of bytes that we read for statistics-recording later
uint64_t bytes_read = 0;
PERF_TIMER_STOP(get_snapshot_time);
// For each of the given keys, apply the entire "get" process as follows:
// First look in the memtable, then in the immutable memtable (if any).
// s is both in/out. When in, s could either be OK or MergeInProgress.
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences. Summary: Here are the major changes to the Merge Interface. It has been expanded to handle cases where the MergeOperator is not associative. It does so by stacking up merge operations while scanning through the key history (i.e.: during Get() or Compaction), until a valid Put/Delete/end-of-history is encountered; it then applies all of the merge operations in the correct sequence starting with the base/sentinel value. I have also introduced an "AssociativeMerge" function which allows the user to take advantage of associative merge operations (such as in the case of counters). The implementation will always attempt to merge the operations/operands themselves together when they are encountered, and will resort to the "stacking" method if and only if the "associative-merge" fails. This implementation is conjectured to allow MergeOperator to handle the general case, while still providing the user with the ability to take advantage of certain efficiencies in their own merge-operator / data-structure. NOTE: This is a preliminary diff. This must still go through a lot of review, revision, and testing. Feedback welcome! Test Plan: -This is a preliminary diff. I have only just begun testing/debugging it. -I will be testing this with the existing MergeOperator use-cases and unit-tests (counters, string-append, and redis-lists) -I will be "desk-checking" and walking through the code with the help gdb. -I will find a way of stress-testing the new interface / implementation using db_bench, db_test, merge_test, and/or db_stress. -I will ensure that my tests cover all cases: Get-Memtable, Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0, Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found, end-of-history, end-of-file, etc. -A lot of feedback from the reviewers. Reviewers: haobo, dhruba, zshao, emayanke Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11499
2013-08-06 05:14:32 +02:00
// merge_operands will contain the sequence of merges in the latter case.
size_t num_found = 0;
for (size_t i = 0; i < num_keys; ++i) {
merge_context.Clear();
Status& s = stat_list[i];
std::string* value = &(*values)[i];
LookupKey lkey(keys[i], consistent_seqnum);
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
SequenceNumber max_covering_tombstone_seq = 0;
auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
assert(mgd_iter != multiget_cf_data.end());
auto mgd = mgd_iter->second;
auto super_version = mgd.super_version;
bool skip_memtable =
(read_options.read_tier == kPersistedTier &&
has_unpersisted_data_.load(std::memory_order_relaxed));
bool done = false;
if (!skip_memtable) {
if (super_version->mem->Get(lkey, value, &s, &merge_context,
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
&max_covering_tombstone_seq, read_options)) {
done = true;
RecordTick(stats_, MEMTABLE_HIT);
} else if (super_version->imm->Get(lkey, value, &s, &merge_context,
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
&max_covering_tombstone_seq,
read_options)) {
done = true;
RecordTick(stats_, MEMTABLE_HIT);
}
}
if (!done) {
PinnableSlice pinnable_val;
PERF_TIMER_GUARD(get_from_output_files_time);
super_version->current->Get(read_options, lkey, &pinnable_val, &s,
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
&merge_context, &max_covering_tombstone_seq);
value->assign(pinnable_val.data(), pinnable_val.size());
RecordTick(stats_, MEMTABLE_MISS);
}
if (s.ok()) {
bytes_read += value->size();
num_found++;
}
}
// Post processing (decrement reference counts and record statistics)
PERF_TIMER_GUARD(get_post_process_time);
autovector<SuperVersion*> superversions_to_delete;
for (auto mgd_iter : multiget_cf_data) {
auto mgd = mgd_iter.second;
if (!unref_only) {
ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
} else {
mgd.cfd->GetSuperVersion()->Unref();
}
}
RecordTick(stats_, NUMBER_MULTIGET_CALLS);
RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
PERF_TIMER_STOP(get_post_process_time);
return stat_list;
}
template <class T>
bool DBImpl::MultiCFSnapshot(
const ReadOptions& read_options, ReadCallback* callback,
std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
iter_deref_func,
T* cf_list, SequenceNumber* snapshot) {
PERF_TIMER_GUARD(get_snapshot_time);
bool last_try = false;
if (cf_list->size() == 1) {
// Fast path for a single column family. We can simply get the thread loca
// super version
auto cf_iter = cf_list->begin();
auto node = iter_deref_func(cf_iter);
node->super_version = GetAndRefSuperVersion(node->cfd);
if (read_options.snapshot != nullptr) {
// Note: In WritePrepared txns this is not necessary but not harmful
// either. Because prep_seq > snapshot => commit_seq > snapshot so if
// a snapshot is specified we should be fine with skipping seq numbers
// that are greater than that.
//
// In WriteUnprepared, we cannot set snapshot in the lookup key because we
// may skip uncommitted data that should be visible to the transaction for
// reading own writes.
*snapshot =
static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
if (callback) {
*snapshot = std::max(*snapshot, callback->max_visible_seq());
}
} else {
// Since we get and reference the super version before getting
// the snapshot number, without a mutex protection, it is possible
// that a memtable switch happened in the middle and not all the
// data for this snapshot is available. But it will contain all
// the data available in the super version we have, which is also
// a valid snapshot to read from.
// We shouldn't get snapshot before finding and referencing the super
// version because a flush happening in between may compact away data for
// the snapshot, but the snapshot is earlier than the data overwriting it,
// so users may see wrong results.
*snapshot = last_seq_same_as_publish_seq_
? versions_->LastSequence()
: versions_->LastPublishedSequence();
}
} else {
// If we end up with the same issue of memtable geting sealed during 2
// consecutive retries, it means the write rate is very high. In that case
// its probably ok to take the mutex on the 3rd try so we can succeed for
// sure
static const int num_retries = 3;
for (int i = 0; i < num_retries; ++i) {
last_try = (i == num_retries - 1);
bool retry = false;
if (i > 0) {
for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
++cf_iter) {
auto node = iter_deref_func(cf_iter);
SuperVersion* super_version = node->super_version;
ColumnFamilyData* cfd = node->cfd;
if (super_version != nullptr) {
ReturnAndCleanupSuperVersion(cfd, super_version);
}
node->super_version = nullptr;
}
}
if (read_options.snapshot == nullptr) {
if (last_try) {
TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
// We're close to max number of retries. For the last retry,
// acquire the lock so we're sure to succeed
mutex_.Lock();
}
*snapshot = last_seq_same_as_publish_seq_
? versions_->LastSequence()
: versions_->LastPublishedSequence();
} else {
*snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
->number_;
}
for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
++cf_iter) {
auto node = iter_deref_func(cf_iter);
if (!last_try) {
node->super_version = GetAndRefSuperVersion(node->cfd);
} else {
node->super_version = node->cfd->GetSuperVersion()->Ref();
}
TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
if (read_options.snapshot != nullptr || last_try) {
// If user passed a snapshot, then we don't care if a memtable is
// sealed or compaction happens because the snapshot would ensure
// that older key versions are kept around. If this is the last
// retry, then we have the lock so nothing bad can happen
continue;
}
// We could get the earliest sequence number for the whole list of
// memtables, which will include immutable memtables as well, but that
// might be tricky to maintain in case we decide, in future, to do
// memtable compaction.
if (!last_try) {
SequenceNumber seq =
node->super_version->mem->GetEarliestSequenceNumber();
if (seq > *snapshot) {
retry = true;
break;
}
}
}
if (!retry) {
if (last_try) {
mutex_.Unlock();
}
break;
}
}
}
// Keep track of bytes that we read for statistics-recording later
PERF_TIMER_STOP(get_snapshot_time);
return last_try;
}
void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
ColumnFamilyHandle** column_families, const Slice* keys,
PinnableSlice* values, Status* statuses,
const bool sorted_input) {
if (num_keys == 0) {
return;
}
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
sorted_keys.resize(num_keys);
for (size_t i = 0; i < num_keys; ++i) {
key_context.emplace_back(column_families[i], keys[i], &values[i],
&statuses[i]);
}
for (size_t i = 0; i < num_keys; ++i) {
sorted_keys[i] = &key_context[i];
}
PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
multiget_cf_data;
size_t cf_start = 0;
ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
for (size_t i = 0; i < num_keys; ++i) {
KeyContext* key_ctx = sorted_keys[i];
if (key_ctx->column_family != cf) {
multiget_cf_data.emplace_back(
MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
cf_start = i;
cf = key_ctx->column_family;
}
}
{
// multiget_cf_data.emplace_back(
// MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
}
std::function<MultiGetColumnFamilyData*(
autovector<MultiGetColumnFamilyData,
MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
iter_deref_lambda =
[](autovector<MultiGetColumnFamilyData,
MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
return &(*cf_iter);
};
SequenceNumber consistent_seqnum;
bool unref_only = MultiCFSnapshot<
autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
&consistent_seqnum);
for (auto cf_iter = multiget_cf_data.begin();
cf_iter != multiget_cf_data.end(); ++cf_iter) {
MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
if (!unref_only) {
ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
} else {
cf_iter->cfd->GetSuperVersion()->Unref();
}
}
}
namespace {
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
// Order keys by CF ID, followed by key contents
struct CompareKeyContext {
inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
ColumnFamilyHandleImpl* cfh =
static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
uint32_t cfd_id1 = cfh->cfd()->GetID();
const Comparator* comparator = cfh->cfd()->user_comparator();
cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
uint32_t cfd_id2 = cfh->cfd()->GetID();
if (cfd_id1 < cfd_id2) {
return true;
} else if (cfd_id1 > cfd_id2) {
return false;
}
// Both keys are from the same column family
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
if (cmp < 0) {
return true;
}
return false;
}
};
} // anonymous namespace
void DBImpl::PrepareMultiGetKeys(
size_t num_keys, bool sorted_input,
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
#ifndef NDEBUG
if (sorted_input) {
for (size_t index = 0; index < sorted_keys->size(); ++index) {
if (index > 0) {
KeyContext* lhs = (*sorted_keys)[index - 1];
KeyContext* rhs = (*sorted_keys)[index];
ColumnFamilyHandleImpl* cfh =
reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
uint32_t cfd_id1 = cfh->cfd()->GetID();
const Comparator* comparator = cfh->cfd()->user_comparator();
cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
uint32_t cfd_id2 = cfh->cfd()->GetID();
assert(cfd_id1 <= cfd_id2);
if (cfd_id1 < cfd_id2) {
continue;
}
// Both keys are from the same column family
int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
assert(cmp <= 0);
}
index++;
}
}
#endif
if (!sorted_input) {
CompareKeyContext sort_comparator;
std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
sort_comparator);
}
}
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
void DBImpl::MultiGet(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const size_t num_keys,
const Slice* keys, PinnableSlice* values,
Status* statuses, const bool sorted_input) {
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
sorted_keys.resize(num_keys);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
for (size_t i = 0; i < num_keys; ++i) {
key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
}
for (size_t i = 0; i < num_keys; ++i) {
sorted_keys[i] = &key_context[i];
}
PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
}
void DBImpl::MultiGetWithCallback(
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
const ReadOptions& read_options, ColumnFamilyHandle* column_family,
ReadCallback* callback,
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
std::function<MultiGetColumnFamilyData*(
std::array<MultiGetColumnFamilyData, 1>::iterator&)>
iter_deref_lambda =
[](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
return &(*cf_iter);
};
size_t num_keys = sorted_keys->size();
SequenceNumber consistent_seqnum;
bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
read_options, callback, iter_deref_lambda, &multiget_cf_data,
&consistent_seqnum);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
#ifndef NDEBUG
assert(!unref_only);
#else
// Silence unused variable warning
(void)unref_only;
#endif // NDEBUG
if (callback && read_options.snapshot == nullptr) {
// The unprep_seqs are not published for write unprepared, so it could be
// that max_visible_seq is larger. Seek to the std::max of the two.
// However, we still want our callback to contain the actual snapshot so
// that it can do the correct visibility filtering.
callback->Refresh(consistent_seqnum);
// Internally, WriteUnpreparedTxnReadCallback::Refresh would set
// max_visible_seq = max(max_visible_seq, snapshot)
//
// Currently, the commented out assert is broken by
// InvalidSnapshotReadCallback, but if write unprepared recovery followed
// the regular transaction flow, then this special read callback would not
// be needed.
//
// assert(callback->max_visible_seq() >= snapshot);
consistent_seqnum = callback->max_visible_seq();
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
}
MultiGetImpl(read_options, 0, num_keys, sorted_keys,
multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
nullptr);
ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
multiget_cf_data[0].super_version);
}
void DBImpl::MultiGetImpl(
const ReadOptions& read_options, size_t start_key, size_t num_keys,
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
SuperVersion* super_version, SequenceNumber snapshot,
ReadCallback* callback, bool* is_blob_index) {
PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
StopWatch sw(env_, stats_, DB_MULTIGET);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
// For each of the given keys, apply the entire "get" process as follows:
// First look in the memtable, then in the immutable memtable (if any).
// s is both in/out. When in, s could either be OK or MergeInProgress.
// merge_operands will contain the sequence of merges in the latter case.
size_t keys_left = num_keys;
while (keys_left) {
size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
? MultiGetContext::MAX_BATCH_SIZE
: keys_left;
MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
batch_size, snapshot);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
MultiGetRange range = ctx.GetMultiGetRange();
bool lookup_current = false;
keys_left -= batch_size;
for (auto mget_iter = range.begin(); mget_iter != range.end();
++mget_iter) {
MultiGet batching in memtable (#5818) Summary: RocksDB has a MultiGet() API that implements batched key lookup for higher performance (https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L468). Currently, batching is implemented in BlockBasedTableReader::MultiGet() for SST file lookups. One of the ways it improves performance is by pipelining bloom filter lookups (by prefetching required cachelines for all the keys in the batch, and then doing the probe) and thus hiding the cache miss latency. The same concept can be extended to the memtable as well. This PR involves implementing a pipelined bloom filter lookup in DynamicBloom, and implementing MemTable::MultiGet() that can leverage it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5818 Test Plan: Existing tests Performance Test: Ran the below command which fills up the memtable and makes sure there are no flushes and then call multiget. Ran it on master and on the new change and see atleast 1% performance improvement across all the test runs I did. Sometimes the improvement was upto 5%. TEST_TMPDIR=/data/users/$USER/benchmarks/feature/ numactl -C 10 ./db_bench -benchmarks="fillseq,multireadrandom" -num=600000 -compression_type="none" -level_compaction_dynamic_level_bytes -write_buffer_size=200000000 -target_file_size_base=200000000 -max_bytes_for_level_base=16777216 -reads=90000 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 -statistics -memtable_whole_key_filtering=true -memtable_bloom_size_ratio=10 Differential Revision: D17578869 Pulled By: vjnadimpalli fbshipit-source-id: 23dc651d9bf49db11d22375bf435708875a1f192
2019-10-10 18:37:38 +02:00
mget_iter->merge_context.Clear();
*mget_iter->s = Status::OK();
}
bool skip_memtable =
(read_options.read_tier == kPersistedTier &&
has_unpersisted_data_.load(std::memory_order_relaxed));
if (!skip_memtable) {
super_version->mem->MultiGet(read_options, &range, callback,
is_blob_index);
if (!range.empty()) {
super_version->imm->MultiGet(read_options, &range, callback,
is_blob_index);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
}
MultiGet batching in memtable (#5818) Summary: RocksDB has a MultiGet() API that implements batched key lookup for higher performance (https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L468). Currently, batching is implemented in BlockBasedTableReader::MultiGet() for SST file lookups. One of the ways it improves performance is by pipelining bloom filter lookups (by prefetching required cachelines for all the keys in the batch, and then doing the probe) and thus hiding the cache miss latency. The same concept can be extended to the memtable as well. This PR involves implementing a pipelined bloom filter lookup in DynamicBloom, and implementing MemTable::MultiGet() that can leverage it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5818 Test Plan: Existing tests Performance Test: Ran the below command which fills up the memtable and makes sure there are no flushes and then call multiget. Ran it on master and on the new change and see atleast 1% performance improvement across all the test runs I did. Sometimes the improvement was upto 5%. TEST_TMPDIR=/data/users/$USER/benchmarks/feature/ numactl -C 10 ./db_bench -benchmarks="fillseq,multireadrandom" -num=600000 -compression_type="none" -level_compaction_dynamic_level_bytes -write_buffer_size=200000000 -target_file_size_base=200000000 -max_bytes_for_level_base=16777216 -reads=90000 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 -statistics -memtable_whole_key_filtering=true -memtable_bloom_size_ratio=10 Differential Revision: D17578869 Pulled By: vjnadimpalli fbshipit-source-id: 23dc651d9bf49db11d22375bf435708875a1f192
2019-10-10 18:37:38 +02:00
if (!range.empty()) {
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
lookup_current = true;
MultiGet batching in memtable (#5818) Summary: RocksDB has a MultiGet() API that implements batched key lookup for higher performance (https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L468). Currently, batching is implemented in BlockBasedTableReader::MultiGet() for SST file lookups. One of the ways it improves performance is by pipelining bloom filter lookups (by prefetching required cachelines for all the keys in the batch, and then doing the probe) and thus hiding the cache miss latency. The same concept can be extended to the memtable as well. This PR involves implementing a pipelined bloom filter lookup in DynamicBloom, and implementing MemTable::MultiGet() that can leverage it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5818 Test Plan: Existing tests Performance Test: Ran the below command which fills up the memtable and makes sure there are no flushes and then call multiget. Ran it on master and on the new change and see atleast 1% performance improvement across all the test runs I did. Sometimes the improvement was upto 5%. TEST_TMPDIR=/data/users/$USER/benchmarks/feature/ numactl -C 10 ./db_bench -benchmarks="fillseq,multireadrandom" -num=600000 -compression_type="none" -level_compaction_dynamic_level_bytes -write_buffer_size=200000000 -target_file_size_base=200000000 -max_bytes_for_level_base=16777216 -reads=90000 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 -statistics -memtable_whole_key_filtering=true -memtable_bloom_size_ratio=10 Differential Revision: D17578869 Pulled By: vjnadimpalli fbshipit-source-id: 23dc651d9bf49db11d22375bf435708875a1f192
2019-10-10 18:37:38 +02:00
uint64_t left = range.KeysLeft();
RecordTick(stats_, MEMTABLE_MISS, left);
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
}
}
if (lookup_current) {
PERF_TIMER_GUARD(get_from_output_files_time);
super_version->current->MultiGet(read_options, &range, callback,
is_blob_index);
}
}
// Post processing (decrement reference counts and record statistics)
PERF_TIMER_GUARD(get_post_process_time);
size_t num_found = 0;
uint64_t bytes_read = 0;
for (size_t i = start_key; i < start_key + num_keys; ++i) {
KeyContext* key = (*sorted_keys)[i];
if (key->s->ok()) {
bytes_read += key->value->size();
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
num_found++;
}
}
RecordTick(stats_, NUMBER_MULTIGET_CALLS);
RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
PERF_TIMER_STOP(get_post_process_time);
}
Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
const std::string& column_family,
ColumnFamilyHandle** handle) {
assert(handle != nullptr);
Status s = CreateColumnFamilyImpl(cf_options, column_family, handle);
if (s.ok()) {
s = WriteOptionsFile(true /*need_mutex_lock*/,
true /*need_enter_write_thread*/);
}
return s;
}
Status DBImpl::CreateColumnFamilies(
const ColumnFamilyOptions& cf_options,
const std::vector<std::string>& column_family_names,
std::vector<ColumnFamilyHandle*>* handles) {
assert(handles != nullptr);
handles->clear();
size_t num_cf = column_family_names.size();
Status s;
bool success_once = false;
for (size_t i = 0; i < num_cf; i++) {
ColumnFamilyHandle* handle;
s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle);
if (!s.ok()) {
break;
}
handles->push_back(handle);
success_once = true;
}
if (success_once) {
Status persist_options_status = WriteOptionsFile(
true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
if (s.ok() && !persist_options_status.ok()) {
s = persist_options_status;
}
}
return s;
}
Status DBImpl::CreateColumnFamilies(
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles) {
assert(handles != nullptr);
handles->clear();
size_t num_cf = column_families.size();
Status s;
bool success_once = false;
for (size_t i = 0; i < num_cf; i++) {
ColumnFamilyHandle* handle;
s = CreateColumnFamilyImpl(column_families[i].options,
column_families[i].name, &handle);
if (!s.ok()) {
break;
}
handles->push_back(handle);
success_once = true;
}
if (success_once) {
Status persist_options_status = WriteOptionsFile(
true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
if (s.ok() && !persist_options_status.ok()) {
s = persist_options_status;
}
}
return s;
}
Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
const std::string& column_family_name,
ColumnFamilyHandle** handle) {
Status s;
Status persist_options_status;
*handle = nullptr;
DBOptions db_options =
BuildDBOptions(immutable_db_options_, mutable_db_options_);
s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
if (s.ok()) {
for (auto& cf_path : cf_options.cf_paths) {
s = env_->CreateDirIfMissing(cf_path.path);
if (!s.ok()) {
break;
}
}
}
if (!s.ok()) {
return s;
}
SuperVersionContext sv_context(/* create_superversion */ true);
{
InstrumentedMutexLock l(&mutex_);
if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
nullptr) {
return Status::InvalidArgument("Column family already exists");
}
VersionEdit edit;
edit.AddColumnFamily(column_family_name);
uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
edit.SetColumnFamily(new_id);
edit.SetLogNumber(logfile_number_);
edit.SetComparatorName(cf_options.comparator->Name());
// LogAndApply will both write the creation in MANIFEST and create
// ColumnFamilyData object
2015-01-06 21:44:21 +01:00
{ // write thread
WriteThread::Writer w;
write_thread_.EnterUnbatched(&w, &mutex_);
2015-01-06 21:44:21 +01:00
// LogAndApply will both write the creation in MANIFEST and create
// ColumnFamilyData object
s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit,
&mutex_, directories_.GetDbDir(), false,
&cf_options);
write_thread_.ExitUnbatched(&w);
2015-01-06 21:44:21 +01:00
}
if (s.ok()) {
auto* cfd =
versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
assert(cfd != nullptr);
s = cfd->AddDirectories();
}
if (s.ok()) {
single_column_family_mode_ = false;
auto* cfd =
versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
assert(cfd != nullptr);
InstallSuperVersionAndScheduleWork(cfd, &sv_context,
*cfd->GetLatestMutableCFOptions());
if (!cfd->mem()->IsSnapshotSupported()) {
is_snapshot_supported_ = false;
}
cfd->set_initialized();
*handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Created column family [%s] (ID %u)",
column_family_name.c_str(), (unsigned)cfd->GetID());
} else {
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"Creating column family [%s] FAILED -- %s",
column_family_name.c_str(), s.ToString().c_str());
}
} // InstrumentedMutexLock l(&mutex_)
sv_context.Clean();
// this is outside the mutex
if (s.ok()) {
NewThreadStatusCfInfo(
reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
}
return s;
}
Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
assert(column_family != nullptr);
Status s = DropColumnFamilyImpl(column_family);
if (s.ok()) {
s = WriteOptionsFile(true /*need_mutex_lock*/,
true /*need_enter_write_thread*/);
}
return s;
}
Status DBImpl::DropColumnFamilies(
const std::vector<ColumnFamilyHandle*>& column_families) {
Status s;
bool success_once = false;
for (auto* handle : column_families) {
s = DropColumnFamilyImpl(handle);
if (!s.ok()) {
break;
}
success_once = true;
}
if (success_once) {
Status persist_options_status = WriteOptionsFile(
true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
if (s.ok() && !persist_options_status.ok()) {
s = persist_options_status;
}
}
return s;
}
Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
if (cfd->GetID() == 0) {
return Status::InvalidArgument("Can't drop default column family");
}
bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
VersionEdit edit;
edit.DropColumnFamily();
edit.SetColumnFamily(cfd->GetID());
Status s;
{
InstrumentedMutexLock l(&mutex_);
if (cfd->IsDropped()) {
s = Status::InvalidArgument("Column family already dropped!\n");
}
if (s.ok()) {
// we drop column family from a single write thread
WriteThread::Writer w;
write_thread_.EnterUnbatched(&w, &mutex_);
s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
&mutex_);
write_thread_.ExitUnbatched(&w);
}
if (s.ok()) {
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
mutable_cf_options->max_write_buffer_number;
}
if (!cf_support_snapshot) {
// Dropped Column Family doesn't support snapshot. Need to recalculate
// is_snapshot_supported_.
bool new_is_snapshot_supported = true;
for (auto c : *versions_->GetColumnFamilySet()) {
if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
new_is_snapshot_supported = false;
break;
}
}
is_snapshot_supported_ = new_is_snapshot_supported;
}
bg_cv_.SignalAll();
}
if (s.ok()) {
// Note that here we erase the associated cf_info of the to-be-dropped
// cfd before its ref-count goes to zero to avoid having to erase cf_info
// later inside db_mutex.
EraseThreadStatusCfInfo(cfd);
assert(cfd->IsDropped());
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Dropped column family with id %u\n", cfd->GetID());
} else {
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"Dropping column family with id %u FAILED -- %s\n",
cfd->GetID(), s.ToString().c_str());
}
return s;
}
bool DBImpl::KeyMayExist(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
std::string* value, bool* value_found) {
assert(value != nullptr);
if (value_found != nullptr) {
// falsify later if key-may-exist but can't fetch value
*value_found = true;
}
ReadOptions roptions = read_options;
roptions.read_tier = kBlockCacheTier; // read from block cache only
PinnableSlice pinnable_val;
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 23:22:34 +02:00
GetImplOptions get_impl_options;
get_impl_options.column_family = column_family;
get_impl_options.value = &pinnable_val;
get_impl_options.value_found = value_found;
auto s = GetImpl(roptions, key, get_impl_options);
value->assign(pinnable_val.data(), pinnable_val.size());
// If block_cache is enabled and the index block of the table didn't
// not present in block_cache, the return value will be Status::Incomplete.
// In this case, key may still exist in the table.
return s.ok() || s.IsIncomplete();
}
Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) {
if (read_options.managed) {
return NewErrorIterator(
Status::NotSupported("Managed iterator is not supported anymore."));
}
Iterator* result = nullptr;
if (read_options.read_tier == kPersistedTier) {
return NewErrorIterator(Status::NotSupported(
"ReadTier::kPersistedData is not yet supported in iterators."));
}
Added support for differential snapshots Summary: The motivation for this PR is to add to RocksDB support for differential (incremental) snapshots, as snapshot of the DB changes between two points in time (one can think of it as diff between to sequence numbers, or the diff D which can be thought of as an SST file or just set of KVs that can be applied to sequence number S1 to get the database to the state at sequence number S2). This feature would be useful for various distributed storages layers built on top of RocksDB, as it should help reduce resources (time and network bandwidth) needed to recover and rebuilt DB instances as replicas in the context of distributed storages. From the API standpoint that would like client app requesting iterator between (start seqnum) and current DB state, and reading the "diff". This is a very draft PR for initial review in the discussion on the approach, i'm going to rework some parts and keep updating the PR. For now, what's done here according to initial discussions: Preserving deletes: - We want to be able to optionally preserve recent deletes for some defined period of time, so that if a delete came in recently and might need to be included in the next incremental snapshot it would't get dropped by a compaction. This is done by adding new param to Options (preserve deletes flag) and new variable to DB Impl where we keep track of the sequence number after which we don't want to drop tombstones, even if they are otherwise eligible for deletion. - I also added a new API call for clients to be able to advance this cutoff seqnum after which we drop deletes; i assume it's more flexible to let clients control this, since otherwise we'd need to keep some kind of timestamp < -- > seqnum mapping inside the DB, which sounds messy and painful to support. Clients could make use of it by periodically calling GetLatestSequenceNumber(), noting the timestamp, doing some calculation and figuring out by how much we need to advance the cutoff seqnum. - Compaction codepath in compaction_iterator.cc has been modified to avoid dropping tombstones with seqnum > cutoff seqnum. Iterator changes: - couple params added to ReadOptions, to optionally allow client to request internal keys instead of user keys (so that client can get the latest value of a key, be it delete marker or a put), as well as min timestamp and min seqnum. TableCache changes: - I modified table_cache code to be able to quickly exclude SST files from iterators heep if creation_time on the file is less then iter_start_ts as passed in ReadOptions. That would help a lot in some DB settings (like reading very recent data only or using FIFO compactions), but not so much for universal compaction with more or less long iterator time span. What's left: - Still looking at how to best plug that inside DBIter codepath. So far it seems that FindNextUserKeyInternal only parses values as UserKeys, and iter->key() call generally returns user key. Can we add new API to DBIter as internal_key(), and modify this internal method to optionally set saved_key_ to point to the full internal key? I don't need to store actual seqnum there, but I do need to store type. Closes https://github.com/facebook/rocksdb/pull/2999 Differential Revision: D6175602 Pulled By: mikhail-antonov fbshipit-source-id: c779a6696ee2d574d86c69cec866a3ae095aa900
2017-11-02 02:43:29 +01:00
// if iterator wants internal keys, we can only proceed if
// we can guarantee the deletes haven't been processed yet
if (immutable_db_options_.preserve_deletes &&
read_options.iter_start_seqnum > 0 &&
read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) {
return NewErrorIterator(Status::InvalidArgument(
"Iterator requested internal keys which are too old and are not"
" guaranteed to be preserved, try larger iter_start_seqnum opt."));
}
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
ReadCallback* read_callback = nullptr; // No read callback provided.
if (read_options.tailing) {
#ifdef ROCKSDB_LITE
// not supported in lite version
result = nullptr;
#else
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
auto iter = new ForwardIterator(this, read_options, cfd, sv);
result = NewDBIterator(
env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
cfd->user_comparator(), iter, kMaxSequenceNumber,
sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
this, cfd);
#endif
} else {
// Note: no need to consider the special case of
// last_seq_same_as_publish_seq_==false since NewIterator is overridden in
// WritePreparedTxnDB
auto snapshot = read_options.snapshot != nullptr
? read_options.snapshot->GetSequenceNumber()
: versions_->LastSequence();
result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
}
return result;
}
ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
ColumnFamilyData* cfd,
SequenceNumber snapshot,
ReadCallback* read_callback,
bool allow_blob,
bool allow_refresh) {
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
// Try to generate a DB iterator tree in continuous memory area to be
// cache friendly. Here is an example of result:
// +-------------------------------+
// | |
// | ArenaWrappedDBIter |
// | + |
// | +---> Inner Iterator ------------+
// | | | |
// | | +-- -- -- -- -- -- -- --+ |
// | +--- | Arena | |
// | | | |
// | Allocated Memory: | |
// | | +-------------------+ |
// | | | DBIter | <---+
// | | + |
// | | | +-> iter_ ------------+
// | | | | |
// | | +-------------------+ |
// | | | MergingIterator | <---+
// | | + |
// | | | +->child iter1 ------------+
// | | | | | |
// | | +->child iter2 ----------+ |
// | | | | | | |
// | | | +->child iter3 --------+ | |
// | | | | | |
// | | +-------------------+ | | |
// | | | Iterator1 | <--------+
// | | +-------------------+ | |
// | | | Iterator2 | <------+
// | | +-------------------+ |
// | | | Iterator3 | <----+
// | | +-------------------+
// | | |
// +-------+-----------------------+
//
// ArenaWrappedDBIter inlines an arena area where all the iterators in
// the iterator tree are allocated in the order of being accessed when
// querying.
// Laying out the iterators in the order of being accessed makes it more
// likely that any iterator pointer is close to the iterator it points to so
// that they are likely to be in the same cache line and/or page.
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
sv->mutable_cf_options.max_sequential_skip_in_iterations,
sv->version_number, read_callback, this, cfd, allow_blob,
((read_options.snapshot != nullptr) ? false : allow_refresh));
InternalIterator* internal_iter =
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
db_iter->GetRangeDelAggregator(), snapshot);
db_iter->SetIterUnderDBIter(internal_iter);
return db_iter;
}
Status DBImpl::NewIterators(
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) {
if (read_options.managed) {
return Status::NotSupported("Managed iterator is not supported anymore.");
}
if (read_options.read_tier == kPersistedTier) {
return Status::NotSupported(
"ReadTier::kPersistedData is not yet supported in iterators.");
}
ReadCallback* read_callback = nullptr; // No read callback provided.
iterators->clear();
iterators->reserve(column_families.size());
if (read_options.tailing) {
#ifdef ROCKSDB_LITE
return Status::InvalidArgument(
"Tailing iterator not supported in RocksDB lite");
#else
for (auto cfh : column_families) {
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
auto iter = new ForwardIterator(this, read_options, cfd, sv);
iterators->push_back(NewDBIterator(
env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
cfd->user_comparator(), iter, kMaxSequenceNumber,
sv->mutable_cf_options.max_sequential_skip_in_iterations,
read_callback, this, cfd));
}
#endif
} else {
// Note: no need to consider the special case of
// last_seq_same_as_publish_seq_==false since NewIterators is overridden in
// WritePreparedTxnDB
auto snapshot = read_options.snapshot != nullptr
? read_options.snapshot->GetSequenceNumber()
: versions_->LastSequence();
for (size_t i = 0; i < column_families.size(); ++i) {
auto* cfd =
reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd();
iterators->push_back(
NewIteratorImpl(read_options, cfd, snapshot, read_callback));
}
}
return Status::OK();
}
const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
#ifndef ROCKSDB_LITE
const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
return GetSnapshotImpl(true);
}
#endif // ROCKSDB_LITE
SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
bool lock) {
int64_t unix_time = 0;
env_->GetCurrentTime(&unix_time); // Ignore error
SnapshotImpl* s = new SnapshotImpl;
if (lock) {
mutex_.Lock();
}
// returns null if the underlying memtable does not support snapshot.
if (!is_snapshot_supported_) {
if (lock) {
mutex_.Unlock();
}
delete s;
return nullptr;
}
auto snapshot_seq = last_seq_same_as_publish_seq_
? versions_->LastSequence()
: versions_->LastPublishedSequence();
SnapshotImpl* snapshot =
snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
if (lock) {
mutex_.Unlock();
}
return snapshot;
}
namespace {
typedef autovector<ColumnFamilyData*, 2> CfdList;
bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
for (const ColumnFamilyData* t : list) {
if (t == cfd) {
return true;
}
}
return false;
}
} // namespace
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
{
InstrumentedMutexLock l(&mutex_);
snapshots_.Delete(casted_s);
single-file bottom-level compaction when snapshot released Summary: When snapshots are held for a long time, files may reach the bottom level containing overwritten/deleted keys. We previously had no mechanism to trigger compaction on such files. This particularly impacted DBs that write to different parts of the keyspace over time, as such files would never be naturally compacted due to second-last level files moving down. This PR introduces a mechanism for bottommost files to be recompacted upon releasing all snapshots that prevent them from dropping their deleted/overwritten keys. - Changed `CompactionPicker` to compact files in `BottommostFilesMarkedForCompaction()`. These are the last choice when picking. Each file will be compacted alone and output to the same level in which it originated. The goal of this type of compaction is to rewrite the data excluding deleted/overwritten keys. - Changed `ReleaseSnapshot()` to recompute the bottom files marked for compaction when the oldest existing snapshot changes, and schedule a compaction if needed. We cache the value that oldest existing snapshot needs to exceed in order for another file to be marked in `bottommost_files_mark_threshold_`, which allows us to avoid recomputing marked files for most snapshot releases. - Changed `VersionStorageInfo` to track the list of bottommost files, which is recomputed every time the version changes by `UpdateBottommostFiles()`. The list of marked bottommost files is first computed in `ComputeBottommostFilesMarkedForCompaction()` when the version changes, but may also be recomputed when `ReleaseSnapshot()` is called. - Extracted core logic of `Compaction::IsBottommostLevel()` into `VersionStorageInfo::RangeMightExistAfterSortedRun()` since logic to check whether a file is bottommost is now necessary outside of compaction. Closes https://github.com/facebook/rocksdb/pull/3009 Differential Revision: D6062044 Pulled By: ajkr fbshipit-source-id: 123d201cf140715a7d5928e8b3cb4f9cd9f7ad21
2017-10-26 01:24:29 +02:00
uint64_t oldest_snapshot;
if (snapshots_.empty()) {
oldest_snapshot = last_seq_same_as_publish_seq_
? versions_->LastSequence()
: versions_->LastPublishedSequence();
single-file bottom-level compaction when snapshot released Summary: When snapshots are held for a long time, files may reach the bottom level containing overwritten/deleted keys. We previously had no mechanism to trigger compaction on such files. This particularly impacted DBs that write to different parts of the keyspace over time, as such files would never be naturally compacted due to second-last level files moving down. This PR introduces a mechanism for bottommost files to be recompacted upon releasing all snapshots that prevent them from dropping their deleted/overwritten keys. - Changed `CompactionPicker` to compact files in `BottommostFilesMarkedForCompaction()`. These are the last choice when picking. Each file will be compacted alone and output to the same level in which it originated. The goal of this type of compaction is to rewrite the data excluding deleted/overwritten keys. - Changed `ReleaseSnapshot()` to recompute the bottom files marked for compaction when the oldest existing snapshot changes, and schedule a compaction if needed. We cache the value that oldest existing snapshot needs to exceed in order for another file to be marked in `bottommost_files_mark_threshold_`, which allows us to avoid recomputing marked files for most snapshot releases. - Changed `VersionStorageInfo` to track the list of bottommost files, which is recomputed every time the version changes by `UpdateBottommostFiles()`. The list of marked bottommost files is first computed in `ComputeBottommostFilesMarkedForCompaction()` when the version changes, but may also be recomputed when `ReleaseSnapshot()` is called. - Extracted core logic of `Compaction::IsBottommostLevel()` into `VersionStorageInfo::RangeMightExistAfterSortedRun()` since logic to check whether a file is bottommost is now necessary outside of compaction. Closes https://github.com/facebook/rocksdb/pull/3009 Differential Revision: D6062044 Pulled By: ajkr fbshipit-source-id: 123d201cf140715a7d5928e8b3cb4f9cd9f7ad21
2017-10-26 01:24:29 +02:00
} else {
oldest_snapshot = snapshots_.oldest()->number_;
}
// Avoid to go through every column family by checking a global threshold
// first.
if (oldest_snapshot > bottommost_files_mark_threshold_) {
CfdList cf_scheduled;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
if (!cfd->current()
->storage_info()
->BottommostFilesMarkedForCompaction()
.empty()) {
SchedulePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction();
cf_scheduled.push_back(cfd);
}
}
// Calculate a new threshold, skipping those CFs where compactions are
// scheduled. We do not do the same pass as the previous loop because
// mutex might be unlocked during the loop, making the result inaccurate.
SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (CfdListContains(cf_scheduled, cfd)) {
continue;
}
new_bottommost_files_mark_threshold = std::min(
new_bottommost_files_mark_threshold,
cfd->current()->storage_info()->bottommost_files_mark_threshold());
single-file bottom-level compaction when snapshot released Summary: When snapshots are held for a long time, files may reach the bottom level containing overwritten/deleted keys. We previously had no mechanism to trigger compaction on such files. This particularly impacted DBs that write to different parts of the keyspace over time, as such files would never be naturally compacted due to second-last level files moving down. This PR introduces a mechanism for bottommost files to be recompacted upon releasing all snapshots that prevent them from dropping their deleted/overwritten keys. - Changed `CompactionPicker` to compact files in `BottommostFilesMarkedForCompaction()`. These are the last choice when picking. Each file will be compacted alone and output to the same level in which it originated. The goal of this type of compaction is to rewrite the data excluding deleted/overwritten keys. - Changed `ReleaseSnapshot()` to recompute the bottom files marked for compaction when the oldest existing snapshot changes, and schedule a compaction if needed. We cache the value that oldest existing snapshot needs to exceed in order for another file to be marked in `bottommost_files_mark_threshold_`, which allows us to avoid recomputing marked files for most snapshot releases. - Changed `VersionStorageInfo` to track the list of bottommost files, which is recomputed every time the version changes by `UpdateBottommostFiles()`. The list of marked bottommost files is first computed in `ComputeBottommostFilesMarkedForCompaction()` when the version changes, but may also be recomputed when `ReleaseSnapshot()` is called. - Extracted core logic of `Compaction::IsBottommostLevel()` into `VersionStorageInfo::RangeMightExistAfterSortedRun()` since logic to check whether a file is bottommost is now necessary outside of compaction. Closes https://github.com/facebook/rocksdb/pull/3009 Differential Revision: D6062044 Pulled By: ajkr fbshipit-source-id: 123d201cf140715a7d5928e8b3cb4f9cd9f7ad21
2017-10-26 01:24:29 +02:00
}
bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
single-file bottom-level compaction when snapshot released Summary: When snapshots are held for a long time, files may reach the bottom level containing overwritten/deleted keys. We previously had no mechanism to trigger compaction on such files. This particularly impacted DBs that write to different parts of the keyspace over time, as such files would never be naturally compacted due to second-last level files moving down. This PR introduces a mechanism for bottommost files to be recompacted upon releasing all snapshots that prevent them from dropping their deleted/overwritten keys. - Changed `CompactionPicker` to compact files in `BottommostFilesMarkedForCompaction()`. These are the last choice when picking. Each file will be compacted alone and output to the same level in which it originated. The goal of this type of compaction is to rewrite the data excluding deleted/overwritten keys. - Changed `ReleaseSnapshot()` to recompute the bottom files marked for compaction when the oldest existing snapshot changes, and schedule a compaction if needed. We cache the value that oldest existing snapshot needs to exceed in order for another file to be marked in `bottommost_files_mark_threshold_`, which allows us to avoid recomputing marked files for most snapshot releases. - Changed `VersionStorageInfo` to track the list of bottommost files, which is recomputed every time the version changes by `UpdateBottommostFiles()`. The list of marked bottommost files is first computed in `ComputeBottommostFilesMarkedForCompaction()` when the version changes, but may also be recomputed when `ReleaseSnapshot()` is called. - Extracted core logic of `Compaction::IsBottommostLevel()` into `VersionStorageInfo::RangeMightExistAfterSortedRun()` since logic to check whether a file is bottommost is now necessary outside of compaction. Closes https://github.com/facebook/rocksdb/pull/3009 Differential Revision: D6062044 Pulled By: ajkr fbshipit-source-id: 123d201cf140715a7d5928e8b3cb4f9cd9f7ad21
2017-10-26 01:24:29 +02:00
}
}
delete casted_s;
}
#ifndef ROCKSDB_LITE
Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
TablePropertiesCollection* props) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
// Increment the ref count
mutex_.Lock();
auto version = cfd->current();
version->Ref();
mutex_.Unlock();
auto s = version->GetPropertiesOfAllTables(props);
// Decrement the ref count
mutex_.Lock();
version->Unref();
mutex_.Unlock();
return s;
}
Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
const Range* range, std::size_t n,
TablePropertiesCollection* props) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
// Increment the ref count
mutex_.Lock();
auto version = cfd->current();
version->Ref();
mutex_.Unlock();
auto s = version->GetPropertiesOfTablesInRange(range, n, props);
// Decrement the ref count
mutex_.Lock();
version->Unref();
mutex_.Unlock();
return s;
}
#endif // ROCKSDB_LITE
const std::string& DBImpl::GetName() const { return dbname_; }
[RocksDB] BackupableDB Summary: In this diff I present you BackupableDB v1. You can easily use it to backup your DB and it will do incremental snapshots for you. Let's first describe how you would use BackupableDB. It's inheriting StackableDB interface so you can easily construct it with your DB object -- it will add a method RollTheSnapshot() to the DB object. When you call RollTheSnapshot(), current snapshot of the DB will be stored in the backup dir. To restore, you can just call RestoreDBFromBackup() on a BackupableDB (which is a static method) and it will restore all files from the backup dir. In the next version, it will even support automatic backuping every X minutes. There are multiple things you can configure: 1. backup_env and db_env can be different, which is awesome because then you can easily backup to HDFS or wherever you feel like. 2. sync - if true, it *guarantees* backup consistency on machine reboot 3. number of snapshots to keep - this will keep last N snapshots around if you want, for some reason, be able to restore from an earlier snapshot. All the backuping is done in incremental fashion - if we already have 00010.sst, we will not copy it again. *IMPORTANT* -- This is based on assumption that 00010.sst never changes - two files named 00010.sst from the same DB will always be exactly the same. Is this true? I always copy manifest, current and log files. 4. You can decide if you want to flush the memtables before you backup, or you're fine with backing up the log files -- either way, you get a complete and consistent view of the database at a time of backup. 5. More things you can find in BackupableDBOptions Here is the directory structure I use: backup_dir/CURRENT_SNAPSHOT - just 4 bytes holding the latest snapshot 0, 1, 2, ... - files containing serialized version of each snapshot - containing a list of files files/*.sst - sst files shared between snapshots - if one snapshot references 00010.sst and another one needs to backup it from the DB, it will just reference the same file files/ 0/, 1/, 2/, ... - snapshot directories containing private snapshot files - current, manifest and log files All the files are ref counted and deleted immediatelly when they get out of scope. Some other stuff in this diff: 1. Added GetEnv() method to the DB. Discussed with @haobo and we agreed that it seems right thing to do. 2. Fixed StackableDB interface. The way it was set up before, I was not able to implement BackupableDB. Test Plan: I have a unittest, but please don't look at this yet. I just hacked it up to help me with debugging. I will write a lot of good tests and update the diff. Also, `make asan_check` Reviewers: dhruba, haobo, emayanke Reviewed By: dhruba CC: leveldb, haobo Differential Revision: https://reviews.facebook.net/D14295
2013-12-09 23:06:52 +01:00
Env* DBImpl::GetEnv() const { return env_; }
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
FileSystem* DB::GetFileSystem() const {
static LegacyFileSystemWrapper fs_wrap(GetEnv());
return &fs_wrap;
}
FileSystem* DBImpl::GetFileSystem() const {
return immutable_db_options_.fs.get();
}
Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
InstrumentedMutexLock l(&mutex_);
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
cfh->cfd()->GetLatestCFOptions());
}
DBOptions DBImpl::GetDBOptions() const {
InstrumentedMutexLock l(&mutex_);
return BuildDBOptions(immutable_db_options_, mutable_db_options_);
}
bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
const Slice& property, std::string* value) {
const DBPropertyInfo* property_info = GetPropertyInfo(property);
value->clear();
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
if (property_info == nullptr) {
return false;
} else if (property_info->handle_int) {
uint64_t int_value;
bool ret_value =
GetIntPropertyInternal(cfd, *property_info, false, &int_value);
if (ret_value) {
*value = ToString(int_value);
}
return ret_value;
} else if (property_info->handle_string) {
InstrumentedMutexLock l(&mutex_);
return cfd->internal_stats()->GetStringProperty(*property_info, property,
value);
} else if (property_info->handle_string_dbimpl) {
std::string tmp_value;
bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value);
if (ret_value) {
*value = tmp_value;
}
return ret_value;
}
// Shouldn't reach here since exactly one of handle_string and handle_int
// should be non-nullptr.
assert(false);
return false;
}
bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
const Slice& property,
std::map<std::string, std::string>* value) {
const DBPropertyInfo* property_info = GetPropertyInfo(property);
value->clear();
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
if (property_info == nullptr) {
return false;
} else if (property_info->handle_map) {
InstrumentedMutexLock l(&mutex_);
return cfd->internal_stats()->GetMapProperty(*property_info, property,
value);
}
// If we reach this point it means that handle_map is not provided for the
// requested property
return false;
}
bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
const Slice& property, uint64_t* value) {
const DBPropertyInfo* property_info = GetPropertyInfo(property);
if (property_info == nullptr || property_info->handle_int == nullptr) {
return false;
}
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
return GetIntPropertyInternal(cfd, *property_info, false, value);
}
bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
const DBPropertyInfo& property_info,
bool is_locked, uint64_t* value) {
assert(property_info.handle_int != nullptr);
if (!property_info.need_out_of_mutex) {
if (is_locked) {
mutex_.AssertHeld();
return cfd->internal_stats()->GetIntProperty(property_info, value, this);
} else {
InstrumentedMutexLock l(&mutex_);
return cfd->internal_stats()->GetIntProperty(property_info, value, this);
}
} else {
SuperVersion* sv = nullptr;
if (!is_locked) {
sv = GetAndRefSuperVersion(cfd);
} else {
sv = cfd->GetSuperVersion();
}
bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
property_info, sv->current, value);
if (!is_locked) {
ReturnAndCleanupSuperVersion(cfd, sv);
}
return ret;
}
}
bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
assert(value != nullptr);
Statistics* statistics = immutable_db_options_.statistics.get();
if (!statistics) {
return false;
}
*value = statistics->ToString();
return true;
}
#ifndef ROCKSDB_LITE
Status DBImpl::ResetStats() {
InstrumentedMutexLock l(&mutex_);
for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (cfd->initialized()) {
cfd->internal_stats()->Clear();
}
}
return Status::OK();
}
#endif // ROCKSDB_LITE
bool DBImpl::GetAggregatedIntProperty(const Slice& property,
uint64_t* aggregated_value) {
const DBPropertyInfo* property_info = GetPropertyInfo(property);
if (property_info == nullptr || property_info->handle_int == nullptr) {
return false;
}
uint64_t sum = 0;
{
// Needs mutex to protect the list of column families.
InstrumentedMutexLock l(&mutex_);
uint64_t value;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (!cfd->initialized()) {
continue;
}
if (GetIntPropertyInternal(cfd, *property_info, true, &value)) {
sum += value;
} else {
return false;
}
}
}
*aggregated_value = sum;
return true;
}
SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
// TODO(ljin): consider using GetReferencedSuperVersion() directly
return cfd->GetThreadLocalSuperVersion(&mutex_);
}
// REQUIRED: this function should only be called on the write thread or if the
// mutex is held.
SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
auto column_family_set = versions_->GetColumnFamilySet();
auto cfd = column_family_set->GetColumnFamily(column_family_id);
if (!cfd) {
return nullptr;
}
return GetAndRefSuperVersion(cfd);
}
void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
// Release SuperVersion
if (sv->Unref()) {
{
InstrumentedMutexLock l(&mutex_);
sv->Cleanup();
}
delete sv;
RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
}
RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
}
void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
SuperVersion* sv) {
if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
CleanupSuperVersion(sv);
}
}
// REQUIRED: this function should only be called on the write thread.
void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
SuperVersion* sv) {
auto column_family_set = versions_->GetColumnFamilySet();
auto cfd = column_family_set->GetColumnFamily(column_family_id);
// If SuperVersion is held, and we successfully fetched a cfd using
// GetAndRefSuperVersion(), it must still exist.
assert(cfd != nullptr);
ReturnAndCleanupSuperVersion(cfd, sv);
}
// REQUIRED: this function should only be called on the write thread or if the
// mutex is held.
ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
if (!cf_memtables->Seek(column_family_id)) {
return nullptr;
}
return cf_memtables->GetColumnFamilyHandle();
}
// REQUIRED: mutex is NOT held.
std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
uint32_t column_family_id) {
InstrumentedMutexLock l(&mutex_);
auto* cfd =
versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
if (cfd == nullptr) {
return nullptr;
}
return std::unique_ptr<ColumnFamilyHandleImpl>(
new ColumnFamilyHandleImpl(cfd, this, &mutex_));
}
void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
const Range& range,
uint64_t* const count,
uint64_t* const size) {
ColumnFamilyHandleImpl* cfh =
reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
ColumnFamilyData* cfd = cfh->cfd();
SuperVersion* sv = GetAndRefSuperVersion(cfd);
// Convert user_key into a corresponding internal key.
InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
MemTable::MemTableStats memStats =
sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
MemTable::MemTableStats immStats =
sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
*count = memStats.count + immStats.count;
*size = memStats.size + immStats.size;
ReturnAndCleanupSuperVersion(cfd, sv);
}
Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
ColumnFamilyHandle* column_family,
const Range* range, int n, uint64_t* sizes) {
if (!options.include_memtabtles && !options.include_files) {
return Status::InvalidArgument("Invalid options");
}
Version* v;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* sv = GetAndRefSuperVersion(cfd);
v = sv->current;
for (int i = 0; i < n; i++) {
// Convert user_key into a corresponding internal key.
InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
sizes[i] = 0;
if (options.include_files) {
Create a BlockCacheLookupContext to enable fine-grained block cache tracing. (#5421) Summary: BlockCacheLookupContext only contains the caller for now. We will trace block accesses at five places: 1. BlockBasedTable::GetFilter. 2. BlockBasedTable::GetUncompressedDict. 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, and range deletion block.) 4. BlockBasedTable::Get. (To trace the referenced key and whether the referenced key exists in a fetched data block.) 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the referenced key exists in a fetched data block.) We create the context at: 1. BlockBasedTable::Get. (kUserGet) 2. BlockBasedTable::MultiGet. (kUserMGet) 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or external SST ingestion calls this function.) 4. BlockBasedTable::Open. (kPrefetch) 5. Index/Filter::CacheDependencies. (kPrefetch) 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or kUserApproximateSize). I loaded 1 million key-value pairs into the database and ran the readrandom benchmark with a single thread. I gave the block cache 10 GB to make sure all reads hit the block cache after warmup. The throughput is comparable. Throughput of this PR: 231334 ops/s. Throughput of the master branch: 238428 ops/s. Experiment setup: RocksDB: version 6.2 Date: Mon Jun 10 10:42:51 2019 CPU: 24 * Intel Core Processor (Skylake) CPUCache: 16384 KB Keys: 20 bytes each Values: 100 bytes each (100 bytes after compression) Entries: 1000000 Prefix: 20 bytes Keys per prefix: 0 RawSize: 114.4 MB (estimated) FileSize: 114.4 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: NoCompression Compression sampling rate: 0 Memtablerep: skip_list Perf Level: 1 Load command: ./db_bench --benchmarks="fillseq" --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 Run command: ./db_bench --benchmarks="readrandom,stats" --use_existing_db --threads=1 --duration=120 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 --duration=120 TODOs: 1. Create a caller for external SST file ingestion and differentiate the callers for iterator. 2. Integrate tracer to trace block cache accesses. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5421 Differential Revision: D15704258 Pulled By: HaoyuHuang fbshipit-source-id: 4aa8a55f8cb1576ffb367bfa3186a91d8f06d93a
2019-06-11 00:30:05 +02:00
sizes[i] += versions_->ApproximateSize(
options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
/*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
}
if (options.include_memtabtles) {
sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
}
}
ReturnAndCleanupSuperVersion(cfd, sv);
return Status::OK();
}
std::list<uint64_t>::iterator
DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
// We need to remember the iterator of our insert, because after the
// background job is done, we need to remove that element from
// pending_outputs_.
pending_outputs_.push_back(versions_->current_next_file_number());
auto pending_outputs_inserted_elem = pending_outputs_.end();
--pending_outputs_inserted_elem;
return pending_outputs_inserted_elem;
}
void DBImpl::ReleaseFileNumberFromPendingOutputs(
std::unique_ptr<std::list<uint64_t>::iterator>& v) {
if (v.get() != nullptr) {
pending_outputs_.erase(*v.get());
v.reset();
}
}
#ifndef ROCKSDB_LITE
Status DBImpl::GetUpdatesSince(
SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
const TransactionLogIterator::ReadOptions& read_options) {
RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
if (seq > versions_->LastSequence()) {
return Status::NotFound("Requested sequence not yet written in the db");
}
return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
}
Status DBImpl::DeleteFile(std::string name) {
uint64_t number;
FileType type;
WalFileType log_type;
if (!ParseFileName(name, &number, &type, &log_type) ||
(type != kTableFile && type != kLogFile)) {
ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
name.c_str());
return Status::InvalidArgument("Invalid file name");
}
Status status;
if (type == kLogFile) {
// Only allow deleting archived log files
if (log_type != kArchivedLogFile) {
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"DeleteFile %s failed - not archived log.\n",
name.c_str());
return Status::NotSupported("Delete only supported for archived logs");
}
status = wal_manager_.DeleteFile(name, number);
if (!status.ok()) {
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"DeleteFile %s failed -- %s.\n", name.c_str(),
status.ToString().c_str());
}
return status;
}
int level;
2014-04-08 00:38:53 +02:00
FileMetaData* metadata;
ColumnFamilyData* cfd;
VersionEdit edit;
JobContext job_context(next_job_id_.fetch_add(1), true);
{
InstrumentedMutexLock l(&mutex_);
status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
if (!status.ok()) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"DeleteFile %s failed. File not found\n", name.c_str());
2014-11-15 01:20:24 +01:00
job_context.Clean();
return Status::InvalidArgument("File not found");
}
assert(level < cfd->NumberLevels());
// If the file is being compacted no need to delete.
if (metadata->being_compacted) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"DeleteFile %s Skipped. File about to be compacted\n",
name.c_str());
2014-11-15 01:20:24 +01:00
job_context.Clean();
return Status::OK();
}
// Only the files in the last level can be deleted externally.
// This is to make sure that any deletion tombstones are not
// lost. Check that the level passed is the last level.
auto* vstoreage = cfd->current()->storage_info();
for (int i = level + 1; i < cfd->NumberLevels(); i++) {
if (vstoreage->NumLevelFiles(i) != 0) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"DeleteFile %s FAILED. File not in last level\n",
name.c_str());
2014-11-15 01:20:24 +01:00
job_context.Clean();
return Status::InvalidArgument("File not in last level");
}
}
// if level == 0, it has to be the oldest file
if (level == 0 &&
vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"DeleteFile %s failed ---"
" target file in level 0 must be the oldest.",
name.c_str());
2014-11-15 01:20:24 +01:00
job_context.Clean();
return Status::InvalidArgument("File in level 0, but not oldest");
}
edit.SetColumnFamily(cfd->GetID());
edit.DeleteFile(level, number);
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
&edit, &mutex_, directories_.GetDbDir());
if (status.ok()) {
InstallSuperVersionAndScheduleWork(cfd,
&job_context.superversion_contexts[0],
*cfd->GetLatestMutableCFOptions());
}
FindObsoleteFiles(&job_context, false);
} // lock released here
LogFlush(immutable_db_options_.info_log);
// remove files outside the db-lock
if (job_context.HaveSomethingToDelete()) {
// Call PurgeObsoleteFiles() without holding mutex.
PurgeObsoleteFiles(job_context);
}
job_context.Clean();
return status;
}
Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
const RangePtr* ranges, size_t n,
bool include_end) {
Status status;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
ColumnFamilyData* cfd = cfh->cfd();
VersionEdit edit;
std::set<FileMetaData*> deleted_files;
JobContext job_context(next_job_id_.fetch_add(1), true);
{
InstrumentedMutexLock l(&mutex_);
Version* input_version = cfd->current();
auto* vstorage = input_version->storage_info();
for (size_t r = 0; r < n; r++) {
auto begin = ranges[r].start, end = ranges[r].limit;
for (int i = 1; i < cfd->NumberLevels(); i++) {
if (vstorage->LevelFiles(i).empty() ||
!vstorage->OverlapInLevel(i, begin, end)) {
continue;
}
std::vector<FileMetaData*> level_files;
InternalKey begin_storage, end_storage, *begin_key, *end_key;
if (begin == nullptr) {
begin_key = nullptr;
} else {
begin_storage.SetMinPossibleForUserKey(*begin);
begin_key = &begin_storage;
}
if (end == nullptr) {
end_key = nullptr;
} else {
end_storage.SetMaxPossibleForUserKey(*end);
end_key = &end_storage;
}
vstorage->GetCleanInputsWithinInterval(
i, begin_key, end_key, &level_files, -1 /* hint_index */,
nullptr /* file_index */);
FileMetaData* level_file;
for (uint32_t j = 0; j < level_files.size(); j++) {
level_file = level_files[j];
if (level_file->being_compacted) {
continue;
}
if (deleted_files.find(level_file) != deleted_files.end()) {
continue;
}
if (!include_end && end != nullptr &&
cfd->user_comparator()->Compare(level_file->largest.user_key(),
*end) == 0) {
continue;
}
edit.SetColumnFamily(cfd->GetID());
edit.DeleteFile(i, level_file->fd.GetNumber());
deleted_files.insert(level_file);
level_file->being_compacted = true;
}
}
}
if (edit.GetDeletedFiles().empty()) {
job_context.Clean();
return Status::OK();
}
input_version->Ref();
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
&edit, &mutex_, directories_.GetDbDir());
if (status.ok()) {
InstallSuperVersionAndScheduleWork(cfd,
&job_context.superversion_contexts[0],
*cfd->GetLatestMutableCFOptions());
}
for (auto* deleted_file : deleted_files) {
deleted_file->being_compacted = false;
}
input_version->Unref();
FindObsoleteFiles(&job_context, false);
} // lock released here
LogFlush(immutable_db_options_.info_log);
// remove files outside the db-lock
if (job_context.HaveSomethingToDelete()) {
// Call PurgeObsoleteFiles() without holding mutex.
PurgeObsoleteFiles(job_context);
}
job_context.Clean();
return status;
}
void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
InstrumentedMutexLock l(&mutex_);
versions_->GetLiveFilesMetaData(metadata);
}
void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
ColumnFamilyMetaData* cf_meta) {
assert(column_family);
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
auto* sv = GetAndRefSuperVersion(cfd);
{
// Without mutex, Version::GetColumnFamilyMetaData will have data race with
// Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
// this may cause regression. An alternative is to make
// FileMetaData::being_compacted atomic, but it will make FileMetaData
// non-copy-able. Another option is to separate these variables from
// original FileMetaData struct, and this requires re-organization of data
// structures. For now, we take the easy approach. If
// DB::GetColumnFamilyMetaData is not called frequently, the regression
// should not be big. We still need to keep an eye on it.
InstrumentedMutexLock l(&mutex_);
sv->current->GetColumnFamilyMetaData(cf_meta);
}
ReturnAndCleanupSuperVersion(cfd, sv);
}
#endif // ROCKSDB_LITE
Status DBImpl::CheckConsistency() {
mutex_.AssertHeld();
std::vector<LiveFileMetaData> metadata;
versions_->GetLiveFilesMetaData(&metadata);
TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
std::string corruption_messages;
for (const auto& md : metadata) {
// md.name has a leading "/".
std::string file_path = md.db_path + md.name;
uint64_t fsize = 0;
TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
Status s = env_->GetFileSize(file_path, &fsize);
if (!s.ok() &&
env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
s = Status::OK();
}
if (!s.ok()) {
corruption_messages +=
"Can't access " + md.name + ": " + s.ToString() + "\n";
} else if (fsize != md.size) {
corruption_messages += "Sst file size mismatch: " + file_path +
". Size recorded in manifest " +
ToString(md.size) + ", actual size " +
ToString(fsize) + "\n";
}
}
if (corruption_messages.size() == 0) {
return Status::OK();
} else {
return Status::Corruption(corruption_messages);
}
}
Status DBImpl::GetDbIdentity(std::string& identity) const {
identity.assign(db_id_);
return Status::OK();
}
Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
std::string idfilename = IdentityFileName(dbname_);
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
const FileOptions soptions;
std::unique_ptr<SequentialFileReader> id_file_reader;
Status s;
{
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
std::unique_ptr<FSSequentialFile> idfile;
s = fs_->NewSequentialFile(idfilename, soptions, &idfile, nullptr);
if (!s.ok()) {
return s;
}
id_file_reader.reset(
new SequentialFileReader(std::move(idfile), idfilename));
}
uint64_t file_size;
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
s = fs_->GetFileSize(idfilename, IOOptions(), &file_size, nullptr);
if (!s.ok()) {
return s;
}
char* buffer =
reinterpret_cast<char*>(alloca(static_cast<size_t>(file_size)));
Slice id;
s = id_file_reader->Read(static_cast<size_t>(file_size), &id, buffer);
if (!s.ok()) {
return s;
}
identity->assign(id.ToString());
// If last character is '\n' remove it from identity
if (identity->size() > 0 && identity->back() == '\n') {
identity->pop_back();
}
return s;
}
// Default implementation -- returns not supported status
Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
const std::string& /*column_family_name*/,
ColumnFamilyHandle** /*handle*/) {
return Status::NotSupported("");
}
Status DB::CreateColumnFamilies(
const ColumnFamilyOptions& /*cf_options*/,
const std::vector<std::string>& /*column_family_names*/,
std::vector<ColumnFamilyHandle*>* /*handles*/) {
return Status::NotSupported("");
}
Status DB::CreateColumnFamilies(
const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
std::vector<ColumnFamilyHandle*>* /*handles*/) {
return Status::NotSupported("");
}
Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
return Status::NotSupported("");
}
Status DB::DropColumnFamilies(
const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
return Status::NotSupported("");
}
Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
delete column_family;
return Status::OK();
}
DB::~DB() {}
Status DBImpl::Close() {
if (!closed_) {
{
InstrumentedMutexLock l(&mutex_);
// If there is unreleased snapshot, fail the close call
if (!snapshots_.empty()) {
return Status::Aborted("Cannot close DB with unreleased snapshot.");
}
}
closed_ = true;
return CloseImpl();
}
return Status::OK();
}
Status DB::ListColumnFamilies(const DBOptions& db_options,
const std::string& name,
std::vector<std::string>* column_families) {
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
FileSystem* fs = db_options.file_system.get();
LegacyFileSystemWrapper legacy_fs(db_options.env);
if (!fs) {
fs = &legacy_fs;
}
return VersionSet::ListColumnFamilies(column_families, name, fs);
}
Snapshot::~Snapshot() {}
Status DestroyDB(const std::string& dbname, const Options& options,
const std::vector<ColumnFamilyDescriptor>& column_families) {
ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
Env* env = soptions.env;
std::vector<std::string> filenames;
bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions);
// Reset the logger because it holds a handle to the
// log file and prevents cleanup and directory removal
soptions.info_log.reset();
// Ignore error in case directory does not exist
env->GetChildren(dbname, &filenames);
FileLock* lock;
const std::string lockname = LockFileName(dbname);
Status result = env->LockFile(lockname, &lock);
if (result.ok()) {
uint64_t number;
FileType type;
InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
for (const auto& fname : filenames) {
if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
type != kDBLockFile) { // Lock file will be deleted at end
Status del;
std::string path_to_delete = dbname + "/" + fname;
if (type == kMetaDatabase) {
del = DestroyDB(path_to_delete, options);
} else if (type == kTableFile || type == kLogFile) {
del = DeleteDBFile(&soptions, path_to_delete, dbname,
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
} else {
del = env->DeleteFile(path_to_delete);
}
if (result.ok() && !del.ok()) {
result = del;
}
}
}
std::vector<std::string> paths;
for (const auto& path : options.db_paths) {
paths.emplace_back(path.path);
}
for (const auto& cf : column_families) {
for (const auto& path : cf.options.cf_paths) {
paths.emplace_back(path.path);
}
}
// Remove duplicate paths.
// Note that we compare only the actual paths but not path ids.
// This reason is that same path can appear at different path_ids
// for different column families.
std::sort(paths.begin(), paths.end());
paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
for (const auto& path : paths) {
if (env->GetChildren(path, &filenames).ok()) {
for (const auto& fname : filenames) {
if (ParseFileName(fname, &number, &type) &&
type == kTableFile) { // Lock file will be deleted at end
std::string table_path = path + "/" + fname;
Status del = DeleteDBFile(&soptions, table_path, dbname,
/*force_bg=*/false, /*force_fg=*/false);
if (result.ok() && !del.ok()) {
result = del;
}
}
}
env->DeleteDir(path);
}
}
std::vector<std::string> walDirFiles;
std::string archivedir = ArchivalDirectory(dbname);
bool wal_dir_exists = false;
if (dbname != soptions.wal_dir) {
wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok();
archivedir = ArchivalDirectory(soptions.wal_dir);
}
// Archive dir may be inside wal dir or dbname and should be
// processed and removed before those otherwise we have issues
// removing them
std::vector<std::string> archiveFiles;
if (env->GetChildren(archivedir, &archiveFiles).ok()) {
// Delete archival files.
for (const auto& file : archiveFiles) {
if (ParseFileName(file, &number, &type) && type == kLogFile) {
Status del =
DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
if (result.ok() && !del.ok()) {
result = del;
}
}
}
env->DeleteDir(archivedir);
}
// Delete log files in the WAL dir
if (wal_dir_exists) {
for (const auto& file : walDirFiles) {
if (ParseFileName(file, &number, &type) && type == kLogFile) {
Status del =
DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
soptions.wal_dir, /*force_bg=*/false,
/*force_fg=*/!wal_in_db_path);
if (result.ok() && !del.ok()) {
result = del;
}
}
}
env->DeleteDir(soptions.wal_dir);
}
env->UnlockFile(lock); // Ignore error since state is already gone
env->DeleteFile(lockname);
env->DeleteDir(dbname); // Ignore error in case dir contains other files
}
return result;
}
Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
bool need_enter_write_thread) {
#ifndef ROCKSDB_LITE
WriteThread::Writer w;
if (need_mutex_lock) {
mutex_.Lock();
} else {
mutex_.AssertHeld();
}
if (need_enter_write_thread) {
write_thread_.EnterUnbatched(&w, &mutex_);
}
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// This part requires mutex to protect the column family options
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
cf_names.push_back(cfd->GetName());
cf_opts.push_back(cfd->GetLatestCFOptions());
}
// Unlock during expensive operations. New writes cannot get here
// because the single write thread ensures all new writes get queued.
DBOptions db_options =
BuildDBOptions(immutable_db_options_, mutable_db_options_);
mutex_.Unlock();
TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
std::string file_name =
TempOptionsFileName(GetName(), versions_->NewFileNumber());
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
GetFileSystem());
if (s.ok()) {
s = RenameTempFileToOptionsFile(file_name);
}
// restore lock
if (!need_mutex_lock) {
mutex_.Lock();
}
if (need_enter_write_thread) {
write_thread_.ExitUnbatched(&w);
}
if (!s.ok()) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"Unnable to persist options -- %s", s.ToString().c_str());
if (immutable_db_options_.fail_if_options_file_error) {
return Status::IOError("Unable to persist options.",
s.ToString().c_str());
}
}
#else
(void)need_mutex_lock;
(void)need_enter_write_thread;
#endif // !ROCKSDB_LITE
return Status::OK();
}
#ifndef ROCKSDB_LITE
namespace {
void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
const size_t num_files_to_keep,
const std::shared_ptr<Logger>& info_log,
Env* env) {
if (filenames.size() <= num_files_to_keep) {
return;
}
for (auto iter = std::next(filenames.begin(), num_files_to_keep);
iter != filenames.end(); ++iter) {
if (!env->DeleteFile(iter->second).ok()) {
ROCKS_LOG_WARN(info_log, "Unable to delete options file %s",
iter->second.c_str());
}
}
}
} // namespace
#endif // !ROCKSDB_LITE
Status DBImpl::DeleteObsoleteOptionsFiles() {
#ifndef ROCKSDB_LITE
std::vector<std::string> filenames;
// use ordered map to store keep the filenames sorted from the newest
// to the oldest.
std::map<uint64_t, std::string> options_filenames;
Status s;
s = GetEnv()->GetChildren(GetName(), &filenames);
if (!s.ok()) {
return s;
}
for (auto& filename : filenames) {
uint64_t file_number;
FileType type;
if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
options_filenames.insert(
{std::numeric_limits<uint64_t>::max() - file_number,
GetName() + "/" + filename});
}
}
// Keeps the latest 2 Options file
const size_t kNumOptionsFilesKept = 2;
DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
immutable_db_options_.info_log, GetEnv());
return Status::OK();
#else
return Status::OK();
#endif // !ROCKSDB_LITE
}
Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
#ifndef ROCKSDB_LITE
Status s;
uint64_t options_file_number = versions_->NewFileNumber();
std::string options_file_name =
OptionsFileName(GetName(), options_file_number);
// Retry if the file name happen to conflict with an existing one.
s = GetEnv()->RenameFile(file_name, options_file_name);
if (s.ok()) {
InstrumentedMutexLock l(&mutex_);
versions_->options_file_number_ = options_file_number;
}
if (0 == disable_delete_obsolete_files_) {
DeleteObsoleteOptionsFiles();
}
return s;
#else
(void)file_name;
return Status::OK();
#endif // !ROCKSDB_LITE
}
#ifdef ROCKSDB_USING_THREAD_STATUS
void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
if (immutable_db_options_.enable_thread_tracking) {
ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
cfd->ioptions()->env);
}
}
void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
if (immutable_db_options_.enable_thread_tracking) {
ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
}
}
void DBImpl::EraseThreadStatusDbInfo() const {
if (immutable_db_options_.enable_thread_tracking) {
ThreadStatusUtil::EraseDatabaseInfo(this);
}
}
#else
void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
void DBImpl::EraseThreadStatusDbInfo() const {}
#endif // ROCKSDB_USING_THREAD_STATUS
//
// A global method that can dump out the build version
void DumpRocksDBBuildVersion(Logger* log) {
2014-04-11 19:19:58 +02:00
#if !defined(IOS_CROSS_COMPILE)
// if we compile with Xcode, we don't run build_detect_version, so we don't
// generate util/build_version.cc
ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR,
ROCKSDB_MINOR, ROCKSDB_PATCH);
ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha);
ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date);
#else
(void)log; // ignore "-Wunused-parameter"
2014-04-11 19:19:58 +02:00
#endif
}
#ifndef ROCKSDB_LITE
SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
bool include_history) {
// Find the earliest sequence number that we know we can rely on reading
// from the memtable without needing to check sst files.
SequenceNumber earliest_seq =
sv->imm->GetEarliestSequenceNumber(include_history);
if (earliest_seq == kMaxSequenceNumber) {
earliest_seq = sv->mem->GetEarliestSequenceNumber();
}
assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
return earliest_seq;
}
#endif // ROCKSDB_LITE
#ifndef ROCKSDB_LITE
Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
bool cache_only,
SequenceNumber lower_bound_seq,
SequenceNumber* seq,
bool* found_record_for_key,
bool* is_blob_index) {
Status s;
MergeContext merge_context;
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
SequenceNumber max_covering_tombstone_seq = 0;
ReadOptions read_options;
SequenceNumber current_seq = versions_->LastSequence();
LookupKey lkey(key, current_seq);
*seq = kMaxSequenceNumber;
*found_record_for_key = false;
// Check if there is a record for this key in the latest memtable
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
seq, read_options, nullptr /*read_callback*/, is_blob_index);
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
// unexpected error reading memtable.
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"Unexpected status returned from MemTable::Get: %s\n",
s.ToString().c_str());
return s;
}
if (*seq != kMaxSequenceNumber) {
// Found a sequence number, no need to check immutable memtables
*found_record_for_key = true;
return Status::OK();
}
SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
if (lower_bound_in_mem != kMaxSequenceNumber &&
lower_bound_in_mem < lower_bound_seq) {
*found_record_for_key = false;
return Status::OK();
}
// Check if there is a record for this key in the immutable memtables
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
seq, read_options, nullptr /*read_callback*/, is_blob_index);
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
// unexpected error reading memtable.
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"Unexpected status returned from MemTableList::Get: %s\n",
s.ToString().c_str());
return s;
}
if (*seq != kMaxSequenceNumber) {
// Found a sequence number, no need to check memtable history
*found_record_for_key = true;
return Status::OK();
}
SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
if (lower_bound_in_imm != kMaxSequenceNumber &&
lower_bound_in_imm < lower_bound_seq) {
*found_record_for_key = false;
return Status::OK();
}
// Check if there is a record for this key in the immutable memtables
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
&max_covering_tombstone_seq, seq, read_options,
is_blob_index);
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
// unexpected error reading memtable.
ROCKS_LOG_ERROR(
immutable_db_options_.info_log,
"Unexpected status returned from MemTableList::GetFromHistory: %s\n",
s.ToString().c_str());
return s;
}
if (*seq != kMaxSequenceNumber) {
// Found a sequence number, no need to check SST files
*found_record_for_key = true;
return Status::OK();
}
// We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
// check here to skip the history if possible. But currently the caller
// already does that. Maybe we should move the logic here later.
// TODO(agiardullo): possible optimization: consider checking cached
// SST files if cache_only=true?
if (!cache_only) {
// Check tables
sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
Use only "local" range tombstones during Get (#4449) Summary: Previously, range tombstones were accumulated from every level, which was necessary if a range tombstone in a higher level covered a key in a lower level. However, RangeDelAggregator::AddTombstones's complexity is based on the number of tombstones that are currently stored in it, which is wasteful in the Get case, where we only need to know the highest sequence number of range tombstones that cover the key from higher levels, and compute the highest covering sequence number at the current level. This change introduces this optimization, and removes the use of RangeDelAggregator from the Get path. In the benchmark results, the following command was used to initialize the database: ``` ./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8 ``` ...and the following command was used to measure read throughput: ``` ./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32 ``` The filluniquerandom command was only run once, and the resulting database was used to measure read performance before and after the PR. Both binaries were compiled with `DEBUG_LEVEL=0`. Readrandom results before PR: ``` readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found) ``` Readrandom results after PR: ``` readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found) ``` So it's actually slower right now, but this PR paves the way for future optimizations (see #4493). ---- Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449 Differential Revision: D10370575 Pulled By: abhimadan fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 21:29:29 +02:00
&max_covering_tombstone_seq, nullptr /* value_found */,
found_record_for_key, seq, nullptr /*read_callback*/,
is_blob_index);
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
// unexpected error reading SST files
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"Unexpected status returned from Version::Get: %s\n",
s.ToString().c_str());
}
}
return s;
}
Status DBImpl::IngestExternalFile(
ColumnFamilyHandle* column_family,
const std::vector<std::string>& external_files,
const IngestExternalFileOptions& ingestion_options) {
IngestExternalFileArg arg;
arg.column_family = column_family;
arg.external_files = external_files;
arg.options = ingestion_options;
return IngestExternalFiles({arg});
}
Status DBImpl::IngestExternalFiles(
const std::vector<IngestExternalFileArg>& args) {
if (args.empty()) {
return Status::InvalidArgument("ingestion arg list is empty");
}
{
std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
for (const auto& arg : args) {
if (arg.column_family == nullptr) {
return Status::InvalidArgument("column family handle is null");
} else if (unique_cfhs.count(arg.column_family) > 0) {
return Status::InvalidArgument(
"ingestion args have duplicate column families");
}
unique_cfhs.insert(arg.column_family);
}
}
// Ingest multiple external SST files atomically.
size_t num_cfs = args.size();
for (size_t i = 0; i != num_cfs; ++i) {
if (args[i].external_files.empty()) {
char err_msg[128] = {0};
snprintf(err_msg, 128, "external_files[%zu] is empty", i);
return Status::InvalidArgument(err_msg);
}
}
for (const auto& arg : args) {
const IngestExternalFileOptions& ingest_opts = arg.options;
if (ingest_opts.ingest_behind &&
!immutable_db_options_.allow_ingest_behind) {
return Status::InvalidArgument(
"can't ingest_behind file in DB with allow_ingest_behind=false");
}
}
// TODO (yanqin) maybe handle the case in which column_families have
// duplicates
std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
size_t total = 0;
for (const auto& arg : args) {
total += arg.external_files.size();
}
uint64_t next_file_number = 0;
Status status = ReserveFileNumbersBeforeIngestion(
static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
pending_output_elem, &next_file_number);
if (!status.ok()) {
InstrumentedMutexLock l(&mutex_);
ReleaseFileNumberFromPendingOutputs(pending_output_elem);
return status;
}
std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
for (const auto& arg : args) {
auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
ingestion_jobs.emplace_back(
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
env_, versions_.get(), cfd, immutable_db_options_, file_options_,
&snapshots_, arg.options, &directories_, &event_logger_);
}
std::vector<std::pair<bool, Status>> exec_results;
for (size_t i = 0; i != num_cfs; ++i) {
exec_results.emplace_back(false, Status::OK());
}
// TODO(yanqin) maybe make jobs run in parallel
uint64_t start_file_number = next_file_number;
for (size_t i = 1; i != num_cfs; ++i) {
start_file_number += args[i - 1].external_files.size();
auto* cfd =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
exec_results[i].second = ingestion_jobs[i].Prepare(
args[i].external_files, start_file_number, super_version);
exec_results[i].first = true;
CleanupSuperVersion(super_version);
}
TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
{
auto* cfd =
static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
exec_results[0].second = ingestion_jobs[0].Prepare(
args[0].external_files, next_file_number, super_version);
exec_results[0].first = true;
CleanupSuperVersion(super_version);
}
for (const auto& exec_result : exec_results) {
if (!exec_result.second.ok()) {
status = exec_result.second;
break;
}
}
if (!status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) {
if (exec_results[i].first) {
ingestion_jobs[i].Cleanup(status);
}
}
InstrumentedMutexLock l(&mutex_);
ReleaseFileNumberFromPendingOutputs(pending_output_elem);
return status;
}
std::vector<SuperVersionContext> sv_ctxs;
for (size_t i = 0; i != num_cfs; ++i) {
sv_ctxs.emplace_back(true /* create_superversion */);
}
TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
TEST_SYNC_POINT("DBImpl::AddFile:Start");
{
InstrumentedMutexLock l(&mutex_);
TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
// Stop writes to the DB by entering both write threads
WriteThread::Writer w;
write_thread_.EnterUnbatched(&w, &mutex_);
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
WriteThread::Writer nonmem_w;
if (two_write_queues_) {
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
}
// When unordered_write is enabled, the keys are writing to memtable in an
// unordered way. If the ingestion job checks memtable key range before the
// key landing in memtable, the ingestion job may skip the necessary
// memtable flush.
// So wait here to ensure there is no pending write to memtable.
WaitForPendingWrites();
num_running_ingest_file_ += static_cast<int>(num_cfs);
TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
bool at_least_one_cf_need_flush = false;
std::vector<bool> need_flush(num_cfs, false);
for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
if (cfd->IsDropped()) {
// TODO (yanqin) investigate whether we should abort ingestion or
// proceed with other non-dropped column families.
status = Status::InvalidArgument(
"cannot ingest an external file into a dropped CF");
break;
}
bool tmp = false;
status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
need_flush[i] = tmp;
at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
if (!status.ok()) {
break;
}
}
TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
&at_least_one_cf_need_flush);
if (status.ok() && at_least_one_cf_need_flush) {
FlushOptions flush_opts;
flush_opts.allow_write_stall = true;
if (immutable_db_options_.atomic_flush) {
autovector<ColumnFamilyData*> cfds_to_flush;
SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
mutex_.Unlock();
status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
FlushReason::kExternalFileIngestion,
true /* writes_stopped */);
mutex_.Lock();
} else {
for (size_t i = 0; i != num_cfs; ++i) {
if (need_flush[i]) {
mutex_.Unlock();
auto* cfd =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
->cfd();
status = FlushMemTable(cfd, flush_opts,
FlushReason::kExternalFileIngestion,
true /* writes_stopped */);
mutex_.Lock();
if (!status.ok()) {
break;
}
}
}
}
}
// Run ingestion jobs.
if (status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) {
status = ingestion_jobs[i].Run();
if (!status.ok()) {
break;
}
}
}
if (status.ok()) {
int consumed_seqno_count =
ingestion_jobs[0].ConsumedSequenceNumbersCount();
#ifndef NDEBUG
for (size_t i = 1; i != num_cfs; ++i) {
assert(!!consumed_seqno_count ==
!!ingestion_jobs[i].ConsumedSequenceNumbersCount());
consumed_seqno_count +=
ingestion_jobs[i].ConsumedSequenceNumbersCount();
}
#endif
if (consumed_seqno_count > 0) {
const SequenceNumber last_seqno = versions_->LastSequence();
versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
versions_->SetLastSequence(last_seqno + consumed_seqno_count);
}
autovector<ColumnFamilyData*> cfds_to_commit;
autovector<const MutableCFOptions*> mutable_cf_options_list;
autovector<autovector<VersionEdit*>> edit_lists;
uint32_t num_entries = 0;
for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
if (cfd->IsDropped()) {
continue;
}
cfds_to_commit.push_back(cfd);
mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
autovector<VersionEdit*> edit_list;
edit_list.push_back(ingestion_jobs[i].edit());
edit_lists.push_back(edit_list);
++num_entries;
}
// Mark the version edits as an atomic group if the number of version
// edits exceeds 1.
if (cfds_to_commit.size() > 1) {
for (auto& edits : edit_lists) {
assert(edits.size() == 1);
edits[0]->MarkAtomicGroup(--num_entries);
}
assert(0 == num_entries);
}
status =
versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
edit_lists, &mutex_, directories_.GetDbDir());
}
if (status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
if (!cfd->IsDropped()) {
InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
*cfd->GetLatestMutableCFOptions());
#ifndef NDEBUG
if (0 == i && num_cfs > 1) {
TEST_SYNC_POINT(
"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
TEST_SYNC_POINT(
"DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
}
#endif // !NDEBUG
}
}
}
// Resume writes to the DB
if (two_write_queues_) {
Optimize for serial commits in 2PC Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
2017-06-24 23:06:43 +02:00
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
}
write_thread_.ExitUnbatched(&w);
if (status.ok()) {
for (auto& job : ingestion_jobs) {
job.UpdateStats();
}
}
ReleaseFileNumberFromPendingOutputs(pending_output_elem);
num_running_ingest_file_ -= static_cast<int>(num_cfs);
if (0 == num_running_ingest_file_) {
bg_cv_.SignalAll();
}
TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
}
// mutex_ is unlocked here
// Cleanup
for (size_t i = 0; i != num_cfs; ++i) {
sv_ctxs[i].Clean();
// This may rollback jobs that have completed successfully. This is
// intended for atomicity.
ingestion_jobs[i].Cleanup(status);
}
if (status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
if (!cfd->IsDropped()) {
NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
}
}
}
return status;
}
Export Import sst files (#5495) Summary: Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135 This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469 "Add support for taking snapshot of a column family and creating column family from a given CF snapshot" We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality. (1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below. // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. // - SST files will be created as hard links when the directory specified // is in the same partition as the db directory, copied otherwise. // - export_dir should not already exist and will be created by this API. // - Always triggers a flush. virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, const std::string& export_dir, ExportImportFilesMetaData** metadata); Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by returning the list of file metadata. (2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below. // CreateColumnFamilyWithImport() will create a new column family with // column_family_name and import external SST files specified in metadata into // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in // an existing DB. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a // move, the call ensures that the specified files at external_file_path are // deleted on successful return and files are not modified on any error // return. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle); Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported. If incoming sequence number is higher than current local sequence number, local sequence number is updated to reflect this. Note, as the sst files is are being moved across Column Families, Column Family name in sst file will no longer match the actual column family on destination DB. The API does not modify Column Family name or id in the sst files being imported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495 Differential Revision: D16018881 fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 21:22:21 +02:00
Status DBImpl::CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
Export Import sst files (#5495) Summary: Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135 This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469 "Add support for taking snapshot of a column family and creating column family from a given CF snapshot" We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality. (1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below. // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. // - SST files will be created as hard links when the directory specified // is in the same partition as the db directory, copied otherwise. // - export_dir should not already exist and will be created by this API. // - Always triggers a flush. virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, const std::string& export_dir, ExportImportFilesMetaData** metadata); Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by returning the list of file metadata. (2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below. // CreateColumnFamilyWithImport() will create a new column family with // column_family_name and import external SST files specified in metadata into // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in // an existing DB. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a // move, the call ensures that the specified files at external_file_path are // deleted on successful return and files are not modified on any error // return. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle); Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported. If incoming sequence number is higher than current local sequence number, local sequence number is updated to reflect this. Note, as the sst files is are being moved across Column Families, Column Family name in sst file will no longer match the actual column family on destination DB. The API does not modify Column Family name or id in the sst files being imported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495 Differential Revision: D16018881 fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 21:22:21 +02:00
assert(handle != nullptr);
assert(*handle == nullptr);
std::string cf_comparator_name = options.comparator->Name();
if (cf_comparator_name != metadata.db_comparator_name) {
return Status::InvalidArgument("Comparator name mismatch");
}
// Create column family.
auto status = CreateColumnFamily(options, column_family_name, handle);
if (!status.ok()) {
return status;
}
// Import sst files from metadata.
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(*handle);
auto cfd = cfh->cfd();
ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
immutable_db_options_, file_options_,
Export Import sst files (#5495) Summary: Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135 This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469 "Add support for taking snapshot of a column family and creating column family from a given CF snapshot" We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality. (1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below. // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. // - SST files will be created as hard links when the directory specified // is in the same partition as the db directory, copied otherwise. // - export_dir should not already exist and will be created by this API. // - Always triggers a flush. virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, const std::string& export_dir, ExportImportFilesMetaData** metadata); Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by returning the list of file metadata. (2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below. // CreateColumnFamilyWithImport() will create a new column family with // column_family_name and import external SST files specified in metadata into // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in // an existing DB. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a // move, the call ensures that the specified files at external_file_path are // deleted on successful return and files are not modified on any error // return. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle); Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported. If incoming sequence number is higher than current local sequence number, local sequence number is updated to reflect this. Note, as the sst files is are being moved across Column Families, Column Family name in sst file will no longer match the actual column family on destination DB. The API does not modify Column Family name or id in the sst files being imported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495 Differential Revision: D16018881 fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 21:22:21 +02:00
import_options, metadata.files);
SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
VersionEdit dummy_edit;
uint64_t next_file_number = 0;
std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
Export Import sst files (#5495) Summary: Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135 This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469 "Add support for taking snapshot of a column family and creating column family from a given CF snapshot" We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality. (1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below. // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. // - SST files will be created as hard links when the directory specified // is in the same partition as the db directory, copied otherwise. // - export_dir should not already exist and will be created by this API. // - Always triggers a flush. virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, const std::string& export_dir, ExportImportFilesMetaData** metadata); Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by returning the list of file metadata. (2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below. // CreateColumnFamilyWithImport() will create a new column family with // column_family_name and import external SST files specified in metadata into // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in // an existing DB. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a // move, the call ensures that the specified files at external_file_path are // deleted on successful return and files are not modified on any error // return. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle); Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported. If incoming sequence number is higher than current local sequence number, local sequence number is updated to reflect this. Note, as the sst files is are being moved across Column Families, Column Family name in sst file will no longer match the actual column family on destination DB. The API does not modify Column Family name or id in the sst files being imported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495 Differential Revision: D16018881 fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 21:22:21 +02:00
{
// Lock db mutex
InstrumentedMutexLock l(&mutex_);
if (error_handler_.IsDBStopped()) {
// Don't import files when there is a bg_error
status = error_handler_.GetBGError();
}
// Make sure that bg cleanup wont delete the files that we are importing
pending_output_elem.reset(new std::list<uint64_t>::iterator(
CaptureCurrentFileNumberInPendingOutputs()));
Export Import sst files (#5495) Summary: Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135 This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469 "Add support for taking snapshot of a column family and creating column family from a given CF snapshot" We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality. (1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below. // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. // - SST files will be created as hard links when the directory specified // is in the same partition as the db directory, copied otherwise. // - export_dir should not already exist and will be created by this API. // - Always triggers a flush. virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, const std::string& export_dir, ExportImportFilesMetaData** metadata); Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by returning the list of file metadata. (2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below. // CreateColumnFamilyWithImport() will create a new column family with // column_family_name and import external SST files specified in metadata into // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in // an existing DB. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a // move, the call ensures that the specified files at external_file_path are // deleted on successful return and files are not modified on any error // return. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle); Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported. If incoming sequence number is higher than current local sequence number, local sequence number is updated to reflect this. Note, as the sst files is are being moved across Column Families, Column Family name in sst file will no longer match the actual column family on destination DB. The API does not modify Column Family name or id in the sst files being imported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495 Differential Revision: D16018881 fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 21:22:21 +02:00
if (status.ok()) {
// If crash happen after a hard link established, Recover function may
// reuse the file number that has already assigned to the internal file,
// and this will overwrite the external file. To protect the external
// file, we have to make sure the file number will never being reused.
next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
Export Import sst files (#5495) Summary: Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135 This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469 "Add support for taking snapshot of a column family and creating column family from a given CF snapshot" We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality. (1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below. // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. // - SST files will be created as hard links when the directory specified // is in the same partition as the db directory, copied otherwise. // - export_dir should not already exist and will be created by this API. // - Always triggers a flush. virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, const std::string& export_dir, ExportImportFilesMetaData** metadata); Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by returning the list of file metadata. (2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below. // CreateColumnFamilyWithImport() will create a new column family with // column_family_name and import external SST files specified in metadata into // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in // an existing DB. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a // move, the call ensures that the specified files at external_file_path are // deleted on successful return and files are not modified on any error // return. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle); Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported. If incoming sequence number is higher than current local sequence number, local sequence number is updated to reflect this. Note, as the sst files is are being moved across Column Families, Column Family name in sst file will no longer match the actual column family on destination DB. The API does not modify Column Family name or id in the sst files being imported. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495 Differential Revision: D16018881 fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 21:22:21 +02:00
auto cf_options = cfd->GetLatestMutableCFOptions();
status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
directories_.GetDbDir());
if (status.ok()) {
InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
}
}
}
dummy_sv_ctx.Clean();
if (status.ok()) {
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
status = import_job.Prepare(next_file_number, sv);
CleanupSuperVersion(sv);
}
if (status.ok()) {
SuperVersionContext sv_context(true /*create_superversion*/);
{
// Lock db mutex
InstrumentedMutexLock l(&mutex_);
// Stop writes to the DB by entering both write threads
WriteThread::Writer w;
write_thread_.EnterUnbatched(&w, &mutex_);
WriteThread::Writer nonmem_w;
if (two_write_queues_) {
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
}
num_running_ingest_file_++;
assert(!cfd->IsDropped());
status = import_job.Run();
// Install job edit [Mutex will be unlocked here]
if (status.ok()) {
auto cf_options = cfd->GetLatestMutableCFOptions();
status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
&mutex_, directories_.GetDbDir());
if (status.ok()) {
InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
}
}
// Resume writes to the DB
if (two_write_queues_) {
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
}
write_thread_.ExitUnbatched(&w);
num_running_ingest_file_--;
if (num_running_ingest_file_ == 0) {
bg_cv_.SignalAll();
}
}
// mutex_ is unlocked here
sv_context.Clean();
}
{
InstrumentedMutexLock l(&mutex_);
ReleaseFileNumberFromPendingOutputs(pending_output_elem);
}
import_job.Cleanup(status);
if (!status.ok()) {
DropColumnFamily(*handle);
DestroyColumnFamilyHandle(*handle);
*handle = nullptr;
}
return status;
}
Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
Status s;
std::vector<ColumnFamilyData*> cfd_list;
{
InstrumentedMutexLock l(&mutex_);
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (!cfd->IsDropped() && cfd->initialized()) {
cfd->Ref();
cfd_list.push_back(cfd);
}
}
}
std::vector<SuperVersion*> sv_list;
for (auto cfd : cfd_list) {
sv_list.push_back(cfd->GetReferencedSuperVersion(&mutex_));
}
for (auto& sv : sv_list) {
VersionStorageInfo* vstorage = sv->current->storage_info();
ColumnFamilyData* cfd = sv->current->cfd();
Options opts;
{
InstrumentedMutexLock l(&mutex_);
opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
cfd->GetLatestCFOptions());
}
for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
j++) {
const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
std::string fname = TableFileName(cfd->ioptions()->cf_paths,
fd.GetNumber(), fd.GetPathId());
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
s = rocksdb::VerifySstFileChecksum(opts, file_options_, read_options,
fname);
}
}
if (!s.ok()) {
break;
}
}
{
InstrumentedMutexLock l(&mutex_);
for (auto sv : sv_list) {
if (sv && sv->Unref()) {
sv->Cleanup();
delete sv;
}
}
for (auto cfd : cfd_list) {
cfd->UnrefAndTryDelete();
}
}
return s;
}
void DBImpl::NotifyOnExternalFileIngested(
ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
if (immutable_db_options_.listeners.empty()) {
return;
}
for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
ExternalFileIngestionInfo info;
info.cf_name = cfd->GetName();
info.external_file_path = f.external_file_path;
info.internal_file_path = f.internal_file_path;
info.global_seqno = f.assigned_seqno;
info.table_properties = f.table_properties;
for (auto listener : immutable_db_options_.listeners) {
listener->OnExternalFileIngested(this, info);
}
}
}
void DBImpl::WaitForIngestFile() {
mutex_.AssertHeld();
while (num_running_ingest_file_ > 0) {
bg_cv_.Wait();
}
}
Status DBImpl::StartTrace(const TraceOptions& trace_options,
std::unique_ptr<TraceWriter>&& trace_writer) {
InstrumentedMutexLock lock(&trace_mutex_);
tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
return Status::OK();
}
Status DBImpl::EndTrace() {
InstrumentedMutexLock lock(&trace_mutex_);
Status s;
if (tracer_ != nullptr) {
s = tracer_->Close();
tracer_.reset();
} else {
return Status::IOError("No trace file to close");
}
return s;
}
Status DBImpl::StartBlockCacheTrace(
const TraceOptions& trace_options,
std::unique_ptr<TraceWriter>&& trace_writer) {
return block_cache_tracer_.StartTrace(env_, trace_options,
std::move(trace_writer));
}
Status DBImpl::EndBlockCacheTrace() {
block_cache_tracer_.EndTrace();
return Status::OK();
}
Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
Status s;
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_) {
s = tracer_->IteratorSeek(cf_id, key);
}
}
return s;
}
Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
const Slice& key) {
Status s;
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_) {
s = tracer_->IteratorSeekForPrev(cf_id, key);
}
}
return s;
}
Status DBImpl::ReserveFileNumbersBeforeIngestion(
ColumnFamilyData* cfd, uint64_t num,
std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
uint64_t* next_file_number) {
Status s;
SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
assert(nullptr != next_file_number);
InstrumentedMutexLock l(&mutex_);
if (error_handler_.IsDBStopped()) {
// Do not ingest files when there is a bg_error
return error_handler_.GetBGError();
}
pending_output_elem.reset(new std::list<uint64_t>::iterator(
CaptureCurrentFileNumberInPendingOutputs()));
*next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
auto cf_options = cfd->GetLatestMutableCFOptions();
VersionEdit dummy_edit;
// If crash happen after a hard link established, Recover function may
// reuse the file number that has already assigned to the internal file,
// and this will overwrite the external file. To protect the external
// file, we have to make sure the file number will never being reused.
s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
directories_.GetDbDir());
if (s.ok()) {
InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
}
dummy_sv_ctx.Clean();
return s;
}
Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
if (mutable_db_options_.max_open_files == -1) {
uint64_t oldest_time = port::kMaxUint64;
for (auto cfd : *versions_->GetColumnFamilySet()) {
uint64_t ctime;
cfd->current()->GetCreationTimeOfOldestFile(&ctime);
if (ctime < oldest_time) {
oldest_time = ctime;
}
if (oldest_time == 0) {
break;
}
}
*creation_time = oldest_time;
return Status::OK();
} else {
return Status::NotSupported("This API only works if max_open_files = -1");
}
}
#endif // ROCKSDB_LITE
} // namespace rocksdb