2016-02-09 15:12:00 -08:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 16:03:42 -07:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 14:59:46 -07:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/memtable.h"
|
2013-07-23 14:42:27 -07:00
|
|
|
|
2014-03-12 16:40:14 -07:00
|
|
|
#include <algorithm>
|
2019-10-10 09:37:38 -07:00
|
|
|
#include <array>
|
2017-10-23 15:22:05 -07:00
|
|
|
#include <limits>
|
|
|
|
#include <memory>
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "db/dbformat.h"
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
#include "db/kv_checksum.h"
|
2013-12-02 18:34:05 -08:00
|
|
|
#include "db/merge_context.h"
|
2016-06-13 16:17:26 -07:00
|
|
|
#include "db/merge_helper.h"
|
2016-04-26 12:41:07 -07:00
|
|
|
#include "db/pinned_iterators_manager.h"
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
#include "db/range_tombstone_fragmenter.h"
|
2017-09-11 08:58:52 -07:00
|
|
|
#include "db/read_callback.h"
|
2021-09-29 04:01:57 -07:00
|
|
|
#include "logging/logging.h"
|
2019-05-30 17:39:43 -07:00
|
|
|
#include "memory/arena.h"
|
|
|
|
#include "memory/memory_usage.h"
|
2017-04-05 19:02:00 -07:00
|
|
|
#include "monitoring/perf_context_imp.h"
|
|
|
|
#include "monitoring/statistics.h"
|
2020-04-20 13:21:34 -07:00
|
|
|
#include "port/lang.h"
|
2016-11-15 20:05:36 -08:00
|
|
|
#include "port/port.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/comparator.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
|
|
|
#include "rocksdb/merge_operator.h"
|
2014-01-30 17:18:17 -08:00
|
|
|
#include "rocksdb/slice_transform.h"
|
2022-01-27 14:53:39 -08:00
|
|
|
#include "rocksdb/types.h"
|
2016-06-20 18:01:03 -07:00
|
|
|
#include "rocksdb/write_buffer_manager.h"
|
2015-10-12 15:06:38 -07:00
|
|
|
#include "table/internal_iterator.h"
|
2016-11-03 18:40:23 -07:00
|
|
|
#include "table/iterator_wrapper.h"
|
2017-02-02 16:38:40 -08:00
|
|
|
#include "table/merging_iterator.h"
|
2016-11-15 20:05:36 -08:00
|
|
|
#include "util/autovector.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "util/coding.h"
|
2013-11-27 11:47:40 -08:00
|
|
|
#include "util/mutexlock.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2017-11-02 22:16:23 -07:00
|
|
|
ImmutableMemTableOptions::ImmutableMemTableOptions(
|
2021-05-05 13:59:21 -07:00
|
|
|
const ImmutableOptions& ioptions,
|
2017-11-02 22:16:23 -07:00
|
|
|
const MutableCFOptions& mutable_cf_options)
|
|
|
|
: arena_block_size(mutable_cf_options.arena_block_size),
|
2016-06-03 17:02:10 -07:00
|
|
|
memtable_prefix_bloom_bits(
|
|
|
|
static_cast<uint32_t>(
|
|
|
|
static_cast<double>(mutable_cf_options.write_buffer_size) *
|
|
|
|
mutable_cf_options.memtable_prefix_bloom_size_ratio) *
|
|
|
|
8u),
|
2016-07-26 18:05:30 -07:00
|
|
|
memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
|
2019-02-19 12:12:25 -08:00
|
|
|
memtable_whole_key_filtering(
|
|
|
|
mutable_cf_options.memtable_whole_key_filtering),
|
2016-06-03 17:02:10 -07:00
|
|
|
inplace_update_support(ioptions.inplace_update_support),
|
|
|
|
inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
|
|
|
|
inplace_callback(ioptions.inplace_callback),
|
|
|
|
max_successive_merges(mutable_cf_options.max_successive_merges),
|
2021-04-26 12:43:02 -07:00
|
|
|
statistics(ioptions.stats),
|
2021-04-22 20:42:50 -07:00
|
|
|
merge_operator(ioptions.merge_operator.get()),
|
2021-04-26 12:43:02 -07:00
|
|
|
info_log(ioptions.logger),
|
2020-09-29 23:16:12 -07:00
|
|
|
allow_data_in_errors(ioptions.allow_data_in_errors) {}
|
2014-09-08 18:46:52 -07:00
|
|
|
|
|
|
|
MemTable::MemTable(const InternalKeyComparator& cmp,
|
2021-05-05 13:59:21 -07:00
|
|
|
const ImmutableOptions& ioptions,
|
2014-12-02 12:09:20 -08:00
|
|
|
const MutableCFOptions& mutable_cf_options,
|
2016-06-20 18:01:03 -07:00
|
|
|
WriteBufferManager* write_buffer_manager,
|
2021-07-22 18:26:47 -07:00
|
|
|
SequenceNumber latest_seq, uint32_t column_family_id)
|
2011-03-18 22:37:00 +00:00
|
|
|
: comparator_(cmp),
|
2014-10-27 12:10:13 -07:00
|
|
|
moptions_(ioptions, mutable_cf_options),
|
2011-05-21 02:17:43 +00:00
|
|
|
refs_(0),
|
2014-10-27 12:10:13 -07:00
|
|
|
kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
|
2017-06-02 14:13:59 -07:00
|
|
|
mem_tracker_(write_buffer_manager),
|
2018-11-18 16:51:15 -08:00
|
|
|
arena_(moptions_.arena_block_size,
|
|
|
|
(write_buffer_manager != nullptr &&
|
|
|
|
(write_buffer_manager->enabled() ||
|
|
|
|
write_buffer_manager->cost_to_cache()))
|
|
|
|
? &mem_tracker_
|
|
|
|
: nullptr,
|
|
|
|
mutable_cf_options.memtable_huge_page_size),
|
2014-09-08 18:46:52 -07:00
|
|
|
table_(ioptions.memtable_factory->CreateMemTableRep(
|
2018-05-21 14:33:55 -07:00
|
|
|
comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
|
2021-04-26 12:43:02 -07:00
|
|
|
ioptions.logger, column_family_id)),
|
2016-10-27 10:07:28 -07:00
|
|
|
range_del_table_(SkipListFactory().CreateMemTableRep(
|
2021-04-26 12:43:02 -07:00
|
|
|
comparator_, &arena_, nullptr /* transform */, ioptions.logger,
|
2017-06-02 12:08:01 -07:00
|
|
|
column_family_id)),
|
2016-11-21 12:07:09 -08:00
|
|
|
is_range_del_table_empty_(true),
|
2015-06-12 18:04:30 -07:00
|
|
|
data_size_(0),
|
2014-04-22 17:17:33 -07:00
|
|
|
num_entries_(0),
|
2015-03-18 16:11:02 -07:00
|
|
|
num_deletes_(0),
|
2017-11-02 22:16:23 -07:00
|
|
|
write_buffer_size_(mutable_cf_options.write_buffer_size),
|
2012-10-19 14:00:53 -07:00
|
|
|
flush_in_progress_(false),
|
|
|
|
flush_completed_(false),
|
2012-11-28 16:42:36 -08:00
|
|
|
file_number_(0),
|
2013-06-11 14:23:58 -07:00
|
|
|
first_seqno_(0),
|
2017-03-21 10:59:57 -07:00
|
|
|
earliest_seqno_(latest_seq),
|
|
|
|
creation_seq_(latest_seq),
|
2013-07-16 11:56:46 -07:00
|
|
|
mem_next_logfile_number_(0),
|
2016-04-18 11:11:51 -07:00
|
|
|
min_prep_log_referenced_(0),
|
2015-03-18 16:11:02 -07:00
|
|
|
locks_(moptions_.inplace_update_support
|
|
|
|
? moptions_.inplace_update_num_locks
|
|
|
|
: 0),
|
2018-05-21 14:33:55 -07:00
|
|
|
prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
flush_state_(FLUSH_NOT_REQUESTED),
|
2021-03-15 04:32:24 -07:00
|
|
|
clock_(ioptions.clock),
|
2016-11-13 18:58:17 -08:00
|
|
|
insert_with_hint_prefix_extractor_(
|
2021-04-22 20:42:50 -07:00
|
|
|
ioptions.memtable_insert_with_hint_prefix_extractor.get()),
|
2018-10-15 19:59:20 -07:00
|
|
|
oldest_key_time_(std::numeric_limits<uint64_t>::max()),
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 13:54:09 -07:00
|
|
|
atomic_flush_seqno_(kMaxSequenceNumber),
|
|
|
|
approximate_memory_usage_(0) {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
UpdateFlushState();
|
|
|
|
// something went wrong if we need to flush before inserting anything
|
|
|
|
assert(!ShouldScheduleFlush());
|
|
|
|
|
2019-02-19 12:12:25 -08:00
|
|
|
// use bloom_filter_ for both whole key and prefix bloom filter
|
|
|
|
if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
|
|
|
|
moptions_.memtable_prefix_bloom_bits > 0) {
|
|
|
|
bloom_filter_.reset(
|
2019-01-24 10:11:19 -08:00
|
|
|
new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
|
Faster new DynamicBloom implementation (for memtable) (#5762)
Summary:
Since DynamicBloom is now only used in-memory, we're free to
change it without schema compatibility issues. The new implementation
is drawn from (with manifest permission)
https://github.com/pdillinger/wormhashing/blob/303542a767437f56d8b66cea6ebecaac0e6a61e9/bloom_simulation_tests/foo.cc#L613
This has several speed advantages over the prior implementation:
* Uses fastrange instead of %
* Minimum logic to determine first (and all) probed memory addresses
* (Major) Two probes per 64-bit memory fetch/write.
* Very fast and effective (murmur-like) hash expansion/re-mixing. (At
least on recent CPUs, integer multiplication is very cheap.)
While a Bloom filter with 512-bit cache locality has about a 1.15x FP
rate penalty (e.g. 0.84% to 0.97%), further restricting to two probes
per 64 bits incurs an additional 1.12x FP rate penalty (e.g. 0.97% to
1.09%). Nevertheless, the unit tests show no "mediocre" FP rate samples,
unlike the old implementation with more erratic FP rates.
Especially for the memtable, we expect speed to outweigh somewhat higher
FP rates. For example, a negative table query would have to be 1000x
slower than a BF query to justify doubling BF query time to shave 10% off
FP rate (working assumption around 1% FP rate). While that seems likely
for SSTs, my data suggests a speed factor of roughly 50x for the memtable
(vs. BF; ~1.5% lower write throughput when enabling memtable Bloom
filter, after this change). Thus, it's probably not worth even 5% more
time in the Bloom filter to shave off 1/10th of the Bloom FP rate, or 0.1%
in absolute terms, and it's probably at least 20% slower to recoup that
much FP rate from this new implementation. Because of this, we do not see
a need for a 'locality' option that affects the MemTable Bloom filter
and have decoupled the MemTable Bloom filter from Options::bloom_locality.
Note that just 3% more memory to the Bloom filter (10.3 bits per key vs.
just 10) is able to make up for the ~12% FP rate drop in the new
implementation:
[] # Nearly "ideal" FP-wise but reasonably fast cache-local implementation
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_WORM64_FROM32_any.out 10000000 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_WORM64_FROM32_any.out time: 3.29372 sampled_fp_rate: 0.00985956 ...
[] # Close match to this new implementation
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out 10000000 6 10.3 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out time: 2.10072 sampled_fp_rate: 0.00985655 ...
[] # Old locality=1 implementation
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_ROCKSDB_DYNAMIC_any.out 10000000 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_ROCKSDB_DYNAMIC_any.out time: 3.95472 sampled_fp_rate: 0.00988943 ...
Also note the dramatic speed improvement vs. alternatives.
--
Performance unit test: DynamicBloomTest.concurrent_with_perf is updated
to report more precise timing data. (Measure running time of each
thread, not just longest running thread, etc.) Results averaged over
various sizes enabled with --enable_perf and 20 runs each; old dynamic
bloom refers to locality=1, the faster of the old:
old dynamic bloom, avg add latency = 65.6468
new dynamic bloom, avg add latency = 44.3809
old dynamic bloom, avg query latency = 50.6485
new dynamic bloom, avg query latency = 43.2186
old avg parallel add latency = 41.678
new avg parallel add latency = 24.5238
old avg parallel hit latency = 14.6322
new avg parallel hit latency = 12.3939
old avg parallel miss latency = 16.7289
new avg parallel miss latency = 12.2134
Tested on a dedicated 64-bit production machine at Facebook. Significant
improvement all around.
Despite now using std::atomic<uint64_t>, quick before-and-after test on
a 32-bit machine (Intel Atom N270, released 2008) shows no regression in
performance, in some cases modest improvement.
--
Performance integration test (synthetic): with DEBUG_LEVEL=0, used
TEST_TMPDIR=/dev/shm ./db_bench --benchmarks=fillrandom,readmissing,readrandom,stats --num=2000000
and optionally with -memtable_whole_key_filtering -memtable_bloom_size_ratio=0.01
300 runs each configuration.
Write throughput change by enabling memtable bloom:
Old locality=0: -3.06%
Old locality=1: -2.37%
New: -1.50%
conclusion -> seems to substantially close the gap
Readmissing throughput change by enabling memtable bloom:
Old locality=0: +34.47%
Old locality=1: +34.80%
New: +33.25%
conclusion -> maybe a small new penalty from FP rate
Readrandom throughput change by enabling memtable bloom:
Old locality=0: +31.54%
Old locality=1: +31.13%
New: +30.60%
conclusion -> maybe also from FP rate (after memtable flush)
--
Another conclusion we can draw from this new implementation is that the
existing 32-bit hash function is not inherently crippling the Bloom
filter speed or accuracy, below about 5 million keys. For speed, the
implementation is essentially the same whether starting with 32-bits or
64-bits of hash; it just determines whether the first multiplication
after fastrange is a pseudorandom expansion or needed re-mix. Note that
this multiplication can occur while memory is fetching.
For accuracy, in a standard configuration, you need about 5 million
keys before you have about a 1.1x FP penalty due to using a
32-bit hash vs. 64-bit:
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out $((5 * 1000 * 1000 * 10)) 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out time: 2.52069 sampled_fp_rate: 0.0118267 ...
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_MUL64_BLOCK_any.out $((5 * 1000 * 1000 * 10)) 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_MUL64_BLOCK_any.out time: 2.43871 sampled_fp_rate: 0.0109059
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5762
Differential Revision: D17214194
Pulled By: pdillinger
fbshipit-source-id: ad9da031772e985fd6b62a0e1db8e81892520595
2019-09-05 14:57:39 -07:00
|
|
|
6 /* hard coded 6 probes */,
|
2021-04-26 12:43:02 -07:00
|
|
|
moptions_.memtable_huge_page_size, ioptions.logger));
|
2013-11-27 14:27:02 -08:00
|
|
|
}
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2017-06-02 14:13:59 -07:00
|
|
|
MemTable::~MemTable() {
|
|
|
|
mem_tracker_.FreeMem();
|
|
|
|
assert(refs_ == 0);
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-07-23 14:42:27 -07:00
|
|
|
size_t MemTable::ApproximateMemoryUsage() {
|
2020-02-20 12:07:53 -08:00
|
|
|
autovector<size_t> usages = {
|
|
|
|
arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(),
|
|
|
|
range_del_table_->ApproximateMemoryUsage(),
|
|
|
|
ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)};
|
2016-11-15 20:05:36 -08:00
|
|
|
size_t total_usage = 0;
|
|
|
|
for (size_t usage : usages) {
|
|
|
|
// If usage + total_usage >= kMaxSizet, return kMaxSizet.
|
|
|
|
// the following variation is to avoid numeric overflow.
|
2022-05-05 13:08:21 -07:00
|
|
|
if (usage >= std::numeric_limits<size_t>::max() - total_usage) {
|
|
|
|
return std::numeric_limits<size_t>::max();
|
2016-11-15 20:05:36 -08:00
|
|
|
}
|
|
|
|
total_usage += usage;
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
2014-04-29 17:13:46 -07:00
|
|
|
}
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 13:54:09 -07:00
|
|
|
approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
2014-04-29 17:13:46 -07:00
|
|
|
// otherwise, return the actual usage
|
2016-11-15 20:05:36 -08:00
|
|
|
return total_usage;
|
2013-07-23 14:42:27 -07:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 13:54:09 -07:00
|
|
|
bool MemTable::ShouldFlushNow() {
|
2017-11-02 22:16:23 -07:00
|
|
|
size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
|
2014-03-12 16:40:14 -07:00
|
|
|
// In a lot of times, we cannot allocate arena blocks that exactly matches the
|
|
|
|
// buffer size. Thus we have to decide if we should over-allocate or
|
|
|
|
// under-allocate.
|
2015-08-31 23:11:12 -07:00
|
|
|
// This constant variable can be interpreted as: if we still have more than
|
2014-03-12 16:40:14 -07:00
|
|
|
// "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
|
|
|
|
// allocate one more block.
|
|
|
|
const double kAllowOverAllocationRatio = 0.6;
|
|
|
|
|
|
|
|
// If arena still have room for new block allocation, we can safely say it
|
|
|
|
// shouldn't flush.
|
2016-09-12 14:14:40 -07:00
|
|
|
auto allocated_memory = table_->ApproximateMemoryUsage() +
|
|
|
|
range_del_table_->ApproximateMemoryUsage() +
|
|
|
|
arena_.MemoryAllocatedBytes();
|
2014-03-12 16:40:14 -07:00
|
|
|
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 13:54:09 -07:00
|
|
|
approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
|
|
|
|
|
2014-03-14 14:01:45 -07:00
|
|
|
// if we can still allocate one more block without exceeding the
|
|
|
|
// over-allocation ratio, then we should not flush.
|
|
|
|
if (allocated_memory + kArenaBlockSize <
|
2017-11-02 22:16:23 -07:00
|
|
|
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
2014-03-12 16:40:14 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-11-02 22:16:23 -07:00
|
|
|
// if user keeps adding entries that exceeds write_buffer_size, we need to
|
|
|
|
// flush earlier even though we still have much available memory left.
|
|
|
|
if (allocated_memory >
|
|
|
|
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
2014-03-12 16:40:14 -07:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// In this code path, Arena has already allocated its "last block", which
|
|
|
|
// means the total allocatedmemory size is either:
|
2014-03-14 14:01:45 -07:00
|
|
|
// (1) "moderately" over allocated the memory (no more than `0.6 * arena
|
2014-03-12 16:40:14 -07:00
|
|
|
// block size`. Or,
|
|
|
|
// (2) the allocated memory is less than write buffer size, but we'll stop
|
|
|
|
// here since if we allocate a new arena block, we'll over allocate too much
|
|
|
|
// more (half of the arena block size) memory.
|
|
|
|
//
|
|
|
|
// In either case, to avoid over-allocate, the last block will stop allocation
|
|
|
|
// when its usage reaches a certain ratio, which we carefully choose "0.75
|
|
|
|
// full" as the stop condition because it addresses the following issue with
|
|
|
|
// great simplicity: What if the next inserted entry's size is
|
|
|
|
// bigger than AllocatedAndUnused()?
|
|
|
|
//
|
|
|
|
// The answer is: if the entry size is also bigger than 0.25 *
|
|
|
|
// kArenaBlockSize, a dedicated block will be allocated for it; otherwise
|
|
|
|
// arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
|
|
|
|
// and regular block. In either case, we *overly* over-allocated.
|
|
|
|
//
|
|
|
|
// Therefore, setting the last block to be at most "0.75 full" avoids both
|
|
|
|
// cases.
|
|
|
|
//
|
|
|
|
// NOTE: the average percentage of waste space of this approach can be counted
|
|
|
|
// as: "arena block size * 0.25 / write buffer size". User who specify a small
|
|
|
|
// write buffer size and/or big arena block size may suffer.
|
|
|
|
return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
|
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
void MemTable::UpdateFlushState() {
|
|
|
|
auto state = flush_state_.load(std::memory_order_relaxed);
|
|
|
|
if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
|
|
|
|
// ignore CAS failure, because that means somebody else requested
|
|
|
|
// a flush
|
|
|
|
flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
|
|
|
|
std::memory_order_relaxed,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-10-23 15:22:05 -07:00
|
|
|
void MemTable::UpdateOldestKeyTime() {
|
|
|
|
uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
|
|
|
|
if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
|
|
|
|
int64_t current_time = 0;
|
2021-01-25 22:07:26 -08:00
|
|
|
auto s = clock_->GetCurrentTime(¤t_time);
|
2017-10-23 15:22:05 -07:00
|
|
|
if (s.ok()) {
|
|
|
|
assert(current_time >= 0);
|
|
|
|
// If fail, the timestamp is already set.
|
|
|
|
oldest_key_time_.compare_exchange_strong(
|
|
|
|
oldest_key_time, static_cast<uint64_t>(current_time),
|
|
|
|
std::memory_order_relaxed, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-24 17:50:59 -08:00
|
|
|
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
|
|
|
|
const char* prefix_len_key2) const {
|
|
|
|
// Internal keys are encoded as length-prefixed strings.
|
|
|
|
Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
|
|
|
|
Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
|
2018-01-31 18:45:49 -08:00
|
|
|
return comparator.CompareKeySeq(k1, k2);
|
2014-01-24 17:50:59 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
int MemTable::KeyComparator::operator()(const char* prefix_len_key,
|
2018-03-23 12:12:15 -07:00
|
|
|
const KeyComparator::DecodedType& key)
|
2011-03-18 22:37:00 +00:00
|
|
|
const {
|
|
|
|
// Internal keys are encoded as length-prefixed strings.
|
2014-01-24 17:50:59 -08:00
|
|
|
Slice a = GetLengthPrefixedSlice(prefix_len_key);
|
2018-01-31 18:45:49 -08:00
|
|
|
return comparator.CompareKeySeq(a, key);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2018-03-05 13:08:17 -08:00
|
|
|
void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
|
2018-01-18 14:45:48 -08:00
|
|
|
#ifndef ROCKSDB_LITE
|
2018-01-31 18:45:49 -08:00
|
|
|
throw std::runtime_error("concurrent insert not supported");
|
2018-01-18 14:45:48 -08:00
|
|
|
#else
|
2018-01-31 18:45:49 -08:00
|
|
|
abort();
|
2018-01-18 14:45:48 -08:00
|
|
|
#endif
|
2018-01-31 18:45:49 -08:00
|
|
|
}
|
2018-01-18 14:45:48 -08:00
|
|
|
|
2013-08-22 23:10:02 -07:00
|
|
|
Slice MemTableRep::UserKey(const char* key) const {
|
|
|
|
Slice slice = GetLengthPrefixedSlice(key);
|
|
|
|
return Slice(slice.data(), slice.size() - 8);
|
|
|
|
}
|
|
|
|
|
2014-04-04 15:37:28 -07:00
|
|
|
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
|
2014-12-02 12:09:20 -08:00
|
|
|
*buf = allocator_->Allocate(len);
|
2014-04-04 15:37:28 -07:00
|
|
|
return static_cast<KeyHandle>(*buf);
|
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Encode a suitable internal key target for "target" and return it.
|
|
|
|
// Uses *scratch as scratch space, and the returned pointer will point
|
|
|
|
// into this scratch space.
|
2013-11-20 19:49:27 -08:00
|
|
|
const char* EncodeKey(std::string* scratch, const Slice& target) {
|
2011-03-18 22:37:00 +00:00
|
|
|
scratch->clear();
|
2014-11-11 16:47:22 -05:00
|
|
|
PutVarint32(scratch, static_cast<uint32_t>(target.size()));
|
2011-03-18 22:37:00 +00:00
|
|
|
scratch->append(target.data(), target.size());
|
|
|
|
return scratch->data();
|
|
|
|
}
|
|
|
|
|
2015-10-12 15:06:38 -07:00
|
|
|
class MemTableIterator : public InternalIterator {
|
2011-03-18 22:37:00 +00:00
|
|
|
public:
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
2016-07-20 09:49:03 -07:00
|
|
|
MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
|
2016-09-12 14:14:40 -07:00
|
|
|
Arena* arena, bool use_range_del_table = false)
|
2014-03-28 14:38:10 -07:00
|
|
|
: bloom_(nullptr),
|
|
|
|
prefix_extractor_(mem.prefix_extractor_),
|
2016-09-27 18:20:57 -07:00
|
|
|
comparator_(mem.comparator_),
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 16:38:00 -07:00
|
|
|
valid_(false),
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
2016-07-20 09:49:03 -07:00
|
|
|
arena_mode_(arena != nullptr),
|
2017-11-02 22:16:23 -07:00
|
|
|
value_pinned_(
|
|
|
|
!mem.GetImmutableMemTableOptions()->inplace_update_support) {
|
2016-09-12 14:14:40 -07:00
|
|
|
if (use_range_del_table) {
|
|
|
|
iter_ = mem.range_del_table_->GetIterator(arena);
|
2020-01-28 14:42:21 -08:00
|
|
|
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
|
|
|
|
!read_options.auto_prefix_mode) {
|
|
|
|
// Auto prefix mode is not implemented in memtable yet.
|
2019-02-19 12:12:25 -08:00
|
|
|
bloom_ = mem.bloom_filter_.get();
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 16:38:00 -07:00
|
|
|
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
|
2013-11-03 16:32:46 -08:00
|
|
|
} else {
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 16:38:00 -07:00
|
|
|
iter_ = mem.table_->GetIterator(arena);
|
|
|
|
}
|
|
|
|
}
|
2019-09-11 18:07:12 -07:00
|
|
|
// No copying allowed
|
|
|
|
MemTableIterator(const MemTableIterator&) = delete;
|
|
|
|
void operator=(const MemTableIterator&) = delete;
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 16:38:00 -07:00
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
~MemTableIterator() override {
|
2016-04-26 12:41:07 -07:00
|
|
|
#ifndef NDEBUG
|
|
|
|
// Assert that the MemTableIterator is never deleted while
|
|
|
|
// Pinning is Enabled.
|
2018-02-15 16:43:23 -08:00
|
|
|
assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
|
2016-04-26 12:41:07 -07:00
|
|
|
#endif
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 16:38:00 -07:00
|
|
|
if (arena_mode_) {
|
|
|
|
iter_->~Iterator();
|
|
|
|
} else {
|
|
|
|
delete iter_;
|
2013-11-03 16:32:46 -08:00
|
|
|
}
|
|
|
|
}
|
2013-08-22 23:10:02 -07:00
|
|
|
|
2016-04-26 12:41:07 -07:00
|
|
|
#ifndef NDEBUG
|
2019-02-14 13:52:47 -08:00
|
|
|
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
|
2016-04-26 12:41:07 -07:00
|
|
|
pinned_iters_mgr_ = pinned_iters_mgr;
|
|
|
|
}
|
|
|
|
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
|
|
|
|
#endif
|
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
bool Valid() const override { return valid_; }
|
|
|
|
void Seek(const Slice& k) override {
|
2015-02-27 17:06:06 -08:00
|
|
|
PERF_TIMER_GUARD(seek_on_memtable_time);
|
|
|
|
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
|
2019-02-19 12:12:25 -08:00
|
|
|
if (bloom_) {
|
|
|
|
// iterator should only use prefix bloom filter
|
2020-12-01 14:05:19 -08:00
|
|
|
auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
|
|
|
|
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
|
|
|
|
if (prefix_extractor_->InDomain(user_k_without_ts) &&
|
|
|
|
!bloom_->MayContain(
|
|
|
|
prefix_extractor_->Transform(user_k_without_ts))) {
|
2015-10-07 11:23:20 -07:00
|
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
|
|
|
valid_ = false;
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
|
|
}
|
2013-11-27 14:27:02 -08:00
|
|
|
}
|
|
|
|
iter_->Seek(k, nullptr);
|
|
|
|
valid_ = iter_->Valid();
|
|
|
|
}
|
2019-02-14 13:52:47 -08:00
|
|
|
void SeekForPrev(const Slice& k) override {
|
2016-09-27 18:20:57 -07:00
|
|
|
PERF_TIMER_GUARD(seek_on_memtable_time);
|
|
|
|
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
|
2019-02-19 12:12:25 -08:00
|
|
|
if (bloom_) {
|
2020-12-01 14:05:19 -08:00
|
|
|
auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
|
|
|
|
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
|
|
|
|
if (prefix_extractor_->InDomain(user_k_without_ts) &&
|
|
|
|
!bloom_->MayContain(
|
|
|
|
prefix_extractor_->Transform(user_k_without_ts))) {
|
2016-09-27 18:20:57 -07:00
|
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
|
|
|
valid_ = false;
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
iter_->Seek(k, nullptr);
|
|
|
|
valid_ = iter_->Valid();
|
|
|
|
if (!Valid()) {
|
|
|
|
SeekToLast();
|
|
|
|
}
|
|
|
|
while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
|
|
|
|
Prev();
|
|
|
|
}
|
|
|
|
}
|
2019-02-14 13:52:47 -08:00
|
|
|
void SeekToFirst() override {
|
2013-11-27 14:27:02 -08:00
|
|
|
iter_->SeekToFirst();
|
|
|
|
valid_ = iter_->Valid();
|
|
|
|
}
|
2019-02-14 13:52:47 -08:00
|
|
|
void SeekToLast() override {
|
2013-11-27 14:27:02 -08:00
|
|
|
iter_->SeekToLast();
|
|
|
|
valid_ = iter_->Valid();
|
|
|
|
}
|
2019-02-14 13:52:47 -08:00
|
|
|
void Next() override {
|
2016-11-28 10:12:28 -08:00
|
|
|
PERF_COUNTER_ADD(next_on_memtable_count, 1);
|
2013-11-27 14:27:02 -08:00
|
|
|
assert(Valid());
|
|
|
|
iter_->Next();
|
2020-10-01 10:08:52 -07:00
|
|
|
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
|
2013-11-27 14:27:02 -08:00
|
|
|
valid_ = iter_->Valid();
|
|
|
|
}
|
2020-07-29 09:43:56 -07:00
|
|
|
bool NextAndGetResult(IterateResult* result) override {
|
|
|
|
Next();
|
|
|
|
bool is_valid = valid_;
|
|
|
|
if (is_valid) {
|
|
|
|
result->key = key();
|
2020-08-05 10:42:56 -07:00
|
|
|
result->bound_check_result = IterBoundCheck::kUnknown;
|
2020-07-29 09:43:56 -07:00
|
|
|
result->value_prepared = true;
|
|
|
|
}
|
|
|
|
return is_valid;
|
|
|
|
}
|
2019-02-14 13:52:47 -08:00
|
|
|
void Prev() override {
|
2016-11-28 10:12:28 -08:00
|
|
|
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
|
2013-11-27 14:27:02 -08:00
|
|
|
assert(Valid());
|
|
|
|
iter_->Prev();
|
|
|
|
valid_ = iter_->Valid();
|
|
|
|
}
|
2019-02-14 13:52:47 -08:00
|
|
|
Slice key() const override {
|
2013-11-27 14:27:02 -08:00
|
|
|
assert(Valid());
|
2013-07-23 14:42:27 -07:00
|
|
|
return GetLengthPrefixedSlice(iter_->key());
|
|
|
|
}
|
2019-02-14 13:52:47 -08:00
|
|
|
Slice value() const override {
|
2013-11-27 14:27:02 -08:00
|
|
|
assert(Valid());
|
2013-07-23 14:42:27 -07:00
|
|
|
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
|
2011-03-18 22:37:00 +00:00
|
|
|
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
|
|
|
}
|
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
Status status() const override { return Status::OK(); }
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
bool IsKeyPinned() const override {
|
2015-12-16 12:08:30 -08:00
|
|
|
// memtable data is always pinned
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
bool IsValuePinned() const override {
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
2016-07-20 09:49:03 -07:00
|
|
|
// memtable value is always pinned, except if we allow inplace update.
|
|
|
|
return value_pinned_;
|
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
2014-03-28 14:38:10 -07:00
|
|
|
DynamicBloom* bloom_;
|
|
|
|
const SliceTransform* const prefix_extractor_;
|
2016-09-27 18:20:57 -07:00
|
|
|
const MemTable::KeyComparator comparator_;
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 16:38:00 -07:00
|
|
|
MemTableRep::Iterator* iter_;
|
2013-11-27 14:27:02 -08:00
|
|
|
bool valid_;
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 16:38:00 -07:00
|
|
|
bool arena_mode_;
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
2016-07-20 09:49:03 -07:00
|
|
|
bool value_pinned_;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2015-10-12 15:06:38 -07:00
|
|
|
InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
|
|
|
|
Arena* arena) {
|
2014-09-04 17:40:41 -07:00
|
|
|
assert(arena != nullptr);
|
|
|
|
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
|
2014-09-08 18:46:52 -07:00
|
|
|
return new (mem) MemTableIterator(*this, read_options, arena);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2018-11-28 15:26:56 -08:00
|
|
|
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
|
|
|
|
const ReadOptions& read_options, SequenceNumber read_seq) {
|
2018-12-19 17:18:44 -08:00
|
|
|
if (read_options.ignore_range_deletions ||
|
|
|
|
is_range_del_table_empty_.load(std::memory_order_relaxed)) {
|
2016-11-21 12:07:09 -08:00
|
|
|
return nullptr;
|
2016-11-03 18:40:23 -07:00
|
|
|
}
|
2022-01-27 14:53:39 -08:00
|
|
|
return NewRangeTombstoneIteratorInternal(read_options, read_seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
|
|
|
|
const ReadOptions& read_options, SequenceNumber read_seq) {
|
2018-11-28 15:26:56 -08:00
|
|
|
auto* unfragmented_iter = new MemTableIterator(
|
|
|
|
*this, read_options, nullptr /* arena */, true /* use_range_del_table */);
|
|
|
|
auto fragmented_tombstone_list =
|
|
|
|
std::make_shared<FragmentedRangeTombstoneList>(
|
|
|
|
std::unique_ptr<InternalIterator>(unfragmented_iter),
|
|
|
|
comparator_.comparator);
|
|
|
|
|
|
|
|
auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
|
2018-12-11 11:44:24 -08:00
|
|
|
fragmented_tombstone_list, comparator_.comparator, read_seq);
|
2018-11-28 15:26:56 -08:00
|
|
|
return fragmented_iter;
|
2016-09-12 14:14:40 -07:00
|
|
|
}
|
|
|
|
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
port::RWMutex* MemTable::GetLock(const Slice& key) {
|
2020-09-28 11:33:31 -07:00
|
|
|
return &locks_[GetSliceRangedNPHash(key, locks_.size())];
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
}
|
|
|
|
|
2017-02-06 14:42:38 -08:00
|
|
|
MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
|
|
|
|
const Slice& end_ikey) {
|
2015-06-12 18:04:30 -07:00
|
|
|
uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
|
2016-09-12 14:14:40 -07:00
|
|
|
entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
|
2015-06-12 18:04:30 -07:00
|
|
|
if (entry_count == 0) {
|
2017-02-06 14:42:38 -08:00
|
|
|
return {0, 0};
|
2015-06-12 18:04:30 -07:00
|
|
|
}
|
|
|
|
uint64_t n = num_entries_.load(std::memory_order_relaxed);
|
|
|
|
if (n == 0) {
|
2017-02-06 14:42:38 -08:00
|
|
|
return {0, 0};
|
2015-06-12 18:04:30 -07:00
|
|
|
}
|
|
|
|
if (entry_count > n) {
|
2016-09-12 14:14:40 -07:00
|
|
|
// (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
|
|
|
|
// be larger than actual entries we have. Cap it to entries we have to limit
|
|
|
|
// the inaccuracy.
|
2015-06-12 18:04:30 -07:00
|
|
|
entry_count = n;
|
|
|
|
}
|
|
|
|
uint64_t data_size = data_size_.load(std::memory_order_relaxed);
|
2017-02-06 14:42:38 -08:00
|
|
|
return {entry_count * (data_size / n), entry_count};
|
2015-06-12 18:04:30 -07:00
|
|
|
}
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
Status MemTable::VerifyEncodedEntry(Slice encoded,
|
2021-09-14 13:13:36 -07:00
|
|
|
const ProtectionInfoKVOS64& kv_prot_info) {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
uint32_t ikey_len = 0;
|
|
|
|
if (!GetVarint32(&encoded, &ikey_len)) {
|
|
|
|
return Status::Corruption("Unable to parse internal key length");
|
|
|
|
}
|
|
|
|
size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
|
|
|
|
if (ikey_len < 8 + ts_sz) {
|
|
|
|
return Status::Corruption("Internal key length too short");
|
|
|
|
}
|
|
|
|
if (ikey_len > encoded.size()) {
|
|
|
|
return Status::Corruption("Internal key length too long");
|
|
|
|
}
|
|
|
|
uint32_t value_len = 0;
|
2021-09-14 13:13:36 -07:00
|
|
|
const size_t user_key_len = ikey_len - 8;
|
|
|
|
Slice key(encoded.data(), user_key_len);
|
|
|
|
encoded.remove_prefix(user_key_len);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
|
|
|
|
uint64_t packed = DecodeFixed64(encoded.data());
|
|
|
|
ValueType value_type = kMaxValue;
|
|
|
|
SequenceNumber sequence_number = kMaxSequenceNumber;
|
|
|
|
UnPackSequenceAndType(packed, &sequence_number, &value_type);
|
|
|
|
encoded.remove_prefix(8);
|
|
|
|
|
|
|
|
if (!GetVarint32(&encoded, &value_len)) {
|
|
|
|
return Status::Corruption("Unable to parse value length");
|
|
|
|
}
|
|
|
|
if (value_len < encoded.size()) {
|
|
|
|
return Status::Corruption("Value length too short");
|
|
|
|
}
|
|
|
|
if (value_len > encoded.size()) {
|
|
|
|
return Status::Corruption("Value length too long");
|
|
|
|
}
|
|
|
|
Slice value(encoded.data(), value_len);
|
|
|
|
|
|
|
|
return kv_prot_info.StripS(sequence_number)
|
2021-09-14 13:13:36 -07:00
|
|
|
.StripKVO(key, value, value_type)
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
.GetStatus();
|
|
|
|
}
|
|
|
|
|
2020-11-23 16:27:46 -08:00
|
|
|
Status MemTable::Add(SequenceNumber s, ValueType type,
|
|
|
|
const Slice& key, /* user key */
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
const Slice& value,
|
2021-09-14 13:13:36 -07:00
|
|
|
const ProtectionInfoKVOS64* kv_prot_info,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
bool allow_concurrent,
|
2020-11-23 16:27:46 -08:00
|
|
|
MemTablePostProcessInfo* post_process_info, void** hint) {
|
2011-03-18 22:37:00 +00:00
|
|
|
// Format of an entry is concatenation of:
|
|
|
|
// key_size : varint32 of internal_key.size()
|
|
|
|
// key bytes : char[internal_key.size()]
|
|
|
|
// value_size : varint32 of value.size()
|
|
|
|
// value bytes : char[value.size()]
|
2014-11-11 16:47:22 -05:00
|
|
|
uint32_t key_size = static_cast<uint32_t>(key.size());
|
|
|
|
uint32_t val_size = static_cast<uint32_t>(value.size());
|
|
|
|
uint32_t internal_key_size = key_size + 8;
|
|
|
|
const uint32_t encoded_len = VarintLength(internal_key_size) +
|
|
|
|
internal_key_size + VarintLength(val_size) +
|
|
|
|
val_size;
|
2014-04-04 15:37:28 -07:00
|
|
|
char* buf = nullptr;
|
2016-09-12 14:14:40 -07:00
|
|
|
std::unique_ptr<MemTableRep>& table =
|
|
|
|
type == kTypeRangeDeletion ? range_del_table_ : table_;
|
|
|
|
KeyHandle handle = table->Allocate(encoded_len, &buf);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
char* p = EncodeVarint32(buf, internal_key_size);
|
|
|
|
memcpy(p, key.data(), key_size);
|
2016-11-13 18:58:17 -08:00
|
|
|
Slice key_slice(p, key_size);
|
2011-03-18 22:37:00 +00:00
|
|
|
p += key_size;
|
2015-05-29 14:36:35 -07:00
|
|
|
uint64_t packed = PackSequenceAndType(s, type);
|
|
|
|
EncodeFixed64(p, packed);
|
2011-03-18 22:37:00 +00:00
|
|
|
p += 8;
|
|
|
|
p = EncodeVarint32(p, val_size);
|
|
|
|
memcpy(p, value.data(), val_size);
|
2014-02-03 13:48:30 -08:00
|
|
|
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
Slice encoded(buf, encoded_len);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded);
|
|
|
|
Status status = VerifyEncodedEntry(encoded, *kv_prot_info);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-05 23:07:28 -07:00
|
|
|
size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
|
2020-12-01 14:05:19 -08:00
|
|
|
Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz);
|
2019-06-05 23:07:28 -07:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
if (!allow_concurrent) {
|
2016-11-13 18:58:17 -08:00
|
|
|
// Extract prefix for insert with hint.
|
2016-11-22 14:06:54 -08:00
|
|
|
if (insert_with_hint_prefix_extractor_ != nullptr &&
|
|
|
|
insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
|
|
|
|
Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
|
2018-02-15 17:12:48 -08:00
|
|
|
bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
|
2018-04-07 21:55:42 -07:00
|
|
|
if (UNLIKELY(!res)) {
|
2020-11-23 16:27:46 -08:00
|
|
|
return Status::TryAgain("key+seq exists");
|
2018-01-31 18:45:49 -08:00
|
|
|
}
|
2016-11-22 14:06:54 -08:00
|
|
|
} else {
|
2018-02-15 17:12:48 -08:00
|
|
|
bool res = table->InsertKey(handle);
|
2018-04-07 21:55:42 -07:00
|
|
|
if (UNLIKELY(!res)) {
|
2020-11-23 16:27:46 -08:00
|
|
|
return Status::TryAgain("key+seq exists");
|
2018-01-31 18:45:49 -08:00
|
|
|
}
|
2016-11-13 18:58:17 -08:00
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
|
|
|
|
// this is a bit ugly, but is the way to avoid locked instructions
|
|
|
|
// when incrementing an atomic
|
|
|
|
num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
|
2015-06-12 18:04:30 -07:00
|
|
|
std::memory_order_relaxed);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
if (type == kTypeDeletion) {
|
|
|
|
num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
2013-02-28 14:09:30 -08:00
|
|
|
|
2019-04-12 17:03:08 -07:00
|
|
|
if (bloom_filter_ && prefix_extractor_ &&
|
2020-12-01 14:05:19 -08:00
|
|
|
prefix_extractor_->InDomain(key_without_ts)) {
|
|
|
|
bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
|
2019-02-19 12:12:25 -08:00
|
|
|
}
|
|
|
|
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
|
2020-12-01 14:05:19 -08:00
|
|
|
bloom_filter_->Add(key_without_ts);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
}
|
2013-11-27 14:27:02 -08:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
// The first sequence number inserted into the memtable
|
2017-09-28 16:43:04 -07:00
|
|
|
assert(first_seqno_ == 0 || s >= first_seqno_);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
if (first_seqno_ == 0) {
|
|
|
|
first_seqno_.store(s, std::memory_order_relaxed);
|
2015-05-29 14:36:35 -07:00
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
if (earliest_seqno_ == kMaxSequenceNumber) {
|
|
|
|
earliest_seqno_.store(GetFirstSequenceNumber(),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
assert(first_seqno_.load() >= earliest_seqno_.load());
|
|
|
|
}
|
2016-07-07 14:45:29 -07:00
|
|
|
assert(post_process_info == nullptr);
|
|
|
|
UpdateFlushState();
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
} else {
|
2019-09-12 16:53:31 -07:00
|
|
|
bool res = (hint == nullptr)
|
|
|
|
? table->InsertKeyConcurrently(handle)
|
|
|
|
: table->InsertKeyWithHintConcurrently(handle, hint);
|
2018-04-07 21:55:42 -07:00
|
|
|
if (UNLIKELY(!res)) {
|
2020-11-23 16:27:46 -08:00
|
|
|
return Status::TryAgain("key+seq exists");
|
2018-01-31 18:45:49 -08:00
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
|
2016-07-07 14:45:29 -07:00
|
|
|
assert(post_process_info != nullptr);
|
|
|
|
post_process_info->num_entries++;
|
|
|
|
post_process_info->data_size += encoded_len;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
if (type == kTypeDeletion) {
|
2016-07-07 14:45:29 -07:00
|
|
|
post_process_info->num_deletes++;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
}
|
|
|
|
|
2019-04-12 17:03:08 -07:00
|
|
|
if (bloom_filter_ && prefix_extractor_ &&
|
2020-12-01 14:05:19 -08:00
|
|
|
prefix_extractor_->InDomain(key_without_ts)) {
|
|
|
|
bloom_filter_->AddConcurrently(
|
|
|
|
prefix_extractor_->Transform(key_without_ts));
|
2019-02-19 12:12:25 -08:00
|
|
|
}
|
|
|
|
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
|
2020-12-01 14:05:19 -08:00
|
|
|
bloom_filter_->AddConcurrently(key_without_ts);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// atomically update first_seqno_ and earliest_seqno_.
|
|
|
|
uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
|
|
|
|
while ((cur_seq_num == 0 || s < cur_seq_num) &&
|
|
|
|
!first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
|
|
|
|
}
|
|
|
|
uint64_t cur_earliest_seqno =
|
|
|
|
earliest_seqno_.load(std::memory_order_relaxed);
|
|
|
|
while (
|
|
|
|
(cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
|
|
|
|
!first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
|
2015-05-29 14:36:35 -07:00
|
|
|
}
|
2013-02-28 14:09:30 -08:00
|
|
|
}
|
2018-12-19 17:18:44 -08:00
|
|
|
if (type == kTypeRangeDeletion) {
|
|
|
|
is_range_del_table_empty_.store(false, std::memory_order_relaxed);
|
2016-11-21 12:07:09 -08:00
|
|
|
}
|
2017-10-23 15:22:05 -07:00
|
|
|
UpdateOldestKeyTime();
|
2020-11-23 16:27:46 -08:00
|
|
|
return Status::OK();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2014-02-11 09:46:30 -08:00
|
|
|
// Callback from MemTable::Get()
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
struct Saver {
|
|
|
|
Status* status;
|
|
|
|
const LookupKey* key;
|
|
|
|
bool* found_final_value; // Is value set correctly? Used by KeyMayExist
|
|
|
|
bool* merge_in_progress;
|
2017-01-08 14:08:51 -08:00
|
|
|
std::string* value;
|
2015-05-29 14:36:35 -07:00
|
|
|
SequenceNumber seq;
|
2020-03-02 15:58:32 -08:00
|
|
|
std::string* timestamp;
|
2014-02-11 09:46:30 -08:00
|
|
|
const MergeOperator* merge_operator;
|
|
|
|
// the merge operations encountered;
|
|
|
|
MergeContext* merge_context;
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
SequenceNumber max_covering_tombstone_seq;
|
2014-02-11 09:46:30 -08:00
|
|
|
MemTable* mem;
|
|
|
|
Logger* logger;
|
|
|
|
Statistics* statistics;
|
|
|
|
bool inplace_update_support;
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 14:22:34 -07:00
|
|
|
bool do_merge;
|
2021-03-15 04:32:24 -07:00
|
|
|
SystemClock* clock;
|
2021-01-25 22:07:26 -08:00
|
|
|
|
2017-09-11 08:58:52 -07:00
|
|
|
ReadCallback* callback_;
|
2017-10-03 09:08:07 -07:00
|
|
|
bool* is_blob_index;
|
2020-09-29 23:16:12 -07:00
|
|
|
bool allow_data_in_errors;
|
2017-09-11 08:58:52 -07:00
|
|
|
bool CheckCallback(SequenceNumber _seq) {
|
|
|
|
if (callback_) {
|
2018-06-27 12:05:29 -07:00
|
|
|
return callback_->IsVisible(_seq);
|
2017-09-11 08:58:52 -07:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2014-02-11 09:46:30 -08:00
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
static bool SaveValue(void* arg, const char* entry) {
|
|
|
|
Saver* s = reinterpret_cast<Saver*>(arg);
|
2018-02-15 16:43:23 -08:00
|
|
|
assert(s != nullptr);
|
2014-02-11 09:46:30 -08:00
|
|
|
MergeContext* merge_context = s->merge_context;
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
|
2014-02-11 09:46:30 -08:00
|
|
|
const MergeOperator* merge_operator = s->merge_operator;
|
|
|
|
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
assert(merge_context != nullptr);
|
2014-02-11 09:46:30 -08:00
|
|
|
|
|
|
|
// entry format is:
|
|
|
|
// klength varint32
|
|
|
|
// userkey char[klength-8]
|
|
|
|
// tag uint64
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 14:22:34 -07:00
|
|
|
// vlength varint32f
|
2014-02-11 09:46:30 -08:00
|
|
|
// value char[vlength]
|
|
|
|
// Check that it belongs to same user key. We do not check the
|
|
|
|
// sequence number since the Seek() call above should have skipped
|
|
|
|
// all entries with overly large sequence numbers.
|
2020-06-26 11:12:06 -07:00
|
|
|
uint32_t key_length = 0;
|
2014-02-11 09:46:30 -08:00
|
|
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
2020-06-26 11:12:06 -07:00
|
|
|
assert(key_length >= 8);
|
2019-06-05 23:07:28 -07:00
|
|
|
Slice user_key_slice = Slice(key_ptr, key_length - 8);
|
2020-03-02 15:58:32 -08:00
|
|
|
const Comparator* user_comparator =
|
|
|
|
s->mem->GetInternalKeyComparator().user_comparator();
|
|
|
|
size_t ts_sz = user_comparator->timestamp_size();
|
2021-03-10 11:13:55 -08:00
|
|
|
if (user_comparator->EqualWithoutTimestamp(user_key_slice,
|
|
|
|
s->key->user_key())) {
|
2014-02-11 09:46:30 -08:00
|
|
|
// Correct user key
|
|
|
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
2015-05-29 14:36:35 -07:00
|
|
|
ValueType type;
|
2017-09-11 08:58:52 -07:00
|
|
|
SequenceNumber seq;
|
|
|
|
UnPackSequenceAndType(tag, &seq, &type);
|
|
|
|
// If the value is not in the snapshot, skip it
|
|
|
|
if (!s->CheckCallback(seq)) {
|
|
|
|
return true; // to continue to the next seq
|
|
|
|
}
|
|
|
|
|
|
|
|
s->seq = seq;
|
2015-05-29 14:36:35 -07:00
|
|
|
|
2017-10-03 09:08:07 -07:00
|
|
|
if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
max_covering_tombstone_seq > seq) {
|
2016-11-03 18:40:23 -07:00
|
|
|
type = kTypeRangeDeletion;
|
|
|
|
}
|
2015-05-29 14:36:35 -07:00
|
|
|
switch (type) {
|
2017-10-03 09:08:07 -07:00
|
|
|
case kTypeBlobIndex:
|
|
|
|
if (s->is_blob_index == nullptr) {
|
|
|
|
ROCKS_LOG_ERROR(s->logger, "Encounter unexpected blob index.");
|
|
|
|
*(s->status) = Status::NotSupported(
|
|
|
|
"Encounter unsupported blob value. Please open DB with "
|
2020-02-20 12:07:53 -08:00
|
|
|
"ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
|
2017-10-03 09:08:07 -07:00
|
|
|
} else if (*(s->merge_in_progress)) {
|
|
|
|
*(s->status) =
|
|
|
|
Status::NotSupported("Blob DB does not support merge operator.");
|
|
|
|
}
|
|
|
|
if (!s->status->ok()) {
|
|
|
|
*(s->found_final_value) = true;
|
|
|
|
return false;
|
|
|
|
}
|
2018-10-15 19:59:20 -07:00
|
|
|
FALLTHROUGH_INTENDED;
|
2014-02-11 09:46:30 -08:00
|
|
|
case kTypeValue: {
|
|
|
|
if (s->inplace_update_support) {
|
|
|
|
s->mem->GetLock(s->key->user_key())->ReadLock();
|
|
|
|
}
|
|
|
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
|
|
*(s->status) = Status::OK();
|
2017-01-08 14:08:51 -08:00
|
|
|
if (*(s->merge_in_progress)) {
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 14:22:34 -07:00
|
|
|
if (s->do_merge) {
|
|
|
|
if (s->value != nullptr) {
|
|
|
|
*(s->status) = MergeHelper::TimedFullMerge(
|
|
|
|
merge_operator, s->key->user_key(), &v,
|
|
|
|
merge_context->GetOperands(), s->value, s->logger,
|
2021-01-25 22:07:26 -08:00
|
|
|
s->statistics, s->clock, nullptr /* result_operand */, true);
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 14:22:34 -07:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Preserve the value with the goal of returning it as part of
|
|
|
|
// raw merge operands to the user
|
|
|
|
merge_context->PushOperand(
|
|
|
|
v, s->inplace_update_support == false /* operand_pinned */);
|
2017-09-12 11:41:13 -07:00
|
|
|
}
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 14:22:34 -07:00
|
|
|
} else if (!s->do_merge) {
|
|
|
|
// Preserve the value with the goal of returning it as part of
|
|
|
|
// raw merge operands to the user
|
|
|
|
merge_context->PushOperand(
|
|
|
|
v, s->inplace_update_support == false /* operand_pinned */);
|
2017-01-08 14:08:51 -08:00
|
|
|
} else if (s->value != nullptr) {
|
|
|
|
s->value->assign(v.data(), v.size());
|
2014-02-11 09:46:30 -08:00
|
|
|
}
|
|
|
|
if (s->inplace_update_support) {
|
2014-06-16 15:41:46 -07:00
|
|
|
s->mem->GetLock(s->key->user_key())->ReadUnlock();
|
2014-02-11 09:46:30 -08:00
|
|
|
}
|
|
|
|
*(s->found_final_value) = true;
|
2017-10-03 09:08:07 -07:00
|
|
|
if (s->is_blob_index != nullptr) {
|
|
|
|
*(s->is_blob_index) = (type == kTypeBlobIndex);
|
|
|
|
}
|
2020-03-02 15:58:32 -08:00
|
|
|
|
|
|
|
if (ts_sz > 0 && s->timestamp != nullptr) {
|
|
|
|
Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
|
|
|
|
s->timestamp->assign(ts.data(), ts.size());
|
|
|
|
}
|
2014-02-11 09:46:30 -08:00
|
|
|
return false;
|
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 11:42:56 -07:00
|
|
|
case kTypeDeletion:
|
2020-05-28 10:37:57 -07:00
|
|
|
case kTypeDeletionWithTimestamp:
|
2016-11-03 18:40:23 -07:00
|
|
|
case kTypeSingleDeletion:
|
|
|
|
case kTypeRangeDeletion: {
|
2014-02-11 09:46:30 -08:00
|
|
|
if (*(s->merge_in_progress)) {
|
2017-09-12 11:41:13 -07:00
|
|
|
if (s->value != nullptr) {
|
|
|
|
*(s->status) = MergeHelper::TimedFullMerge(
|
|
|
|
merge_operator, s->key->user_key(), nullptr,
|
|
|
|
merge_context->GetOperands(), s->value, s->logger,
|
2021-01-25 22:07:26 -08:00
|
|
|
s->statistics, s->clock, nullptr /* result_operand */, true);
|
2017-09-12 11:41:13 -07:00
|
|
|
}
|
2014-02-11 09:46:30 -08:00
|
|
|
} else {
|
|
|
|
*(s->status) = Status::NotFound();
|
|
|
|
}
|
|
|
|
*(s->found_final_value) = true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
case kTypeMerge: {
|
2014-07-30 17:24:36 -07:00
|
|
|
if (!merge_operator) {
|
|
|
|
*(s->status) = Status::InvalidArgument(
|
|
|
|
"merge_operator is not properly initialized.");
|
|
|
|
// Normally we continue the loop (return true) when we see a merge
|
2014-07-30 17:25:11 -07:00
|
|
|
// operand. But in case of an error, we should stop the loop
|
2014-07-30 17:24:36 -07:00
|
|
|
// immediately and pretend we have found the value to stop further
|
|
|
|
// seek. Otherwise, the later call will override this error status.
|
|
|
|
*(s->found_final_value) = true;
|
|
|
|
return false;
|
|
|
|
}
|
2014-02-11 09:46:30 -08:00
|
|
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
|
|
*(s->merge_in_progress) = true;
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
2016-07-20 09:49:03 -07:00
|
|
|
merge_context->PushOperand(
|
|
|
|
v, s->inplace_update_support == false /* operand_pinned */);
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 14:22:34 -07:00
|
|
|
if (s->do_merge && merge_operator->ShouldMerge(
|
|
|
|
merge_context->GetOperandsDirectionBackward())) {
|
2017-09-28 15:49:05 -07:00
|
|
|
*(s->status) = MergeHelper::TimedFullMerge(
|
|
|
|
merge_operator, s->key->user_key(), nullptr,
|
|
|
|
merge_context->GetOperands(), s->value, s->logger, s->statistics,
|
2021-01-25 22:07:26 -08:00
|
|
|
s->clock, nullptr /* result_operand */, true);
|
2017-09-28 15:49:05 -07:00
|
|
|
*(s->found_final_value) = true;
|
|
|
|
return false;
|
|
|
|
}
|
2014-02-11 09:46:30 -08:00
|
|
|
return true;
|
|
|
|
}
|
2020-07-13 20:25:37 -07:00
|
|
|
default: {
|
2020-09-29 23:16:12 -07:00
|
|
|
std::string msg("Corrupted value not expected.");
|
|
|
|
if (s->allow_data_in_errors) {
|
|
|
|
msg.append("Unrecognized value type: " +
|
|
|
|
std::to_string(static_cast<int>(type)) + ". ");
|
|
|
|
msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) +
|
|
|
|
". ");
|
|
|
|
msg.append("seq: " + std::to_string(seq) + ".");
|
|
|
|
}
|
2020-07-13 20:25:37 -07:00
|
|
|
*(s->status) = Status::Corruption(msg.c_str());
|
2020-09-29 23:16:12 -07:00
|
|
|
return false;
|
2020-07-13 20:25:37 -07:00
|
|
|
}
|
2014-02-11 09:46:30 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// s->state could be Corrupt, merge or notfound
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-03-02 15:58:32 -08:00
|
|
|
bool MemTable::Get(const LookupKey& key, std::string* value,
|
|
|
|
std::string* timestamp, Status* s,
|
2016-11-03 18:40:23 -07:00
|
|
|
MergeContext* merge_context,
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
SequenceNumber* max_covering_tombstone_seq,
|
|
|
|
SequenceNumber* seq, const ReadOptions& read_opts,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 14:22:34 -07:00
|
|
|
ReadCallback* callback, bool* is_blob_index, bool do_merge) {
|
2014-08-26 17:19:03 -07:00
|
|
|
// The sequence number is updated synchronously in version_set.h
|
2014-09-05 12:01:01 -07:00
|
|
|
if (IsEmpty()) {
|
2014-08-26 17:19:03 -07:00
|
|
|
// Avoiding recording stats for speed.
|
|
|
|
return false;
|
|
|
|
}
|
2014-08-22 15:28:58 -07:00
|
|
|
PERF_TIMER_GUARD(get_from_memtable_time);
|
2013-11-18 11:32:54 -08:00
|
|
|
|
2018-11-28 15:26:56 -08:00
|
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
|
|
NewRangeTombstoneIterator(read_opts,
|
|
|
|
GetInternalKeySeqno(key.internal_key())));
|
|
|
|
if (range_del_iter != nullptr) {
|
|
|
|
*max_covering_tombstone_seq =
|
|
|
|
std::max(*max_covering_tombstone_seq,
|
|
|
|
range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()));
|
|
|
|
}
|
2017-08-16 19:00:46 -07:00
|
|
|
|
2014-02-11 09:46:30 -08:00
|
|
|
bool found_final_value = false;
|
|
|
|
bool merge_in_progress = s->IsMergeInProgress();
|
2019-02-19 12:12:25 -08:00
|
|
|
bool may_contain = true;
|
2019-06-05 23:07:28 -07:00
|
|
|
size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
|
2020-12-01 14:05:19 -08:00
|
|
|
Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz);
|
2019-02-19 12:12:25 -08:00
|
|
|
if (bloom_filter_) {
|
|
|
|
// when both memtable_whole_key_filtering and prefix_extractor_ are set,
|
|
|
|
// only do whole key filtering for Get() to save CPU
|
|
|
|
if (moptions_.memtable_whole_key_filtering) {
|
2020-12-01 14:05:19 -08:00
|
|
|
may_contain = bloom_filter_->MayContain(user_key_without_ts);
|
2019-02-19 12:12:25 -08:00
|
|
|
} else {
|
|
|
|
assert(prefix_extractor_);
|
2020-12-01 14:05:19 -08:00
|
|
|
may_contain = !prefix_extractor_->InDomain(user_key_without_ts) ||
|
|
|
|
bloom_filter_->MayContain(
|
|
|
|
prefix_extractor_->Transform(user_key_without_ts));
|
2019-02-19 12:12:25 -08:00
|
|
|
}
|
|
|
|
}
|
2019-10-10 09:37:38 -07:00
|
|
|
|
2019-02-19 12:12:25 -08:00
|
|
|
if (bloom_filter_ && !may_contain) {
|
2013-11-27 14:27:02 -08:00
|
|
|
// iter is null if prefix bloom says the key does not exist
|
2015-10-07 11:23:20 -07:00
|
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
2015-05-29 14:36:35 -07:00
|
|
|
*seq = kMaxSequenceNumber;
|
2013-11-27 14:27:02 -08:00
|
|
|
} else {
|
2019-02-19 12:12:25 -08:00
|
|
|
if (bloom_filter_) {
|
2015-10-07 11:23:20 -07:00
|
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
|
|
}
|
2019-10-10 09:37:38 -07:00
|
|
|
GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
|
2020-03-02 15:58:32 -08:00
|
|
|
is_blob_index, value, timestamp, s, merge_context, seq,
|
2019-10-10 09:37:38 -07:00
|
|
|
&found_final_value, &merge_in_progress);
|
2011-06-22 02:36:45 +00:00
|
|
|
}
|
2013-03-21 15:59:47 -07:00
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
2013-08-05 20:14:32 -07:00
|
|
|
// No change to value, since we have not yet found a Put/Delete
|
2013-11-18 11:32:54 -08:00
|
|
|
if (!found_final_value && merge_in_progress) {
|
Simplify querying of merge results
Summary:
While working on supporting mixing merge operators with
single deletes ( https://reviews.facebook.net/D43179 ),
I realized that returning and dealing with merge results
can be made simpler. Submitting this as a separate diff
because it is not directly related to single deletes.
Before, callers of merge helper had to retrieve the merge
result in one of two ways depending on whether the merge
was successful or not (success = result of merge was single
kTypeValue). For successful merges, the caller could query
the resulting key/value pair and for unsuccessful merges,
the result could be retrieved in the form of two deques of
keys and values. However, with single deletes, a successful merge
does not return a single key/value pair (if merge
operands are merged with a single delete, we have to generate
a value and keep the original single delete around to make
sure that we are not accidentially producing a key overwrite).
In addition, the two existing call sites of the merge
helper were taking the same actions independently from whether
the merge was successful or not, so this patch simplifies that.
Test Plan: make clean all check
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43353
2015-08-17 17:34:38 -07:00
|
|
|
*s = Status::MergeInProgress();
|
2013-03-21 15:59:47 -07:00
|
|
|
}
|
2014-04-08 10:58:07 -07:00
|
|
|
PERF_COUNTER_ADD(get_from_memtable_count, 1);
|
2013-11-18 11:32:54 -08:00
|
|
|
return found_final_value;
|
2011-06-22 02:36:45 +00:00
|
|
|
}
|
|
|
|
|
2019-10-10 09:37:38 -07:00
|
|
|
void MemTable::GetFromTable(const LookupKey& key,
|
|
|
|
SequenceNumber max_covering_tombstone_seq,
|
|
|
|
bool do_merge, ReadCallback* callback,
|
2020-03-02 15:58:32 -08:00
|
|
|
bool* is_blob_index, std::string* value,
|
|
|
|
std::string* timestamp, Status* s,
|
2019-10-10 09:37:38 -07:00
|
|
|
MergeContext* merge_context, SequenceNumber* seq,
|
|
|
|
bool* found_final_value, bool* merge_in_progress) {
|
|
|
|
Saver saver;
|
|
|
|
saver.status = s;
|
|
|
|
saver.found_final_value = found_final_value;
|
|
|
|
saver.merge_in_progress = merge_in_progress;
|
|
|
|
saver.key = &key;
|
|
|
|
saver.value = value;
|
2020-03-02 15:58:32 -08:00
|
|
|
saver.timestamp = timestamp;
|
2019-10-10 09:37:38 -07:00
|
|
|
saver.seq = kMaxSequenceNumber;
|
|
|
|
saver.mem = this;
|
|
|
|
saver.merge_context = merge_context;
|
|
|
|
saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
|
|
|
|
saver.merge_operator = moptions_.merge_operator;
|
|
|
|
saver.logger = moptions_.info_log;
|
|
|
|
saver.inplace_update_support = moptions_.inplace_update_support;
|
|
|
|
saver.statistics = moptions_.statistics;
|
2021-01-25 22:07:26 -08:00
|
|
|
saver.clock = clock_;
|
2019-10-10 09:37:38 -07:00
|
|
|
saver.callback_ = callback;
|
|
|
|
saver.is_blob_index = is_blob_index;
|
|
|
|
saver.do_merge = do_merge;
|
2020-09-29 23:16:12 -07:00
|
|
|
saver.allow_data_in_errors = moptions_.allow_data_in_errors;
|
2019-10-10 09:37:38 -07:00
|
|
|
table_->Get(key, &saver, SaveValue);
|
|
|
|
*seq = saver.seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
2020-12-14 13:47:17 -08:00
|
|
|
ReadCallback* callback) {
|
2019-10-10 09:37:38 -07:00
|
|
|
// The sequence number is updated synchronously in version_set.h
|
|
|
|
if (IsEmpty()) {
|
|
|
|
// Avoiding recording stats for speed.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
PERF_TIMER_GUARD(get_from_memtable_time);
|
|
|
|
|
2022-01-27 14:53:39 -08:00
|
|
|
// For now, memtable Bloom filter is effectively disabled if there are any
|
|
|
|
// range tombstones. This is the simplest way to ensure range tombstones are
|
|
|
|
// handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
|
|
|
|
bool no_range_del = read_options.ignore_range_deletions ||
|
|
|
|
is_range_del_table_empty_.load(std::memory_order_relaxed);
|
2019-10-10 09:37:38 -07:00
|
|
|
MultiGetRange temp_range(*range, range->begin(), range->end());
|
2022-01-27 14:53:39 -08:00
|
|
|
if (bloom_filter_ && no_range_del) {
|
|
|
|
bool whole_key =
|
|
|
|
!prefix_extractor_ || moptions_.memtable_whole_key_filtering;
|
|
|
|
std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
|
|
|
|
std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
|
|
|
|
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
|
2019-10-10 09:37:38 -07:00
|
|
|
int num_keys = 0;
|
|
|
|
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
|
2022-01-27 14:53:39 -08:00
|
|
|
if (whole_key) {
|
|
|
|
bloom_keys[num_keys] = iter->ukey_without_ts;
|
|
|
|
range_indexes[num_keys++] = iter.index();
|
2020-11-03 09:44:21 -08:00
|
|
|
} else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
|
2022-01-27 14:53:39 -08:00
|
|
|
bloom_keys[num_keys] =
|
|
|
|
prefix_extractor_->Transform(iter->ukey_without_ts);
|
|
|
|
range_indexes[num_keys++] = iter.index();
|
|
|
|
} else {
|
|
|
|
// TODO: consider not counting these as Bloom hits to more closely
|
|
|
|
// match bloom_sst_hit_count
|
2019-10-10 09:37:38 -07:00
|
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
|
|
}
|
2022-01-27 14:53:39 -08:00
|
|
|
}
|
|
|
|
bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
|
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
|
|
if (!may_match[i]) {
|
|
|
|
temp_range.SkipIndex(range_indexes[i]);
|
2019-10-10 09:37:38 -07:00
|
|
|
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
|
|
|
} else {
|
|
|
|
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
|
|
|
|
bool found_final_value{false};
|
|
|
|
bool merge_in_progress = iter->s->IsMergeInProgress();
|
2022-01-27 14:53:39 -08:00
|
|
|
if (!no_range_del) {
|
|
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
|
|
NewRangeTombstoneIteratorInternal(
|
|
|
|
read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
|
2019-10-10 09:37:38 -07:00
|
|
|
iter->max_covering_tombstone_seq = std::max(
|
|
|
|
iter->max_covering_tombstone_seq,
|
|
|
|
range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
|
|
|
|
}
|
2022-01-27 14:53:39 -08:00
|
|
|
SequenceNumber dummy_seq;
|
2019-10-10 09:37:38 -07:00
|
|
|
GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
|
2020-12-14 13:47:17 -08:00
|
|
|
callback, &iter->is_blob_index, iter->value->GetSelf(),
|
2022-01-27 14:53:39 -08:00
|
|
|
iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
|
2020-12-14 13:47:17 -08:00
|
|
|
&found_final_value, &merge_in_progress);
|
2019-10-10 09:37:38 -07:00
|
|
|
|
|
|
|
if (!found_final_value && merge_in_progress) {
|
|
|
|
*(iter->s) = Status::MergeInProgress();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found_final_value) {
|
|
|
|
iter->value->PinSelf();
|
2020-05-27 13:03:08 -07:00
|
|
|
range->AddValueSize(iter->value->size());
|
2019-10-10 09:37:38 -07:00
|
|
|
range->MarkKeyDone(iter);
|
|
|
|
RecordTick(moptions_.statistics, MEMTABLE_HIT);
|
2020-05-27 13:03:08 -07:00
|
|
|
if (range->GetValueSize() > read_options.value_size_soft_limit) {
|
|
|
|
// Set all remaining keys in range to Abort
|
|
|
|
for (auto range_iter = range->begin(); range_iter != range->end();
|
|
|
|
++range_iter) {
|
|
|
|
range->MarkKeyDone(range_iter);
|
|
|
|
*(range_iter->s) = Status::Aborted();
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2019-10-10 09:37:38 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
PERF_COUNTER_ADD(get_from_memtable_count, 1);
|
|
|
|
}
|
|
|
|
|
2020-11-23 16:27:46 -08:00
|
|
|
Status MemTable::Update(SequenceNumber seq, const Slice& key,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
const Slice& value,
|
2021-09-14 13:13:36 -07:00
|
|
|
const ProtectionInfoKVOS64* kv_prot_info) {
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
LookupKey lkey(key, seq);
|
2013-11-27 14:27:02 -08:00
|
|
|
Slice mem_key = lkey.memtable_key();
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
|
2014-01-15 18:17:58 -08:00
|
|
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
2014-06-24 17:52:30 -07:00
|
|
|
table_->GetDynamicPrefixIterator());
|
2014-01-24 17:50:59 -08:00
|
|
|
iter->Seek(lkey.internal_key(), mem_key.data());
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
|
|
|
|
if (iter->Valid()) {
|
|
|
|
// entry format is:
|
2014-01-14 07:55:16 -08:00
|
|
|
// key_length varint32
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
// userkey char[klength-8]
|
|
|
|
// tag uint64
|
|
|
|
// vlength varint32
|
|
|
|
// value char[vlength]
|
|
|
|
// Check that it belongs to same user key. We do not check the
|
|
|
|
// sequence number since the Seek() call above should have skipped
|
|
|
|
// all entries with overly large sequence numbers.
|
|
|
|
const char* entry = iter->key();
|
2014-01-17 12:22:39 -08:00
|
|
|
uint32_t key_length = 0;
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
2015-09-08 15:30:49 -07:00
|
|
|
if (comparator_.comparator.user_comparator()->Equal(
|
|
|
|
Slice(key_ptr, key_length - 8), lkey.user_key())) {
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
// Correct user key
|
|
|
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
2015-05-29 14:36:35 -07:00
|
|
|
ValueType type;
|
2018-01-31 18:45:49 -08:00
|
|
|
SequenceNumber existing_seq;
|
|
|
|
UnPackSequenceAndType(tag, &existing_seq, &type);
|
|
|
|
assert(existing_seq != seq);
|
2016-12-17 00:05:05 -08:00
|
|
|
if (type == kTypeValue) {
|
|
|
|
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
|
|
|
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
|
|
|
|
uint32_t new_size = static_cast<uint32_t>(value.size());
|
|
|
|
|
|
|
|
// Update value, if new value size <= previous value size
|
2017-04-18 12:00:36 -07:00
|
|
|
if (new_size <= prev_size) {
|
|
|
|
char* p =
|
|
|
|
EncodeVarint32(const_cast<char*>(key_ptr) + key_length, new_size);
|
2016-12-17 00:05:05 -08:00
|
|
|
WriteLock wl(GetLock(lkey.user_key()));
|
|
|
|
memcpy(p, value.data(), value.size());
|
|
|
|
assert((unsigned)((p + value.size()) - entry) ==
|
|
|
|
(unsigned)(VarintLength(key_length) + key_length +
|
|
|
|
VarintLength(value.size()) + value.size()));
|
2017-10-10 21:16:32 -07:00
|
|
|
RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
if (kv_prot_info != nullptr) {
|
2021-09-14 13:13:36 -07:00
|
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
// `seq` is swallowed and `existing_seq` prevails.
|
|
|
|
updated_kv_prot_info.UpdateS(seq, existing_seq);
|
|
|
|
Slice encoded(entry, p + value.size() - entry);
|
|
|
|
return VerifyEncodedEntry(encoded, updated_kv_prot_info);
|
|
|
|
}
|
2020-11-23 16:27:46 -08:00
|
|
|
return Status::OK();
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-23 16:27:46 -08:00
|
|
|
// The latest value is not `kTypeValue` or key doesn't exist
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
return Add(seq, kTypeValue, key, value, kv_prot_info);
|
2014-01-14 07:55:16 -08:00
|
|
|
}
|
|
|
|
|
2020-11-23 16:27:46 -08:00
|
|
|
Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
const Slice& delta,
|
2021-09-14 13:13:36 -07:00
|
|
|
const ProtectionInfoKVOS64* kv_prot_info) {
|
2014-01-14 07:55:16 -08:00
|
|
|
LookupKey lkey(key, seq);
|
|
|
|
Slice memkey = lkey.memtable_key();
|
|
|
|
|
2014-04-22 21:14:25 -07:00
|
|
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
2014-06-24 17:52:30 -07:00
|
|
|
table_->GetDynamicPrefixIterator());
|
2014-01-24 17:50:59 -08:00
|
|
|
iter->Seek(lkey.internal_key(), memkey.data());
|
2014-01-14 07:55:16 -08:00
|
|
|
|
|
|
|
if (iter->Valid()) {
|
|
|
|
// entry format is:
|
|
|
|
// key_length varint32
|
|
|
|
// userkey char[klength-8]
|
|
|
|
// tag uint64
|
|
|
|
// vlength varint32
|
|
|
|
// value char[vlength]
|
|
|
|
// Check that it belongs to same user key. We do not check the
|
|
|
|
// sequence number since the Seek() call above should have skipped
|
|
|
|
// all entries with overly large sequence numbers.
|
|
|
|
const char* entry = iter->key();
|
2014-01-17 12:22:39 -08:00
|
|
|
uint32_t key_length = 0;
|
2014-01-14 07:55:16 -08:00
|
|
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
2015-09-08 15:30:49 -07:00
|
|
|
if (comparator_.comparator.user_comparator()->Equal(
|
|
|
|
Slice(key_ptr, key_length - 8), lkey.user_key())) {
|
2014-01-14 07:55:16 -08:00
|
|
|
// Correct user key
|
|
|
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
2015-05-29 14:36:35 -07:00
|
|
|
ValueType type;
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
uint64_t existing_seq;
|
|
|
|
UnPackSequenceAndType(tag, &existing_seq, &type);
|
2015-05-29 14:36:35 -07:00
|
|
|
switch (type) {
|
2014-01-14 07:55:16 -08:00
|
|
|
case kTypeValue: {
|
|
|
|
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
2014-11-11 16:47:22 -05:00
|
|
|
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
|
Allow callback to change size of existing value. Change return type of the callback function to an enum status to handle 3 cases.
Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
2. Generate a new buffer indicating merged value. Returns 2.
3. Fails to do either of above, based on whatever application logic. Returns 0.
Test Plan: Just make all for now. I'm adding another unit test to test each scenario.
Reviewers: dhruba, haobo
Reviewed By: haobo
CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo
Differential Revision: https://reviews.facebook.net/D15195
2014-01-16 15:11:19 -08:00
|
|
|
|
|
|
|
char* prev_buffer = const_cast<char*>(prev_value.data());
|
2014-11-11 16:47:22 -05:00
|
|
|
uint32_t new_prev_size = prev_size;
|
2014-01-14 07:55:16 -08:00
|
|
|
|
|
|
|
std::string str_value;
|
Allow callback to change size of existing value. Change return type of the callback function to an enum status to handle 3 cases.
Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
2. Generate a new buffer indicating merged value. Returns 2.
3. Fails to do either of above, based on whatever application logic. Returns 0.
Test Plan: Just make all for now. I'm adding another unit test to test each scenario.
Reviewers: dhruba, haobo
Reviewed By: haobo
CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo
Differential Revision: https://reviews.facebook.net/D15195
2014-01-16 15:11:19 -08:00
|
|
|
WriteLock wl(GetLock(lkey.user_key()));
|
2014-09-08 18:46:52 -07:00
|
|
|
auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
|
|
|
|
delta, &str_value);
|
Allow callback to change size of existing value. Change return type of the callback function to an enum status to handle 3 cases.
Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
2. Generate a new buffer indicating merged value. Returns 2.
3. Fails to do either of above, based on whatever application logic. Returns 0.
Test Plan: Just make all for now. I'm adding another unit test to test each scenario.
Reviewers: dhruba, haobo
Reviewed By: haobo
CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo
Differential Revision: https://reviews.facebook.net/D15195
2014-01-16 15:11:19 -08:00
|
|
|
if (status == UpdateStatus::UPDATED_INPLACE) {
|
2014-01-14 07:55:16 -08:00
|
|
|
// Value already updated by callback.
|
Allow callback to change size of existing value. Change return type of the callback function to an enum status to handle 3 cases.
Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
2. Generate a new buffer indicating merged value. Returns 2.
3. Fails to do either of above, based on whatever application logic. Returns 0.
Test Plan: Just make all for now. I'm adding another unit test to test each scenario.
Reviewers: dhruba, haobo
Reviewed By: haobo
CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo
Differential Revision: https://reviews.facebook.net/D15195
2014-01-16 15:11:19 -08:00
|
|
|
assert(new_prev_size <= prev_size);
|
|
|
|
if (new_prev_size < prev_size) {
|
|
|
|
// overwrite the new prev_size
|
|
|
|
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
|
|
|
new_prev_size);
|
|
|
|
if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
|
|
|
|
// shift the value buffer as well.
|
|
|
|
memcpy(p, prev_buffer, new_prev_size);
|
|
|
|
}
|
|
|
|
}
|
2014-10-27 12:10:13 -07:00
|
|
|
RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
UpdateFlushState();
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
if (kv_prot_info != nullptr) {
|
2021-09-14 13:13:36 -07:00
|
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
// `seq` is swallowed and `existing_seq` prevails.
|
|
|
|
updated_kv_prot_info.UpdateS(seq, existing_seq);
|
|
|
|
updated_kv_prot_info.UpdateV(delta,
|
|
|
|
Slice(prev_buffer, new_prev_size));
|
|
|
|
Slice encoded(entry, prev_buffer + new_prev_size - entry);
|
|
|
|
return VerifyEncodedEntry(encoded, updated_kv_prot_info);
|
|
|
|
}
|
2020-11-23 16:27:46 -08:00
|
|
|
return Status::OK();
|
Allow callback to change size of existing value. Change return type of the callback function to an enum status to handle 3 cases.
Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
2. Generate a new buffer indicating merged value. Returns 2.
3. Fails to do either of above, based on whatever application logic. Returns 0.
Test Plan: Just make all for now. I'm adding another unit test to test each scenario.
Reviewers: dhruba, haobo
Reviewed By: haobo
CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo
Differential Revision: https://reviews.facebook.net/D15195
2014-01-16 15:11:19 -08:00
|
|
|
} else if (status == UpdateStatus::UPDATED) {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
Status s;
|
|
|
|
if (kv_prot_info != nullptr) {
|
2021-09-14 13:13:36 -07:00
|
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 12:17:17 -08:00
|
|
|
updated_kv_prot_info.UpdateV(delta, str_value);
|
|
|
|
s = Add(seq, kTypeValue, key, Slice(str_value),
|
|
|
|
&updated_kv_prot_info);
|
|
|
|
} else {
|
|
|
|
s = Add(seq, kTypeValue, key, Slice(str_value),
|
|
|
|
nullptr /* kv_prot_info */);
|
|
|
|
}
|
2014-10-27 12:10:13 -07:00
|
|
|
RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
UpdateFlushState();
|
2020-11-23 16:27:46 -08:00
|
|
|
return s;
|
Allow callback to change size of existing value. Change return type of the callback function to an enum status to handle 3 cases.
Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
2. Generate a new buffer indicating merged value. Returns 2.
3. Fails to do either of above, based on whatever application logic. Returns 0.
Test Plan: Just make all for now. I'm adding another unit test to test each scenario.
Reviewers: dhruba, haobo
Reviewed By: haobo
CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo
Differential Revision: https://reviews.facebook.net/D15195
2014-01-16 15:11:19 -08:00
|
|
|
} else if (status == UpdateStatus::UPDATE_FAILED) {
|
2020-11-23 16:27:46 -08:00
|
|
|
// `UPDATE_FAILED` is named incorrectly. It indicates no update
|
|
|
|
// happened. It does not indicate a failure happened.
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 16:59:07 -07:00
|
|
|
UpdateFlushState();
|
2020-11-23 16:27:46 -08:00
|
|
|
return Status::OK();
|
2014-01-14 07:55:16 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-11-23 16:27:46 -08:00
|
|
|
// The latest value is not `kTypeValue` or key doesn't exist
|
|
|
|
return Status::NotFound();
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 14:12:47 -07:00
|
|
|
}
|
2014-01-10 17:33:56 -08:00
|
|
|
|
|
|
|
size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
|
|
|
|
Slice memkey = key.memtable_key();
|
|
|
|
|
|
|
|
// A total ordered iterator is costly for some memtablerep (prefix aware
|
|
|
|
// reps). By passing in the user key, we allow efficient iterator creation.
|
|
|
|
// The iterator only needs to be ordered within the same user key.
|
2014-01-15 18:17:58 -08:00
|
|
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
2014-06-24 17:52:30 -07:00
|
|
|
table_->GetDynamicPrefixIterator());
|
2014-01-24 17:50:59 -08:00
|
|
|
iter->Seek(key.internal_key(), memkey.data());
|
2014-01-10 17:33:56 -08:00
|
|
|
|
|
|
|
size_t num_successive_merges = 0;
|
|
|
|
|
|
|
|
for (; iter->Valid(); iter->Next()) {
|
|
|
|
const char* entry = iter->key();
|
2014-01-17 12:22:39 -08:00
|
|
|
uint32_t key_length = 0;
|
2014-01-10 17:33:56 -08:00
|
|
|
const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
2015-09-08 15:30:49 -07:00
|
|
|
if (!comparator_.comparator.user_comparator()->Equal(
|
|
|
|
Slice(iter_key_ptr, key_length - 8), key.user_key())) {
|
2014-01-10 17:33:56 -08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
|
2015-05-29 14:36:35 -07:00
|
|
|
ValueType type;
|
|
|
|
uint64_t unused;
|
|
|
|
UnPackSequenceAndType(tag, &unused, &type);
|
|
|
|
if (type != kTypeMerge) {
|
2014-01-10 17:33:56 -08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
++num_successive_merges;
|
|
|
|
}
|
|
|
|
|
|
|
|
return num_successive_merges;
|
|
|
|
}
|
|
|
|
|
2014-02-11 09:46:30 -08:00
|
|
|
void MemTableRep::Get(const LookupKey& k, void* callback_args,
|
|
|
|
bool (*callback_func)(void* arg, const char* entry)) {
|
2014-06-24 17:52:30 -07:00
|
|
|
auto iter = GetDynamicPrefixIterator();
|
2014-02-11 09:46:30 -08:00
|
|
|
for (iter->Seek(k.internal_key(), k.memtable_key().data());
|
|
|
|
iter->Valid() && callback_func(callback_args, iter->key());
|
|
|
|
iter->Next()) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-18 11:11:51 -07:00
|
|
|
void MemTable::RefLogContainingPrepSection(uint64_t log) {
|
|
|
|
assert(log > 0);
|
|
|
|
auto cur = min_prep_log_referenced_.load();
|
|
|
|
while ((log < cur || cur == 0) &&
|
|
|
|
!min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
|
|
|
|
cur = min_prep_log_referenced_.load();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t MemTable::GetMinLogContainingPrepSection() {
|
|
|
|
return min_prep_log_referenced_.load();
|
|
|
|
}
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|