2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 23:59:46 +02:00
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2013-10-05 07:32:05 +02:00
|
|
|
#pragma once
|
2021-12-01 07:31:41 +01:00
|
|
|
#include <array>
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
#include <vector>
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 21:17:17 +01:00
|
|
|
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
|
|
|
#include "db/flush_scheduler.h"
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 21:17:17 +01:00
|
|
|
#include "db/kv_checksum.h"
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
|
|
|
#include "db/trim_history_scheduler.h"
|
2016-02-05 19:44:13 +01:00
|
|
|
#include "db/write_thread.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/options.h"
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
|
|
|
#include "rocksdb/types.h"
|
|
|
|
#include "rocksdb/write_batch.h"
|
2015-11-06 16:29:10 +01:00
|
|
|
#include "util/autovector.h"
|
2021-12-01 07:31:41 +01:00
|
|
|
#include "util/cast_util.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2011-07-15 02:20:57 +02:00
|
|
|
class MemTable;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
class FlushScheduler;
|
|
|
|
class ColumnFamilyData;
|
2011-07-15 02:20:57 +02:00
|
|
|
|
2014-01-28 20:05:04 +01:00
|
|
|
class ColumnFamilyMemTables {
|
|
|
|
public:
|
2014-02-06 01:02:48 +01:00
|
|
|
virtual ~ColumnFamilyMemTables() {}
|
|
|
|
virtual bool Seek(uint32_t column_family_id) = 0;
|
|
|
|
// returns true if the update to memtable should be ignored
|
|
|
|
// (useful when recovering from log whose updates have already
|
|
|
|
// been processed)
|
|
|
|
virtual uint64_t GetLogNumber() const = 0;
|
|
|
|
virtual MemTable* GetMemTable() const = 0;
|
2014-02-11 02:04:44 +01:00
|
|
|
virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
virtual ColumnFamilyData* current() { return nullptr; }
|
2014-02-06 01:02:48 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
|
|
|
|
public:
|
2014-11-18 19:20:10 +01:00
|
|
|
explicit ColumnFamilyMemTablesDefault(MemTable* mem)
|
|
|
|
: ok_(false), mem_(mem) {}
|
2014-02-06 01:02:48 +01:00
|
|
|
|
|
|
|
bool Seek(uint32_t column_family_id) override {
|
|
|
|
ok_ = (column_family_id == 0);
|
|
|
|
return ok_;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t GetLogNumber() const override { return 0; }
|
|
|
|
|
|
|
|
MemTable* GetMemTable() const override {
|
|
|
|
assert(ok_);
|
|
|
|
return mem_;
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
|
2014-02-06 01:02:48 +01:00
|
|
|
|
|
|
|
private:
|
|
|
|
bool ok_;
|
|
|
|
MemTable* mem_;
|
2014-01-28 20:05:04 +01:00
|
|
|
};
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 21:17:17 +01:00
|
|
|
struct WriteBatch::ProtectionInfo {
|
|
|
|
// `WriteBatch` usually doesn't contain a huge number of keys so protecting
|
|
|
|
// with a fixed, non-configurable eight bytes per key may work well enough.
|
2021-09-14 22:13:36 +02:00
|
|
|
autovector<ProtectionInfoKVOC64> entries_;
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 21:17:17 +01:00
|
|
|
|
|
|
|
size_t GetBytesPerKey() const { return 8; }
|
|
|
|
};
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// WriteBatchInternal provides static methods for manipulating a
|
|
|
|
// WriteBatch that we don't want in the public WriteBatch interface.
|
|
|
|
class WriteBatchInternal {
|
|
|
|
public:
|
2016-03-30 19:35:22 +02:00
|
|
|
|
|
|
|
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
|
|
|
|
static const size_t kHeader = 12;
|
|
|
|
|
2014-04-22 20:27:33 +02:00
|
|
|
// WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status Put(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const Slice& key, const Slice& value);
|
2014-04-22 20:27:33 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status Put(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const SliceParts& key, const SliceParts& value);
|
2014-04-22 20:27:33 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status Delete(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const SliceParts& key);
|
2014-07-10 18:31:42 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status Delete(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const Slice& key);
|
2014-04-22 20:27:33 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const SliceParts& key);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 20:42:56 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const Slice& key);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 20:42:56 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const Slice& begin_key, const Slice& end_key);
|
2016-08-16 17:16:04 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const SliceParts& begin_key,
|
|
|
|
const SliceParts& end_key);
|
2016-08-16 17:16:04 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status Merge(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const Slice& key, const Slice& value);
|
2014-04-22 20:27:33 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status Merge(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const SliceParts& key, const SliceParts& value);
|
2015-05-28 01:59:22 +02:00
|
|
|
|
2017-10-03 18:08:07 +02:00
|
|
|
static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
|
|
|
|
const Slice& key, const Slice& value);
|
|
|
|
|
2017-11-11 20:23:43 +01:00
|
|
|
static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
|
2018-06-29 03:46:39 +02:00
|
|
|
const bool write_after_commit = true,
|
|
|
|
const bool unprepared_batch = false);
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 08:35:51 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status MarkRollback(WriteBatch* batch, const Slice& xid);
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 08:35:51 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status MarkCommit(WriteBatch* batch, const Slice& xid);
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 08:35:51 +02:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status InsertNoop(WriteBatch* batch);
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 08:35:51 +02:00
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
// Return the number of entries in the batch.
|
2019-09-09 20:22:28 +02:00
|
|
|
static uint32_t Count(const WriteBatch* batch);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
// Set the count for the number of entries in the batch.
|
2019-09-09 20:22:28 +02:00
|
|
|
static void SetCount(WriteBatch* batch, uint32_t n);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2018-03-08 19:18:34 +01:00
|
|
|
// Return the sequence number for the start of this batch.
|
2011-03-18 23:37:00 +01:00
|
|
|
static SequenceNumber Sequence(const WriteBatch* batch);
|
|
|
|
|
2018-03-08 19:18:34 +01:00
|
|
|
// Store the specified number as the sequence number for the start of
|
2011-03-18 23:37:00 +01:00
|
|
|
// this batch.
|
|
|
|
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
|
|
|
|
|
2015-07-11 05:15:45 +02:00
|
|
|
// Returns the offset of the first entry in the batch.
|
|
|
|
// This offset is only valid if the batch is not empty.
|
|
|
|
static size_t GetFirstOffset(WriteBatch* batch);
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
static Slice Contents(const WriteBatch* batch) {
|
|
|
|
return Slice(batch->rep_);
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t ByteSize(const WriteBatch* batch) {
|
|
|
|
return batch->rep_.size();
|
|
|
|
}
|
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status SetContents(WriteBatch* batch, const Slice& contents);
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2018-02-09 23:50:09 +01:00
|
|
|
static Status CheckSlicePartsLength(const SliceParts& key,
|
|
|
|
const SliceParts& value);
|
|
|
|
|
2015-11-06 16:29:10 +01:00
|
|
|
// Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
|
|
|
|
//
|
|
|
|
// If ignore_missing_column_families == true. WriteBatch
|
|
|
|
// referencing non-existing column family will be ignored.
|
|
|
|
// If ignore_missing_column_families == false, processing of the
|
|
|
|
// batches will be stopped if a reference is found to a non-existing
|
|
|
|
// column family and InvalidArgument() will be returned. The writes
|
|
|
|
// in batches may be only partially applied at that point.
|
2014-09-02 22:29:05 +02:00
|
|
|
//
|
|
|
|
// If log_number is non-zero, the memtable will be updated only if
|
2015-11-06 16:29:10 +01:00
|
|
|
// memtables->GetLogNumber() >= log_number.
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-15 01:59:07 +02:00
|
|
|
//
|
|
|
|
// If flush_scheduler is non-null, it will be invoked if the memtable
|
|
|
|
// should be flushed.
|
|
|
|
//
|
|
|
|
// Under concurrent use, the caller is responsible for making sure that
|
|
|
|
// the memtables object itself is thread-local.
|
2018-06-29 03:46:39 +02:00
|
|
|
static Status InsertInto(
|
|
|
|
WriteThread::WriteGroup& write_group, SequenceNumber sequence,
|
|
|
|
ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
|
|
|
TrimHistoryScheduler* trim_history_scheduler,
|
2018-06-29 03:46:39 +02:00
|
|
|
bool ignore_missing_column_families = false, uint64_t log_number = 0,
|
|
|
|
DB* db = nullptr, bool concurrent_memtable_writes = false,
|
|
|
|
bool seq_per_batch = false, bool batch_per_txn = true);
|
2015-11-06 16:29:10 +01:00
|
|
|
|
|
|
|
// Convenience form of InsertInto when you have only one batch
|
2018-03-08 19:18:34 +01:00
|
|
|
// next_seq returns the seq after last sequence number used in MemTable insert
|
2018-06-29 03:46:39 +02:00
|
|
|
static Status InsertInto(
|
|
|
|
const WriteBatch* batch, ColumnFamilyMemTables* memtables,
|
|
|
|
FlushScheduler* flush_scheduler,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
|
|
|
TrimHistoryScheduler* trim_history_scheduler,
|
2018-06-29 03:46:39 +02:00
|
|
|
bool ignore_missing_column_families = false, uint64_t log_number = 0,
|
|
|
|
DB* db = nullptr, bool concurrent_memtable_writes = false,
|
|
|
|
SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
|
|
|
|
bool seq_per_batch = false, bool batch_per_txn = true);
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
2016-05-04 23:02:27 +02:00
|
|
|
|
2017-05-31 19:45:47 +02:00
|
|
|
static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
|
2016-04-18 20:11:51 +02:00
|
|
|
ColumnFamilyMemTables* memtables,
|
|
|
|
FlushScheduler* flush_scheduler,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 22:54:09 +02:00
|
|
|
TrimHistoryScheduler* trim_history_scheduler,
|
2016-04-18 20:11:51 +02:00
|
|
|
bool ignore_missing_column_families = false,
|
|
|
|
uint64_t log_number = 0, DB* db = nullptr,
|
2017-10-06 23:18:30 +02:00
|
|
|
bool concurrent_memtable_writes = false,
|
2018-06-29 03:46:39 +02:00
|
|
|
bool seq_per_batch = false, size_t batch_cnt = 0,
|
2019-09-13 01:53:31 +02:00
|
|
|
bool batch_per_txn = true,
|
|
|
|
bool hint_per_batch = false);
|
2014-01-28 20:05:04 +01:00
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
static Status Append(WriteBatch* dst, const WriteBatch* src,
|
|
|
|
const bool WAL_only = false);
|
2015-11-06 16:29:10 +01:00
|
|
|
|
|
|
|
// Returns the byte size of appending a WriteBatch with ByteSize
|
|
|
|
// leftByteSize and a WriteBatch with ByteSize rightByteSize
|
|
|
|
static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
|
2017-11-02 01:23:52 +01:00
|
|
|
|
2019-07-31 22:36:22 +02:00
|
|
|
// Iterate over [begin, end) range of a write batch
|
|
|
|
static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
|
|
|
|
size_t begin, size_t end);
|
|
|
|
|
2017-11-02 01:23:52 +01:00
|
|
|
// This write batch includes the latest state that should be persisted. Such
|
|
|
|
// state meant to be used only during recovery.
|
2021-07-30 21:06:47 +02:00
|
|
|
static void SetAsLatestPersistentState(WriteBatch* b);
|
2017-11-02 01:23:52 +01:00
|
|
|
static bool IsLatestPersistentState(const WriteBatch* b);
|
2011-03-18 23:37:00 +01:00
|
|
|
};
|
|
|
|
|
2017-04-11 00:38:34 +02:00
|
|
|
// LocalSavePoint is similar to a scope guard
|
|
|
|
class LocalSavePoint {
|
|
|
|
public:
|
|
|
|
explicit LocalSavePoint(WriteBatch* batch)
|
|
|
|
: batch_(batch),
|
|
|
|
savepoint_(batch->GetDataSize(), batch->Count(),
|
|
|
|
batch->content_flags_.load(std::memory_order_relaxed))
|
|
|
|
#ifndef NDEBUG
|
|
|
|
,
|
|
|
|
committed_(false)
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
~LocalSavePoint() { assert(committed_); }
|
|
|
|
#endif
|
|
|
|
Status commit() {
|
|
|
|
#ifndef NDEBUG
|
|
|
|
committed_ = true;
|
|
|
|
#endif
|
|
|
|
if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
|
|
|
|
batch_->rep_.resize(savepoint_.size);
|
|
|
|
WriteBatchInternal::SetCount(batch_, savepoint_.count);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 21:17:17 +01:00
|
|
|
if (batch_->prot_info_ != nullptr) {
|
|
|
|
batch_->prot_info_->entries_.resize(savepoint_.count);
|
|
|
|
}
|
2017-04-11 00:38:34 +02:00
|
|
|
batch_->content_flags_.store(savepoint_.content_flags,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
return Status::MemoryLimit();
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
WriteBatch* batch_;
|
|
|
|
SavePoint savepoint_;
|
|
|
|
#ifndef NDEBUG
|
|
|
|
bool committed_;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2021-12-01 07:31:41 +01:00
|
|
|
template <typename Derived, typename Checker>
|
|
|
|
class TimestampAssignerBase : public WriteBatch::Handler {
|
|
|
|
public:
|
|
|
|
explicit TimestampAssignerBase(WriteBatch::ProtectionInfo* prot_info,
|
|
|
|
Checker&& checker)
|
|
|
|
: prot_info_(prot_info), checker_(std::move(checker)) {}
|
|
|
|
|
|
|
|
~TimestampAssignerBase() override {}
|
|
|
|
|
|
|
|
Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
|
|
|
|
return AssignTimestamp(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return AssignTimestamp(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return AssignTimestamp(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
|
|
|
|
const Slice&) override {
|
|
|
|
return AssignTimestamp(cf, begin_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
|
|
|
|
return AssignTimestamp(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override {
|
|
|
|
return AssignTimestamp(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkBeginPrepare(bool) override { return Status::OK(); }
|
|
|
|
|
|
|
|
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
|
|
|
|
|
|
|
|
Status MarkCommit(const Slice&) override { return Status::OK(); }
|
|
|
|
|
|
|
|
Status MarkRollback(const Slice&) override { return Status::OK(); }
|
|
|
|
|
|
|
|
protected:
|
|
|
|
Status AssignTimestamp(uint32_t cf, const Slice& key) {
|
|
|
|
Status s = static_cast_with_check<Derived>(this)->AssignTimestampImpl(
|
|
|
|
cf, key, idx_);
|
|
|
|
++idx_;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CheckTimestampSize(uint32_t cf, size_t& ts_sz) {
|
|
|
|
return checker_(cf, ts_sz);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status UpdateTimestampIfNeeded(size_t ts_sz, const Slice& key,
|
|
|
|
const Slice& ts) {
|
|
|
|
if (ts_sz > 0) {
|
|
|
|
assert(ts_sz == ts.size());
|
|
|
|
UpdateProtectionInformationIfNeeded(key, ts);
|
|
|
|
UpdateTimestamp(key, ts);
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void UpdateProtectionInformationIfNeeded(const Slice& key, const Slice& ts) {
|
|
|
|
if (prot_info_ != nullptr) {
|
|
|
|
const size_t ts_sz = ts.size();
|
|
|
|
SliceParts old_key(&key, 1);
|
|
|
|
Slice key_no_ts(key.data(), key.size() - ts_sz);
|
|
|
|
std::array<Slice, 2> new_key_cmpts{{key_no_ts, ts}};
|
|
|
|
SliceParts new_key(new_key_cmpts.data(), 2);
|
|
|
|
prot_info_->entries_[idx_].UpdateK(old_key, new_key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void UpdateTimestamp(const Slice& key, const Slice& ts) {
|
|
|
|
const size_t ts_sz = ts.size();
|
|
|
|
char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
|
|
|
|
assert(ptr);
|
|
|
|
memcpy(ptr, ts.data(), ts_sz);
|
|
|
|
}
|
|
|
|
|
|
|
|
// No copy or move.
|
|
|
|
TimestampAssignerBase(const TimestampAssignerBase&) = delete;
|
|
|
|
TimestampAssignerBase(TimestampAssignerBase&&) = delete;
|
|
|
|
TimestampAssignerBase& operator=(const TimestampAssignerBase&) = delete;
|
|
|
|
TimestampAssignerBase& operator=(TimestampAssignerBase&&) = delete;
|
|
|
|
|
|
|
|
WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
|
|
|
|
const Checker checker_{};
|
|
|
|
size_t idx_ = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <typename Checker>
|
|
|
|
class SimpleListTimestampAssigner
|
|
|
|
: public TimestampAssignerBase<SimpleListTimestampAssigner<Checker>,
|
|
|
|
Checker> {
|
|
|
|
public:
|
|
|
|
explicit SimpleListTimestampAssigner(WriteBatch::ProtectionInfo* prot_info,
|
|
|
|
Checker checker,
|
|
|
|
const std::vector<Slice>& timestamps)
|
|
|
|
: TimestampAssignerBase<SimpleListTimestampAssigner<Checker>, Checker>(
|
|
|
|
prot_info, std::move(checker)),
|
|
|
|
timestamps_(timestamps) {}
|
|
|
|
|
|
|
|
~SimpleListTimestampAssigner() override {}
|
|
|
|
|
|
|
|
private:
|
|
|
|
friend class TimestampAssignerBase<SimpleListTimestampAssigner<Checker>,
|
|
|
|
Checker>;
|
|
|
|
|
|
|
|
Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t idx) {
|
|
|
|
if (idx >= timestamps_.size()) {
|
|
|
|
return Status::InvalidArgument("Need more timestamps for the assignment");
|
|
|
|
}
|
|
|
|
const Slice& ts = timestamps_[idx];
|
|
|
|
size_t ts_sz = ts.size();
|
|
|
|
const Status s = this->CheckTimestampSize(cf, ts_sz);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return this->UpdateTimestampIfNeeded(ts_sz, key, ts);
|
|
|
|
}
|
|
|
|
|
|
|
|
const std::vector<Slice>& timestamps_;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <typename Checker>
|
|
|
|
class TimestampAssigner
|
|
|
|
: public TimestampAssignerBase<TimestampAssigner<Checker>, Checker> {
|
|
|
|
public:
|
|
|
|
explicit TimestampAssigner(WriteBatch::ProtectionInfo* prot_info,
|
|
|
|
Checker checker, const Slice& ts)
|
|
|
|
: TimestampAssignerBase<TimestampAssigner<Checker>, Checker>(
|
|
|
|
prot_info, std::move(checker)),
|
|
|
|
timestamp_(ts) {
|
|
|
|
assert(!timestamp_.empty());
|
|
|
|
}
|
|
|
|
~TimestampAssigner() override {}
|
|
|
|
|
|
|
|
private:
|
|
|
|
friend class TimestampAssignerBase<TimestampAssigner<Checker>, Checker>;
|
|
|
|
|
|
|
|
Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t /*idx*/) {
|
|
|
|
if (timestamp_.empty()) {
|
|
|
|
return Status::InvalidArgument("Timestamp is empty");
|
|
|
|
}
|
|
|
|
size_t ts_sz = timestamp_.size();
|
|
|
|
const Status s = this->CheckTimestampSize(cf, ts_sz);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return this->UpdateTimestampIfNeeded(ts_sz, key, timestamp_);
|
|
|
|
}
|
|
|
|
|
|
|
|
const Slice timestamp_;
|
|
|
|
};
|
|
|
|
|
|
|
|
template <typename Checker>
|
|
|
|
Status WriteBatch::AssignTimestamp(const Slice& ts, Checker checker) {
|
|
|
|
TimestampAssigner<Checker> ts_assigner(prot_info_.get(), checker, ts);
|
|
|
|
return Iterate(&ts_assigner);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename Checker>
|
|
|
|
Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list,
|
|
|
|
Checker checker) {
|
|
|
|
SimpleListTimestampAssigner<Checker> ts_assigner(prot_info_.get(), checker,
|
|
|
|
ts_list);
|
|
|
|
return Iterate(&ts_assigner);
|
|
|
|
}
|
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|