2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2014-10-28 19:54:33 +01:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/flush_job.h"
|
|
|
|
|
2019-06-06 22:52:39 +02:00
|
|
|
#include <cinttypes>
|
2015-09-02 22:58:22 +02:00
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/builder.h"
|
|
|
|
#include "db/db_iter.h"
|
|
|
|
#include "db/dbformat.h"
|
2015-05-28 22:37:47 +02:00
|
|
|
#include "db/event_helpers.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
2018-12-17 22:12:22 +01:00
|
|
|
#include "db/memtable.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "db/memtable_list.h"
|
|
|
|
#include "db/merge_context.h"
|
2018-12-17 22:12:22 +01:00
|
|
|
#include "db/range_tombstone_fragmenter.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "db/version_set.h"
|
2019-05-30 05:44:08 +02:00
|
|
|
#include "file/file_util.h"
|
|
|
|
#include "file/filename.h"
|
2019-06-01 02:19:43 +02:00
|
|
|
#include "logging/event_logger.h"
|
|
|
|
#include "logging/log_buffer.h"
|
|
|
|
#include "logging/logging.h"
|
2017-04-06 04:02:00 +02:00
|
|
|
#include "monitoring/iostats_context_imp.h"
|
|
|
|
#include "monitoring/perf_context_imp.h"
|
|
|
|
#include "monitoring/thread_status_util.h"
|
2015-09-02 22:58:22 +02:00
|
|
|
#include "port/port.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/table.h"
|
2017-02-03 01:38:40 +01:00
|
|
|
#include "table/merging_iterator.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "table/table_builder.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
2019-05-31 02:39:43 +02:00
|
|
|
#include "test_util/sync_point.h"
|
2014-10-28 19:54:33 +01:00
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/mutexlock.h"
|
|
|
|
#include "util/stop_watch.h"
|
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2014-10-28 19:54:33 +01:00
|
|
|
|
2018-02-09 21:09:55 +01:00
|
|
|
const char* GetFlushReasonString (FlushReason flush_reason) {
|
|
|
|
switch (flush_reason) {
|
FlushReason improvement
Summary:
Right now flush reason "SuperVersion Change" covers a few different scenarios which is a bit vague. For example, the following db_bench job should trigger "Write Buffer Full"
> $ TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -write_buffer_size=1048576 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304
$ grep 'flush_reason' /dev/shm/dbbench/LOG
...
2018/03/06-17:30:42.543638 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242543634, "job": 192, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018024, "flush_reason": "SuperVersion Change"}
2018/03/06-17:30:42.569541 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242569536, "job": 193, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "SuperVersion Change"}
2018/03/06-17:30:42.596396 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242596392, "job": 194, "event": "flush_started", "num_memtables": 1, "num_entries": 7008, "num_deletes": 0, "memory_usage": 1018048, "flush_reason": "SuperVersion Change"}
2018/03/06-17:30:42.622444 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242622440, "job": 195, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "SuperVersion Change"}
With the fix:
> 2018/03/19-14:40:02.341451 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602341444, "job": 98, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018008, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.379655 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602379642, "job": 100, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018016, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.418479 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602418474, "job": 101, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.455084 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602455079, "job": 102, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018048, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.492293 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602492288, "job": 104, "event": "flush_started", "num_memtables": 1, "num_entries": 7007, "num_deletes": 0, "memory_usage": 1018056, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.528720 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602528715, "job": 105, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.566255 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602566238, "job": 107, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018112, "flush_reason": "Write Buffer Full"}
Closes https://github.com/facebook/rocksdb/pull/3627
Differential Revision: D7328772
Pulled By: miasantreble
fbshipit-source-id: 67c94065fbdd36930f09930aad0aaa6d2c152bb8
2018-03-23 02:34:04 +01:00
|
|
|
case FlushReason::kOthers:
|
|
|
|
return "Other Reasons";
|
2018-02-09 21:09:55 +01:00
|
|
|
case FlushReason::kGetLiveFiles:
|
|
|
|
return "Get Live Files";
|
|
|
|
case FlushReason::kShutDown:
|
|
|
|
return "Shut down";
|
|
|
|
case FlushReason::kExternalFileIngestion:
|
|
|
|
return "External File Ingestion";
|
|
|
|
case FlushReason::kManualCompaction:
|
|
|
|
return "Manual Compaction";
|
|
|
|
case FlushReason::kWriteBufferManager:
|
|
|
|
return "Write Buffer Manager";
|
|
|
|
case FlushReason::kWriteBufferFull:
|
|
|
|
return "Write Buffer Full";
|
|
|
|
case FlushReason::kTest:
|
|
|
|
return "Test";
|
FlushReason improvement
Summary:
Right now flush reason "SuperVersion Change" covers a few different scenarios which is a bit vague. For example, the following db_bench job should trigger "Write Buffer Full"
> $ TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -write_buffer_size=1048576 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304
$ grep 'flush_reason' /dev/shm/dbbench/LOG
...
2018/03/06-17:30:42.543638 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242543634, "job": 192, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018024, "flush_reason": "SuperVersion Change"}
2018/03/06-17:30:42.569541 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242569536, "job": 193, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "SuperVersion Change"}
2018/03/06-17:30:42.596396 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242596392, "job": 194, "event": "flush_started", "num_memtables": 1, "num_entries": 7008, "num_deletes": 0, "memory_usage": 1018048, "flush_reason": "SuperVersion Change"}
2018/03/06-17:30:42.622444 7f2773b99700 EVENT_LOG_v1 {"time_micros": 1520386242622440, "job": 195, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "SuperVersion Change"}
With the fix:
> 2018/03/19-14:40:02.341451 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602341444, "job": 98, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018008, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.379655 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602379642, "job": 100, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018016, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.418479 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602418474, "job": 101, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.455084 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602455079, "job": 102, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018048, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.492293 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602492288, "job": 104, "event": "flush_started", "num_memtables": 1, "num_entries": 7007, "num_deletes": 0, "memory_usage": 1018056, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.528720 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602528715, "job": 105, "event": "flush_started", "num_memtables": 1, "num_entries": 7006, "num_deletes": 0, "memory_usage": 1018104, "flush_reason": "Write Buffer Full"}
2018/03/19-14:40:02.566255 7f11dc257700 EVENT_LOG_v1 {"time_micros": 1521495602566238, "job": 107, "event": "flush_started", "num_memtables": 1, "num_entries": 7009, "num_deletes": 0, "memory_usage": 1018112, "flush_reason": "Write Buffer Full"}
Closes https://github.com/facebook/rocksdb/pull/3627
Differential Revision: D7328772
Pulled By: miasantreble
fbshipit-source-id: 67c94065fbdd36930f09930aad0aaa6d2c152bb8
2018-03-23 02:34:04 +01:00
|
|
|
case FlushReason::kDeleteFiles:
|
|
|
|
return "Delete Files";
|
|
|
|
case FlushReason::kAutoCompaction:
|
|
|
|
return "Auto Compaction";
|
|
|
|
case FlushReason::kManualFlush:
|
|
|
|
return "Manual Flush";
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 22:36:19 +02:00
|
|
|
case FlushReason::kErrorRecovery:
|
|
|
|
return "Error Recovery";
|
2021-04-08 08:17:41 +02:00
|
|
|
case FlushReason::kWalFull:
|
|
|
|
return "WAL Full";
|
2018-02-09 21:09:55 +01:00
|
|
|
default:
|
|
|
|
return "Invalid";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-18 04:43:22 +01:00
|
|
|
FlushJob::FlushJob(
|
|
|
|
const std::string& dbname, ColumnFamilyData* cfd,
|
|
|
|
const ImmutableDBOptions& db_options,
|
|
|
|
const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
|
|
|
|
const FileOptions& file_options, VersionSet* versions,
|
|
|
|
InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
|
|
|
|
std::vector<SequenceNumber> existing_snapshots,
|
|
|
|
SequenceNumber earliest_write_conflict_snapshot,
|
|
|
|
SnapshotChecker* snapshot_checker, JobContext* job_context,
|
|
|
|
LogBuffer* log_buffer, FSDirectory* db_directory,
|
|
|
|
FSDirectory* output_file_directory, CompressionType output_compression,
|
|
|
|
Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
|
|
|
|
const bool sync_output_directory, const bool write_manifest,
|
|
|
|
Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
|
|
|
|
const std::string& db_id, const std::string& db_session_id,
|
|
|
|
std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback)
|
2014-10-28 19:54:33 +01:00
|
|
|
: dbname_(dbname),
|
2020-06-17 19:55:42 +02:00
|
|
|
db_id_(db_id),
|
|
|
|
db_session_id_(db_session_id),
|
2014-10-28 19:54:33 +01:00
|
|
|
cfd_(cfd),
|
|
|
|
db_options_(db_options),
|
|
|
|
mutable_cf_options_(mutable_cf_options),
|
2018-10-16 04:59:20 +02:00
|
|
|
max_memtable_id_(max_memtable_id),
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
file_options_(file_options),
|
2014-10-28 19:54:33 +01:00
|
|
|
versions_(versions),
|
|
|
|
db_mutex_(db_mutex),
|
|
|
|
shutting_down_(shutting_down),
|
2015-08-24 20:11:12 +02:00
|
|
|
existing_snapshots_(std::move(existing_snapshots)),
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-16 01:37:15 +02:00
|
|
|
earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
|
2017-10-06 19:26:38 +02:00
|
|
|
snapshot_checker_(snapshot_checker),
|
2014-10-28 19:54:33 +01:00
|
|
|
job_context_(job_context),
|
|
|
|
log_buffer_(log_buffer),
|
|
|
|
db_directory_(db_directory),
|
2015-01-26 22:59:38 +01:00
|
|
|
output_file_directory_(output_file_directory),
|
2014-10-28 19:54:33 +01:00
|
|
|
output_compression_(output_compression),
|
EventLogger
Summary:
Here's my proposal for making our LOGs easier to read by machines.
The idea is to dump all events as JSON objects. JSON is easy to read by humans, but more importantly, it's easy to read by machines. That way, we can parse this, load into SQLite/mongo and then query or visualize.
I started with table_create and table_delete events, but if everybody agrees, I'll continue by adding more events (flush/compaction/etc etc)
Test Plan:
Ran db_bench. Observed:
2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": 1421360005788015, "event": "table_file_creation", "file_number": 12, "file_size": 1909699}
2015/01/15-14:13:25.956500 110740000 EVENT_LOG_v1 {"time_micros": 1421360005956498, "event": "table_file_deletion", "file_number": 12}
Reviewers: yhchiang, rven, dhruba, MarkCallaghan, lgalanis, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31647
2015-03-13 18:15:54 +01:00
|
|
|
stats_(stats),
|
2016-04-14 22:56:29 +02:00
|
|
|
event_logger_(event_logger),
|
2016-07-20 00:12:46 +02:00
|
|
|
measure_io_stats_(measure_io_stats),
|
2018-10-16 04:59:20 +02:00
|
|
|
sync_output_directory_(sync_output_directory),
|
|
|
|
write_manifest_(write_manifest),
|
2017-10-27 06:00:17 +02:00
|
|
|
edit_(nullptr),
|
|
|
|
base_(nullptr),
|
2019-03-20 01:24:09 +01:00
|
|
|
pick_memtable_called(false),
|
2020-09-08 19:49:01 +02:00
|
|
|
thread_pri_(thread_pri),
|
2020-11-13 03:43:30 +01:00
|
|
|
io_tracer_(io_tracer),
|
2021-03-15 12:32:24 +01:00
|
|
|
clock_(db_options_.clock),
|
2021-03-18 04:43:22 +01:00
|
|
|
full_history_ts_low_(std::move(full_history_ts_low)),
|
|
|
|
blob_callback_(blob_callback) {
|
2015-03-13 18:45:40 +01:00
|
|
|
// Update the thread status to indicate flush.
|
2015-05-16 08:22:22 +02:00
|
|
|
ReportStartedFlush();
|
2015-03-13 18:45:40 +01:00
|
|
|
TEST_SYNC_POINT("FlushJob::FlushJob()");
|
|
|
|
}
|
|
|
|
|
|
|
|
FlushJob::~FlushJob() {
|
2020-09-28 23:57:48 +02:00
|
|
|
io_status_.PermitUncheckedError();
|
2015-03-13 18:45:40 +01:00
|
|
|
ThreadStatusUtil::ResetThreadStatus();
|
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
|
2015-05-16 08:22:22 +02:00
|
|
|
void FlushJob::ReportStartedFlush() {
|
2016-01-26 01:26:53 +01:00
|
|
|
ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
|
2016-09-17 00:09:14 +02:00
|
|
|
db_options_.enable_thread_tracking);
|
2015-05-16 08:22:22 +02:00
|
|
|
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_JOB_ID,
|
|
|
|
job_context_->job_id);
|
|
|
|
IOSTATS_RESET(bytes_written);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
|
|
|
|
uint64_t input_size = 0;
|
|
|
|
for (auto* mem : mems) {
|
|
|
|
input_size += mem->ApproximateMemoryUsage();
|
|
|
|
}
|
|
|
|
ThreadStatusUtil::IncreaseThreadOperationProperty(
|
|
|
|
ThreadStatus::FLUSH_BYTES_MEMTABLES,
|
|
|
|
input_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FlushJob::RecordFlushIOStats() {
|
2016-04-25 21:01:01 +02:00
|
|
|
RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
|
|
|
|
ThreadStatusUtil::IncreaseThreadOperationProperty(
|
2015-05-16 08:22:22 +02:00
|
|
|
ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
|
2016-04-25 21:01:01 +02:00
|
|
|
IOSTATS_RESET(bytes_written);
|
2015-05-16 08:22:22 +02:00
|
|
|
}
|
2016-07-20 00:12:46 +02:00
|
|
|
void FlushJob::PickMemTable() {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
assert(!pick_memtable_called);
|
|
|
|
pick_memtable_called = true;
|
|
|
|
// Save the contents of the earliest memtable as a new Table
|
2018-10-16 04:59:20 +02:00
|
|
|
cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_);
|
2016-07-20 00:12:46 +02:00
|
|
|
if (mems_.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ReportFlushInputSize(mems_);
|
|
|
|
|
|
|
|
// entries mems are (implicitly) sorted in ascending order by their created
|
|
|
|
// time. We will use the first memtable's `edit` to keep the meta info for
|
|
|
|
// this flush.
|
|
|
|
MemTable* m = mems_[0];
|
|
|
|
edit_ = m->GetEdits();
|
|
|
|
edit_->SetPrevLogNumber(0);
|
|
|
|
// SetLogNumber(log_num) indicates logs with number smaller than log_num
|
|
|
|
// will no longer be picked up for recovery.
|
|
|
|
edit_->SetLogNumber(mems_.back()->GetNextLogNumber());
|
|
|
|
edit_->SetColumnFamily(cfd_->GetID());
|
|
|
|
|
|
|
|
// path 0 for level 0 file.
|
|
|
|
meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
|
|
|
|
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
// If mempurge feature is activated, keep track of any potential
|
|
|
|
// memtables coming from a previous mempurge operation.
|
|
|
|
// Used for mempurge policy.
|
2021-08-11 03:07:48 +02:00
|
|
|
if (db_options_.experimental_mempurge_threshold > 0.0) {
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
contains_mempurge_outcome_ = false;
|
|
|
|
for (MemTable* mt : mems_) {
|
|
|
|
if (cfd_->imm()->IsMemPurgeOutput(mt->GetID())) {
|
|
|
|
contains_mempurge_outcome_ = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-20 00:12:46 +02:00
|
|
|
base_ = cfd_->current();
|
|
|
|
base_->Ref(); // it is likely that we do not need this reference
|
|
|
|
}
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
|
|
|
|
FileMetaData* file_meta) {
|
2018-06-28 21:23:57 +02:00
|
|
|
TEST_SYNC_POINT("FlushJob::Start");
|
2016-07-20 00:12:46 +02:00
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
assert(pick_memtable_called);
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_run(
|
|
|
|
ThreadStatus::STAGE_FLUSH_RUN);
|
2016-07-20 00:12:46 +02:00
|
|
|
if (mems_.empty()) {
|
2017-03-16 03:22:52 +01:00
|
|
|
ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
|
|
|
|
cfd_->GetName().c_str());
|
2016-07-20 00:12:46 +02:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2016-04-14 22:56:29 +02:00
|
|
|
// I/O measurement variables
|
|
|
|
PerfLevel prev_perf_level = PerfLevel::kEnableTime;
|
|
|
|
uint64_t prev_write_nanos = 0;
|
|
|
|
uint64_t prev_fsync_nanos = 0;
|
|
|
|
uint64_t prev_range_sync_nanos = 0;
|
|
|
|
uint64_t prev_prepare_write_nanos = 0;
|
2019-01-30 01:23:21 +01:00
|
|
|
uint64_t prev_cpu_write_nanos = 0;
|
|
|
|
uint64_t prev_cpu_read_nanos = 0;
|
2016-04-14 22:56:29 +02:00
|
|
|
if (measure_io_stats_) {
|
|
|
|
prev_perf_level = GetPerfLevel();
|
|
|
|
SetPerfLevel(PerfLevel::kEnableTime);
|
|
|
|
prev_write_nanos = IOSTATS(write_nanos);
|
|
|
|
prev_fsync_nanos = IOSTATS(fsync_nanos);
|
|
|
|
prev_range_sync_nanos = IOSTATS(range_sync_nanos);
|
|
|
|
prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
|
2019-01-30 01:23:21 +01:00
|
|
|
prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
|
|
|
|
prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
|
2016-04-14 22:56:29 +02:00
|
|
|
}
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
Status mempurge_s = Status::NotFound("No MemPurge.");
|
2021-08-11 03:07:48 +02:00
|
|
|
if ((db_options_.experimental_mempurge_threshold > 0.0) &&
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
(cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) &&
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
(!mems_.empty()) && MemPurgeDecider()) {
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
mempurge_s = MemPurge();
|
|
|
|
if (!mempurge_s.ok()) {
|
2021-07-12 19:41:28 +02:00
|
|
|
// Mempurge is typically aborted when the new_mem output memtable
|
|
|
|
// is filled at more than XX % capacity (currently: 60%).
|
|
|
|
if (mempurge_s.IsAborted()) {
|
|
|
|
ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n",
|
|
|
|
mempurge_s.ToString().c_str());
|
|
|
|
} else {
|
|
|
|
// However the mempurge process can also fail for
|
|
|
|
// other reasons (eg: new_mem->Add() fails).
|
|
|
|
ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n",
|
|
|
|
mempurge_s.ToString().c_str());
|
|
|
|
}
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
Status s;
|
|
|
|
if (mempurge_s.ok()) {
|
|
|
|
base_->Unref();
|
|
|
|
s = Status::OK();
|
|
|
|
} else {
|
|
|
|
// This will release and re-acquire the mutex.
|
|
|
|
s = WriteLevel0Table();
|
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
|
2019-05-20 19:37:37 +02:00
|
|
|
if (s.ok() && cfd_->IsDropped()) {
|
|
|
|
s = Status::ColumnFamilyDropped("Column family dropped during compaction");
|
|
|
|
}
|
|
|
|
if ((s.ok() || s.IsColumnFamilyDropped()) &&
|
|
|
|
shutting_down_->load(std::memory_order_acquire)) {
|
|
|
|
s = Status::ShutdownInProgress("Database shutdown");
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
2016-07-20 00:12:46 +02:00
|
|
|
cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
|
2018-10-16 04:59:20 +02:00
|
|
|
} else if (write_manifest_) {
|
LogAndApply() should fail if the column family has been dropped
Summary:
This patch finally fixes the ColumnFamilyTest.ReadDroppedColumnFamily test. The test has been failing very sporadically and it was hard to repro. However, I managed to write a new tests that reproes the failure deterministically.
Here's what happens:
1. We start the flush for the column family
2. We check if the column family was dropped here: https://github.com/facebook/rocksdb/blob/a3fc49bfddcdb1ff29409aacd06c04df56c7a1d7/db/flush_job.cc#L149
3. This check goes through, ends up in InstallMemtableFlushResults() and it goes into LogAndApply()
4. At about this time, we start dropping the column family. Dropping the column family process gets to LogAndApply() at about the same time as LogAndApply() from flush process
5. Drop column family goes through LogAndApply() first, marking the column family as dropped.
6. Flush process gets woken up and gets a chance to write to the MANIFEST. However, this is where it gets stuck: https://github.com/facebook/rocksdb/blob/a3fc49bfddcdb1ff29409aacd06c04df56c7a1d7/db/version_set.cc#L1975
7. We see that the column family was dropped, so there is no need to write to the MANIFEST. We return OK.
8. Flush gets OK back from LogAndApply() and it deletes the memtable, thinking that the data is now safely persisted to sst file.
The fix is pretty simple. Instead of OK, we return ShutdownInProgress. This is not really true, but we have been using this status code to also mean "this operation was canceled because the column family has been dropped".
The fix is only one LOC. All other code is related to tests. I added a new test that reproes the failure. I also moved SleepingBackgroundTask to util/testutil.h (because I needed it in column_family_test for my new test). There's plenty of other places where we reimplement SleepingBackgroundTask, but I'll address that in a separate commit.
Test Plan:
1. new test
2. make check
3. Make sure the ColumnFamilyTest.ReadDroppedColumnFamily doesn't fail on Travis: https://travis-ci.org/facebook/rocksdb/jobs/79952386
Reviewers: yhchiang, anthony, IslamAbdelRahman, kradhakrishnan, rven, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D46773
2015-09-15 20:28:44 +02:00
|
|
|
TEST_SYNC_POINT("FlushJob::InstallResults");
|
2014-10-28 19:54:33 +01:00
|
|
|
// Replace immutable memtable with the generated Table
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus tmp_io_s;
|
2018-10-06 00:37:45 +02:00
|
|
|
s = cfd_->imm()->TryInstallMemtableFlushResults(
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
|
2016-07-20 00:12:46 +02:00
|
|
|
meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
|
2021-07-16 02:48:17 +02:00
|
|
|
log_buffer_, &committed_flush_jobs_info_, &tmp_io_s,
|
|
|
|
!(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted),
|
|
|
|
but 'false' if mempurge successful: no new min log number
|
|
|
|
or new level 0 file path to write to manifest. */);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
if (!tmp_io_s.ok()) {
|
|
|
|
io_status_ = tmp_io_s;
|
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
|
|
|
|
2015-06-12 00:22:22 +02:00
|
|
|
if (s.ok() && file_meta != nullptr) {
|
2016-07-20 00:12:46 +02:00
|
|
|
*file_meta = meta_;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
}
|
2015-05-16 08:22:22 +02:00
|
|
|
RecordFlushIOStats();
|
2015-02-17 19:13:52 +01:00
|
|
|
|
2020-01-06 19:15:00 +01:00
|
|
|
// When measure_io_stats_ is true, the default 512 bytes is not enough.
|
|
|
|
auto stream = event_logger_->LogToBuffer(log_buffer_, 1024);
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
stream << "job" << job_context_->job_id << "event"
|
|
|
|
<< "flush_finished";
|
2017-12-14 19:18:01 +01:00
|
|
|
stream << "output_compression"
|
|
|
|
<< CompressionTypeToString(output_compression_);
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
stream << "lsm_state";
|
|
|
|
stream.StartArray();
|
|
|
|
auto vstorage = cfd_->current()->storage_info();
|
|
|
|
for (int level = 0; level < vstorage->num_levels(); ++level) {
|
|
|
|
stream << vstorage->NumLevelFiles(level);
|
|
|
|
}
|
|
|
|
stream.EndArray();
|
2020-09-15 06:10:09 +02:00
|
|
|
|
|
|
|
const auto& blob_files = vstorage->GetBlobFiles();
|
|
|
|
if (!blob_files.empty()) {
|
|
|
|
stream << "blob_file_head" << blob_files.begin()->first;
|
|
|
|
stream << "blob_file_tail" << blob_files.rbegin()->first;
|
|
|
|
}
|
|
|
|
|
2016-06-01 20:11:33 +02:00
|
|
|
stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
|
2016-04-14 22:56:29 +02:00
|
|
|
if (measure_io_stats_) {
|
|
|
|
if (prev_perf_level != PerfLevel::kEnableTime) {
|
|
|
|
SetPerfLevel(prev_perf_level);
|
|
|
|
}
|
|
|
|
stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
|
|
|
|
stream << "file_range_sync_nanos"
|
|
|
|
<< (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
|
|
|
|
stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
|
|
|
|
stream << "file_prepare_write_nanos"
|
|
|
|
<< (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
|
2019-01-30 01:23:21 +01:00
|
|
|
stream << "file_cpu_write_nanos"
|
|
|
|
<< (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
|
|
|
|
stream << "file_cpu_read_nanos"
|
|
|
|
<< (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
|
2016-04-14 22:56:29 +02:00
|
|
|
}
|
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-01-20 08:03:45 +01:00
|
|
|
void FlushJob::Cancel() {
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
assert(base_ != nullptr);
|
|
|
|
base_->Unref();
|
|
|
|
}
|
|
|
|
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
Status FlushJob::MemPurge() {
|
|
|
|
Status s;
|
|
|
|
db_mutex_->AssertHeld();
|
|
|
|
db_mutex_->Unlock();
|
|
|
|
assert(!mems_.empty());
|
|
|
|
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
// Measure purging time.
|
|
|
|
const uint64_t start_micros = clock_->NowMicros();
|
|
|
|
const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
|
|
|
|
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
MemTable* new_mem = nullptr;
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
// For performance/log investigation purposes:
|
|
|
|
// look at how much useful payload we harvest in the new_mem.
|
|
|
|
// This value is then printed to the DB log.
|
|
|
|
double new_mem_capacity = 0.0;
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
|
|
|
|
// Create two iterators, one for the memtable data (contains
|
|
|
|
// info from puts + deletes), and one for the memtable
|
|
|
|
// Range Tombstones (from DeleteRanges).
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
Arena arena;
|
|
|
|
std::vector<InternalIterator*> memtables;
|
|
|
|
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
|
|
|
|
range_del_iters;
|
|
|
|
for (MemTable* m : mems_) {
|
|
|
|
memtables.push_back(m->NewIterator(ro, &arena));
|
|
|
|
auto* range_del_iter = m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
|
|
|
|
if (range_del_iter != nullptr) {
|
|
|
|
range_del_iters.emplace_back(range_del_iter);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!memtables.empty());
|
2021-07-16 02:48:17 +02:00
|
|
|
SequenceNumber first_seqno = kMaxSequenceNumber;
|
|
|
|
SequenceNumber earliest_seqno = kMaxSequenceNumber;
|
|
|
|
// Pick first and earliest seqno as min of all first_seqno
|
|
|
|
// and earliest_seqno of the mempurged memtables.
|
|
|
|
for (const auto& mem : mems_) {
|
|
|
|
first_seqno = mem->GetFirstSequenceNumber() < first_seqno
|
|
|
|
? mem->GetFirstSequenceNumber()
|
|
|
|
: first_seqno;
|
|
|
|
earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno
|
|
|
|
? mem->GetEarliestSequenceNumber()
|
|
|
|
: earliest_seqno;
|
|
|
|
}
|
|
|
|
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
ScopedArenaIterator iter(
|
|
|
|
NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(),
|
|
|
|
static_cast<int>(memtables.size()), &arena));
|
|
|
|
|
|
|
|
auto* ioptions = cfd_->ioptions();
|
|
|
|
|
|
|
|
// Place iterator at the First (meaning most recent) key node.
|
|
|
|
iter->SeekToFirst();
|
|
|
|
|
|
|
|
std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
|
|
|
|
new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
|
|
|
|
existing_snapshots_));
|
|
|
|
for (auto& rd_iter : range_del_iters) {
|
|
|
|
range_del_agg->AddTombstones(std::move(rd_iter));
|
|
|
|
}
|
|
|
|
|
|
|
|
// If there is valid data in the memtable,
|
|
|
|
// or at least range tombstones, copy over the info
|
|
|
|
// to the new memtable.
|
|
|
|
if (iter->Valid() || !range_del_agg->IsEmpty()) {
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
// MaxSize is the size of a memtable.
|
|
|
|
size_t maxSize = mutable_cf_options_.write_buffer_size;
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
std::unique_ptr<CompactionFilter> compaction_filter;
|
|
|
|
if (ioptions->compaction_filter_factory != nullptr &&
|
|
|
|
ioptions->compaction_filter_factory->ShouldFilterTableFileCreation(
|
|
|
|
TableFileCreationReason::kFlush)) {
|
|
|
|
CompactionFilter::Context ctx;
|
|
|
|
ctx.is_full_compaction = false;
|
|
|
|
ctx.is_manual_compaction = false;
|
|
|
|
ctx.column_family_id = cfd_->GetID();
|
|
|
|
ctx.reason = TableFileCreationReason::kFlush;
|
|
|
|
compaction_filter =
|
|
|
|
ioptions->compaction_filter_factory->CreateCompactionFilter(ctx);
|
|
|
|
if (compaction_filter != nullptr &&
|
|
|
|
!compaction_filter->IgnoreSnapshots()) {
|
|
|
|
s = Status::NotSupported(
|
|
|
|
"CompactionFilter::IgnoreSnapshots() = false is not supported "
|
|
|
|
"anymore.");
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-16 02:48:17 +02:00
|
|
|
new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
|
|
|
|
mutable_cf_options_, cfd_->write_buffer_mgr(),
|
2021-07-23 03:26:47 +02:00
|
|
|
earliest_seqno, cfd_->GetID());
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
assert(new_mem != nullptr);
|
|
|
|
|
|
|
|
Env* env = db_options_.env;
|
|
|
|
assert(env);
|
|
|
|
MergeHelper merge(
|
|
|
|
env, (cfd_->internal_comparator()).user_comparator(),
|
|
|
|
(ioptions->merge_operator).get(), compaction_filter.get(),
|
|
|
|
ioptions->logger, true /* internal key corruption is not ok */,
|
|
|
|
existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
|
|
|
|
snapshot_checker_);
|
|
|
|
CompactionIterator c_iter(
|
|
|
|
iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
|
|
|
|
kMaxSequenceNumber, &existing_snapshots_,
|
|
|
|
earliest_write_conflict_snapshot_, snapshot_checker_, env,
|
|
|
|
ShouldReportDetailedTime(env, ioptions->stats),
|
|
|
|
true /* internal key corruption is not ok */, range_del_agg.get(),
|
|
|
|
nullptr, ioptions->allow_data_in_errors,
|
|
|
|
/*compaction=*/nullptr, compaction_filter.get(),
|
|
|
|
/*shutting_down=*/nullptr,
|
|
|
|
/*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
|
|
|
|
/*manual_compaction_canceled=*/nullptr, ioptions->info_log,
|
|
|
|
&(cfd_->GetFullHistoryTsLow()));
|
|
|
|
|
|
|
|
// Set earliest sequence number in the new memtable
|
|
|
|
// to be equal to the earliest sequence number of the
|
|
|
|
// memtable being flushed (See later if there is a need
|
|
|
|
// to update this number!).
|
|
|
|
new_mem->SetEarliestSequenceNumber(earliest_seqno);
|
|
|
|
// Likewise for first seq number.
|
|
|
|
new_mem->SetFirstSequenceNumber(first_seqno);
|
|
|
|
SequenceNumber new_first_seqno = kMaxSequenceNumber;
|
|
|
|
|
|
|
|
c_iter.SeekToFirst();
|
|
|
|
|
|
|
|
// Key transfer
|
|
|
|
for (; c_iter.Valid(); c_iter.Next()) {
|
|
|
|
const ParsedInternalKey ikey = c_iter.ikey();
|
|
|
|
const Slice value = c_iter.value();
|
|
|
|
new_first_seqno =
|
|
|
|
ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno;
|
|
|
|
|
|
|
|
// Should we update "OldestKeyTime" ???? -> timestamp appear
|
|
|
|
// to still be an "experimental" feature.
|
|
|
|
s = new_mem->Add(
|
|
|
|
ikey.sequence, ikey.type, ikey.user_key, value,
|
|
|
|
nullptr, // KV protection info set as nullptr since it
|
|
|
|
// should only be useful for the first add to
|
|
|
|
// the original memtable.
|
|
|
|
false, // : allow concurrent_memtable_writes_
|
|
|
|
// Not seen as necessary for now.
|
|
|
|
nullptr, // get_post_process_info(m) must be nullptr
|
|
|
|
// when concurrent_memtable_writes is switched off.
|
|
|
|
nullptr); // hint, only used when concurrent_memtable_writes_
|
|
|
|
// is switched on.
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If new_mem has size greater than maxSize,
|
|
|
|
// then rollback to regular flush operation,
|
|
|
|
// and destroy new_mem.
|
|
|
|
if (new_mem->ApproximateMemoryUsage() > maxSize) {
|
|
|
|
s = Status::Aborted("Mempurge filled more than one memtable.");
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
new_mem_capacity = 1.0;
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check status and propagate
|
|
|
|
// potential error status from c_iter
|
|
|
|
if (!s.ok()) {
|
|
|
|
c_iter.status().PermitUncheckedError();
|
|
|
|
} else if (!c_iter.status().ok()) {
|
|
|
|
s = c_iter.status();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Range tombstone transfer.
|
|
|
|
if (s.ok()) {
|
|
|
|
auto range_del_it = range_del_agg->NewIterator();
|
|
|
|
for (range_del_it->SeekToFirst(); range_del_it->Valid();
|
|
|
|
range_del_it->Next()) {
|
|
|
|
auto tombstone = range_del_it->Tombstone();
|
|
|
|
new_first_seqno =
|
|
|
|
tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno;
|
|
|
|
s = new_mem->Add(
|
|
|
|
tombstone.seq_, // Sequence number
|
|
|
|
kTypeRangeDeletion, // KV type
|
|
|
|
tombstone.start_key_, // Key is start key.
|
|
|
|
tombstone.end_key_, // Value is end key.
|
|
|
|
nullptr, // KV protection info set as nullptr since it
|
|
|
|
// should only be useful for the first add to
|
|
|
|
// the original memtable.
|
|
|
|
false, // : allow concurrent_memtable_writes_
|
|
|
|
// Not seen as necessary for now.
|
|
|
|
nullptr, // get_post_process_info(m) must be nullptr
|
|
|
|
// when concurrent_memtable_writes is switched off.
|
|
|
|
nullptr); // hint, only used when concurrent_memtable_writes_
|
|
|
|
// is switched on.
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If new_mem has size greater than maxSize,
|
|
|
|
// then rollback to regular flush operation,
|
|
|
|
// and destroy new_mem.
|
|
|
|
if (new_mem->ApproximateMemoryUsage() > maxSize) {
|
|
|
|
s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
new_mem_capacity = 1.0;
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If everything happened smoothly and new_mem contains valid data,
|
|
|
|
// decide if it is flushed to storage or kept in the imm()
|
|
|
|
// memtable list (memory).
|
|
|
|
if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) {
|
|
|
|
// Rectify the first sequence number, which (unlike the earliest seq
|
|
|
|
// number) needs to be present in the new memtable.
|
|
|
|
new_mem->SetFirstSequenceNumber(new_first_seqno);
|
|
|
|
|
|
|
|
// The new_mem is added to the list of immutable memtables
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
// only if it filled at less than 100% capacity and isn't flagged
|
|
|
|
// as in need of being flushed.
|
|
|
|
if (new_mem->ApproximateMemoryUsage() < maxSize &&
|
|
|
|
!(new_mem->ShouldFlushNow())) {
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
db_mutex_->Lock();
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
uint64_t new_mem_id = mems_[0]->GetID();
|
|
|
|
// Copy lowest memtable ID
|
|
|
|
// House keeping work.
|
|
|
|
for (MemTable* mt : mems_) {
|
|
|
|
new_mem_id = mt->GetID() < new_mem_id ? mt->GetID() : new_mem_id;
|
|
|
|
// Note: if m is not a previous mempurge output memtable,
|
|
|
|
// nothing happens.
|
|
|
|
cfd_->imm()->RemoveMemPurgeOutputID(mt->GetID());
|
|
|
|
}
|
|
|
|
new_mem->SetID(new_mem_id);
|
|
|
|
cfd_->imm()->AddMemPurgeOutputID(new_mem_id);
|
2021-08-03 05:25:39 +02:00
|
|
|
// This addition will not trigger another flush, because
|
|
|
|
// we do not call SchedulePendingFlush().
|
|
|
|
cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free);
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
new_mem->Ref();
|
|
|
|
db_mutex_->Unlock();
|
|
|
|
} else {
|
|
|
|
s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
new_mem_capacity = 1.0;
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
if (new_mem) {
|
|
|
|
job_context_->memtables_to_free.push_back(new_mem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// In this case, the newly allocated new_mem is empty.
|
|
|
|
assert(new_mem != nullptr);
|
|
|
|
job_context_->memtables_to_free.push_back(new_mem);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reacquire the mutex for WriteLevel0 function.
|
|
|
|
db_mutex_->Lock();
|
|
|
|
|
|
|
|
// If mempurge successful, don't write input tables to level0,
|
|
|
|
// but write any full output table to level0.
|
|
|
|
if (s.ok()) {
|
|
|
|
TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful");
|
|
|
|
} else {
|
|
|
|
TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
|
|
|
|
}
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
const uint64_t micros = clock_->NowMicros() - start_micros;
|
|
|
|
const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
|
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Mempurge lasted %" PRIu64
|
|
|
|
" microseconds, and %" PRIu64
|
|
|
|
" cpu "
|
|
|
|
"microseconds. Status is %s ok. Perc capacity: %f\n",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, micros,
|
|
|
|
cpu_micros, s.ok() ? "" : "not", new_mem_capacity);
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
bool FlushJob::MemPurgeDecider() {
|
2021-08-11 03:07:48 +02:00
|
|
|
double threshold = db_options_.experimental_mempurge_threshold;
|
|
|
|
// Never trigger mempurge if threshold is not a strictly positive value.
|
|
|
|
if (!(threshold > 0.0)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (threshold > (1.0 * mems_.size())) {
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
return true;
|
|
|
|
}
|
2021-08-11 03:07:48 +02:00
|
|
|
// Payload and useful_payload (in bytes).
|
|
|
|
// The useful payload ratio of a given MemTable
|
|
|
|
// is estimated to be useful_payload/payload.
|
|
|
|
uint64_t payload = 0, useful_payload = 0;
|
|
|
|
// If estimated_useful_payload is > threshold,
|
|
|
|
// then flush to storage, else MemPurge.
|
|
|
|
double estimated_useful_payload = 0.0;
|
|
|
|
// Cochran formula for determining sample size.
|
|
|
|
// 95% confidence interval, 7% precision.
|
|
|
|
// n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0
|
|
|
|
double n0 = 196.0;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
|
|
|
|
// Iterate over each memtable of the set.
|
|
|
|
for (MemTable* mt : mems_) {
|
|
|
|
// If the memtable is the output of a previous mempurge,
|
|
|
|
// its approximate useful payload ratio is already calculated.
|
|
|
|
if (cfd_->imm()->IsMemPurgeOutput(mt->GetID())) {
|
|
|
|
// We make the assumption that this memtable is already
|
|
|
|
// free of garbage (garbage underestimation).
|
|
|
|
estimated_useful_payload += mt->ApproximateMemoryUsage();
|
|
|
|
} else {
|
|
|
|
// Else sample from the table.
|
|
|
|
uint64_t nentries = mt->num_entries();
|
|
|
|
// Corrected Cochran formula for small populations
|
|
|
|
// (converges to n0 for large populations).
|
|
|
|
uint64_t target_sample_size =
|
|
|
|
static_cast<uint64_t>(ceil(n0 / (1.0 + (n0 / nentries))));
|
|
|
|
std::unordered_set<const char*> sentries = {};
|
|
|
|
// Populate sample entries set.
|
|
|
|
mt->UniqueRandomSample(target_sample_size, &sentries);
|
|
|
|
|
|
|
|
// Estimate the garbage ratio by comparing if
|
|
|
|
// each sample corresponds to a valid entry.
|
|
|
|
for (const char* ss : sentries) {
|
|
|
|
ParsedInternalKey res;
|
|
|
|
Slice entry_slice = GetLengthPrefixedSlice(ss);
|
|
|
|
Status parse_s =
|
|
|
|
ParseInternalKey(entry_slice, &res, true /*log_err_key*/);
|
|
|
|
if (!parse_s.ok()) {
|
|
|
|
ROCKS_LOG_WARN(db_options_.info_log,
|
|
|
|
"Memtable Decider: ParseInternalKey did not parse "
|
|
|
|
"entry_slice %s"
|
|
|
|
"successfully.",
|
|
|
|
entry_slice.data());
|
|
|
|
}
|
|
|
|
LookupKey lkey(res.user_key, kMaxSequenceNumber);
|
|
|
|
std::string vget;
|
|
|
|
Status s;
|
|
|
|
MergeContext merge_context;
|
|
|
|
SequenceNumber max_covering_tombstone_seq = 0, sqno = 0;
|
|
|
|
|
|
|
|
// Pick the oldest existing snapshot that is more recent
|
|
|
|
// than the sequence number of the sampled entry.
|
|
|
|
SequenceNumber min_seqno_snapshot = kMaxSequenceNumber;
|
|
|
|
SnapshotImpl min_snapshot;
|
|
|
|
for (SequenceNumber seq_num : existing_snapshots_) {
|
|
|
|
if (seq_num > res.sequence && seq_num < min_seqno_snapshot) {
|
|
|
|
min_seqno_snapshot = seq_num;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
min_snapshot.number_ = min_seqno_snapshot;
|
|
|
|
ro.snapshot =
|
|
|
|
min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr;
|
|
|
|
|
|
|
|
// Estimate if the sample entry is valid or not.
|
|
|
|
bool gres = mt->Get(lkey, &vget, nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, &sqno, ro);
|
|
|
|
if (!gres) {
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
db_options_.info_log,
|
|
|
|
"Memtable Get returned false when Get(sampled entry). "
|
|
|
|
"Yet each sample entry should exist somewhere in the memtable, "
|
|
|
|
"unrelated to whether it has been deleted or not.");
|
|
|
|
}
|
|
|
|
payload += entry_slice.size();
|
|
|
|
|
|
|
|
// TODO(bjlemaire): evaluate typeMerge.
|
|
|
|
// This is where the sampled entry is estimated to be
|
|
|
|
// garbage or not. Note that this is a garbage *estimation*
|
|
|
|
// because we do not include certain items such as
|
|
|
|
// CompactionFitlers triggered at flush, or if the same delete
|
|
|
|
// has been inserted twice or more in the memtable.
|
|
|
|
if (res.type == kTypeValue && gres && s.ok() && sqno == res.sequence) {
|
|
|
|
useful_payload += entry_slice.size();
|
|
|
|
} else if (((res.type == kTypeDeletion) ||
|
|
|
|
(res.type == kTypeSingleDeletion)) &&
|
|
|
|
s.IsNotFound() && gres) {
|
|
|
|
useful_payload += entry_slice.size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (payload > 0) {
|
|
|
|
// We used the estimated useful payload ratio
|
|
|
|
// to evaluate how much of the total memtable is useful bytes.
|
|
|
|
estimated_useful_payload +=
|
|
|
|
(mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload);
|
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
db_options_.info_log,
|
|
|
|
"Mempurge sampling - found garbage ratio from sampling: %f.\n",
|
|
|
|
(payload - useful_payload) * 1.0 / payload);
|
|
|
|
} else {
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
db_options_.info_log,
|
|
|
|
"Mempurge kSampling policy: null payload measured, and collected "
|
|
|
|
"sample size is %zu\n.",
|
|
|
|
sentries.size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// We convert the total number of useful paylaod bytes
|
|
|
|
// into the proportion of memtable necessary to store all these bytes.
|
|
|
|
// We compare this proportion with the threshold value.
|
|
|
|
return (estimated_useful_payload / mutable_cf_options_.write_buffer_size) <
|
|
|
|
threshold;
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
}
|
|
|
|
|
2016-07-20 00:12:46 +02:00
|
|
|
Status FlushJob::WriteLevel0Table() {
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_FLUSH_WRITE_L0);
|
2014-10-28 19:54:33 +01:00
|
|
|
db_mutex_->AssertHeld();
|
2021-01-26 07:07:26 +01:00
|
|
|
const uint64_t start_micros = clock_->NowMicros();
|
|
|
|
const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
|
2014-10-28 19:54:33 +01:00
|
|
|
Status s;
|
2020-09-15 06:10:09 +02:00
|
|
|
|
|
|
|
std::vector<BlobFileAddition> blob_file_additions;
|
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
{
|
2017-11-10 18:25:26 +01:00
|
|
|
auto write_hint = cfd_->CalculateSSTWriteHint(0);
|
2014-10-28 19:54:33 +01:00
|
|
|
db_mutex_->Unlock();
|
|
|
|
if (log_buffer_) {
|
|
|
|
log_buffer_->FlushBufferToLog();
|
|
|
|
}
|
2016-11-01 04:35:54 +01:00
|
|
|
// memtables and range_del_iters store internal iterators over each data
|
|
|
|
// memtable and its associated range deletion memtable, respectively, at
|
|
|
|
// corresponding indexes.
|
2015-10-13 00:06:38 +02:00
|
|
|
std::vector<InternalIterator*> memtables;
|
2018-12-17 22:12:22 +01:00
|
|
|
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
|
|
|
|
range_del_iters;
|
2014-10-28 19:54:33 +01:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
Arena arena;
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
uint64_t total_num_entries = 0, total_num_deletes = 0;
|
2019-02-16 01:30:23 +01:00
|
|
|
uint64_t total_data_size = 0;
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
size_t total_memory_usage = 0;
|
2016-07-20 00:12:46 +02:00
|
|
|
for (MemTable* m : mems_) {
|
2017-03-16 03:22:52 +01:00
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
|
2014-10-28 19:54:33 +01:00
|
|
|
memtables.push_back(m->NewIterator(ro, &arena));
|
2018-11-29 00:26:56 +01:00
|
|
|
auto* range_del_iter =
|
2018-12-17 22:12:22 +01:00
|
|
|
m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
|
2016-11-21 21:07:09 +01:00
|
|
|
if (range_del_iter != nullptr) {
|
2018-12-17 22:12:22 +01:00
|
|
|
range_del_iters.emplace_back(range_del_iter);
|
2016-11-21 21:07:09 +01:00
|
|
|
}
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
total_num_entries += m->num_entries();
|
|
|
|
total_num_deletes += m->num_deletes();
|
2019-02-16 01:30:23 +01:00
|
|
|
total_data_size += m->get_data_size();
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
total_memory_usage += m->ApproximateMemoryUsage();
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
|
2019-02-16 01:30:23 +01:00
|
|
|
event_logger_->Log() << "job" << job_context_->job_id << "event"
|
|
|
|
<< "flush_started"
|
|
|
|
<< "num_memtables" << mems_.size() << "num_entries"
|
|
|
|
<< total_num_entries << "num_deletes"
|
|
|
|
<< total_num_deletes << "total_data_size"
|
|
|
|
<< total_data_size << "memory_usage"
|
|
|
|
<< total_memory_usage << "flush_reason"
|
|
|
|
<< GetFlushReasonString(cfd_->GetFlushReason());
|
Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.
Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D45429
2015-08-25 02:04:18 +02:00
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
{
|
2014-11-11 22:47:22 +01:00
|
|
|
ScopedArenaIterator iter(
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
|
2014-11-11 22:47:22 +01:00
|
|
|
static_cast<int>(memtables.size()), &arena));
|
2017-03-16 03:22:52 +01:00
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id,
|
|
|
|
meta_.fd.GetNumber());
|
2014-10-28 19:54:33 +01:00
|
|
|
|
2015-04-15 01:53:19 +02:00
|
|
|
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
|
|
|
|
&output_compression_);
|
2017-07-05 21:02:00 +02:00
|
|
|
int64_t _current_time = 0;
|
2021-01-26 07:07:26 +01:00
|
|
|
auto status = clock_->GetCurrentTime(&_current_time);
|
2017-12-07 05:43:52 +01:00
|
|
|
// Safe to proceed even if GetCurrentTime fails. So, log and proceed.
|
|
|
|
if (!status.ok()) {
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
db_options_.info_log,
|
|
|
|
"Failed to get current time to populate creation_time property. "
|
|
|
|
"Status: %s",
|
|
|
|
status.ToString().c_str());
|
|
|
|
}
|
2017-06-28 02:02:20 +02:00
|
|
|
const uint64_t current_time = static_cast<uint64_t>(_current_time);
|
|
|
|
|
2017-10-24 00:22:05 +02:00
|
|
|
uint64_t oldest_key_time =
|
|
|
|
mems_.front()->ApproximateOldestKeyTime();
|
|
|
|
|
2019-11-23 01:01:21 +01:00
|
|
|
// It's not clear whether oldest_key_time is always available. In case
|
|
|
|
// it is not available, use current_time.
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
2020-08-11 21:39:49 +02:00
|
|
|
uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time);
|
|
|
|
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"FlushJob::WriteLevel0Table:oldest_ancester_time",
|
|
|
|
&oldest_ancester_time);
|
|
|
|
meta_.oldest_ancester_time = oldest_ancester_time;
|
|
|
|
|
2019-11-27 06:38:38 +01:00
|
|
|
meta_.file_creation_time = current_time;
|
2019-11-23 01:01:21 +01:00
|
|
|
|
2020-03-31 03:57:28 +02:00
|
|
|
uint64_t creation_time = (cfd_->ioptions()->compaction_style ==
|
|
|
|
CompactionStyle::kCompactionStyleFIFO)
|
|
|
|
? current_time
|
|
|
|
: meta_.oldest_ancester_time;
|
|
|
|
|
2021-05-21 01:06:12 +02:00
|
|
|
uint64_t num_input_entries = 0;
|
2021-06-18 13:56:43 +02:00
|
|
|
uint64_t memtable_payload_bytes = 0;
|
|
|
|
uint64_t memtable_garbage_bytes = 0;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus io_s;
|
2020-11-13 03:43:30 +01:00
|
|
|
const std::string* const full_history_ts_low =
|
|
|
|
(full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
|
2021-04-29 15:59:53 +02:00
|
|
|
TableBuilderOptions tboptions(
|
|
|
|
*cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
|
|
|
|
cfd_->int_tbl_prop_collector_factories(), output_compression_,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 22:49:24 +02:00
|
|
|
mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
|
|
|
|
0 /* level */, false /* is_bottommost */,
|
|
|
|
TableFileCreationReason::kFlush, creation_time, oldest_key_time,
|
2021-06-10 20:01:44 +02:00
|
|
|
current_time, db_id_, db_session_id_, 0 /* target_file_size */,
|
|
|
|
meta_.fd.GetNumber());
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 19:42:39 +02:00
|
|
|
s = BuildTable(
|
2021-04-29 15:59:53 +02:00
|
|
|
dbname_, versions_, db_options_, tboptions, file_options_,
|
|
|
|
cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
|
|
|
|
&blob_file_additions, existing_snapshots_,
|
2017-10-06 19:26:38 +02:00
|
|
|
earliest_write_conflict_snapshot_, snapshot_checker_,
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 19:42:39 +02:00
|
|
|
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 22:49:24 +02:00
|
|
|
&io_s, io_tracer_, event_logger_, job_context_->job_id, Env::IO_HIGH,
|
2021-05-21 01:06:12 +02:00
|
|
|
&table_properties_, write_hint, full_history_ts_low, blob_callback_,
|
2021-06-18 13:56:43 +02:00
|
|
|
&num_input_entries, &memtable_payload_bytes, &memtable_garbage_bytes);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
if (!io_s.ok()) {
|
|
|
|
io_status_ = io_s;
|
|
|
|
}
|
2021-05-21 01:06:12 +02:00
|
|
|
if (num_input_entries != total_num_entries && s.ok()) {
|
|
|
|
std::string msg = "Expected " + ToString(total_num_entries) +
|
|
|
|
" entries in memtables, but read " +
|
|
|
|
ToString(num_input_entries);
|
|
|
|
ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id,
|
|
|
|
msg.c_str());
|
|
|
|
if (db_options_.flush_verify_memtable_count) {
|
|
|
|
s = Status::Corruption(msg);
|
|
|
|
}
|
|
|
|
}
|
2021-06-18 13:56:43 +02:00
|
|
|
if (tboptions.reason == TableFileCreationReason::kFlush) {
|
Memtable "MemPurge" prototype (#8454)
Summary:
Implement an experimental feature called "MemPurge", which consists in purging "garbage" bytes out of a memtable and reuse the memtable struct instead of making it immutable and eventually flushing its content to storage.
The prototype is by default deactivated and is not intended for use. It is intended for correctness and validation testing. At the moment, the "MemPurge" feature can be switched on by using the `options.experimental_allow_mempurge` flag. For this early stage, when the allow_mempurge flag is set to `true`, all the flush operations will be rerouted to perform a MemPurge. This is a temporary design decision that will give us the time to explore meaningful heuristics to use MemPurge at the right time for relevant workloads . Moreover, the current MemPurge operation only supports `Puts`, `Deletes`, `DeleteRange` operations, and handles `Iterators` as well as `CompactionFilter`s that are invoked at flush time .
Three unit tests are added to `db_flush_test.cc` to test if MemPurge works correctly (and checks that the previously mentioned operations are fully supported thoroughly tested).
One noticeable design decision is the timing of the MemPurge operation in the memtable workflow: for this prototype, the mempurge happens when the memtable is switched (and usually made immutable). This is an inefficient process because it implies that the entirety of the MemPurge operation happens while holding the db_mutex. Future commits will make the MemPurge operation a background task (akin to the regular flush operation) and aim at drastically enhancing the performance of this operation. The MemPurge is also not fully "WAL-compatible" yet, but when the WAL is full, or when the regular MemPurge operation fails (or when the purged memtable still needs to be flushed), a regular flush operation takes place. Later commits will also correct these behaviors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8454
Reviewed By: anand1976
Differential Revision: D29433971
Pulled By: bjlemaire
fbshipit-source-id: 6af48213554e35048a7e03816955100a80a26dc5
2021-07-02 14:22:03 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::FlushJob:Flush");
|
2021-06-18 13:56:43 +02:00
|
|
|
RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
|
|
|
|
memtable_payload_bytes);
|
|
|
|
RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
|
|
|
|
memtable_garbage_bytes);
|
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
}
|
2017-03-16 03:22:52 +01:00
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
|
|
|
|
" bytes %s"
|
|
|
|
"%s",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id,
|
|
|
|
meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
|
|
|
|
s.ToString().c_str(),
|
|
|
|
meta_.marked_for_compaction ? " (needs compaction)" : "");
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-13 00:53:55 +02:00
|
|
|
|
2018-10-16 04:59:20 +02:00
|
|
|
if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
|
2020-03-03 01:14:00 +01:00
|
|
|
s = output_file_directory_->Fsync(IOOptions(), nullptr);
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
2019-10-16 19:39:00 +02:00
|
|
|
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
|
2014-10-28 19:54:33 +01:00
|
|
|
db_mutex_->Lock();
|
|
|
|
}
|
2016-07-20 00:12:46 +02:00
|
|
|
base_->Unref();
|
2014-10-28 19:54:33 +01:00
|
|
|
|
|
|
|
// Note that if file_size is zero, the file has been deleted and
|
|
|
|
// should not be added to the manifest.
|
2020-09-15 06:10:09 +02:00
|
|
|
const bool has_output = meta_.fd.GetFileSize() > 0;
|
|
|
|
|
|
|
|
if (s.ok() && has_output) {
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 02:16:00 +02:00
|
|
|
TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated");
|
2014-10-28 19:54:33 +01:00
|
|
|
// if we have more than 1 background thread, then we cannot
|
|
|
|
// insert files directly into higher levels because some other
|
|
|
|
// threads could be concurrently producing compacted files for
|
|
|
|
// that key range.
|
2015-07-17 21:02:52 +02:00
|
|
|
// Add file to L0
|
2016-07-20 00:12:46 +02:00
|
|
|
edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
|
|
|
|
meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
|
2018-07-28 01:00:26 +02:00
|
|
|
meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
|
2019-11-23 01:01:21 +01:00
|
|
|
meta_.marked_for_compaction, meta_.oldest_blob_file_number,
|
2020-02-11 00:42:46 +01:00
|
|
|
meta_.oldest_ancester_time, meta_.file_creation_time,
|
|
|
|
meta_.file_checksum, meta_.file_checksum_func_name);
|
2020-09-15 06:10:09 +02:00
|
|
|
|
|
|
|
edit_->SetBlobFileAdditions(std::move(blob_file_additions));
|
2014-10-28 19:54:33 +01:00
|
|
|
}
|
2019-10-16 19:39:00 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
// Piggyback FlushJobInfo on the first first flushed memtable.
|
|
|
|
mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
|
|
|
|
#endif // !ROCKSDB_LITE
|
2014-10-28 19:54:33 +01:00
|
|
|
|
2016-04-25 21:01:01 +02:00
|
|
|
// Note that here we treat flush as level 0 compaction in internal stats
|
2018-04-11 19:47:54 +02:00
|
|
|
InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
const uint64_t micros = clock_->NowMicros() - start_micros;
|
|
|
|
const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
|
|
|
|
stats.micros = micros;
|
|
|
|
stats.cpu_micros = cpu_micros;
|
|
|
|
|
|
|
|
ROCKS_LOG_INFO(db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Flush lasted %" PRIu64
|
|
|
|
" microseconds, and %" PRIu64 " cpu microseconds.\n",
|
|
|
|
cfd_->GetName().c_str(), job_context_->job_id, micros,
|
|
|
|
cpu_micros);
|
2020-09-15 06:10:09 +02:00
|
|
|
|
|
|
|
if (has_output) {
|
|
|
|
stats.bytes_written = meta_.fd.GetFileSize();
|
2020-10-26 21:50:03 +01:00
|
|
|
stats.num_output_files = 1;
|
|
|
|
}
|
2020-09-15 06:10:09 +02:00
|
|
|
|
2020-10-26 21:50:03 +01:00
|
|
|
const auto& blobs = edit_->GetBlobFileAdditions();
|
|
|
|
for (const auto& blob : blobs) {
|
2021-03-02 18:46:10 +01:00
|
|
|
stats.bytes_written_blob += blob.GetTotalBlobBytes();
|
2020-09-15 06:10:09 +02:00
|
|
|
}
|
|
|
|
|
2021-03-02 18:46:10 +01:00
|
|
|
stats.num_output_files_blob = static_cast<int>(blobs.size());
|
2020-10-26 21:50:03 +01:00
|
|
|
|
2021-08-11 03:07:48 +02:00
|
|
|
if ((db_options_.experimental_mempurge_threshold > 0.0) && s.ok()) {
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
// The db_mutex is held at this point.
|
|
|
|
for (MemTable* mt : mems_) {
|
|
|
|
// Note: if m is not a previous mempurge output memtable,
|
|
|
|
// nothing happens here.
|
|
|
|
cfd_->imm()->RemoveMemPurgeOutputID(mt->GetID());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-28 19:14:19 +01:00
|
|
|
RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
|
2019-03-20 01:24:09 +01:00
|
|
|
cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
|
2021-03-02 18:46:10 +01:00
|
|
|
cfd_->internal_stats()->AddCFStats(
|
|
|
|
InternalStats::BYTES_FLUSHED,
|
|
|
|
stats.bytes_written + stats.bytes_written_blob);
|
2016-04-25 21:01:01 +02:00
|
|
|
RecordFlushIOStats();
|
Add simple heuristics for experimental mempurge. (#8583)
Summary:
Add `experimental_mempurge_policy` option flag and introduce two new `MemPurge` (Memtable Garbage Collection) policies: 'ALWAYS' and 'ALTERNATE'. Default value: ALTERNATE.
`ALWAYS`: every flush will first go through a `MemPurge` process. If the output is too big to fit into a single memtable, then the mempurge is aborted and a regular flush process carries on. `ALWAYS` is designed for user that need to reduce the number of L0 SST file created to a strict minimum, and can afford a small dent in performance (possibly hits to CPU usage, read efficiency, and maximum burst write throughput).
`ALTERNATE`: a flush is transformed into a `MemPurge` except if one of the memtables being flushed is the product of a previous `MemPurge`. `ALTERNATE` is a good tradeoff between reduction in number of L0 SST files created and performance. `ALTERNATE` perform particularly well for completely random garbage ratios, or garbage ratios anywhere in (0%,50%], and even higher when there is a wild variability in garbage ratios.
This PR also includes support for `experimental_mempurge_policy` in `db_bench`.
Testing was done locally by replacing all the `MemPurge` policies of the unit tests with `ALTERNATE`, as well as local testing with `db_crashtest.py` `whitebox` and `blackbox`. Overall, if an `ALWAYS` mempurge policy passes the tests, there is no reasons why an `ALTERNATE` policy would fail, and therefore the mempurge policy was set to `ALWAYS` for all mempurge unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8583
Reviewed By: pdillinger
Differential Revision: D29888050
Pulled By: bjlemaire
fbshipit-source-id: e2cf26646d66679f6f5fb29842624615610759c1
2021-07-26 20:55:27 +02:00
|
|
|
|
2014-10-28 19:54:33 +01:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-10-16 19:39:00 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
|
|
|
|
db_mutex_->AssertHeld();
|
2019-11-01 19:44:59 +01:00
|
|
|
std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
|
2019-10-16 19:39:00 +02:00
|
|
|
info->cf_id = cfd_->GetID();
|
|
|
|
info->cf_name = cfd_->GetName();
|
2019-10-24 23:42:43 +02:00
|
|
|
|
|
|
|
const uint64_t file_number = meta_.fd.GetNumber();
|
|
|
|
info->file_path =
|
|
|
|
MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
|
|
|
|
info->file_number = file_number;
|
|
|
|
info->oldest_blob_file_number = meta_.oldest_blob_file_number;
|
2019-10-16 19:39:00 +02:00
|
|
|
info->thread_id = db_options_.env->GetThreadID();
|
|
|
|
info->job_id = job_context_->job_id;
|
|
|
|
info->smallest_seqno = meta_.fd.smallest_seqno;
|
|
|
|
info->largest_seqno = meta_.fd.largest_seqno;
|
|
|
|
info->table_properties = table_properties_;
|
|
|
|
info->flush_reason = cfd_->GetFlushReason();
|
|
|
|
return info;
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|