2016-02-09 15:12:00 -08:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 16:03:42 -07:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2016-01-28 18:35:01 -08:00
|
|
|
|
2019-05-29 20:44:08 -07:00
|
|
|
#include "file/sst_file_manager_impl.h"
|
2016-01-28 18:35:01 -08:00
|
|
|
|
2019-06-06 13:52:39 -07:00
|
|
|
#include <cinttypes>
|
2016-01-28 18:35:01 -08:00
|
|
|
#include <vector>
|
|
|
|
|
2019-05-31 11:52:59 -07:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2021-09-29 04:01:57 -07:00
|
|
|
#include "logging/logging.h"
|
2016-01-28 18:35:01 -08:00
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/env.h"
|
2016-12-21 17:35:00 -08:00
|
|
|
#include "rocksdb/sst_file_manager.h"
|
2019-05-30 11:21:38 -07:00
|
|
|
#include "test_util/sync_point.h"
|
2019-05-30 17:39:43 -07:00
|
|
|
#include "util/mutexlock.h"
|
2016-01-28 18:35:01 -08:00
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2016-01-28 18:35:01 -08:00
|
|
|
|
2016-12-21 17:35:00 -08:00
|
|
|
#ifndef ROCKSDB_LITE
|
2021-01-25 22:07:26 -08:00
|
|
|
SstFileManagerImpl::SstFileManagerImpl(
|
|
|
|
const std::shared_ptr<SystemClock>& clock,
|
|
|
|
const std::shared_ptr<FileSystem>& fs,
|
|
|
|
const std::shared_ptr<Logger>& logger, int64_t rate_bytes_per_sec,
|
|
|
|
double max_trash_db_ratio, uint64_t bytes_max_delete_chunk)
|
|
|
|
: clock_(clock),
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
fs_(fs),
|
2016-01-28 18:35:01 -08:00
|
|
|
logger_(logger),
|
|
|
|
total_files_size_(0),
|
2018-03-06 16:13:05 -08:00
|
|
|
compaction_buffer_size_(0),
|
|
|
|
cur_compactions_reserved_size_(0),
|
2016-02-18 11:25:19 -08:00
|
|
|
max_allowed_space_(0),
|
2021-03-15 04:32:24 -07:00
|
|
|
delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec,
|
|
|
|
logger.get(), this, max_trash_db_ratio,
|
|
|
|
bytes_max_delete_chunk),
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
cv_(&mu_),
|
|
|
|
closing_(false),
|
|
|
|
bg_thread_(nullptr),
|
|
|
|
reserved_disk_buffer_(0),
|
|
|
|
free_space_trigger_(0),
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
cur_instance_(nullptr) {}
|
2016-01-28 18:35:01 -08:00
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
SstFileManagerImpl::~SstFileManagerImpl() {
|
2018-09-17 13:08:13 -07:00
|
|
|
Close();
|
2020-08-20 19:16:56 -07:00
|
|
|
bg_err_.PermitUncheckedError();
|
2018-09-17 13:08:13 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void SstFileManagerImpl::Close() {
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
{
|
|
|
|
MutexLock l(&mu_);
|
2018-09-17 13:08:13 -07:00
|
|
|
if (closing_) {
|
|
|
|
return;
|
|
|
|
}
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
closing_ = true;
|
|
|
|
cv_.SignalAll();
|
|
|
|
}
|
|
|
|
if (bg_thread_) {
|
|
|
|
bg_thread_->join();
|
|
|
|
}
|
|
|
|
}
|
2016-01-28 18:35:01 -08:00
|
|
|
|
2021-03-17 20:43:22 -07:00
|
|
|
Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
|
2016-01-28 18:35:01 -08:00
|
|
|
uint64_t file_size;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
|
2016-01-28 18:35:01 -08:00
|
|
|
if (s.ok()) {
|
|
|
|
MutexLock l(&mu_);
|
2021-03-17 20:43:22 -07:00
|
|
|
OnAddFileImpl(file_path, file_size);
|
2016-01-28 18:35:01 -08:00
|
|
|
}
|
2021-03-17 20:43:22 -07:00
|
|
|
TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
|
|
|
|
const_cast<std::string*>(&file_path));
|
2016-01-28 18:35:01 -08:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-02-04 13:38:00 -08:00
|
|
|
Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
|
2021-03-17 20:43:22 -07:00
|
|
|
uint64_t file_size) {
|
2020-02-04 13:38:00 -08:00
|
|
|
MutexLock l(&mu_);
|
2021-03-17 20:43:22 -07:00
|
|
|
OnAddFileImpl(file_path, file_size);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
|
|
|
|
const_cast<std::string*>(&file_path));
|
2020-02-04 13:38:00 -08:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2016-01-28 18:35:01 -08:00
|
|
|
Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
|
|
|
|
{
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
OnDeleteFileImpl(file_path);
|
|
|
|
}
|
2021-03-17 20:43:22 -07:00
|
|
|
TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile",
|
|
|
|
const_cast<std::string*>(&file_path));
|
2016-01-28 18:35:01 -08:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2018-03-06 16:13:05 -08:00
|
|
|
void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
uint64_t size_added_by_compaction = 0;
|
|
|
|
for (size_t i = 0; i < c->num_input_levels(); i++) {
|
|
|
|
for (size_t j = 0; j < c->num_input_files(i); j++) {
|
|
|
|
FileMetaData* filemeta = c->input(i, j);
|
|
|
|
size_added_by_compaction += filemeta->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cur_compactions_reserved_size_ -= size_added_by_compaction;
|
|
|
|
}
|
|
|
|
|
2016-01-28 18:35:01 -08:00
|
|
|
Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
|
2017-06-12 16:51:37 -07:00
|
|
|
const std::string& new_path,
|
|
|
|
uint64_t* file_size) {
|
2016-01-28 18:35:01 -08:00
|
|
|
{
|
|
|
|
MutexLock l(&mu_);
|
2017-06-12 16:51:37 -07:00
|
|
|
if (file_size != nullptr) {
|
|
|
|
*file_size = tracked_files_[old_path];
|
|
|
|
}
|
2021-03-17 20:43:22 -07:00
|
|
|
OnAddFileImpl(new_path, tracked_files_[old_path]);
|
2016-01-28 18:35:01 -08:00
|
|
|
OnDeleteFileImpl(old_path);
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2016-02-17 15:20:23 -08:00
|
|
|
void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
max_allowed_space_ = max_allowed_space;
|
|
|
|
}
|
|
|
|
|
2018-03-06 16:13:05 -08:00
|
|
|
void SstFileManagerImpl::SetCompactionBufferSize(
|
|
|
|
uint64_t compaction_buffer_size) {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
compaction_buffer_size_ = compaction_buffer_size;
|
|
|
|
}
|
|
|
|
|
2016-02-17 15:20:23 -08:00
|
|
|
bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
if (max_allowed_space_ <= 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return total_files_size_ >= max_allowed_space_;
|
|
|
|
}
|
|
|
|
|
2018-03-06 16:13:05 -08:00
|
|
|
bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
if (max_allowed_space_ <= 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return total_files_size_ + cur_compactions_reserved_size_ >=
|
|
|
|
max_allowed_space_;
|
|
|
|
}
|
|
|
|
|
2018-04-02 19:53:19 -07:00
|
|
|
bool SstFileManagerImpl::EnoughRoomForCompaction(
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
|
2021-01-06 14:14:01 -08:00
|
|
|
const Status& bg_error) {
|
2018-03-06 16:13:05 -08:00
|
|
|
MutexLock l(&mu_);
|
|
|
|
uint64_t size_added_by_compaction = 0;
|
|
|
|
// First check if we even have the space to do the compaction
|
2018-04-02 19:53:19 -07:00
|
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
|
|
for (size_t j = 0; j < inputs[i].size(); j++) {
|
|
|
|
FileMetaData* filemeta = inputs[i][j];
|
2018-03-06 16:13:05 -08:00
|
|
|
size_added_by_compaction += filemeta->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
// Update cur_compactions_reserved_size_ so concurrent compaction
|
|
|
|
// don't max out space
|
|
|
|
size_t needed_headroom =
|
|
|
|
cur_compactions_reserved_size_ + size_added_by_compaction +
|
|
|
|
compaction_buffer_size_;
|
2018-03-06 16:13:05 -08:00
|
|
|
if (max_allowed_space_ != 0 &&
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
(needed_headroom + total_files_size_ > max_allowed_space_)) {
|
2018-03-06 16:13:05 -08:00
|
|
|
return false;
|
|
|
|
}
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
|
|
|
|
// Implement more aggressive checks only if this DB instance has already
|
|
|
|
// seen a NoSpace() error. This is tin order to contain a single potentially
|
|
|
|
// misbehaving DB instance and prevent it from slowing down compactions of
|
|
|
|
// other DB instances
|
2021-01-06 14:14:01 -08:00
|
|
|
if (bg_error.IsNoSpace() && CheckFreeSpace()) {
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
auto fn =
|
|
|
|
TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
|
|
|
|
inputs[0][0]->fd.GetPathId());
|
|
|
|
uint64_t free_space = 0;
|
2020-08-20 19:16:56 -07:00
|
|
|
Status s = fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
|
|
|
|
s.PermitUncheckedError(); // TODO: Check the status
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
// needed_headroom is based on current size reserved by compactions,
|
|
|
|
// minus any files created by running compactions as they would count
|
|
|
|
// against the reserved size. If user didn't specify any compaction
|
|
|
|
// buffer, add reserved_disk_buffer_ that's calculated by default so the
|
|
|
|
// compaction doesn't end up leaving nothing for logs and flush SSTs
|
|
|
|
if (compaction_buffer_size_ == 0) {
|
|
|
|
needed_headroom += reserved_disk_buffer_;
|
|
|
|
}
|
|
|
|
if (free_space < needed_headroom + size_added_by_compaction) {
|
|
|
|
// We hit the condition of not enough disk space
|
2019-04-04 12:05:42 -07:00
|
|
|
ROCKS_LOG_ERROR(logger_,
|
|
|
|
"free space [%" PRIu64
|
|
|
|
" bytes] is less than "
|
|
|
|
"needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
|
|
|
|
free_space, needed_headroom);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-06 16:13:05 -08:00
|
|
|
cur_compactions_reserved_size_ += size_added_by_compaction;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
// Take a snapshot of cur_compactions_reserved_size_ for when we encounter
|
|
|
|
// a NoSpace error.
|
|
|
|
free_space_trigger_ = cur_compactions_reserved_size_;
|
2018-03-06 16:13:05 -08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
return cur_compactions_reserved_size_;
|
|
|
|
}
|
|
|
|
|
2016-01-28 18:35:01 -08:00
|
|
|
uint64_t SstFileManagerImpl::GetTotalSize() {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
return total_files_size_;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unordered_map<std::string, uint64_t>
|
|
|
|
SstFileManagerImpl::GetTrackedFiles() {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
return tracked_files_;
|
|
|
|
}
|
|
|
|
|
|
|
|
int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
|
|
|
|
return delete_scheduler_.GetRateBytesPerSecond();
|
|
|
|
}
|
|
|
|
|
2017-03-16 12:06:04 -07:00
|
|
|
void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
|
|
|
|
return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
|
|
|
|
}
|
|
|
|
|
2017-11-17 11:56:41 -08:00
|
|
|
double SstFileManagerImpl::GetMaxTrashDBRatio() {
|
|
|
|
return delete_scheduler_.GetMaxTrashDBRatio();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
|
|
|
|
return delete_scheduler_.SetMaxTrashDBRatio(r);
|
|
|
|
}
|
|
|
|
|
2018-08-04 17:50:01 -07:00
|
|
|
uint64_t SstFileManagerImpl::GetTotalTrashSize() {
|
|
|
|
return delete_scheduler_.GetTotalTrashSize();
|
|
|
|
}
|
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
|
|
|
|
const std::string& path) {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
|
|
|
|
reserved_disk_buffer_ += size;
|
|
|
|
if (path_.empty()) {
|
|
|
|
path_ = path;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SstFileManagerImpl::ClearError() {
|
|
|
|
while (true) {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
|
|
|
|
if (closing_) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-05-28 12:16:22 -07:00
|
|
|
uint64_t free_space = 0;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
Status s = fs_->GetFreeSpace(path_, IOOptions(), &free_space, nullptr);
|
2019-05-24 18:35:11 -07:00
|
|
|
free_space = max_allowed_space_ > 0
|
|
|
|
? std::min(max_allowed_space_, free_space)
|
|
|
|
: free_space;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
if (s.ok()) {
|
|
|
|
// In case of multi-DB instances, some of them may have experienced a
|
|
|
|
// soft error and some a hard error. In the SstFileManagerImpl, a hard
|
|
|
|
// error will basically override previously reported soft errors. Once
|
|
|
|
// we clear the hard error, we don't keep track of previous errors for
|
|
|
|
// now
|
|
|
|
if (bg_err_.severity() == Status::Severity::kHardError) {
|
|
|
|
if (free_space < reserved_disk_buffer_) {
|
2019-04-04 12:05:42 -07:00
|
|
|
ROCKS_LOG_ERROR(logger_,
|
|
|
|
"free space [%" PRIu64
|
|
|
|
" bytes] is less than "
|
|
|
|
"required disk buffer [%" PRIu64 " bytes]\n",
|
|
|
|
free_space, reserved_disk_buffer_);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
|
|
|
|
s = Status::NoSpace();
|
|
|
|
}
|
|
|
|
} else if (bg_err_.severity() == Status::Severity::kSoftError) {
|
|
|
|
if (free_space < free_space_trigger_) {
|
2019-04-04 12:05:42 -07:00
|
|
|
ROCKS_LOG_WARN(logger_,
|
|
|
|
"free space [%" PRIu64
|
|
|
|
" bytes] is less than "
|
|
|
|
"free space for compaction trigger [%" PRIu64
|
|
|
|
" bytes]\n",
|
|
|
|
free_space, free_space_trigger_);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
|
|
|
|
s = Status::NoSpace();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Someone could have called CancelErrorRecovery() and the list could have
|
|
|
|
// become empty, so check again here
|
|
|
|
if (s.ok() && !error_handler_list_.empty()) {
|
|
|
|
auto error_handler = error_handler_list_.front();
|
|
|
|
// Since we will release the mutex, set cur_instance_ to signal to the
|
|
|
|
// shutdown thread, if it calls // CancelErrorRecovery() the meantime,
|
|
|
|
// to indicate that this DB instance is busy. The DB instance is
|
|
|
|
// guaranteed to not be deleted before RecoverFromBGError() returns,
|
|
|
|
// since the ErrorHandler::recovery_in_prog_ flag would be true
|
|
|
|
cur_instance_ = error_handler;
|
|
|
|
mu_.Unlock();
|
|
|
|
s = error_handler->RecoverFromBGError();
|
2020-02-03 18:15:12 -08:00
|
|
|
TEST_SYNC_POINT("SstFileManagerImpl::ErrorCleared");
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
mu_.Lock();
|
|
|
|
// The DB instance might have been deleted while we were
|
|
|
|
// waiting for the mutex, so check cur_instance_ to make sure its
|
|
|
|
// still non-null
|
|
|
|
if (cur_instance_) {
|
|
|
|
// Check for error again, since the instance may have recovered but
|
|
|
|
// immediately got another error. If that's the case, and the new
|
|
|
|
// error is also a NoSpace() non-fatal error, leave the instance in
|
|
|
|
// the list
|
|
|
|
Status err = cur_instance_->GetBGError();
|
2021-07-20 18:08:55 -07:00
|
|
|
if (s.ok() && err.subcode() == IOStatus::SubCode::kNoSpace &&
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
err.severity() < Status::Severity::kFatalError) {
|
|
|
|
s = err;
|
|
|
|
}
|
|
|
|
cur_instance_ = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok() || s.IsShutdownInProgress() ||
|
|
|
|
(!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
|
|
|
|
// If shutdown is in progress, abandon this handler instance
|
|
|
|
// and continue with the others
|
|
|
|
error_handler_list_.pop_front();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!error_handler_list_.empty()) {
|
|
|
|
// If there are more instances to be recovered, reschedule after 5
|
|
|
|
// seconds
|
2021-01-25 22:07:26 -08:00
|
|
|
int64_t wait_until = clock_->NowMicros() + 5000000;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
cv_.TimedWait(wait_until);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check again for error_handler_list_ empty, as a DB instance shutdown
|
|
|
|
// could have removed it from the queue while we were in timed wait
|
|
|
|
if (error_handler_list_.empty()) {
|
|
|
|
ROCKS_LOG_INFO(logger_, "Clearing error\n");
|
|
|
|
bg_err_ = Status::OK();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
|
|
|
|
Status bg_error) {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
if (bg_error.severity() == Status::Severity::kSoftError) {
|
|
|
|
if (bg_err_.ok()) {
|
|
|
|
// Setting bg_err_ basically means we're in degraded mode
|
|
|
|
// Assume that all pending compactions will fail similarly. The trigger
|
|
|
|
// for clearing this condition is set to current compaction reserved
|
|
|
|
// size, so we stop checking disk space available in
|
|
|
|
// EnoughRoomForCompaction once this much free space is available
|
|
|
|
bg_err_ = bg_error;
|
|
|
|
}
|
|
|
|
} else if (bg_error.severity() == Status::Severity::kHardError) {
|
|
|
|
bg_err_ = bg_error;
|
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If this is the first instance of this error, kick of a thread to poll
|
|
|
|
// and recover from this condition
|
|
|
|
if (error_handler_list_.empty()) {
|
|
|
|
error_handler_list_.push_back(handler);
|
|
|
|
// Release lock before calling join. Its ok to do so because
|
|
|
|
// error_handler_list_ is now non-empty, so no other invocation of this
|
|
|
|
// function will execute this piece of code
|
|
|
|
mu_.Unlock();
|
|
|
|
if (bg_thread_) {
|
|
|
|
bg_thread_->join();
|
|
|
|
}
|
|
|
|
// Start a new thread. The previous one would have exited.
|
|
|
|
bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
|
|
|
|
mu_.Lock();
|
|
|
|
} else {
|
|
|
|
// Check if this DB instance is already in the list
|
|
|
|
for (auto iter = error_handler_list_.begin();
|
|
|
|
iter != error_handler_list_.end(); ++iter) {
|
|
|
|
if ((*iter) == handler) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
error_handler_list_.push_back(handler);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
|
|
|
|
MutexLock l(&mu_);
|
|
|
|
|
|
|
|
if (cur_instance_ == handler) {
|
|
|
|
// This instance is currently busy attempting to recover
|
|
|
|
// Nullify it so the recovery thread doesn't attempt to access it again
|
|
|
|
cur_instance_ = nullptr;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto iter = error_handler_list_.begin();
|
|
|
|
iter != error_handler_list_.end(); ++iter) {
|
|
|
|
if ((*iter) == handler) {
|
|
|
|
error_handler_list_.erase(iter);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-04-26 13:51:39 -07:00
|
|
|
Status SstFileManagerImpl::ScheduleFileDeletion(
|
2019-01-29 14:27:30 -08:00
|
|
|
const std::string& file_path, const std::string& path_to_sync,
|
|
|
|
const bool force_bg) {
|
2020-01-14 15:06:53 -08:00
|
|
|
TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion",
|
|
|
|
const_cast<std::string*>(&file_path));
|
2019-01-29 14:27:30 -08:00
|
|
|
return delete_scheduler_.DeleteFile(file_path, path_to_sync,
|
|
|
|
force_bg);
|
2016-01-28 18:35:01 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void SstFileManagerImpl::WaitForEmptyTrash() {
|
|
|
|
delete_scheduler_.WaitForEmptyTrash();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
|
2021-03-17 20:43:22 -07:00
|
|
|
uint64_t file_size) {
|
2016-01-28 18:35:01 -08:00
|
|
|
auto tracked_file = tracked_files_.find(file_path);
|
|
|
|
if (tracked_file != tracked_files_.end()) {
|
|
|
|
// File was added before, we will just update the size
|
|
|
|
total_files_size_ -= tracked_file->second;
|
|
|
|
total_files_size_ += file_size;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 13:36:19 -07:00
|
|
|
cur_compactions_reserved_size_ -= file_size;
|
2016-01-28 18:35:01 -08:00
|
|
|
} else {
|
|
|
|
total_files_size_ += file_size;
|
|
|
|
}
|
|
|
|
tracked_files_[file_path] = file_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
|
|
|
|
auto tracked_file = tracked_files_.find(file_path);
|
|
|
|
if (tracked_file == tracked_files_.end()) {
|
|
|
|
// File is not tracked
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
total_files_size_ -= tracked_file->second;
|
|
|
|
tracked_files_.erase(tracked_file);
|
|
|
|
}
|
|
|
|
|
|
|
|
SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
|
|
|
|
std::string trash_dir,
|
|
|
|
int64_t rate_bytes_per_sec,
|
2017-11-17 11:56:41 -08:00
|
|
|
bool delete_existing_trash, Status* status,
|
2018-03-22 15:42:44 -07:00
|
|
|
double max_trash_db_ratio,
|
|
|
|
uint64_t bytes_max_delete_chunk) {
|
2021-01-06 10:48:24 -08:00
|
|
|
const auto& fs = env->GetFileSystem();
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
|
|
|
|
delete_existing_trash, status, max_trash_db_ratio,
|
|
|
|
bytes_max_delete_chunk);
|
|
|
|
}
|
|
|
|
|
|
|
|
SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
|
|
|
|
std::shared_ptr<Logger> info_log,
|
|
|
|
const std::string& trash_dir,
|
|
|
|
int64_t rate_bytes_per_sec,
|
|
|
|
bool delete_existing_trash, Status* status,
|
|
|
|
double max_trash_db_ratio,
|
|
|
|
uint64_t bytes_max_delete_chunk) {
|
2021-01-25 22:07:26 -08:00
|
|
|
const auto& clock = env->GetSystemClock();
|
2016-01-28 18:35:01 -08:00
|
|
|
SstFileManagerImpl* res =
|
2021-01-25 22:07:26 -08:00
|
|
|
new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec,
|
2018-03-22 15:42:44 -07:00
|
|
|
max_trash_db_ratio, bytes_max_delete_chunk);
|
2016-01-28 18:35:01 -08:00
|
|
|
|
2017-10-27 13:25:54 -07:00
|
|
|
// trash_dir is deprecated and not needed anymore, but if user passed it
|
|
|
|
// we will still remove files in it.
|
2020-07-28 22:58:28 -07:00
|
|
|
Status s = Status::OK();
|
2017-10-27 13:25:54 -07:00
|
|
|
if (delete_existing_trash && trash_dir != "") {
|
|
|
|
std::vector<std::string> files_in_trash;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
|
2017-10-27 13:25:54 -07:00
|
|
|
if (s.ok()) {
|
|
|
|
for (const std::string& trash_file : files_in_trash) {
|
|
|
|
std::string path_in_trash = trash_dir + "/" + trash_file;
|
|
|
|
res->OnAddFile(path_in_trash);
|
2018-04-26 13:51:39 -07:00
|
|
|
Status file_delete =
|
|
|
|
res->ScheduleFileDeletion(path_in_trash, trash_dir);
|
2017-10-27 13:25:54 -07:00
|
|
|
if (s.ok() && !file_delete.ok()) {
|
|
|
|
s = file_delete;
|
2016-01-28 18:35:01 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status) {
|
|
|
|
*status = s;
|
2020-07-28 22:58:28 -07:00
|
|
|
} else {
|
|
|
|
// No one passed us a Status, so they must not care about the error...
|
|
|
|
s.PermitUncheckedError();
|
2016-01-28 18:35:01 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2016-12-21 17:35:00 -08:00
|
|
|
#else
|
|
|
|
|
2018-04-12 17:55:14 -07:00
|
|
|
SstFileManager* NewSstFileManager(Env* /*env*/,
|
|
|
|
std::shared_ptr<Logger> /*info_log*/,
|
|
|
|
std::string /*trash_dir*/,
|
|
|
|
int64_t /*rate_bytes_per_sec*/,
|
|
|
|
bool /*delete_existing_trash*/,
|
|
|
|
Status* status, double /*max_trash_db_ratio*/,
|
|
|
|
uint64_t /*bytes_max_delete_chunk*/) {
|
2016-12-21 17:35:00 -08:00
|
|
|
if (status) {
|
|
|
|
*status =
|
2018-04-12 17:55:14 -07:00
|
|
|
Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
|
2016-12-21 17:35:00 -08:00
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|