2019-09-16 19:31:27 +02:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "file/writable_file_writer.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <mutex>
|
|
|
|
|
2020-02-11 00:42:46 +01:00
|
|
|
#include "db/version_edit.h"
|
2019-09-16 19:31:27 +02:00
|
|
|
#include "monitoring/histogram.h"
|
|
|
|
#include "monitoring/iostats_context_imp.h"
|
|
|
|
#include "port/port.h"
|
2021-01-26 07:07:26 +01:00
|
|
|
#include "rocksdb/system_clock.h"
|
2019-09-16 19:31:27 +02:00
|
|
|
#include "test_util/sync_point.h"
|
2021-02-11 07:18:33 +01:00
|
|
|
#include "util/crc32c.h"
|
2019-09-16 19:31:27 +02:00
|
|
|
#include "util/random.h"
|
|
|
|
#include "util/rate_limiter.h"
|
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2021-01-29 07:08:46 +01:00
|
|
|
Status WritableFileWriter::Create(const std::shared_ptr<FileSystem>& fs,
|
|
|
|
const std::string& fname,
|
|
|
|
const FileOptions& file_opts,
|
|
|
|
std::unique_ptr<WritableFileWriter>* writer,
|
|
|
|
IODebugContext* dbg) {
|
|
|
|
std::unique_ptr<FSWritableFile> file;
|
|
|
|
Status s = fs->NewWritableFile(fname, file_opts, &file, dbg);
|
|
|
|
if (s.ok()) {
|
|
|
|
writer->reset(new WritableFileWriter(std::move(file), fname, file_opts));
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::Append(const Slice& data) {
|
2019-09-16 19:31:27 +02:00
|
|
|
const char* src = data.data();
|
|
|
|
size_t left = data.size();
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus s;
|
2019-09-16 19:31:27 +02:00
|
|
|
pending_sync_ = true;
|
|
|
|
|
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Append:0",
|
|
|
|
rocksdb_kill_odds * REDUCE_ODDS2);
|
|
|
|
|
2020-05-14 23:56:10 +02:00
|
|
|
// Calculate the checksum of appended data
|
2020-05-21 17:10:43 +02:00
|
|
|
UpdateFileChecksum(data);
|
2020-05-14 23:56:10 +02:00
|
|
|
|
2019-09-16 19:31:27 +02:00
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(prepare_write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
|
|
|
|
IOOptions(), nullptr);
|
2019-09-16 19:31:27 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// See whether we need to enlarge the buffer to avoid the flush
|
|
|
|
if (buf_.Capacity() - buf_.CurrentSize() < left) {
|
|
|
|
for (size_t cap = buf_.Capacity();
|
|
|
|
cap < max_buffer_size_; // There is still room to increase
|
|
|
|
cap *= 2) {
|
|
|
|
// See whether the next available size is large enough.
|
|
|
|
// Buffer will never be increased to more than max_buffer_size_.
|
|
|
|
size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
|
|
|
|
if (desired_capacity - buf_.CurrentSize() >= left ||
|
|
|
|
(use_direct_io() && desired_capacity == max_buffer_size_)) {
|
|
|
|
buf_.AllocateNewBuffer(desired_capacity, true);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Flush only when buffered I/O
|
|
|
|
if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
|
|
|
|
if (buf_.CurrentSize() > 0) {
|
|
|
|
s = Flush();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(buf_.CurrentSize() == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We never write directly to disk with direct I/O on.
|
|
|
|
// or we simply use it for its original purpose to accumulate many small
|
|
|
|
// chunks
|
|
|
|
if (use_direct_io() || (buf_.Capacity() >= left)) {
|
|
|
|
while (left > 0) {
|
|
|
|
size_t appended = buf_.Append(src, left);
|
|
|
|
left -= appended;
|
|
|
|
src += appended;
|
|
|
|
|
|
|
|
if (left > 0) {
|
|
|
|
s = Flush();
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Writing directly to file bypassing the buffer
|
|
|
|
assert(buf_.CurrentSize() == 0);
|
|
|
|
s = WriteBuffered(src, left);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
|
|
|
|
if (s.ok()) {
|
|
|
|
filesize_ += data.size();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::Pad(const size_t pad_bytes) {
|
2019-09-16 19:31:27 +02:00
|
|
|
assert(pad_bytes < kDefaultPageSize);
|
|
|
|
size_t left = pad_bytes;
|
|
|
|
size_t cap = buf_.Capacity() - buf_.CurrentSize();
|
|
|
|
|
|
|
|
// Assume pad_bytes is small compared to buf_ capacity. So we always
|
|
|
|
// use buf_ rather than write directly to file in certain cases like
|
|
|
|
// Append() does.
|
|
|
|
while (left) {
|
|
|
|
size_t append_bytes = std::min(cap, left);
|
|
|
|
buf_.PadWith(append_bytes, 0);
|
|
|
|
left -= append_bytes;
|
|
|
|
if (left > 0) {
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus s = Flush();
|
2019-09-16 19:31:27 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cap = buf_.Capacity() - buf_.CurrentSize();
|
|
|
|
}
|
|
|
|
pending_sync_ = true;
|
|
|
|
filesize_ += pad_bytes;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
return IOStatus::OK();
|
2019-09-16 19:31:27 +02:00
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::Close() {
|
2019-09-16 19:31:27 +02:00
|
|
|
// Do not quit immediately on failure the file MUST be closed
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus s;
|
2019-09-16 19:31:27 +02:00
|
|
|
|
|
|
|
// Possible to close it twice now as we MUST close
|
|
|
|
// in __dtor, simply flushing is not enough
|
|
|
|
// Windows when pre-allocating does not fill with zeros
|
|
|
|
// also with unbuffered access we also set the end of data.
|
2020-09-08 19:49:01 +02:00
|
|
|
if (writable_file_.get() == nullptr) {
|
2019-09-16 19:31:27 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
s = Flush(); // flush cache to OS
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus interim;
|
2019-09-16 19:31:27 +02:00
|
|
|
// In direct I/O mode we write whole pages so
|
|
|
|
// we need to let the file know where data ends.
|
|
|
|
if (use_direct_io()) {
|
2020-07-08 03:19:32 +02:00
|
|
|
{
|
|
|
|
#ifndef ROCKSDB_LITE
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2020-07-08 03:19:32 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = FileOperationInfo::FinishNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
NotifyOnFileTruncateFinish(start_ts, finish_ts, s);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2019-09-16 19:31:27 +02:00
|
|
|
if (interim.ok()) {
|
2020-07-08 03:19:32 +02:00
|
|
|
{
|
|
|
|
#ifndef ROCKSDB_LITE
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2020-07-08 03:19:32 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
interim = writable_file_->Fsync(IOOptions(), nullptr);
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = FileOperationInfo::FinishNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
NotifyOnFileSyncFinish(start_ts, finish_ts, s,
|
|
|
|
FileOperationType::kFsync);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2019-09-16 19:31:27 +02:00
|
|
|
}
|
|
|
|
if (!interim.ok() && s.ok()) {
|
|
|
|
s = interim;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
|
2020-07-08 03:19:32 +02:00
|
|
|
{
|
|
|
|
#ifndef ROCKSDB_LITE
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2020-07-08 03:19:32 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
interim = writable_file_->Close(IOOptions(), nullptr);
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = FileOperationInfo::FinishNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
NotifyOnFileCloseFinish(start_ts, finish_ts, s);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2019-09-16 19:31:27 +02:00
|
|
|
if (!interim.ok() && s.ok()) {
|
|
|
|
s = interim;
|
|
|
|
}
|
|
|
|
|
|
|
|
writable_file_.reset();
|
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
|
|
|
|
|
2020-03-30 00:57:02 +02:00
|
|
|
if (s.ok() && checksum_generator_ != nullptr && !checksum_finalized_) {
|
|
|
|
checksum_generator_->Finalize();
|
|
|
|
checksum_finalized_ = true;
|
|
|
|
}
|
|
|
|
|
2019-09-16 19:31:27 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// write out the cached data to the OS cache or storage if direct I/O
|
|
|
|
// enabled
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::Flush() {
|
|
|
|
IOStatus s;
|
2019-09-16 19:31:27 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
|
|
|
|
rocksdb_kill_odds * REDUCE_ODDS2);
|
|
|
|
|
|
|
|
if (buf_.CurrentSize() > 0) {
|
|
|
|
if (use_direct_io()) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (pending_sync_) {
|
|
|
|
s = WriteDirect();
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
} else {
|
|
|
|
s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-08 03:19:32 +02:00
|
|
|
{
|
|
|
|
#ifndef ROCKSDB_LITE
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2020-07-08 03:19:32 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
s = writable_file_->Flush(IOOptions(), nullptr);
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = std::chrono::steady_clock::now();
|
2020-07-08 03:19:32 +02:00
|
|
|
NotifyOnFileFlushFinish(start_ts, finish_ts, s);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2019-09-16 19:31:27 +02:00
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// sync OS cache to disk for every bytes_per_sync_
|
|
|
|
// TODO: give log file and sst file different options (log
|
|
|
|
// files could be potentially cached in OS for their whole
|
|
|
|
// life time, thus we might not want to flush at all).
|
|
|
|
|
|
|
|
// We try to avoid sync to the last 1MB of data. For two reasons:
|
|
|
|
// (1) avoid rewrite the same page that is modified later.
|
|
|
|
// (2) for older version of OS, write can block while writing out
|
|
|
|
// the page.
|
|
|
|
// Xfs does neighbor page flushing outside of the specified ranges. We
|
|
|
|
// need to make sure sync range is far from the write offset.
|
|
|
|
if (!use_direct_io() && bytes_per_sync_) {
|
|
|
|
const uint64_t kBytesNotSyncRange =
|
|
|
|
1024 * 1024; // recent 1MB is not synced.
|
|
|
|
const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
|
|
|
|
if (filesize_ > kBytesNotSyncRange) {
|
|
|
|
uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
|
|
|
|
offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
|
|
|
|
assert(offset_sync_to >= last_sync_size_);
|
|
|
|
if (offset_sync_to > 0 &&
|
|
|
|
offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
|
|
|
|
s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
|
|
|
|
last_sync_size_ = offset_sync_to;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-03-30 00:57:02 +02:00
|
|
|
std::string WritableFileWriter::GetFileChecksum() {
|
|
|
|
if (checksum_generator_ != nullptr) {
|
2020-05-21 17:10:43 +02:00
|
|
|
assert(checksum_finalized_);
|
2020-03-30 00:57:02 +02:00
|
|
|
return checksum_generator_->GetChecksum();
|
|
|
|
} else {
|
|
|
|
return kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-11 00:42:46 +01:00
|
|
|
const char* WritableFileWriter::GetFileChecksumFuncName() const {
|
2020-03-30 00:57:02 +02:00
|
|
|
if (checksum_generator_ != nullptr) {
|
|
|
|
return checksum_generator_->Name();
|
2020-02-11 00:42:46 +01:00
|
|
|
} else {
|
2020-06-08 06:54:54 +02:00
|
|
|
return kUnknownFileChecksumFuncName;
|
2020-02-11 00:42:46 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::Sync(bool use_fsync) {
|
|
|
|
IOStatus s = Flush();
|
2019-09-16 19:31:27 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
|
|
|
|
if (!use_direct_io() && pending_sync_) {
|
|
|
|
s = SyncInternal(use_fsync);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
|
|
|
|
pending_sync_ = false;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
return IOStatus::OK();
|
2019-09-16 19:31:27 +02:00
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
|
2019-09-16 19:31:27 +02:00
|
|
|
if (!writable_file_->IsSyncThreadSafe()) {
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
return IOStatus::NotSupported(
|
2019-09-16 19:31:27 +02:00
|
|
|
"Can't WritableFileWriter::SyncWithoutFlush() because "
|
|
|
|
"WritableFile::IsSyncThreadSafe() is false");
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus s = SyncInternal(use_fsync);
|
2019-09-16 19:31:27 +02:00
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::SyncInternal(bool use_fsync) {
|
|
|
|
IOStatus s;
|
2019-09-16 19:31:27 +02:00
|
|
|
IOSTATS_TIMER_GUARD(fsync_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
|
|
|
|
auto prev_perf_level = GetPerfLevel();
|
2021-01-26 07:07:26 +01:00
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
|
2020-07-08 03:19:32 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2020-07-08 03:19:32 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
}
|
|
|
|
#endif
|
2019-09-16 19:31:27 +02:00
|
|
|
if (use_fsync) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
s = writable_file_->Fsync(IOOptions(), nullptr);
|
2019-09-16 19:31:27 +02:00
|
|
|
} else {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
s = writable_file_->Sync(IOOptions(), nullptr);
|
2019-09-16 19:31:27 +02:00
|
|
|
}
|
2020-07-08 03:19:32 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = std::chrono::steady_clock::now();
|
2020-07-08 03:19:32 +02:00
|
|
|
NotifyOnFileSyncFinish(
|
|
|
|
start_ts, finish_ts, s,
|
|
|
|
use_fsync ? FileOperationType::kFsync : FileOperationType::kSync);
|
|
|
|
}
|
|
|
|
#endif
|
2019-09-16 19:31:27 +02:00
|
|
|
SetPerfLevel(prev_perf_level);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
|
2019-09-16 19:31:27 +02:00
|
|
|
IOSTATS_TIMER_GUARD(range_sync_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
|
2020-07-08 03:19:32 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2020-07-08 03:19:32 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2020-07-08 03:19:32 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
IOStatus s = writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = std::chrono::steady_clock::now();
|
2020-07-08 03:19:32 +02:00
|
|
|
NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return s;
|
2019-09-16 19:31:27 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// This method writes to disk the specified data and makes use of the rate
|
|
|
|
// limiter if available
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) {
|
|
|
|
IOStatus s;
|
2019-09-16 19:31:27 +02:00
|
|
|
assert(!use_direct_io());
|
|
|
|
const char* src = data;
|
|
|
|
size_t left = size;
|
2021-02-11 07:18:33 +01:00
|
|
|
DataVerificationInfo v_info;
|
|
|
|
char checksum_buf[sizeof(uint32_t)];
|
2019-09-16 19:31:27 +02:00
|
|
|
|
|
|
|
while (left > 0) {
|
|
|
|
size_t allowed;
|
|
|
|
if (rate_limiter_ != nullptr) {
|
|
|
|
allowed = rate_limiter_->RequestToken(
|
|
|
|
left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
|
|
|
|
RateLimiter::OpType::kWrite);
|
|
|
|
} else {
|
|
|
|
allowed = left;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 23:47:08 +01:00
|
|
|
uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
|
2019-09-16 19:31:27 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2019-09-16 19:31:27 +02:00
|
|
|
old_size = next_write_offset_;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
auto prev_perf_level = GetPerfLevel();
|
2021-02-11 07:18:33 +01:00
|
|
|
|
2021-01-26 07:07:26 +01:00
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
|
2021-02-11 07:18:33 +01:00
|
|
|
if (perform_data_verification_) {
|
|
|
|
Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf);
|
|
|
|
v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
|
|
|
|
s = writable_file_->Append(Slice(src, allowed), IOOptions(), v_info,
|
|
|
|
nullptr);
|
|
|
|
} else {
|
|
|
|
s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
|
|
|
|
}
|
2019-09-16 19:31:27 +02:00
|
|
|
SetPerfLevel(prev_perf_level);
|
|
|
|
}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = std::chrono::steady_clock::now();
|
2019-09-16 19:31:27 +02:00
|
|
|
NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, allowed);
|
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
|
|
|
|
|
|
|
|
left -= allowed;
|
|
|
|
src += allowed;
|
|
|
|
}
|
|
|
|
buf_.Size(0);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-05-21 17:10:43 +02:00
|
|
|
void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
|
2020-03-30 00:57:02 +02:00
|
|
|
if (checksum_generator_ != nullptr) {
|
|
|
|
checksum_generator_->Update(data.data(), data.size());
|
2020-02-11 00:42:46 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-11 07:18:33 +01:00
|
|
|
// Currently, crc32c checksum is used to calculate the checksum value of the
|
|
|
|
// content in the input buffer for handoff. In the future, the checksum might be
|
|
|
|
// calculated from the existing crc32c checksums of the in WAl and Manifest
|
|
|
|
// records, or even SST file blocks.
|
|
|
|
// TODO: effectively use the existing checksum of the data being writing to
|
|
|
|
// generate the crc32c checksum instead of a raw calculation.
|
|
|
|
void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data,
|
|
|
|
size_t size,
|
|
|
|
char* buf) {
|
|
|
|
uint32_t v_crc32c = crc32c::Extend(0, data, size);
|
|
|
|
EncodeFixed32(buf, v_crc32c);
|
|
|
|
}
|
|
|
|
|
2019-09-16 19:31:27 +02:00
|
|
|
// This flushes the accumulated data in the buffer. We pad data with zeros if
|
|
|
|
// necessary to the whole page.
|
|
|
|
// However, during automatic flushes padding would not be necessary.
|
|
|
|
// We always use RateLimiter if available. We move (Refit) any buffer bytes
|
|
|
|
// that are left over the
|
|
|
|
// whole number of pages to be written again on the next flush because we can
|
|
|
|
// only write on aligned
|
|
|
|
// offsets.
|
|
|
|
#ifndef ROCKSDB_LITE
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus WritableFileWriter::WriteDirect() {
|
2019-09-16 19:31:27 +02:00
|
|
|
assert(use_direct_io());
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-28 00:03:05 +01:00
|
|
|
IOStatus s;
|
2019-09-16 19:31:27 +02:00
|
|
|
const size_t alignment = buf_.Alignment();
|
|
|
|
assert((next_write_offset_ % alignment) == 0);
|
|
|
|
|
|
|
|
// Calculate whole page final file advance if all writes succeed
|
|
|
|
size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());
|
|
|
|
|
|
|
|
// Calculate the leftover tail, we write it here padded with zeros BUT we
|
|
|
|
// will write
|
|
|
|
// it again in the future either on Close() OR when the current whole page
|
|
|
|
// fills out
|
|
|
|
size_t leftover_tail = buf_.CurrentSize() - file_advance;
|
|
|
|
|
|
|
|
// Round up and pad
|
|
|
|
buf_.PadToAlignmentWith(0);
|
|
|
|
|
|
|
|
const char* src = buf_.BufferStart();
|
|
|
|
uint64_t write_offset = next_write_offset_;
|
|
|
|
size_t left = buf_.CurrentSize();
|
2021-02-11 07:18:33 +01:00
|
|
|
DataVerificationInfo v_info;
|
|
|
|
char checksum_buf[sizeof(uint32_t)];
|
2019-09-16 19:31:27 +02:00
|
|
|
|
|
|
|
while (left > 0) {
|
|
|
|
// Check how much is allowed
|
|
|
|
size_t size;
|
|
|
|
if (rate_limiter_ != nullptr) {
|
|
|
|
size = rate_limiter_->RequestToken(left, buf_.Alignment(),
|
|
|
|
writable_file_->GetIOPriority(),
|
|
|
|
stats_, RateLimiter::OpType::kWrite);
|
|
|
|
} else {
|
|
|
|
size = left;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
2020-07-22 17:53:21 +02:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2019-09-16 19:31:27 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2019-09-16 19:31:27 +02:00
|
|
|
}
|
|
|
|
// direct writes must be positional
|
2021-02-11 07:18:33 +01:00
|
|
|
if (perform_data_verification_) {
|
|
|
|
Crc32cHandoffChecksumCalculation(src, size, checksum_buf);
|
|
|
|
v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
|
|
|
|
s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
|
|
|
|
IOOptions(), v_info, nullptr);
|
|
|
|
} else {
|
|
|
|
s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
|
|
|
|
IOOptions(), nullptr);
|
|
|
|
}
|
|
|
|
|
2019-09-16 19:31:27 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 17:53:21 +02:00
|
|
|
auto finish_ts = std::chrono::steady_clock::now();
|
2019-09-16 19:31:27 +02:00
|
|
|
NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
buf_.Size(file_advance + leftover_tail);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, size);
|
|
|
|
left -= size;
|
|
|
|
src += size;
|
|
|
|
write_offset += size;
|
|
|
|
assert((next_write_offset_ % alignment) == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
// Move the tail to the beginning of the buffer
|
|
|
|
// This never happens during normal Append but rather during
|
|
|
|
// explicit call to Flush()/Sync() or Close()
|
|
|
|
buf_.RefitTail(file_advance, leftover_tail);
|
|
|
|
// This is where we start writing next time which may or not be
|
|
|
|
// the actual file size on disk. They match if the buffer size
|
|
|
|
// is a multiple of whole pages otherwise filesize_ is leftover_tail
|
|
|
|
// behind
|
|
|
|
next_write_offset_ += file_advance;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE
|
2020-02-20 21:07:53 +01:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|