More robust checking of IO uring completion data (#8894)

Summary:
Potential bugs in the IO uring implementation can cause bad data to be returned in the completion queue. Add some checks in the PosixRandomAccessFile::MultiRead completion handling code to catch such errors and fail the entire MultiRead. Also log some diagnostic messages and stack trace.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8894

Reviewed By: siying, pdillinger

Differential Revision: D30826982

Pulled By: anand1976

fbshipit-source-id: af91815ac760e095d6cc0466cf8bd5c10167fd15
This commit is contained in:
anand76 2021-09-15 12:43:35 -07:00 committed by Facebook GitHub Bot
parent 8df334342e
commit 7743f033b1
3 changed files with 28 additions and 5 deletions

View File

@ -10,6 +10,7 @@
* Fix a bug on POSIX in which failure to create a lock file (e.g. out of space) can prevent future LockFile attempts in the same process on the same file from succeeding. * Fix a bug on POSIX in which failure to create a lock file (e.g. out of space) can prevent future LockFile attempts in the same process on the same file from succeeding.
* Fix a bug that backup_rate_limiter and restore_rate_limiter in BackupEngine could not limit read rates. * Fix a bug that backup_rate_limiter and restore_rate_limiter in BackupEngine could not limit read rates.
* Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together. The sync WAL should work with locked log_write_mutex_. * Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together. The sync WAL should work with locked log_write_mutex_.
* Add checks for validity of the IO uring completion queue entries, and fail the BlockBasedTableReader MultiGet sub-batch if there's an invalid completion
### New Features ### New Features
* RemoteCompaction's interface now includes `db_name`, `db_id`, `session_id`, which could help the user uniquely identify compaction job between db instances and sessions. * RemoteCompaction's interface now includes `db_name`, `db_id`, `session_id`, which could help the user uniquely identify compaction job between db instances and sessions.

20
env/io_posix.cc vendored
View File

@ -31,6 +31,7 @@
#endif #endif
#include "monitoring/iostats_context_imp.h" #include "monitoring/iostats_context_imp.h"
#include "port/port.h" #include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "test_util/sync_point.h" #include "test_util/sync_point.h"
#include "util/autovector.h" #include "util/autovector.h"
@ -644,6 +645,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
autovector<WrappedReadRequest, 32> req_wraps; autovector<WrappedReadRequest, 32> req_wraps;
autovector<WrappedReadRequest*, 4> incomplete_rq_list; autovector<WrappedReadRequest*, 4> incomplete_rq_list;
std::unordered_set<WrappedReadRequest*> wrap_cache;
for (size_t i = 0; i < num_reqs; i++) { for (size_t i = 0; i < num_reqs; i++) {
req_wraps.emplace_back(&reqs[i]); req_wraps.emplace_back(&reqs[i]);
@ -676,6 +678,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
sqe, fd_, &rep_to_submit->iov, 1, sqe, fd_, &rep_to_submit->iov, 1,
rep_to_submit->req->offset + rep_to_submit->finished_len); rep_to_submit->req->offset + rep_to_submit->finished_len);
io_uring_sqe_set_data(sqe, rep_to_submit); io_uring_sqe_set_data(sqe, rep_to_submit);
wrap_cache.emplace(rep_to_submit);
} }
incomplete_rq_list.clear(); incomplete_rq_list.clear();
@ -724,6 +727,22 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
} }
req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe)); req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
// Reset cqe data to catch any stray reuse of it
static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
// Check that we got a valid unique cqe data
auto wrap_check = wrap_cache.find(req_wrap);
if (wrap_check == wrap_cache.end()) {
fprintf(stderr,
"PosixRandomAccessFile::MultiRead: "
"Bad cqe data from IO uring - %p\n",
req_wrap);
port::PrintStack();
ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
ToString((uint64_t)req_wrap));
continue;
}
wrap_cache.erase(wrap_check);
FSReadRequest* req = req_wrap->req; FSReadRequest* req = req_wrap->req;
if (cqe->res < 0) { if (cqe->res < 0) {
req->result = Slice(req->scratch, 0); req->result = Slice(req->scratch, 0);
@ -769,6 +788,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
} }
io_uring_cqe_seen(iu, cqe); io_uring_cqe_seen(iu, cqe);
} }
wrap_cache.clear();
} }
return ios; return ios;
#else #else

View File

@ -1728,14 +1728,16 @@ void BlockBasedTable::RetrieveMultipleBlocks(
{ {
IOOptions opts; IOOptions opts;
IOStatus s = file->PrepareIOOptions(options, opts); IOStatus s = file->PrepareIOOptions(options, opts);
if (s.IsTimedOut()) { if (s.ok()) {
s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(),
&direct_io_buf);
}
if (!s.ok()) {
// Discard all the results in this batch if there is any time out
// or overall MultiRead error
for (FSReadRequest& req : read_reqs) { for (FSReadRequest& req : read_reqs) {
req.status = s; req.status = s;
} }
} else {
// How to handle this status code?
file->MultiRead(opts, &read_reqs[0], read_reqs.size(), &direct_io_buf)
.PermitUncheckedError();
} }
} }