Fix stress test failure "Corruption: checksum mismatch" or "Iterator Diverged" with async_io enabled (#10032)
Summary: In case of non sequential reads with `async_io`, `FilePRefetchBuffer::TryReadFromCacheAsync` can be called for previous blocks with `offset < bufs_[curr_].offset_` which wasn't handled correctly resulting wrong data being returned from buffer. Since `FilePRefetchBuffer::PrefetchAsync` can be called for any data block, it sets `prev_len_` to 0 indicating `FilePRefetchBuffer::TryReadFromCacheAsync` to go for the prefetching even though offset < bufs_[curr_].offset_ This is because async prefetching is always done in second buffer (to avoid mutex) even though curr_ is empty leading to offset < bufs_[curr_].offset_ in some cases. If prev_len_ is non zero then `TryReadFromCacheAsync` returns false if `offset < bufs_[curr_].offset_ && prev_len != 0` indicating reads are not sequential and previous call wasn't PrefetchAsync. - This PR also simplifies `FilePRefetchBuffer::TryReadFromCacheAsync` as it was getting complicated covering different scenarios based on `async_io` enabled/disabled. If `for_compaction` is set true, it now calls `FilePRefetchBufferTryReadFromCache` following synchronous flow as before. Its decided in BlockFetcher.cc Pull Request resolved: https://github.com/facebook/rocksdb/pull/10032 Test Plan: 1. export CRASH_TEST_EXT_ARGS=" --async_io=1" make crash_test -j completed successfully locally 2. make crash_test -j completed successfully locally 3. Reran CircleCi mini crashtest job 4 - 5 times. 4. Updated prefetch_test for more coverage. Reviewed By: anand1976 Differential Revision: D36579858 Pulled By: akankshamahajan15 fbshipit-source-id: 0c428d62b45e12e082a83acf533a5e37a584bedf
This commit is contained in:
parent
bea5831bff
commit
a479c2c2b2
@ -447,18 +447,23 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
|
|||||||
bool FilePrefetchBuffer::TryReadFromCacheAsync(
|
bool FilePrefetchBuffer::TryReadFromCacheAsync(
|
||||||
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
|
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
|
||||||
size_t n, Slice* result, Status* status,
|
size_t n, Slice* result, Status* status,
|
||||||
Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */
|
Env::IOPriority rate_limiter_priority) {
|
||||||
) {
|
assert(async_io_);
|
||||||
|
|
||||||
if (track_min_offset_ && offset < min_offset_read_) {
|
if (track_min_offset_ && offset < min_offset_read_) {
|
||||||
min_offset_read_ = static_cast<size_t>(offset);
|
min_offset_read_ = static_cast<size_t>(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!enable_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// In case of async_io_, offset can be less than bufs_[curr_].offset_ because
|
// In case of async_io_, offset can be less than bufs_[curr_].offset_ because
|
||||||
// of reads not sequential and PrefetchAsync can be called for any block and
|
// of reads not sequential and PrefetchAsync can be called for any block and
|
||||||
// RocksDB will call TryReadFromCacheAsync after PrefetchAsync to Poll for
|
// RocksDB will call TryReadFromCacheAsync after PrefetchAsync to Poll for
|
||||||
// requested bytes. IsEligibleForPrefetch API will return false in case reads
|
// requested bytes.
|
||||||
// are not sequential and Non sequential reads will be handled there.
|
if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_ &&
|
||||||
if (!enable_ || (offset < bufs_[curr_].offset_ && async_io_ == false)) {
|
prev_len_ != 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -476,31 +481,19 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
|
|||||||
Status s;
|
Status s;
|
||||||
assert(reader != nullptr);
|
assert(reader != nullptr);
|
||||||
assert(max_readahead_size_ >= readahead_size_);
|
assert(max_readahead_size_ >= readahead_size_);
|
||||||
if (for_compaction) {
|
|
||||||
s = Prefetch(opts, reader, offset, std::max(n, readahead_size_),
|
if (implicit_auto_readahead_) {
|
||||||
rate_limiter_priority);
|
if (!IsEligibleForPrefetch(offset, n)) {
|
||||||
} else {
|
// Ignore status as Prefetch is not called.
|
||||||
if (implicit_auto_readahead_) {
|
s.PermitUncheckedError();
|
||||||
if (!IsEligibleForPrefetch(offset, n)) {
|
return false;
|
||||||
// Ignore status as Prefetch is not called.
|
|
||||||
s.PermitUncheckedError();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// async prefetching is enabled if it's implicit_auto_readahead_ or
|
|
||||||
// explicit readahead_size_ is passed along with ReadOptions.async_io =
|
|
||||||
// true.
|
|
||||||
if (async_io_) {
|
|
||||||
// Prefetch n + readahead_size_/2 synchronously as remaining
|
|
||||||
// readahead_size_/2 will be prefetched asynchronously.
|
|
||||||
s = PrefetchAsyncInternal(opts, reader, offset, n,
|
|
||||||
readahead_size_ / 2, rate_limiter_priority,
|
|
||||||
copy_to_third_buffer);
|
|
||||||
} else {
|
|
||||||
s = Prefetch(opts, reader, offset, n + readahead_size_,
|
|
||||||
rate_limiter_priority);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prefetch n + readahead_size_/2 synchronously as remaining
|
||||||
|
// readahead_size_/2 will be prefetched asynchronously.
|
||||||
|
s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2,
|
||||||
|
rate_limiter_priority, copy_to_third_buffer);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
if (status) {
|
if (status) {
|
||||||
*status = s;
|
*status = s;
|
||||||
@ -574,7 +567,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|||||||
// Index of second buffer.
|
// Index of second buffer.
|
||||||
uint32_t second = curr_ ^ 1;
|
uint32_t second = curr_ ^ 1;
|
||||||
|
|
||||||
// Since PrefetchAsync can be called on non sequqential reads. So offset can
|
// Since PrefetchAsync can be called on non sequential reads. So offset can
|
||||||
// be less than buffers' offset. In that case it clears the buffer and
|
// be less than buffers' offset. In that case it clears the buffer and
|
||||||
// prefetch that block.
|
// prefetch that block.
|
||||||
if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_) {
|
if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_) {
|
||||||
|
@ -89,6 +89,7 @@ class FilePrefetchBuffer {
|
|||||||
// while curr_ is being consumed. If data is overlapping in two buffers,
|
// while curr_ is being consumed. If data is overlapping in two buffers,
|
||||||
// data is copied to third buffer to return continuous buffer.
|
// data is copied to third buffer to return continuous buffer.
|
||||||
bufs_.resize(3);
|
bufs_.resize(3);
|
||||||
|
(void)async_io_;
|
||||||
}
|
}
|
||||||
|
|
||||||
~FilePrefetchBuffer() {
|
~FilePrefetchBuffer() {
|
||||||
@ -170,8 +171,7 @@ class FilePrefetchBuffer {
|
|||||||
bool TryReadFromCacheAsync(const IOOptions& opts,
|
bool TryReadFromCacheAsync(const IOOptions& opts,
|
||||||
RandomAccessFileReader* reader, uint64_t offset,
|
RandomAccessFileReader* reader, uint64_t offset,
|
||||||
size_t n, Slice* result, Status* status,
|
size_t n, Slice* result, Status* status,
|
||||||
Env::IOPriority rate_limiter_priority,
|
Env::IOPriority rate_limiter_priority);
|
||||||
bool for_compaction);
|
|
||||||
|
|
||||||
// The minimum `offset` ever passed to TryReadFromCache(). This will nly be
|
// The minimum `offset` ever passed to TryReadFromCache(). This will nly be
|
||||||
// tracked if track_min_offset = true.
|
// tracked if track_min_offset = true.
|
||||||
|
@ -1252,7 +1252,6 @@ TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) {
|
|||||||
} else {
|
} else {
|
||||||
ASSERT_GT(async_read_bytes.count, 0);
|
ASSERT_GT(async_read_bytes.count, 0);
|
||||||
ASSERT_GT(get_perf_context()->number_async_seek, 0);
|
ASSERT_GT(get_perf_context()->number_async_seek, 0);
|
||||||
//"ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1349,12 +1348,15 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {
|
|||||||
// Read the keys.
|
// Read the keys.
|
||||||
{
|
{
|
||||||
ASSERT_OK(options.statistics->Reset());
|
ASSERT_OK(options.statistics->Reset());
|
||||||
|
get_perf_context()->Reset();
|
||||||
|
|
||||||
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
||||||
int num_keys = 0;
|
int num_keys = 0;
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
ASSERT_OK(iter->status());
|
ASSERT_OK(iter->status());
|
||||||
num_keys++;
|
num_keys++;
|
||||||
}
|
}
|
||||||
|
|
||||||
ASSERT_EQ(num_keys, total_keys);
|
ASSERT_EQ(num_keys, total_keys);
|
||||||
ASSERT_GT(buff_prefetch_count, 0);
|
ASSERT_GT(buff_prefetch_count, 0);
|
||||||
|
|
||||||
@ -1375,6 +1377,55 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {
|
|||||||
}
|
}
|
||||||
ASSERT_GT(prefetched_bytes_discarded.count, 0);
|
ASSERT_GT(prefetched_bytes_discarded.count, 0);
|
||||||
}
|
}
|
||||||
|
ASSERT_EQ(get_perf_context()->number_async_seek, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Read the keys using seek.
|
||||||
|
{
|
||||||
|
ASSERT_OK(options.statistics->Reset());
|
||||||
|
get_perf_context()->Reset();
|
||||||
|
|
||||||
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
||||||
|
int num_keys = 0;
|
||||||
|
iter->Seek(BuildKey(450));
|
||||||
|
while (iter->Valid()) {
|
||||||
|
ASSERT_OK(iter->status());
|
||||||
|
num_keys++;
|
||||||
|
iter->Next();
|
||||||
|
}
|
||||||
|
ASSERT_OK(iter->status());
|
||||||
|
|
||||||
|
iter->Seek(BuildKey(450));
|
||||||
|
while (iter->Valid()) {
|
||||||
|
ASSERT_OK(iter->status());
|
||||||
|
num_keys++;
|
||||||
|
iter->Prev();
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_EQ(num_keys, total_keys + 1);
|
||||||
|
ASSERT_GT(buff_prefetch_count, 0);
|
||||||
|
|
||||||
|
// Check stats to make sure async prefetch is done.
|
||||||
|
{
|
||||||
|
HistogramData async_read_bytes;
|
||||||
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
||||||
|
HistogramData prefetched_bytes_discarded;
|
||||||
|
options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
|
||||||
|
&prefetched_bytes_discarded);
|
||||||
|
|
||||||
|
// Not all platforms support iouring. In that case, ReadAsync in posix
|
||||||
|
// won't submit async requests.
|
||||||
|
if (read_async_called) {
|
||||||
|
ASSERT_GT(async_read_bytes.count, 0);
|
||||||
|
ASSERT_GT(get_perf_context()->number_async_seek, 0);
|
||||||
|
} else {
|
||||||
|
ASSERT_EQ(async_read_bytes.count, 0);
|
||||||
|
ASSERT_EQ(get_perf_context()->number_async_seek, 0);
|
||||||
|
}
|
||||||
|
ASSERT_GT(prefetched_bytes_discarded.count, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SyncPoint::GetInstance()->DisableProcessing();
|
SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#include "table/block_fetcher.h"
|
#include "table/block_fetcher.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@ -72,10 +73,10 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
|
|||||||
IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
|
IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
|
||||||
if (io_s.ok()) {
|
if (io_s.ok()) {
|
||||||
bool read_from_prefetch_buffer = false;
|
bool read_from_prefetch_buffer = false;
|
||||||
if (read_options_.async_io) {
|
if (read_options_.async_io && !for_compaction_) {
|
||||||
read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCacheAsync(
|
read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCacheAsync(
|
||||||
opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
|
opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
|
||||||
&io_s, read_options_.rate_limiter_priority, for_compaction_);
|
&io_s, read_options_.rate_limiter_priority);
|
||||||
} else {
|
} else {
|
||||||
read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache(
|
read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache(
|
||||||
opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
|
opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
|
||||||
@ -349,20 +350,20 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() {
|
|||||||
#endif // NDEBUG
|
#endif // NDEBUG
|
||||||
return IOStatus::OK();
|
return IOStatus::OK();
|
||||||
} else if (!TryGetCompressedBlockFromPersistentCache()) {
|
} else if (!TryGetCompressedBlockFromPersistentCache()) {
|
||||||
if (prefetch_buffer_ != nullptr) {
|
assert(prefetch_buffer_ != nullptr);
|
||||||
|
if (!for_compaction_) {
|
||||||
IOOptions opts;
|
IOOptions opts;
|
||||||
IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
|
IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
|
||||||
|
if (!io_s.ok()) {
|
||||||
|
return io_s;
|
||||||
|
}
|
||||||
|
io_s = status_to_io_status(prefetch_buffer_->PrefetchAsync(
|
||||||
|
opts, file_, handle_.offset(), block_size_with_trailer_,
|
||||||
|
read_options_.rate_limiter_priority, &slice_));
|
||||||
|
if (io_s.IsTryAgain()) {
|
||||||
|
return io_s;
|
||||||
|
}
|
||||||
if (io_s.ok()) {
|
if (io_s.ok()) {
|
||||||
io_s = status_to_io_status(prefetch_buffer_->PrefetchAsync(
|
|
||||||
opts, file_, handle_.offset(), block_size_with_trailer_,
|
|
||||||
read_options_.rate_limiter_priority, &slice_));
|
|
||||||
if (io_s.IsTryAgain()) {
|
|
||||||
return io_s;
|
|
||||||
}
|
|
||||||
if (!io_s.ok()) {
|
|
||||||
// Fallback to sequential reading of data blocks.
|
|
||||||
return ReadBlockContents();
|
|
||||||
}
|
|
||||||
// Data Block is already in prefetch.
|
// Data Block is already in prefetch.
|
||||||
got_from_prefetch_buffer_ = true;
|
got_from_prefetch_buffer_ = true;
|
||||||
ProcessTrailerIfPresent();
|
ProcessTrailerIfPresent();
|
||||||
@ -388,8 +389,12 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() {
|
|||||||
GetBlockContents();
|
GetBlockContents();
|
||||||
}
|
}
|
||||||
InsertUncompressedBlockToPersistentCacheIfNeeded();
|
InsertUncompressedBlockToPersistentCacheIfNeeded();
|
||||||
|
return io_status_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Fallback to sequential reading of data blocks in case of io_s returns
|
||||||
|
// error or for_compaction_is true.
|
||||||
|
return ReadBlockContents();
|
||||||
}
|
}
|
||||||
return io_status_;
|
return io_status_;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user