Improve direct IO range scan performance with readahead (#3884)
Summary: This PR extends the improvements in #3282 to also work when using Direct IO. We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash. **Description:** This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan. **Implementation Details:** - Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead. - `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled. - `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer. - Made sure not to re-read partial chunks of data that were already available in the buffer, from device again. - Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date. **Constraints:** - Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value). - Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously. - Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them. **Benchmarks:** I used the same benchmark as used in #3282. Data fill: ``` TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes ``` Do a long range scan: Seekrandom with large number of nexts ``` TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram ``` ``` Before: seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found) With this change: seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found) ``` ~4.5X perf improvement. Taken on an average of 3 runs. Closes https://github.com/facebook/rocksdb/pull/3884 Differential Revision: D8082143 Pulled By: sagar0 fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
This commit is contained in:
parent
524c6e6b72
commit
7103559f49
@ -6,6 +6,7 @@
|
||||
|
||||
### New Features
|
||||
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
|
||||
* Improve the performance of iterators doing long range scans by using readahead, when using direct IO.
|
||||
|
||||
### Bug Fixes
|
||||
* fix deadlock with enable_pipelined_write=true and max_successive_merges > 0
|
||||
|
@ -1004,6 +1004,7 @@ void BlockBasedTable::SetupForCompaction() {
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
rep_->for_compaction = true;
|
||||
}
|
||||
|
||||
std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
|
||||
@ -1532,14 +1533,16 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
|
||||
BlockIter* BlockBasedTable::NewDataBlockIterator(
|
||||
Rep* rep, const ReadOptions& ro, const Slice& index_value,
|
||||
BlockIter* input_iter, bool is_index, bool key_includes_seq,
|
||||
GetContext* get_context) {
|
||||
GetContext* get_context,
|
||||
FilePrefetchBuffer* prefetch_buffer) {
|
||||
BlockHandle handle;
|
||||
Slice input = index_value;
|
||||
// We intentionally allow extra stuff in index_value so that we
|
||||
// can add more features in the future.
|
||||
Status s = handle.DecodeFrom(&input);
|
||||
return NewDataBlockIterator(rep, ro, handle, input_iter, is_index,
|
||||
key_includes_seq, get_context, s);
|
||||
key_includes_seq, get_context, s,
|
||||
prefetch_buffer);
|
||||
}
|
||||
|
||||
// Convert an index iterator value (i.e., an encoded BlockHandle)
|
||||
@ -1549,7 +1552,7 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
|
||||
BlockIter* BlockBasedTable::NewDataBlockIterator(
|
||||
Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
|
||||
BlockIter* input_iter, bool is_index, bool key_includes_seq,
|
||||
GetContext* get_context, Status s) {
|
||||
GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) {
|
||||
PERF_TIMER_GUARD(new_table_block_iter_nanos);
|
||||
|
||||
const bool no_io = (ro.read_tier == kBlockCacheTier);
|
||||
@ -1560,7 +1563,7 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
|
||||
if (rep->compression_dict_block) {
|
||||
compression_dict = rep->compression_dict_block->data;
|
||||
}
|
||||
s = MaybeLoadDataBlockToCache(nullptr /*prefetch_buffer*/, rep, ro, handle,
|
||||
s = MaybeLoadDataBlockToCache(prefetch_buffer, rep, ro, handle,
|
||||
compression_dict, &block, is_index,
|
||||
get_context);
|
||||
}
|
||||
@ -1583,8 +1586,8 @@ BlockIter* BlockBasedTable::NewDataBlockIterator(
|
||||
StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
|
||||
READ_BLOCK_GET_MICROS);
|
||||
s = ReadBlockFromFile(
|
||||
rep->file.get(), nullptr /* prefetch_buffer */, rep->footer, ro,
|
||||
handle, &block_value, rep->ioptions, rep->blocks_maybe_compressed,
|
||||
rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
|
||||
&block_value, rep->ioptions, rep->blocks_maybe_compressed,
|
||||
compression_dict, rep->persistent_cache_options,
|
||||
is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
|
||||
rep->table_options.read_amp_bytes_per_bit);
|
||||
@ -1975,31 +1978,39 @@ void BlockBasedTableIterator::InitDataBlock() {
|
||||
auto* rep = table_->get_rep();
|
||||
|
||||
// Automatically prefetch additional data when a range scan (iterator) does
|
||||
// more than 2 sequential IOs. This is enabled only when
|
||||
// more than 2 sequential IOs. This is enabled only for user reads and when
|
||||
// ReadOptions.readahead_size is 0.
|
||||
if (read_options_.readahead_size == 0) {
|
||||
if (num_file_reads_ < 2) {
|
||||
num_file_reads_++;
|
||||
} else if (data_block_handle.offset() +
|
||||
static_cast<size_t>(data_block_handle.size()) +
|
||||
kBlockTrailerSize >
|
||||
readahead_limit_) {
|
||||
num_file_reads_++;
|
||||
// Do not readahead more than kMaxReadaheadSize.
|
||||
readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_);
|
||||
table_->get_rep()->file->Prefetch(data_block_handle.offset(),
|
||||
readahead_size_);
|
||||
readahead_limit_ = static_cast<size_t>(data_block_handle.offset()
|
||||
+ readahead_size_);
|
||||
// Keep exponentially increasing readahead size until kMaxReadaheadSize.
|
||||
readahead_size_ *= 2;
|
||||
if (!rep->for_compaction && read_options_.readahead_size == 0) {
|
||||
num_file_reads_++;
|
||||
if (num_file_reads_ > 2) {
|
||||
if (!rep->file->use_direct_io() &&
|
||||
(data_block_handle.offset() +
|
||||
static_cast<size_t>(data_block_handle.size()) +
|
||||
kBlockTrailerSize >
|
||||
readahead_limit_)) {
|
||||
// Buffered I/O
|
||||
// Discarding the return status of Prefetch calls intentionally, as we
|
||||
// can fallback to reading from disk if Prefetch fails.
|
||||
rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
|
||||
readahead_limit_ =
|
||||
static_cast<size_t>(data_block_handle.offset() + readahead_size_);
|
||||
// Keep exponentially increasing readahead size until
|
||||
// kMaxReadaheadSize.
|
||||
readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2);
|
||||
} else if (rep->file->use_direct_io() && !prefetch_buffer_) {
|
||||
// Direct I/O
|
||||
// Let FilePrefetchBuffer take care of the readahead.
|
||||
prefetch_buffer_.reset(new FilePrefetchBuffer(
|
||||
rep->file.get(), kInitReadaheadSize, kMaxReadaheadSize));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BlockBasedTable::NewDataBlockIterator(rep, read_options_, data_block_handle,
|
||||
&data_block_iter_, is_index_,
|
||||
key_includes_seq_,
|
||||
/* get_context */ nullptr, s);
|
||||
/* get_context */ nullptr, s,
|
||||
prefetch_buffer_.get());
|
||||
block_iter_points_to_real_block_ = true;
|
||||
}
|
||||
}
|
||||
|
@ -213,19 +213,16 @@ class BlockBasedTable : public TableReader {
|
||||
Rep* get_rep() { return rep_; }
|
||||
|
||||
// input_iter: if it is not null, update this one and return it as Iterator
|
||||
static BlockIter* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
|
||||
const Slice& index_value,
|
||||
BlockIter* input_iter = nullptr,
|
||||
bool is_index = false,
|
||||
bool key_includes_seq = true,
|
||||
GetContext* get_context = nullptr);
|
||||
static BlockIter* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
|
||||
const BlockHandle& block_hanlde,
|
||||
BlockIter* input_iter = nullptr,
|
||||
bool is_index = false,
|
||||
bool key_includes_seq = true,
|
||||
GetContext* get_context = nullptr,
|
||||
Status s = Status());
|
||||
static BlockIter* NewDataBlockIterator(
|
||||
Rep* rep, const ReadOptions& ro, const Slice& index_value,
|
||||
BlockIter* input_iter = nullptr, bool is_index = false,
|
||||
bool key_includes_seq = true, GetContext* get_context = nullptr,
|
||||
FilePrefetchBuffer* prefetch_buffer = nullptr);
|
||||
static BlockIter* NewDataBlockIterator(
|
||||
Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
|
||||
BlockIter* input_iter = nullptr, bool is_index = false,
|
||||
bool key_includes_seq = true, GetContext* get_context = nullptr,
|
||||
Status s = Status(), FilePrefetchBuffer* prefetch_buffer = nullptr);
|
||||
|
||||
class PartitionedIndexIteratorState;
|
||||
|
||||
@ -505,6 +502,8 @@ struct BlockBasedTable::Rep {
|
||||
bool blocks_maybe_compressed = true;
|
||||
|
||||
bool closed = false;
|
||||
|
||||
bool for_compaction = false;
|
||||
};
|
||||
|
||||
class BlockBasedTableIterator : public InternalIterator {
|
||||
@ -632,6 +631,7 @@ class BlockBasedTableIterator : public InternalIterator {
|
||||
size_t readahead_size_ = kInitReadaheadSize;
|
||||
size_t readahead_limit_ = 0;
|
||||
int num_file_reads_ = 0;
|
||||
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -96,12 +96,19 @@ public:
|
||||
alignment_ = alignment;
|
||||
}
|
||||
|
||||
// Allocates a new buffer and sets bufstart_ to the aligned first byte
|
||||
void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false) {
|
||||
// Allocates a new buffer and sets bufstart_ to the aligned first byte.
|
||||
// requested_capacity: requested new buffer capacity. This capacity will be
|
||||
// rounded up based on alignment.
|
||||
// copy_data: Copy data from old buffer to new buffer.
|
||||
// copy_offset: Copy data from this offset in old buffer.
|
||||
// copy_len: Number of bytes to copy.
|
||||
void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
|
||||
uint64_t copy_offset = 0, size_t copy_len = 0) {
|
||||
assert(alignment_ > 0);
|
||||
assert((alignment_ & (alignment_ - 1)) == 0);
|
||||
|
||||
if (copy_data && requested_capacity < cursize_) {
|
||||
copy_len = copy_len > 0 ? copy_len : cursize_;
|
||||
if (copy_data && requested_capacity < copy_len) {
|
||||
// If we are downsizing to a capacity that is smaller than the current
|
||||
// data in the buffer. Ignore the request.
|
||||
return;
|
||||
@ -114,7 +121,8 @@ public:
|
||||
~static_cast<uintptr_t>(alignment_ - 1));
|
||||
|
||||
if (copy_data) {
|
||||
memcpy(new_bufstart, bufstart_, cursize_);
|
||||
memcpy(new_bufstart, bufstart_ + copy_offset, copy_len);
|
||||
cursize_ = copy_len;
|
||||
} else {
|
||||
cursize_ = 0;
|
||||
}
|
||||
|
@ -647,24 +647,85 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
|
||||
uint64_t roundup_len = roundup_end - rounddown_offset;
|
||||
assert(roundup_len >= alignment);
|
||||
assert(roundup_len % alignment == 0);
|
||||
buffer_.Alignment(alignment);
|
||||
buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len));
|
||||
|
||||
// Check if requested bytes are in the existing buffer_.
|
||||
// If all bytes exist -- return.
|
||||
// If only a few bytes exist -- reuse them & read only what is really needed.
|
||||
// This is typically the case of incremental reading of data.
|
||||
// If no bytes exist in buffer -- full pread.
|
||||
|
||||
Status s;
|
||||
uint64_t chunk_offset_in_buffer = 0;
|
||||
uint64_t chunk_len = 0;
|
||||
bool copy_data_to_new_buffer = false;
|
||||
if (buffer_len_ > 0 && offset >= buffer_offset_ &&
|
||||
offset <= buffer_offset_ + buffer_len_) {
|
||||
if (offset + n <= buffer_offset_ + buffer_len_) {
|
||||
// All requested bytes are already in the buffer. So no need to Read
|
||||
// again.
|
||||
return s;
|
||||
} else {
|
||||
// Only a few requested bytes are in the buffer. memmove those chunk of
|
||||
// bytes to the beginning, and memcpy them back into the new buffer if a
|
||||
// new buffer is created.
|
||||
chunk_offset_in_buffer = Rounddown(offset - buffer_offset_, alignment);
|
||||
chunk_len = buffer_len_ - chunk_offset_in_buffer;
|
||||
assert(chunk_offset_in_buffer % alignment == 0);
|
||||
assert(chunk_len % alignment == 0);
|
||||
copy_data_to_new_buffer = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new buffer only if current capacity is not sufficient, and memcopy
|
||||
// bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
|
||||
if (buffer_.Capacity() < roundup_len) {
|
||||
buffer_.Alignment(alignment);
|
||||
buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
|
||||
copy_data_to_new_buffer, chunk_offset_in_buffer,
|
||||
chunk_len);
|
||||
} else if (chunk_len > 0) {
|
||||
// New buffer not needed. But memmove bytes from tail to the beginning since
|
||||
// chunk_len is greater than 0.
|
||||
buffer_.RefitTail(chunk_offset_in_buffer, chunk_len);
|
||||
}
|
||||
|
||||
Slice result;
|
||||
Status s = reader->Read(rounddown_offset, static_cast<size_t>(roundup_len),
|
||||
&result, buffer_.BufferStart());
|
||||
s = reader->Read(rounddown_offset + chunk_len,
|
||||
static_cast<size_t>(roundup_len - chunk_len), &result,
|
||||
buffer_.BufferStart() + chunk_len);
|
||||
if (s.ok()) {
|
||||
buffer_offset_ = rounddown_offset;
|
||||
buffer_len_ = result.size();
|
||||
buffer_len_ = chunk_len + result.size();
|
||||
buffer_.Size(buffer_len_);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
|
||||
Slice* result) const {
|
||||
if (offset < buffer_offset_ || offset + n > buffer_offset_ + buffer_len_) {
|
||||
Slice* result) {
|
||||
if (offset < buffer_offset_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the buffer contains only a few of the requested bytes:
|
||||
// If readahead is enabled: prefetch the remaining bytes + readadhead bytes
|
||||
// and satisfy the request.
|
||||
// If readahead is not enabled: return false.
|
||||
if (offset + n > buffer_offset_ + buffer_len_) {
|
||||
if (readahead_size_ > 0) {
|
||||
assert(file_reader_ != nullptr);
|
||||
assert(max_readahead_size_ >= readahead_size_);
|
||||
|
||||
Status s = Prefetch(file_reader_, offset, n + readahead_size_);
|
||||
if (!s.ok()) {
|
||||
return false;
|
||||
}
|
||||
readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t offset_in_buffer = offset - buffer_offset_;
|
||||
*result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
|
||||
return true;
|
||||
|
@ -207,16 +207,29 @@ class WritableFileWriter {
|
||||
Status SyncInternal(bool use_fsync);
|
||||
};
|
||||
|
||||
// FilePrefetchBuffer can automatically do the readahead if file_reader,
|
||||
// readahead_size, and max_readahead_size are passed in.
|
||||
// max_readahead_size should be greater than or equal to readahead_size.
|
||||
// readahead_size will be doubled on every IO, until max_readahead_size.
|
||||
class FilePrefetchBuffer {
|
||||
public:
|
||||
FilePrefetchBuffer() : buffer_offset_(0), buffer_len_(0) {}
|
||||
FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
|
||||
size_t readadhead_size = 0, size_t max_readahead_size = 0)
|
||||
: buffer_offset_(0),
|
||||
buffer_len_(0),
|
||||
file_reader_(file_reader),
|
||||
readahead_size_(readadhead_size),
|
||||
max_readahead_size_(max_readahead_size) {}
|
||||
Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
|
||||
bool TryReadFromCache(uint64_t offset, size_t n, Slice* result) const;
|
||||
bool TryReadFromCache(uint64_t offset, size_t n, Slice* result);
|
||||
|
||||
private:
|
||||
AlignedBuffer buffer_;
|
||||
uint64_t buffer_offset_;
|
||||
size_t buffer_len_;
|
||||
RandomAccessFileReader* file_reader_;
|
||||
size_t readahead_size_;
|
||||
size_t max_readahead_size_;
|
||||
};
|
||||
|
||||
extern Status NewWritableFile(Env* env, const std::string& fname,
|
||||
|
Loading…
Reference in New Issue
Block a user