From df23e80e5f3bc64a6802be58971066ccd35923d3 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 25 Jan 2018 21:34:35 -0800 Subject: [PATCH] Improve performance of long range scans with readahead Summary: This change improves the performance of iterators doing long range scans (e.g. big/full table scans in MyRocks) by using readahead and prefetching additional data on each disk IO. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan. Constraints: - The prefetched data is stored by the OS in page cache. So this currently works only for non direct-reads use-cases i.e applications which use page cache. (Direct-I/O support will be enabled in a later PR). - This gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value). Thanks to siying for the original idea and implementation. **Benchmarks:** Data fill: ``` TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes ``` Do a long range scan: Seekrandom with large number of nexts ``` TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram ``` Page cache was cleared before each experiment with the command: ``` sudo sh -c "echo 3 > /proc/sys/vm/drop_caches" ``` ``` Before: seekrandom : 34020.945 micros/op 29 ops/sec; 32.5 MB/s (1636 of 1999 found) With this change: seekrandom : 8726.912 micros/op 114 ops/sec; 126.8 MB/s (5702 of 6999 found) ``` ~3.9X performance improvement. Also verified with strace and gdb that the readahead size is increasing as expected. ``` strace -e readahead -f -T -t -p ``` Closes https://github.com/facebook/rocksdb/pull/3282 Differential Revision: D6586477 Pulled By: sagar0 fbshipit-source-id: 8a118a0ed4594fbb7f5b1cafb242d7a4033cb58c --- HISTORY.md | 1 + table/block_based_table_reader.cc | 25 +++++++++++++++++++++++++ table/block_based_table_reader.h | 8 ++++++++ 3 files changed, 34 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index e0ad2ca2a..c4e1bf5f0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * Add a new histogram stat called rocksdb.db.flush.micros for memtable flush. * Add "--use_txn" option to use transactional API in db_stress. * Disable onboard cache for compaction output in Windows platform. +* Improve the performance of iterators doing long range scans by using readahead. ### Bug Fixes * Fix a stack-use-after-scope bug in ForwardIterator. diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 6b47fcb30..f20b6cff0 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -1594,6 +1594,9 @@ BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState( is_index_(is_index), block_map_(block_map) {} +const size_t BlockBasedTable::BlockEntryIteratorState::kMaxReadaheadSize = + 256 * 1024; + InternalIterator* BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator( const Slice& index_value) { @@ -1618,6 +1621,28 @@ BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator( &rep->internal_comparator, nullptr, true, rep->ioptions.statistics); } } + + // Automatically prefetch additional data when a range scan (iterator) does + // more than 2 sequential IOs. This is enabled only when + // ReadOptions.readahead_size is 0. + if (read_options_.readahead_size == 0) { + if (num_file_reads_ < 2) { + num_file_reads_++; + } else if (handle.offset() + static_cast(handle.size()) + + kBlockTrailerSize > + readahead_limit_) { + num_file_reads_++; + // Do not readahead more than kMaxReadaheadSize. + readahead_size_ = + std::min(BlockBasedTable::BlockEntryIteratorState::kMaxReadaheadSize, + readahead_size_); + table_->rep_->file->Prefetch(handle.offset(), readahead_size_); + readahead_limit_ = handle.offset() + readahead_size_; + // Keep exponentially increasing readahead size until kMaxReadaheadSize. + readahead_size_ *= 2; + } + } + return NewDataBlockIterator(rep, read_options_, handle, /* input_iter */ nullptr, is_index_, /* get_context */ nullptr, s); diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 4b81b24ec..d3e481ff9 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -376,6 +376,14 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { bool is_index_; std::unordered_map>* block_map_; port::RWMutex cleaner_mu; + + static const size_t kInitReadaheadSize = 8 * 1024; + // Found that 256 KB readahead size provides the best performance, based on + // experiments. + static const size_t kMaxReadaheadSize; + size_t readahead_size_ = kInitReadaheadSize; + size_t readahead_limit_ = 0; + int num_file_reads_ = 0; }; // CachableEntry represents the entries that *may* be fetched from block cache.