babe56ddba
Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
334 lines
12 KiB
C++
334 lines
12 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "table/block_fetcher.h"
|
|
|
|
#include <cinttypes>
|
|
#include <string>
|
|
|
|
#include "logging/logging.h"
|
|
#include "memory/memory_allocator.h"
|
|
#include "monitoring/perf_context_imp.h"
|
|
#include "rocksdb/compression_type.h"
|
|
#include "rocksdb/env.h"
|
|
#include "table/block_based/block.h"
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
#include "table/block_based/block_type.h"
|
|
#include "table/block_based/reader_common.h"
|
|
#include "table/format.h"
|
|
#include "table/persistent_cache_helper.h"
|
|
#include "util/compression.h"
|
|
#include "util/stop_watch.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
inline void BlockFetcher::ProcessTrailerIfPresent() {
|
|
if (footer_.GetBlockTrailerSize() > 0) {
|
|
assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize);
|
|
if (read_options_.verify_checksums) {
|
|
io_status_ = status_to_io_status(VerifyBlockChecksum(
|
|
footer_.checksum_type(), slice_.data(), block_size_,
|
|
file_->file_name(), handle_.offset()));
|
|
}
|
|
compression_type_ =
|
|
BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);
|
|
} else {
|
|
// E.g. plain table or cuckoo table
|
|
compression_type_ = kNoCompression;
|
|
}
|
|
}
|
|
|
|
inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
|
|
if (cache_options_.persistent_cache &&
|
|
!cache_options_.persistent_cache->IsCompressed()) {
|
|
Status status = PersistentCacheHelper::LookupUncompressedPage(
|
|
cache_options_, handle_, contents_);
|
|
if (status.ok()) {
|
|
// uncompressed page is found for the block handle
|
|
return true;
|
|
} else {
|
|
// uncompressed page is not found
|
|
if (ioptions_.logger && !status.IsNotFound()) {
|
|
assert(!status.ok());
|
|
ROCKS_LOG_INFO(ioptions_.logger,
|
|
"Error reading from persistent cache. %s",
|
|
status.ToString().c_str());
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
|
|
if (prefetch_buffer_ != nullptr) {
|
|
IOOptions opts;
|
|
IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
|
|
if (io_s.ok() &&
|
|
prefetch_buffer_->TryReadFromCache(
|
|
opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
|
|
&io_s, read_options_.rate_limiter_priority, for_compaction_)) {
|
|
ProcessTrailerIfPresent();
|
|
if (!io_status_.ok()) {
|
|
return true;
|
|
}
|
|
got_from_prefetch_buffer_ = true;
|
|
used_buf_ = const_cast<char*>(slice_.data());
|
|
} else if (!io_s.ok()) {
|
|
io_status_ = io_s;
|
|
return true;
|
|
}
|
|
}
|
|
return got_from_prefetch_buffer_;
|
|
}
|
|
|
|
inline bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() {
|
|
if (cache_options_.persistent_cache &&
|
|
cache_options_.persistent_cache->IsCompressed()) {
|
|
// lookup uncompressed cache mode p-cache
|
|
std::unique_ptr<char[]> raw_data;
|
|
io_status_ = status_to_io_status(PersistentCacheHelper::LookupRawPage(
|
|
cache_options_, handle_, &raw_data, block_size_with_trailer_));
|
|
if (io_status_.ok()) {
|
|
heap_buf_ = CacheAllocationPtr(raw_data.release());
|
|
used_buf_ = heap_buf_.get();
|
|
slice_ = Slice(heap_buf_.get(), block_size_);
|
|
ProcessTrailerIfPresent();
|
|
return true;
|
|
} else if (!io_status_.IsNotFound() && ioptions_.logger) {
|
|
assert(!io_status_.ok());
|
|
ROCKS_LOG_INFO(ioptions_.logger,
|
|
"Error reading from persistent cache. %s",
|
|
io_status_.ToString().c_str());
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
inline void BlockFetcher::PrepareBufferForBlockFromFile() {
|
|
// cache miss read from device
|
|
if ((do_uncompress_ || ioptions_.allow_mmap_reads) &&
|
|
block_size_with_trailer_ < kDefaultStackBufferSize) {
|
|
// If we've got a small enough hunk of data, read it in to the
|
|
// trivially allocated stack buffer instead of needing a full malloc()
|
|
//
|
|
// `GetBlockContents()` cannot return this data as its lifetime is tied to
|
|
// this `BlockFetcher`'s lifetime. That is fine because this is only used
|
|
// in cases where we do not expect the `GetBlockContents()` result to be the
|
|
// same buffer we are assigning here. If we guess incorrectly, there will be
|
|
// a heap allocation and memcpy in `GetBlockContents()` to obtain the final
|
|
// result. Considering we are eliding a heap allocation here by using the
|
|
// stack buffer, the cost of guessing incorrectly here is one extra memcpy.
|
|
//
|
|
// When `do_uncompress_` is true, we expect the uncompression step will
|
|
// allocate heap memory for the final result. However this expectation will
|
|
// be wrong if the block turns out to already be uncompressed, which we
|
|
// won't know for sure until after reading it.
|
|
//
|
|
// When `ioptions_.allow_mmap_reads` is true, we do not expect the file
|
|
// reader to use the scratch buffer at all, but instead return a pointer
|
|
// into the mapped memory. This expectation will be wrong when using a
|
|
// file reader that does not implement mmap reads properly.
|
|
used_buf_ = &stack_buf_[0];
|
|
} else if (maybe_compressed_ && !do_uncompress_) {
|
|
compressed_buf_ = AllocateBlock(block_size_with_trailer_,
|
|
memory_allocator_compressed_);
|
|
used_buf_ = compressed_buf_.get();
|
|
} else {
|
|
heap_buf_ =
|
|
AllocateBlock(block_size_with_trailer_, memory_allocator_);
|
|
used_buf_ = heap_buf_.get();
|
|
}
|
|
}
|
|
|
|
inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
|
|
if (io_status_.ok() && read_options_.fill_cache &&
|
|
cache_options_.persistent_cache &&
|
|
cache_options_.persistent_cache->IsCompressed()) {
|
|
// insert to raw cache
|
|
PersistentCacheHelper::InsertRawPage(cache_options_, handle_, used_buf_,
|
|
block_size_with_trailer_);
|
|
}
|
|
}
|
|
|
|
inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
|
|
if (io_status_.ok() && !got_from_prefetch_buffer_ &&
|
|
read_options_.fill_cache && cache_options_.persistent_cache &&
|
|
!cache_options_.persistent_cache->IsCompressed()) {
|
|
// insert to uncompressed cache
|
|
PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_,
|
|
*contents_);
|
|
}
|
|
}
|
|
|
|
inline void BlockFetcher::CopyBufferToHeapBuf() {
|
|
assert(used_buf_ != heap_buf_.get());
|
|
heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_);
|
|
memcpy(heap_buf_.get(), used_buf_, block_size_with_trailer_);
|
|
#ifndef NDEBUG
|
|
num_heap_buf_memcpy_++;
|
|
#endif
|
|
}
|
|
|
|
inline void BlockFetcher::CopyBufferToCompressedBuf() {
|
|
assert(used_buf_ != compressed_buf_.get());
|
|
compressed_buf_ = AllocateBlock(block_size_with_trailer_,
|
|
memory_allocator_compressed_);
|
|
memcpy(compressed_buf_.get(), used_buf_, block_size_with_trailer_);
|
|
#ifndef NDEBUG
|
|
num_compressed_buf_memcpy_++;
|
|
#endif
|
|
}
|
|
|
|
// Entering this method means the block is not compressed or do not need to be
|
|
// uncompressed. The block can be in one of the following buffers:
|
|
// 1. prefetch buffer if prefetch is enabled and the block is prefetched before
|
|
// 2. stack_buf_ if block size is smaller than the stack_buf_ size and block
|
|
// is not compressed
|
|
// 3. heap_buf_ if the block is not compressed
|
|
// 4. compressed_buf_ if the block is compressed
|
|
// 5. direct_io_buf_ if direct IO is enabled
|
|
// After this method, if the block is compressed, it should be in
|
|
// compressed_buf_, otherwise should be in heap_buf_.
|
|
inline void BlockFetcher::GetBlockContents() {
|
|
if (slice_.data() != used_buf_) {
|
|
// the slice content is not the buffer provided
|
|
*contents_ = BlockContents(Slice(slice_.data(), block_size_));
|
|
} else {
|
|
// page can be either uncompressed or compressed, the buffer either stack
|
|
// or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
|
|
if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
|
|
CopyBufferToHeapBuf();
|
|
} else if (used_buf_ == compressed_buf_.get()) {
|
|
if (compression_type_ == kNoCompression &&
|
|
memory_allocator_ != memory_allocator_compressed_) {
|
|
CopyBufferToHeapBuf();
|
|
} else {
|
|
heap_buf_ = std::move(compressed_buf_);
|
|
}
|
|
} else if (direct_io_buf_.get() != nullptr) {
|
|
if (compression_type_ == kNoCompression) {
|
|
CopyBufferToHeapBuf();
|
|
} else {
|
|
CopyBufferToCompressedBuf();
|
|
heap_buf_ = std::move(compressed_buf_);
|
|
}
|
|
}
|
|
*contents_ = BlockContents(std::move(heap_buf_), block_size_);
|
|
}
|
|
#ifndef NDEBUG
|
|
contents_->is_raw_block = true;
|
|
#endif
|
|
}
|
|
|
|
IOStatus BlockFetcher::ReadBlockContents() {
|
|
if (TryGetUncompressBlockFromPersistentCache()) {
|
|
compression_type_ = kNoCompression;
|
|
#ifndef NDEBUG
|
|
contents_->is_raw_block = true;
|
|
#endif // NDEBUG
|
|
return IOStatus::OK();
|
|
}
|
|
if (TryGetFromPrefetchBuffer()) {
|
|
if (!io_status_.ok()) {
|
|
return io_status_;
|
|
}
|
|
} else if (!TryGetCompressedBlockFromPersistentCache()) {
|
|
IOOptions opts;
|
|
io_status_ = file_->PrepareIOOptions(read_options_, opts);
|
|
// Actual file read
|
|
if (io_status_.ok()) {
|
|
if (file_->use_direct_io()) {
|
|
PERF_TIMER_GUARD(block_read_time);
|
|
io_status_ = file_->Read(
|
|
opts, handle_.offset(), block_size_with_trailer_, &slice_, nullptr,
|
|
&direct_io_buf_, read_options_.rate_limiter_priority);
|
|
PERF_COUNTER_ADD(block_read_count, 1);
|
|
used_buf_ = const_cast<char*>(slice_.data());
|
|
} else {
|
|
PrepareBufferForBlockFromFile();
|
|
PERF_TIMER_GUARD(block_read_time);
|
|
io_status_ = file_->Read(opts, handle_.offset(),
|
|
block_size_with_trailer_, &slice_, used_buf_,
|
|
nullptr, read_options_.rate_limiter_priority);
|
|
PERF_COUNTER_ADD(block_read_count, 1);
|
|
#ifndef NDEBUG
|
|
if (slice_.data() == &stack_buf_[0]) {
|
|
num_stack_buf_memcpy_++;
|
|
} else if (slice_.data() == heap_buf_.get()) {
|
|
num_heap_buf_memcpy_++;
|
|
} else if (slice_.data() == compressed_buf_.get()) {
|
|
num_compressed_buf_memcpy_++;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// TODO: introduce dedicated perf counter for range tombstones
|
|
switch (block_type_) {
|
|
case BlockType::kFilter:
|
|
PERF_COUNTER_ADD(filter_block_read_count, 1);
|
|
break;
|
|
|
|
case BlockType::kCompressionDictionary:
|
|
PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
|
|
break;
|
|
|
|
case BlockType::kIndex:
|
|
PERF_COUNTER_ADD(index_block_read_count, 1);
|
|
break;
|
|
|
|
// Nothing to do here as we don't have counters for the other types.
|
|
default:
|
|
break;
|
|
}
|
|
|
|
PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_);
|
|
if (!io_status_.ok()) {
|
|
return io_status_;
|
|
}
|
|
|
|
if (slice_.size() != block_size_with_trailer_) {
|
|
return IOStatus::Corruption("truncated block read from " +
|
|
file_->file_name() + " offset " +
|
|
ToString(handle_.offset()) + ", expected " +
|
|
ToString(block_size_with_trailer_) +
|
|
" bytes, got " + ToString(slice_.size()));
|
|
}
|
|
|
|
ProcessTrailerIfPresent();
|
|
if (io_status_.ok()) {
|
|
InsertCompressedBlockToPersistentCacheIfNeeded();
|
|
} else {
|
|
return io_status_;
|
|
}
|
|
}
|
|
|
|
if (do_uncompress_ && compression_type_ != kNoCompression) {
|
|
PERF_TIMER_GUARD(block_decompress_time);
|
|
// compressed page, uncompress, update cache
|
|
UncompressionContext context(compression_type_);
|
|
UncompressionInfo info(context, uncompression_dict_, compression_type_);
|
|
io_status_ = status_to_io_status(UncompressBlockContents(
|
|
info, slice_.data(), block_size_, contents_, footer_.format_version(),
|
|
ioptions_, memory_allocator_));
|
|
#ifndef NDEBUG
|
|
num_heap_buf_memcpy_++;
|
|
#endif
|
|
compression_type_ = kNoCompression;
|
|
} else {
|
|
GetBlockContents();
|
|
}
|
|
|
|
InsertUncompressedBlockToPersistentCacheIfNeeded();
|
|
|
|
return io_status_;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|