diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index 7b55bd397..098d6beb7 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -88,7 +88,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, Slice result; s = reader->Read(rounddown_offset + chunk_len, static_cast(roundup_len - chunk_len), &result, - buffer_.BufferStart() + chunk_len, for_compaction); + buffer_.BufferStart() + chunk_len, nullptr, for_compaction); if (s.ok()) { buffer_offset_ = rounddown_offset; buffer_.Size(static_cast(chunk_len) + result.size()); diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 46892360f..ba1fa9050 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -21,7 +21,10 @@ namespace ROCKSDB_NAMESPACE { Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, - char* scratch, bool for_compaction) const { + char* scratch, + std::unique_ptr* internal_buf, + bool for_compaction) const { + (void) internal_buf; Status s; uint64_t elapsed = 0; { @@ -77,8 +80,13 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, } size_t res_len = 0; if (s.ok() && offset_advance < buf.CurrentSize()) { - res_len = buf.Read(scratch, offset_advance, - std::min(buf.CurrentSize() - offset_advance, n)); + res_len = std::min(buf.CurrentSize() - offset_advance, n); + if (internal_buf == nullptr) { + buf.Read(scratch, offset_advance, res_len); + } else { + scratch = buf.BufferStart(); + internal_buf->reset(buf.Release()); + } } *result = Slice(scratch, res_len); #endif // !ROCKSDB_LITE diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 35027bf45..c0ca371c2 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -102,7 +102,18 @@ class RandomAccessFileReader { RandomAccessFileReader(const RandomAccessFileReader&) = delete; RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; + // In non-direct IO mode, + // 1. if using mmap, result is stored in a buffer other than scratch; + // 2. if not using mmap, result is stored in the buffer starting from scratch. + // + // In direct IO mode, an internal aligned buffer is allocated. + // 1. If internal_buf is null, then results are copied to the buffer + // starting from scratch; + // 2. Otherwise, scratch is not used and can be null, the internal_buf owns + // the internally allocated buffer on return, and the result refers to a + // region in internal_buf. Status Read(uint64_t offset, size_t n, Slice* result, char* scratch, + std::unique_ptr* internal_buf, bool for_compaction = false) const; Status MultiRead(FSReadRequest* reqs, size_t num_reqs) const; diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 9ddee72cc..928e8ea81 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -216,7 +216,7 @@ Status BlockFetcher::ReadBlockContents() { PERF_TIMER_GUARD(block_read_time); // Actual file read status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize, - &slice_, used_buf_, for_compaction_); + &slice_, used_buf_, nullptr, for_compaction_); } PERF_COUNTER_ADD(block_read_count, 1); diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index 005ce717d..bcbff5c5b 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -114,7 +114,7 @@ class CuckooBuilderTest : public testing::Test { for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { Slice read_slice; ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice, - nullptr)); + nullptr, nullptr)); size_t key_idx = std::find(expected_locations.begin(), expected_locations.end(), i) - expected_locations.begin(); diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index c39f3f879..930204fb2 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -137,7 +137,8 @@ CuckooTableReader::CuckooTableReader( cuckoo_block_size_ = *reinterpret_cast( cuckoo_block_size->second.data()); cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; - status_ = file_->Read(0, static_cast(file_size), &file_data_, nullptr); + status_ = file_->Read(0, static_cast(file_size), &file_data_, nullptr, + nullptr); } Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, diff --git a/table/format.cc b/table/format.cc index ee3766eb8..5bfd88ebb 100644 --- a/table/format.cc +++ b/table/format.cc @@ -292,7 +292,8 @@ Status ReadFooterFromFile(RandomAccessFileReader* file, file->file_name()); } - char footer_space[Footer::kMaxEncodedLength]; + std::string footer_buf; + std::unique_ptr internal_buf; Slice footer_input; size_t read_offset = (file_size > Footer::kMaxEncodedLength) @@ -302,8 +303,14 @@ Status ReadFooterFromFile(RandomAccessFileReader* file, if (prefetch_buffer == nullptr || !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength, &footer_input)) { - s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, - footer_space); + if (file->use_direct_io()) { + s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, + nullptr, &internal_buf); + } else { + footer_buf.reserve(Footer::kMaxEncodedLength); + s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, + &footer_buf[0], nullptr); + } if (!s.ok()) return s; } diff --git a/table/mock_table.cc b/table/mock_table.cc index 9ef44628a..ca085a198 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -114,7 +114,7 @@ uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const { uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const { char buf[4]; Slice result; - file->Read(0, 4, &result, buf); + file->Read(0, 4, &result, buf, nullptr); assert(result.size() == 4); return DecodeFixed32(buf); } diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc index d82b969ba..fa3fae1dc 100644 --- a/table/plain/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -208,7 +208,7 @@ bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len, } Slice read_result; Status s = file_info_->file->Read(file_offset, size_to_read, &read_result, - new_buffer->buf.get()); + new_buffer->buf.get(), nullptr); if (!s.ok()) { status_ = s; return false; diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 55756d9c1..6d560477d 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -288,7 +288,8 @@ void PlainTableReader::FillBloom(const std::vector& prefix_hashes) { Status PlainTableReader::MmapDataIfNeeded() { if (file_info_.is_mmap_mode) { // Get mmapped memory. - return file_info_.file->Read(0, static_cast(file_size_), &file_info_.file_data, nullptr); + return file_info_.file->Read(0, static_cast(file_size_), + &file_info_.file_data, nullptr, nullptr); } return Status::OK(); } diff --git a/table/table_test.cc b/table/table_test.cc index 7162a8a9a..72592bc79 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1251,7 +1251,8 @@ class FileChecksumTestHelper { std::string tmp_checksum; bool first_read = true; Status s; - s = file_reader_->Read(offset, 2048, &result, scratch.get(), false); + s = file_reader_->Read(offset, 2048, &result, scratch.get(), nullptr, + false); if (!s.ok()) { return s; } @@ -1264,7 +1265,8 @@ class FileChecksumTestHelper { result.size()); } offset += static_cast(result.size()); - s = file_reader_->Read(offset, 2048, &result, scratch.get(), false); + s = file_reader_->Read(offset, 2048, &result, scratch.get(), nullptr, + false); if (!s.ok()) { return s; } diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h index c7b0728c9..5cd6e8f3c 100644 --- a/util/aligned_buffer.h +++ b/util/aligned_buffer.h @@ -116,6 +116,13 @@ public: cursize_ = 0; } + char* Release() { + cursize_ = 0; + capacity_ = 0; + bufstart_ = nullptr; + return buf_.release(); + } + void Alignment(size_t alignment) { assert(alignment > 0); assert((alignment & (alignment - 1)) == 0); diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 63ca7da86..3f093e305 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1482,15 +1482,22 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number, const uint64_t record_size = sizeof(uint32_t) + key.size() + size; // Allocate the buffer. This is safe in C++11 - std::string buffer_str(static_cast(record_size), static_cast(0)); - char* buffer = &buffer_str[0]; + std::string buf; + std::unique_ptr internal_buf; // A partial blob record contain checksum, key and value. Slice blob_record; { StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); - s = reader->Read(record_offset, static_cast(record_size), &blob_record, buffer); + if (reader->use_direct_io()) { + s = reader->Read(record_offset, static_cast(record_size), + &blob_record, nullptr, &internal_buf); + } else { + buf.reserve(static_cast(record_size)); + s = reader->Read(record_offset, static_cast(record_size), + &blob_record, &buf[0], nullptr); + } RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size()); } diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc index 58f26128f..26f9180ab 100644 --- a/utilities/blob_db/blob_dump_tool.cc +++ b/utilities/blob_db/blob_dump_tool.cc @@ -101,7 +101,7 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) { } buffer_.reset(new char[buffer_size_]); } - Status s = reader_->Read(offset, size, result, buffer_.get()); + Status s = reader_->Read(offset, size, result, buffer_.get(), nullptr); if (!s.ok()) { return s; } diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index f32e29529..f9707e1f7 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -138,9 +138,17 @@ Status BlobFile::ReadFooter(BlobLogFooter* bf) { assert(ra_file_reader_); Slice result; - char scratch[BlobLogFooter::kSize + 10]; - Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result, - scratch); + std::string buf; + std::unique_ptr internal_buf; + Status s; + if (ra_file_reader_->use_direct_io()) { + s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result, + nullptr, &internal_buf); + } else { + buf.reserve(BlobLogFooter::kSize + 10); + s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result, + &buf[0], nullptr); + } if (!s.ok()) return s; if (result.size() != BlobLogFooter::kSize) { // should not happen @@ -254,9 +262,17 @@ Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) { PathName())); // Read file header. - char header_buf[BlobLogHeader::kSize]; + std::string header_buf; + std::unique_ptr internal_buf; Slice header_slice; - s = file_reader->Read(0, BlobLogHeader::kSize, &header_slice, header_buf); + if (file_reader->use_direct_io()) { + s = file_reader->Read(0, BlobLogHeader::kSize, &header_slice, nullptr, + &internal_buf); + } else { + header_buf.reserve(BlobLogHeader::kSize); + s = file_reader->Read(0, BlobLogHeader::kSize, &header_slice, + &header_buf[0], nullptr); + } if (!s.ok()) { ROCKS_LOG_ERROR(info_log_, "Failed to read header of blob file %" PRIu64 @@ -287,10 +303,16 @@ Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) { assert(!footer_valid_); return Status::OK(); } - char footer_buf[BlobLogFooter::kSize]; + std::string footer_buf; Slice footer_slice; - s = file_reader->Read(file_size - BlobLogFooter::kSize, BlobLogFooter::kSize, - &footer_slice, footer_buf); + if (file_reader->use_direct_io()) { + s = file_reader->Read(file_size - BlobLogFooter::kSize, BlobLogFooter::kSize, + &footer_slice, nullptr, &internal_buf); + } else { + footer_buf.reserve(BlobLogFooter::kSize); + s = file_reader->Read(file_size - BlobLogFooter::kSize, BlobLogFooter::kSize, + &footer_slice, &footer_buf[0], nullptr); + } if (!s.ok()) { ROCKS_LOG_ERROR(info_log_, "Failed to read footer of blob file %" PRIu64 diff --git a/utilities/blob_db/blob_log_reader.cc b/utilities/blob_db/blob_log_reader.cc index 1a4b5ac81..913e9282d 100644 --- a/utilities/blob_db/blob_log_reader.cc +++ b/utilities/blob_db/blob_log_reader.cc @@ -26,7 +26,8 @@ Reader::Reader(std::unique_ptr&& file_reader, Env* env, Status Reader::ReadSlice(uint64_t size, Slice* slice, char* buf) { StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); - Status s = file_->Read(next_byte_, static_cast(size), slice, buf); + Status s = + file_->Read(next_byte_, static_cast(size), slice, buf, nullptr); next_byte_ += size; if (!s.ok()) { return s; diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc index 87ae603c5..cde84bf71 100644 --- a/utilities/persistent_cache/block_cache_tier_file.cc +++ b/utilities/persistent_cache/block_cache_tier_file.cc @@ -235,7 +235,7 @@ bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val, } Slice result; - Status s = freader_->Read(lba.off_, lba.size_, &result, scratch); + Status s = freader_->Read(lba.off_, lba.size_, &result, scratch, nullptr); if (!s.ok()) { Error(log_, "Error reading from file %s. %s", Path().c_str(), s.ToString().c_str()); diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc index 7160f7a4c..e9220579c 100644 --- a/utilities/trace/file_trace_reader_writer.cc +++ b/utilities/trace/file_trace_reader_writer.cc @@ -33,7 +33,8 @@ Status FileTraceReader::Close() { Status FileTraceReader::Read(std::string* data) { assert(file_reader_ != nullptr); - Status s = file_reader_->Read(offset_, kTraceMetadataSize, &result_, buffer_); + Status s = file_reader_->Read(offset_, kTraceMetadataSize, &result_, buffer_, + nullptr); if (!s.ok()) { return s; } @@ -57,7 +58,7 @@ Status FileTraceReader::Read(std::string* data) { unsigned int to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; while (to_read > 0) { - s = file_reader_->Read(offset_, to_read, &result_, buffer_); + s = file_reader_->Read(offset_, to_read, &result_, buffer_, nullptr); if (!s.ok()) { return s; }