From 36300fbbe3a7fde402152e4a57bbf5cda614d53c Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 1 Feb 2016 13:14:37 -0800 Subject: [PATCH] Enable per-request buffer allocation in RandomAccessFile This change impacts only non-buffered I/O on Windows. Currently, there is a buffer per RandomAccessFile instance that is protected by a lock. The reason we maintain the buffer is non-buffered I/O requires an aligned buffer to work. XPerf traces demonstrate that we accumulate a considerable wait time while waiting for that lock. This change enables to set random access buffer size to zero which would indicate a per request allocation. We are expecting that allocation expense would be much less than I/O costs plus wait time due to the fact that the memory heap would tend to re-use page aligned allocations especially with the use of Jemalloc. This change does not affect buffer use as a read_ahead_buffer for compaction purposes. --- include/rocksdb/options.h | 3 + port/win/env_win.cc | 121 ++++++++++++++++++++++++-------------- 2 files changed, 80 insertions(+), 44 deletions(-) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 9eee07ac0..a26ed7d81 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1121,6 +1121,9 @@ struct DBOptions { // This option is currently honored only on Windows // // Default: 1 Mb + // + // Special value: 0 - means do not maintain per instance buffer. Allocate + // per request buffer and avoid locking. size_t random_access_max_buffer_size; // This is the maximum buffer size that is used by WritableFileWriter. diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 977c80b88..50059a98f 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -766,6 +766,18 @@ class WinRandomAccessFile : public RandomAccessFile { return read; } + void CalculateReadParameters(uint64_t offset, size_t bytes_requested, + size_t& actual_bytes_toread, + uint64_t& first_page_start) const { + + const size_t alignment = buffer_.Alignment(); + + first_page_start = TruncateToPageBoundary(alignment, offset); + const uint64_t last_page_start = + TruncateToPageBoundary(alignment, offset + bytes_requested - 1); + actual_bytes_toread = (last_page_start - first_page_start) + alignment; + } + public: WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, const EnvOptions& options) @@ -797,66 +809,87 @@ class WinRandomAccessFile : public RandomAccessFile { virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { + Status s; SSIZE_T r = -1; size_t left = n; char* dest = scratch; + if (n == 0) { + *result = Slice(scratch, 0); + return s; + } + // When in unbuffered mode we need to do the following changes: // - use our own aligned buffer // - always read at the offset of that is a multiple of alignment if (!use_os_buffer_) { - std::unique_lock lock(buffer_mut_); - // Let's see if at least some of the requested data is already - // in the buffer - if (offset >= buffered_start_ && + uint64_t first_page_start = 0; + size_t actual_bytes_toread = 0; + size_t bytes_requested = left; + + if (!read_ahead_ && random_access_max_buffer_size_ == 0) { + CalculateReadParameters(offset, bytes_requested, actual_bytes_toread, + first_page_start); + + assert(actual_bytes_toread > 0); + + r = ReadIntoOneShotBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } else { + + std::unique_lock lock(buffer_mut_); + + // Let's see if at least some of the requested data is already + // in the buffer + if (offset >= buffered_start_ && offset < (buffered_start_ + buffer_.CurrentSize())) { - size_t buffer_offset = offset - buffered_start_; - r = buffer_.Read(dest, buffer_offset, left); - assert(r >= 0); + size_t buffer_offset = offset - buffered_start_; + r = buffer_.Read(dest, buffer_offset, left); + assert(r >= 0); - left -= size_t(r); - offset += r; - dest += r; - } - - // Still some left or none was buffered - if (left > 0) { - // Figure out the start/end offset for reading and amount to read - const size_t alignment = buffer_.Alignment(); - const size_t first_page_start = - TruncateToPageBoundary(alignment, offset); - - size_t bytes_requested = left; - if (read_ahead_ && bytes_requested < compaction_readahead_size_) { - bytes_requested = compaction_readahead_size_; + left -= size_t(r); + offset += r; + dest += r; } - const size_t last_page_start = - TruncateToPageBoundary(alignment, offset + bytes_requested - 1); - const size_t actual_bytes_toread = - (last_page_start - first_page_start) + alignment; + // Still some left or none was buffered + if (left > 0) { + // Figure out the start/end offset for reading and amount to read + bytes_requested = left; - if (buffer_.Capacity() < actual_bytes_toread) { - // If we are in read-ahead mode or the requested size - // exceeds max buffer size then use one-shot - // big buffer otherwise reallocate main buffer - if (read_ahead_ || - (actual_bytes_toread > random_access_max_buffer_size_)) { - // Unlock the mutex since we are not using instance buffer - lock.unlock(); - r = ReadIntoOneShotBuffer(offset, first_page_start, - actual_bytes_toread, left, dest); - } else { - buffer_.AllocateNewBuffer(actual_bytes_toread); - r = ReadIntoInstanceBuffer(offset, first_page_start, - actual_bytes_toread, left, dest); + if (read_ahead_ && bytes_requested < compaction_readahead_size_) { + bytes_requested = compaction_readahead_size_; + } + + CalculateReadParameters(offset, bytes_requested, actual_bytes_toread, + first_page_start); + + assert(actual_bytes_toread > 0); + + if (buffer_.Capacity() < actual_bytes_toread) { + // If we are in read-ahead mode or the requested size + // exceeds max buffer size then use one-shot + // big buffer otherwise reallocate main buffer + if (read_ahead_ || + (actual_bytes_toread > random_access_max_buffer_size_)) { + // Unlock the mutex since we are not using instance buffer + lock.unlock(); + r = ReadIntoOneShotBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } + else { + buffer_.AllocateNewBuffer(actual_bytes_toread); + r = ReadIntoInstanceBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } + } + else { + buffer_.Clear(); + r = ReadIntoInstanceBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); } - } else { - buffer_.Clear(); - r = ReadIntoInstanceBuffer(offset, first_page_start, - actual_bytes_toread, left, dest); } } } else {