Fix multiple issues with WinMmapFile fo sequential writing (#1108)

make preallocation inline with other writable files
  make sure that we map no more than pre-allocated size.
This commit is contained in:
Dmitri Smirnov 2016-04-29 16:43:13 -07:00 committed by Yi Wu
parent 1d2e4ef747
commit 5407b7c6d9

View File

@ -262,8 +262,11 @@ class WinMmapFile : public WritableFile {
// page size or SSD page size // page size or SSD page size
const size_t const size_t
allocation_granularity_; // View must start at such a granularity allocation_granularity_; // View must start at such a granularity
size_t mapping_size_; // We want file mapping to be of a specific size
// because then the file is expandable size_t reserved_size_; // Preallocated size
size_t mapping_size_; // The max size of the mapping object
// we want to guess the final file size to minimize the remapping
size_t view_size_; // How much memory to map into a view at a time size_t view_size_; // How much memory to map into a view at a time
char* mapped_begin_; // Must begin at the file offset that is aligned with char* mapped_begin_; // Must begin at the file offset that is aligned with
@ -283,15 +286,6 @@ class WinMmapFile : public WritableFile {
return ftruncate(filename_, hFile_, toSize); return ftruncate(filename_, hFile_, toSize);
} }
// Can only truncate or reserve to a sector size aligned if
// used on files that are opened with Unbuffered I/O
// Normally it does not present a problem since in memory mapped files
// we do not disable buffering
Status ReserveFileSpace(uint64_t toSize) {
IOSTATS_TIMER_GUARD(allocate_nanos);
return fallocate(filename_, hFile_, toSize);
}
Status UnmapCurrentRegion() { Status UnmapCurrentRegion() {
Status status; Status status;
@ -301,82 +295,57 @@ class WinMmapFile : public WritableFile {
"Failed to unmap file view: " + filename_, GetLastError()); "Failed to unmap file view: " + filename_, GetLastError());
} }
// UnmapView automatically sends data to disk but not the metadata
// which is good and provides some equivalent of fdatasync() on Linux
// therefore, we donot need separate flag for metadata
pending_sync_ = false;
mapped_begin_ = nullptr;
mapped_end_ = nullptr;
dst_ = nullptr;
last_sync_ = nullptr;
// Move on to the next portion of the file // Move on to the next portion of the file
file_offset_ += view_size_; file_offset_ += view_size_;
// Increase the amount we map the next time, but capped at 1MB // UnmapView automatically sends data to disk but not the metadata
view_size_ *= 2; // which is good and provides some equivalent of fdatasync() on Linux
view_size_ = std::min(view_size_, c_OneMB); // therefore, we donot need separate flag for metadata
mapped_begin_ = nullptr;
mapped_end_ = nullptr;
dst_ = nullptr;
last_sync_ = nullptr;
pending_sync_ = false;
} }
return status; return status;
} }
Status MapNewRegion() { Status MapNewRegion() {
Status status; Status status;
assert(mapped_begin_ == nullptr); assert(mapped_begin_ == nullptr);
size_t minMappingSize = file_offset_ + view_size_; size_t minDiskSize = file_offset_ + view_size_;
// Check if we need to create a new mapping since we want to write beyond if (minDiskSize > reserved_size_) {
// the current one status = Allocate(file_offset_, view_size_);
// If the mapping view is now too short if (!status.ok()) {
// CreateFileMapping will extend the size of the file automatically if the return status;
// mapping size is greater than
// the current length of the file, which reserves the space and makes
// writing faster, except, windows can not map an empty file.
// Thus the first time around we must actually extend the file ourselves
if (hMap_ == NULL || minMappingSize > mapping_size_) {
if (NULL == hMap_) {
// Creating mapping for the first time so reserve the space on disk
status = ReserveFileSpace(minMappingSize);
if (!status.ok()) {
return status;
}
} }
}
if (hMap_) { // Need to remap
if (hMap_ == NULL || reserved_size_ > mapping_size_) {
if (hMap_ != NULL) {
// Unmap the previous one // Unmap the previous one
BOOL ret = ::CloseHandle(hMap_); BOOL ret = ::CloseHandle(hMap_);
assert(ret); assert(ret);
hMap_ = NULL; hMap_ = NULL;
} }
// Calculate the new mapping size which will hopefully reserve space for
// several consecutive sliding views
// Query preallocation block size if set
size_t preallocationBlockSize = 0;
size_t lastAllocatedBlockSize = 0; // Not used
GetPreallocationStatus(&preallocationBlockSize, &lastAllocatedBlockSize);
if (preallocationBlockSize) {
preallocationBlockSize =
Roundup(preallocationBlockSize, allocation_granularity_);
} else {
preallocationBlockSize = 2 * view_size_;
}
mapping_size_ += preallocationBlockSize;
ULARGE_INTEGER mappingSize; ULARGE_INTEGER mappingSize;
mappingSize.QuadPart = mapping_size_; mappingSize.QuadPart = reserved_size_;
hMap_ = CreateFileMappingA( hMap_ = CreateFileMappingA(
hFile_, hFile_,
NULL, // Security attributes NULL, // Security attributes
PAGE_READWRITE, // There is not a write only mode for mapping PAGE_READWRITE, // There is not a write only mode for mapping
mappingSize.HighPart, // Enable mapping the whole file but the actual mappingSize.HighPart, // Enable mapping the whole file but the actual
// amount mapped is determined by MapViewOfFile // amount mapped is determined by MapViewOfFile
mappingSize.LowPart, mappingSize.LowPart,
NULL); // Mapping name NULL); // Mapping name
@ -385,6 +354,8 @@ class WinMmapFile : public WritableFile {
"WindowsMmapFile failed to create file mapping for: " + filename_, "WindowsMmapFile failed to create file mapping for: " + filename_,
GetLastError()); GetLastError());
} }
mapping_size_ = reserved_size_;
} }
ULARGE_INTEGER offset; ULARGE_INTEGER offset;
@ -416,6 +387,7 @@ class WinMmapFile : public WritableFile {
hMap_(NULL), hMap_(NULL),
page_size_(page_size), page_size_(page_size),
allocation_granularity_(allocation_granularity), allocation_granularity_(allocation_granularity),
reserved_size_(0),
mapping_size_(0), mapping_size_(0),
view_size_(0), view_size_(0),
mapped_begin_(nullptr), mapped_begin_(nullptr),
@ -435,25 +407,10 @@ class WinMmapFile : public WritableFile {
// Only for memory mapped writes // Only for memory mapped writes
assert(options.use_mmap_writes); assert(options.use_mmap_writes);
// Make sure buffering is not disabled. It is ignored for mapping
// purposes but also imposes restriction on moving file position
// it is not a problem so much with reserving space since it is probably a
// factor
// of allocation_granularity but we also want to truncate the file in
// Close() at
// arbitrary position so we do not have to feel this with zeros.
assert(options.use_os_buffer);
// View size must be both the multiple of allocation_granularity AND the // View size must be both the multiple of allocation_granularity AND the
// page size // page size and the granularity is usually a multiple of a page size.
if ((allocation_granularity_ % page_size_) == 0) { const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
view_size_ = 2 * allocation_granularity; view_size_ = Roundup(viewSize, allocation_granularity_);
} else if ((page_size_ % allocation_granularity_) == 0) {
view_size_ = 2 * page_size_;
} else {
// we can multiply them together
assert(false);
}
} }
~WinMmapFile() { ~WinMmapFile() {
@ -479,14 +436,20 @@ class WinMmapFile : public WritableFile {
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
} else {
size_t n = std::min(left, avail);
memcpy(dst_, src, n);
dst_ += n;
src += n;
left -= n;
pending_sync_ = true;
} }
}
size_t n = std::min(left, avail); // Now make sure that the last partial page is padded with zeros if needed
memcpy(dst_, src, n); size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
dst_ += n; if (bytesToPad > 0) {
src += n; memset(dst_, 0, bytesToPad);
left -= n;
pending_sync_ = true;
} }
return Status::OK(); return Status::OK();
@ -508,7 +471,13 @@ class WinMmapFile : public WritableFile {
// which we use does not write zeros and it is good. // which we use does not write zeros and it is good.
uint64_t targetSize = GetFileSize(); uint64_t targetSize = GetFileSize();
s = UnmapCurrentRegion(); if (mapped_begin_ != nullptr) {
// Sync before unmapping to make sure everything
// is on disk and there is not a lazy writing
// so we are deterministic with the tests
Sync();
s = UnmapCurrentRegion();
}
if (NULL != hMap_) { if (NULL != hMap_) {
BOOL ret = ::CloseHandle(hMap_); BOOL ret = ::CloseHandle(hMap_);
@ -521,15 +490,18 @@ class WinMmapFile : public WritableFile {
hMap_ = NULL; hMap_ = NULL;
} }
TruncateFile(targetSize); if (hFile_ != NULL) {
BOOL ret = ::CloseHandle(hFile_); TruncateFile(targetSize);
hFile_ = NULL;
if (!ret && s.ok()) { BOOL ret = ::CloseHandle(hFile_);
auto lastError = GetLastError(); hFile_ = NULL;
s = IOErrorFromWindowsError(
"Failed to close file map handle: " + filename_, lastError); if (!ret && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError(
"Failed to close file map handle: " + filename_, lastError);
}
} }
return s; return s;
@ -542,7 +514,7 @@ class WinMmapFile : public WritableFile {
Status s; Status s;
// Some writes occurred since last sync // Some writes occurred since last sync
if (pending_sync_) { if (dst_ > last_sync_) {
assert(mapped_begin_); assert(mapped_begin_);
assert(dst_); assert(dst_);
assert(dst_ > mapped_begin_); assert(dst_ > mapped_begin_);
@ -552,16 +524,15 @@ class WinMmapFile : public WritableFile {
TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
size_t page_end = size_t page_end =
TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
last_sync_ = dst_;
// Flush only the amount of that is a multiple of pages // Flush only the amount of that is a multiple of pages
if (!::FlushViewOfFile(mapped_begin_ + page_begin, if (!::FlushViewOfFile(mapped_begin_ + page_begin,
(page_end - page_begin) + page_size_)) { (page_end - page_begin) + page_size_)) {
s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
GetLastError()); GetLastError());
} else {
last_sync_ = dst_;
} }
pending_sync_ = false;
} }
return s; return s;
@ -571,19 +542,15 @@ class WinMmapFile : public WritableFile {
* Flush data as well as metadata to stable storage. * Flush data as well as metadata to stable storage.
*/ */
virtual Status Fsync() override { virtual Status Fsync() override {
Status s; Status s = Sync();
// Flush metadata if pending
const bool pending = pending_sync_;
s = Sync();
// Flush metadata // Flush metadata
if (s.ok() && pending) { if (s.ok() && pending_sync_) {
if (!::FlushFileBuffers(hFile_)) { if (!::FlushFileBuffers(hFile_)) {
s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
GetLastError()); GetLastError());
} }
pending_sync_ = false;
} }
return s; return s;
@ -604,7 +571,24 @@ class WinMmapFile : public WritableFile {
} }
virtual Status Allocate(uint64_t offset, uint64_t len) override { virtual Status Allocate(uint64_t offset, uint64_t len) override {
return Status::OK(); Status status;
TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
// Make sure that we reserve an aligned amount of space
// since the reservation block size is driven outside so we want
// to check if we are ok with reservation here
size_t spaceToReserve = Roundup(offset + len, view_size_);
// Nothing to do
if (spaceToReserve <= reserved_size_) {
return status;
}
IOSTATS_TIMER_GUARD(allocate_nanos);
status = fallocate(filename_, hFile_, spaceToReserve);
if (status.ok()) {
reserved_size_ = spaceToReserve;
}
return status;
} }
}; };