Fix multiple issues with WinMmapFile fo sequential writing (#1108)

make preallocation inline with other writable files
  make sure that we map no more than pre-allocated size.
This commit is contained in:
Dmitri Smirnov 2016-04-29 16:43:13 -07:00 committed by Yi Wu
parent 1d2e4ef747
commit 5407b7c6d9

View File

@ -262,8 +262,11 @@ class WinMmapFile : public WritableFile {
// page size or SSD page size
const size_t
allocation_granularity_; // View must start at such a granularity
size_t mapping_size_; // We want file mapping to be of a specific size
// because then the file is expandable
size_t reserved_size_; // Preallocated size
size_t mapping_size_; // The max size of the mapping object
// we want to guess the final file size to minimize the remapping
size_t view_size_; // How much memory to map into a view at a time
char* mapped_begin_; // Must begin at the file offset that is aligned with
@ -283,15 +286,6 @@ class WinMmapFile : public WritableFile {
return ftruncate(filename_, hFile_, toSize);
}
// Can only truncate or reserve to a sector size aligned if
// used on files that are opened with Unbuffered I/O
// Normally it does not present a problem since in memory mapped files
// we do not disable buffering
Status ReserveFileSpace(uint64_t toSize) {
IOSTATS_TIMER_GUARD(allocate_nanos);
return fallocate(filename_, hFile_, toSize);
}
Status UnmapCurrentRegion() {
Status status;
@ -301,82 +295,57 @@ class WinMmapFile : public WritableFile {
"Failed to unmap file view: " + filename_, GetLastError());
}
// UnmapView automatically sends data to disk but not the metadata
// which is good and provides some equivalent of fdatasync() on Linux
// therefore, we donot need separate flag for metadata
pending_sync_ = false;
mapped_begin_ = nullptr;
mapped_end_ = nullptr;
dst_ = nullptr;
last_sync_ = nullptr;
// Move on to the next portion of the file
file_offset_ += view_size_;
// Increase the amount we map the next time, but capped at 1MB
view_size_ *= 2;
view_size_ = std::min(view_size_, c_OneMB);
// UnmapView automatically sends data to disk but not the metadata
// which is good and provides some equivalent of fdatasync() on Linux
// therefore, we donot need separate flag for metadata
mapped_begin_ = nullptr;
mapped_end_ = nullptr;
dst_ = nullptr;
last_sync_ = nullptr;
pending_sync_ = false;
}
return status;
}
Status MapNewRegion() {
Status status;
assert(mapped_begin_ == nullptr);
size_t minMappingSize = file_offset_ + view_size_;
size_t minDiskSize = file_offset_ + view_size_;
// Check if we need to create a new mapping since we want to write beyond
// the current one
// If the mapping view is now too short
// CreateFileMapping will extend the size of the file automatically if the
// mapping size is greater than
// the current length of the file, which reserves the space and makes
// writing faster, except, windows can not map an empty file.
// Thus the first time around we must actually extend the file ourselves
if (hMap_ == NULL || minMappingSize > mapping_size_) {
if (NULL == hMap_) {
// Creating mapping for the first time so reserve the space on disk
status = ReserveFileSpace(minMappingSize);
if (!status.ok()) {
return status;
}
if (minDiskSize > reserved_size_) {
status = Allocate(file_offset_, view_size_);
if (!status.ok()) {
return status;
}
}
if (hMap_) {
// Need to remap
if (hMap_ == NULL || reserved_size_ > mapping_size_) {
if (hMap_ != NULL) {
// Unmap the previous one
BOOL ret = ::CloseHandle(hMap_);
assert(ret);
hMap_ = NULL;
}
// Calculate the new mapping size which will hopefully reserve space for
// several consecutive sliding views
// Query preallocation block size if set
size_t preallocationBlockSize = 0;
size_t lastAllocatedBlockSize = 0; // Not used
GetPreallocationStatus(&preallocationBlockSize, &lastAllocatedBlockSize);
if (preallocationBlockSize) {
preallocationBlockSize =
Roundup(preallocationBlockSize, allocation_granularity_);
} else {
preallocationBlockSize = 2 * view_size_;
}
mapping_size_ += preallocationBlockSize;
ULARGE_INTEGER mappingSize;
mappingSize.QuadPart = mapping_size_;
mappingSize.QuadPart = reserved_size_;
hMap_ = CreateFileMappingA(
hFile_,
NULL, // Security attributes
PAGE_READWRITE, // There is not a write only mode for mapping
mappingSize.HighPart, // Enable mapping the whole file but the actual
// amount mapped is determined by MapViewOfFile
// amount mapped is determined by MapViewOfFile
mappingSize.LowPart,
NULL); // Mapping name
@ -385,6 +354,8 @@ class WinMmapFile : public WritableFile {
"WindowsMmapFile failed to create file mapping for: " + filename_,
GetLastError());
}
mapping_size_ = reserved_size_;
}
ULARGE_INTEGER offset;
@ -416,6 +387,7 @@ class WinMmapFile : public WritableFile {
hMap_(NULL),
page_size_(page_size),
allocation_granularity_(allocation_granularity),
reserved_size_(0),
mapping_size_(0),
view_size_(0),
mapped_begin_(nullptr),
@ -435,25 +407,10 @@ class WinMmapFile : public WritableFile {
// Only for memory mapped writes
assert(options.use_mmap_writes);
// Make sure buffering is not disabled. It is ignored for mapping
// purposes but also imposes restriction on moving file position
// it is not a problem so much with reserving space since it is probably a
// factor
// of allocation_granularity but we also want to truncate the file in
// Close() at
// arbitrary position so we do not have to feel this with zeros.
assert(options.use_os_buffer);
// View size must be both the multiple of allocation_granularity AND the
// page size
if ((allocation_granularity_ % page_size_) == 0) {
view_size_ = 2 * allocation_granularity;
} else if ((page_size_ % allocation_granularity_) == 0) {
view_size_ = 2 * page_size_;
} else {
// we can multiply them together
assert(false);
}
// page size and the granularity is usually a multiple of a page size.
const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
view_size_ = Roundup(viewSize, allocation_granularity_);
}
~WinMmapFile() {
@ -479,14 +436,20 @@ class WinMmapFile : public WritableFile {
if (!s.ok()) {
return s;
}
} else {
size_t n = std::min(left, avail);
memcpy(dst_, src, n);
dst_ += n;
src += n;
left -= n;
pending_sync_ = true;
}
}
size_t n = std::min(left, avail);
memcpy(dst_, src, n);
dst_ += n;
src += n;
left -= n;
pending_sync_ = true;
// Now make sure that the last partial page is padded with zeros if needed
size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
if (bytesToPad > 0) {
memset(dst_, 0, bytesToPad);
}
return Status::OK();
@ -508,7 +471,13 @@ class WinMmapFile : public WritableFile {
// which we use does not write zeros and it is good.
uint64_t targetSize = GetFileSize();
s = UnmapCurrentRegion();
if (mapped_begin_ != nullptr) {
// Sync before unmapping to make sure everything
// is on disk and there is not a lazy writing
// so we are deterministic with the tests
Sync();
s = UnmapCurrentRegion();
}
if (NULL != hMap_) {
BOOL ret = ::CloseHandle(hMap_);
@ -521,15 +490,18 @@ class WinMmapFile : public WritableFile {
hMap_ = NULL;
}
TruncateFile(targetSize);
if (hFile_ != NULL) {
BOOL ret = ::CloseHandle(hFile_);
hFile_ = NULL;
TruncateFile(targetSize);
if (!ret && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError(
"Failed to close file map handle: " + filename_, lastError);
BOOL ret = ::CloseHandle(hFile_);
hFile_ = NULL;
if (!ret && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError(
"Failed to close file map handle: " + filename_, lastError);
}
}
return s;
@ -542,7 +514,7 @@ class WinMmapFile : public WritableFile {
Status s;
// Some writes occurred since last sync
if (pending_sync_) {
if (dst_ > last_sync_) {
assert(mapped_begin_);
assert(dst_);
assert(dst_ > mapped_begin_);
@ -552,16 +524,15 @@ class WinMmapFile : public WritableFile {
TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
size_t page_end =
TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
last_sync_ = dst_;
// Flush only the amount of that is a multiple of pages
if (!::FlushViewOfFile(mapped_begin_ + page_begin,
(page_end - page_begin) + page_size_)) {
(page_end - page_begin) + page_size_)) {
s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
GetLastError());
} else {
last_sync_ = dst_;
}
pending_sync_ = false;
}
return s;
@ -571,19 +542,15 @@ class WinMmapFile : public WritableFile {
* Flush data as well as metadata to stable storage.
*/
virtual Status Fsync() override {
Status s;
// Flush metadata if pending
const bool pending = pending_sync_;
s = Sync();
Status s = Sync();
// Flush metadata
if (s.ok() && pending) {
if (s.ok() && pending_sync_) {
if (!::FlushFileBuffers(hFile_)) {
s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
GetLastError());
}
pending_sync_ = false;
}
return s;
@ -604,7 +571,24 @@ class WinMmapFile : public WritableFile {
}
virtual Status Allocate(uint64_t offset, uint64_t len) override {
return Status::OK();
Status status;
TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
// Make sure that we reserve an aligned amount of space
// since the reservation block size is driven outside so we want
// to check if we are ok with reservation here
size_t spaceToReserve = Roundup(offset + len, view_size_);
// Nothing to do
if (spaceToReserve <= reserved_size_) {
return status;
}
IOSTATS_TIMER_GUARD(allocate_nanos);
status = fallocate(filename_, hFile_, spaceToReserve);
if (status.ok()) {
reserved_size_ = spaceToReserve;
}
return status;
}
};