rocksdb/port/win/env_win.cc

//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include <algorithm>
#include <deque>
#include <thread>
#include <ctime>

#include <errno.h>
#include <process.h>
#include <io.h>
#include <direct.h>
#include <sys/types.h>
#include <sys/stat.h>

#include "rocksdb/env.h"
#include "rocksdb/slice.h"

#include "port/port.h"
#include "port/dirent.h"
#include "port/win/win_logger.h"

#include "util/random.h"
#include "util/iostats_context_imp.h"
#include "util/rate_limiter.h"
#include "util/sync_point.h"

#include "util/thread_status_updater.h"
#include "util/thread_status_util.h"

#include <Rpc.h>  // For UUID generation
#include <Windows.h>

namespace rocksdb {

std::string GetWindowsErrSz(DWORD err) {
  LPSTR lpMsgBuf;
  FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
                     FORMAT_MESSAGE_IGNORE_INSERTS,
                 NULL, err,
                 0,  // Default language
                 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);

  std::string Err = lpMsgBuf;
  LocalFree(lpMsgBuf);
  return Err;
}

namespace {

const size_t c_OneMB = (1 << 20);

ThreadStatusUpdater* CreateThreadStatusUpdater() {
  return new ThreadStatusUpdater();
}

// A wrapper for fadvise, if the platform doesn't support fadvise,
// it will simply return Status::NotSupport.
int Fadvise(int fd, off_t offset, size_t len, int advice) {
  return 0;  // simply do nothing.
}

inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
  return Status::IOError(context, GetWindowsErrSz(err));
}

inline Status IOErrorFromLastWindowsError(const std::string& context) {
  return IOErrorFromWindowsError(context, GetLastError());
}

inline Status IOError(const std::string& context, int err_number) {
  return Status::IOError(context, strerror(err_number));
}

// TODO(sdong): temp logging. Need to help debugging. Remove it when
// the feature is proved to be stable.
inline void PrintThreadInfo(size_t thread_id, size_t terminatingId) {
  fprintf(stdout, "Bg thread %Iu terminates %Iu\n", thread_id, terminatingId);
}

// returns the ID of the current process
inline int current_process_id() { return _getpid(); }

// RAII helpers for HANDLEs
const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;

// We preserve the original name of this interface to denote the original idea
// behind it.
// All reads happen by a specified offset and pwrite interface does not change
// the position of the file pointer. Judging from the man page and errno it does
// execute
// lseek atomically to return the position of the file back where it was.
// WriteFile() does not
// have this capability. Therefore, for both pread and pwrite the pointer is
// advanced to the next position
// which is fine for writes because they are (should be) sequential.
// Because all the reads/writes happen by the specified offset, the caller in
// theory should not
// rely on the current file offset.
SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes,
               uint64_t offset) {
  OVERLAPPED overlapped = {0};
  ULARGE_INTEGER offsetUnion;
  offsetUnion.QuadPart = offset;

  overlapped.Offset = offsetUnion.LowPart;
  overlapped.OffsetHigh = offsetUnion.HighPart;

  SSIZE_T result = 0;

  unsigned long bytesWritten = 0;

  if (FALSE == WriteFile(hFile, src, numBytes, &bytesWritten, &overlapped)) {
    result = -1;
  } else {
    result = bytesWritten;
  }

  return result;
}

// See comments for pwrite above
SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) {
  OVERLAPPED overlapped = {0};
  ULARGE_INTEGER offsetUnion;
  offsetUnion.QuadPart = offset;

  overlapped.Offset = offsetUnion.LowPart;
  overlapped.OffsetHigh = offsetUnion.HighPart;

  SSIZE_T result = 0;

  unsigned long bytesRead = 0;

  if (FALSE == ReadFile(hFile, src, numBytes, &bytesRead, &overlapped)) {
    return -1;
  } else {
    result = bytesRead;
  }

  return result;
}

// Note the below two do not set errno because they are used only here in this
// file
// on a Windows handle and, therefore, not necessary. Translating GetLastError()
// to errno
// is a sad business
inline int fsync(HANDLE hFile) {
  if (!FlushFileBuffers(hFile)) {
    return -1;
  }

  return 0;
}

inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
  s -= (s & (page_size - 1));
  assert((s % page_size) == 0);
  return s;
}

// Roundup x to a multiple of y
inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }

// SetFileInformationByHandle() is capable of fast pre-allocates.
// However, this does not change the file end position unless the file is
// truncated and the pre-allocated space is not considered filled with zeros.
inline Status fallocate(const std::string& filename, HANDLE hFile,
                        uint64_t to_size) {
  Status status;

  FILE_ALLOCATION_INFO alloc_info;
  alloc_info.AllocationSize.QuadPart = to_size;

  if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
                                  sizeof(FILE_ALLOCATION_INFO))) {
    auto lastError = GetLastError();
    status = IOErrorFromWindowsError(
        "Failed to pre-allocate space: " + filename, lastError);
  }

  return status;
}

inline Status ftruncate(const std::string& filename, HANDLE hFile,
                        uint64_t toSize) {
  Status status;

  FILE_END_OF_FILE_INFO end_of_file;
  end_of_file.EndOfFile.QuadPart = toSize;

  if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
                                  sizeof(FILE_END_OF_FILE_INFO))) {
    auto lastError = GetLastError();
    status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
                                     lastError);
  }

  return status;
}

// mmap() based random-access
class WinMmapReadableFile : public RandomAccessFile {
  const std::string fileName_;
  HANDLE hFile_;
  HANDLE hMap_;

  const void* mapped_region_;
  const size_t length_;

 public:
  // mapped_region_[0,length-1] contains the mmapped contents of the file.
  WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
                      const void* mapped_region, size_t length)
      : fileName_(fileName),
        hFile_(hFile),
        hMap_(hMap),
        mapped_region_(mapped_region),
        length_(length) {}

  ~WinMmapReadableFile() {
    BOOL ret = ::UnmapViewOfFile(mapped_region_);
    assert(ret);

    ret = ::CloseHandle(hMap_);
    assert(ret);

    ret = ::CloseHandle(hFile_);
    assert(ret);
  }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const override {
    Status s;

    if (offset + n > length_) {
      *result = Slice();
      s = IOError(fileName_, EINVAL);
    } else {
      *result =
          Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
    }
    return s;
  }

  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }
};

// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file.  This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class WinMmapFile : public WritableFile {
 private:
  const std::string filename_;
  HANDLE hFile_;
  HANDLE hMap_;

  const size_t page_size_;  // We flush the mapping view in page_size
                            // increments. We may decide if this is a memory
                            // page size or SSD page size
  const size_t
      allocation_granularity_;  // View must start at such a granularity
  size_t mapping_size_;         // We want file mapping to be of a specific size
                                // because then the file is expandable
  size_t view_size_;            // How much memory to map into a view at a time

  char* mapped_begin_;  // Must begin at the file offset that is aligned with
                        // allocation_granularity_
  char* mapped_end_;
  char* dst_;  // Where to write next  (in range [mapped_begin_,mapped_end_])
  char* last_sync_;  // Where have we synced up to

  uint64_t file_offset_;  // Offset of mapped_begin_ in file

  // Do we have unsynced writes?
  bool pending_sync_;

  // Can only truncate or reserve to a sector size aligned if
  // used on files that are opened with Unbuffered I/O
  Status TruncateFile(uint64_t toSize) {
    return ftruncate(filename_, hFile_, toSize);
  }

  // Can only truncate or reserve to a sector size aligned if
  // used on files that are opened with Unbuffered I/O
  // Normally it does not present a problem since in memory mapped files
  // we do not disable buffering
  Status ReserveFileSpace(uint64_t toSize) {
    IOSTATS_TIMER_GUARD(allocate_nanos);
    return fallocate(filename_, hFile_, toSize);
  }

  Status UnmapCurrentRegion() {
    Status status;

    if (mapped_begin_ != nullptr) {
      if (!::UnmapViewOfFile(mapped_begin_)) {
        status = IOErrorFromWindowsError(
            "Failed to unmap file view: " + filename_, GetLastError());
      }

      // UnmapView automatically sends data to disk but not the metadata
      // which is good and provides some equivalent of fdatasync() on Linux
      // therefore, we donot need separate flag for metadata
      pending_sync_ = false;
      mapped_begin_ = nullptr;
      mapped_end_ = nullptr;
      dst_ = nullptr;
      last_sync_ = nullptr;

      // Move on to the next portion of the file
      file_offset_ += view_size_;

      // Increase the amount we map the next time, but capped at 1MB
      view_size_ *= 2;
      view_size_ = std::min(view_size_, c_OneMB);
    }

    return status;
  }

  Status MapNewRegion() {
    Status status;

    assert(mapped_begin_ == nullptr);

    size_t minMappingSize = file_offset_ + view_size_;

    // Check if we need to create a new mapping since we want to write beyond
    // the current one
    // If the mapping view is now too short
    // CreateFileMapping will extend the size of the file automatically if the
    // mapping size is greater than
    // the current length of the file, which reserves the space and makes
    // writing faster, except, windows can not map an empty file.
    // Thus the first time around we must actually extend the file ourselves
    if (hMap_ == NULL || minMappingSize > mapping_size_) {
      if (NULL == hMap_) {
        // Creating mapping for the first time so reserve the space on disk
        status = ReserveFileSpace(minMappingSize);
        if (!status.ok()) {
          return status;
        }
      }

      if (hMap_) {
        // Unmap the previous one
        BOOL ret = ::CloseHandle(hMap_);
        assert(ret);
        hMap_ = NULL;
      }

      // Calculate the new mapping size which will hopefully reserve space for
      // several consecutive sliding views
      // Query preallocation block size if set
      size_t preallocationBlockSize = 0;
      size_t lastAllocatedBlockSize = 0;  // Not used
      GetPreallocationStatus(&preallocationBlockSize, &lastAllocatedBlockSize);

      if (preallocationBlockSize) {
        preallocationBlockSize =
            Roundup(preallocationBlockSize, allocation_granularity_);
      } else {
        preallocationBlockSize = 2 * view_size_;
      }

      mapping_size_ += preallocationBlockSize;

      ULARGE_INTEGER mappingSize;
      mappingSize.QuadPart = mapping_size_;

      hMap_ = CreateFileMappingA(
          hFile_,
          NULL,                  // Security attributes
          PAGE_READWRITE,        // There is not a write only mode for mapping
          mappingSize.HighPart,  // Enable mapping the whole file but the actual
                                 // amount mapped is determined by MapViewOfFile
          mappingSize.LowPart,
          NULL);  // Mapping name

      if (NULL == hMap_) {
        return IOErrorFromWindowsError(
            "WindowsMmapFile failed to create file mapping for: " + filename_,
            GetLastError());
      }
    }

    ULARGE_INTEGER offset;
    offset.QuadPart = file_offset_;

    // View must begin at the granularity aligned offset
    mapped_begin_ = reinterpret_cast<char*>(
        MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
                        view_size_, NULL));

    if (!mapped_begin_) {
      status = IOErrorFromWindowsError(
          "WindowsMmapFile failed to map file view: " + filename_,
          GetLastError());
    } else {
      mapped_end_ = mapped_begin_ + view_size_;
      dst_ = mapped_begin_;
      last_sync_ = mapped_begin_;
      pending_sync_ = false;
    }
    return status;
  }

 public:
  WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
              size_t allocation_granularity, const EnvOptions& options)
      : filename_(fname),
        hFile_(hFile),
        hMap_(NULL),
        page_size_(page_size),
        allocation_granularity_(allocation_granularity),
        mapping_size_(0),
        view_size_(0),
        mapped_begin_(nullptr),
        mapped_end_(nullptr),
        dst_(nullptr),
        last_sync_(nullptr),
        file_offset_(0),
        pending_sync_(false) {
    // Allocation granularity must be obtained from GetSystemInfo() and must be
    // a power of two.
    assert(allocation_granularity > 0);
    assert((allocation_granularity & (allocation_granularity - 1)) == 0);

    assert(page_size > 0);
    assert((page_size & (page_size - 1)) == 0);

    // Only for memory mapped writes
    assert(options.use_mmap_writes);

    // Make sure buffering is not disabled. It is ignored for mapping
    // purposes but also imposes restriction on moving file position
    // it is not a problem so much with reserving space since it is probably a
    // factor
    // of allocation_granularity but we also want to truncate the file in
    // Close() at
    // arbitrary position so we do not have to feel this with zeros.
    assert(options.use_os_buffer);

    // View size must be both the multiple of allocation_granularity AND the
    // page size
    if ((allocation_granularity_ % page_size_) == 0) {
      view_size_ = 2 * allocation_granularity;
    } else if ((page_size_ % allocation_granularity_) == 0) {
      view_size_ = 2 * page_size_;
    } else {
      // we can multiply them together
      assert(false);
    }
  }

  ~WinMmapFile() {
    if (hFile_) {
      this->Close();
    }
  }

  virtual Status Append(const Slice& data) override {
    const char* src = data.data();
    size_t left = data.size();

    while (left > 0) {
      assert(mapped_begin_ <= dst_);
      size_t avail = mapped_end_ - dst_;

      if (avail == 0) {
        Status s = UnmapCurrentRegion();
        if (s.ok()) {
          s = MapNewRegion();
        }

        if (!s.ok()) {
          return s;
        }
      }

      size_t n = std::min(left, avail);
      memcpy(dst_, src, n);
      IOSTATS_ADD(bytes_written, n);
      dst_ += n;
      src += n;
      left -= n;
      pending_sync_ = true;
    }

    return Status::OK();
  }

  virtual Status Close() override {
    Status s;

    assert(NULL != hFile_);

    // We truncate to the precise size so no
    // uninitialized data at the end. SetEndOfFile
    // which we use does not write zeros and it is good.
    uint64_t targetSize = GetFileSize();

    s = UnmapCurrentRegion();

    if (NULL != hMap_) {
      BOOL ret = ::CloseHandle(hMap_);
      if (!ret && s.ok()) {
        auto lastError = GetLastError();
        s = IOErrorFromWindowsError(
            "Failed to Close mapping for file: " + filename_, lastError);
      }

      hMap_ = NULL;
    }

    TruncateFile(targetSize);

    BOOL ret = ::CloseHandle(hFile_);
    hFile_ = NULL;

    if (!ret && s.ok()) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError(
          "Failed to close file map handle: " + filename_, lastError);
    }

    return s;
  }

  virtual Status Flush() override { return Status::OK(); }

  // Flush only data
  virtual Status Sync() override {
    Status s;

    // Some writes occurred since last sync
    if (pending_sync_) {
      assert(mapped_begin_);
      assert(dst_);
      assert(dst_ > mapped_begin_);
      assert(dst_ < mapped_end_);

      size_t page_begin =
          TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
      size_t page_end =
          TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
      last_sync_ = dst_;

      // Flush only the amount of that is a multiple of pages
      if (!::FlushViewOfFile(mapped_begin_ + page_begin,
                             (page_end - page_begin) + page_size_)) {
        s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
                                    GetLastError());
      }

      pending_sync_ = false;
    }

    return s;
  }

  /**
  * Flush data as well as metadata to stable storage.
  */
  virtual Status Fsync() override {
    Status s;

    // Flush metadata if pending
    const bool pending = pending_sync_;

    s = Sync();

    // Flush metadata
    if (s.ok() && pending) {
      if (!::FlushFileBuffers(hFile_)) {
        s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
                                    GetLastError());
      }
    }

    return s;
  }

  /**
  * Get the size of valid data in the file. This will not match the
  * size that is returned from the filesystem because we use mmap
  * to extend file by map_size every time.
  */
  virtual uint64_t GetFileSize() override {
    size_t used = dst_ - mapped_begin_;
    return file_offset_ + used;
  }

  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }

  virtual Status Allocate(off_t offset, off_t len) override {
    return Status::OK();
  }
};

// This class is to manage an aligned user
// allocated buffer for unbuffered I/O purposes
// though it does not make a difference if you need a buffer.
class AlignedBuffer {
  const size_t alignment_;
  std::unique_ptr<char[]> buf_;
  size_t capacity_;
  size_t cursize_;
  char* bufstart_;

 public:
  explicit AlignedBuffer(size_t alignment)
      : alignment_(alignment), capacity_(0), cursize_(0), bufstart_(nullptr) {
    assert(alignment > 0);
    assert((alignment & (alignment - 1)) == 0);
  }

  size_t GetAlignment() const { return alignment_; }

  size_t GetCapacity() const { return capacity_; }

  size_t GetCurrentSize() const { return cursize_; }

  const char* GetBufferStart() const { return bufstart_; }

  void Clear() { cursize_ = 0; }

  // Allocates a new buffer and sets bufstart_ to the aligned first byte
  void AllocateNewBuffer(size_t requestedCapacity) {
    size_t size = Roundup(requestedCapacity, alignment_);
    buf_.reset(new char[size + alignment_]);

    char* p = buf_.get();
    bufstart_ = reinterpret_cast<char*>(
        (reinterpret_cast<uintptr_t>(p) + (alignment_ - 1)) &
        ~static_cast<uintptr_t>(alignment_ - 1));
    capacity_ = size;
    cursize_ = 0;
  }

  // Used for write
  // Returns the number of bytes appended
  size_t Append(const char* src, size_t append_size) {
    size_t buffer_remaining = capacity_ - cursize_;
    size_t to_copy = std::min(append_size, buffer_remaining);

    if (to_copy > 0) {
      memcpy(bufstart_ + cursize_, src, to_copy);
      cursize_ += to_copy;
    }
    return to_copy;
  }

  size_t Read(char* dest, size_t offset, size_t read_size) const {
    assert(offset < cursize_);
    size_t to_read = std::min(cursize_ - offset, read_size);
    if (to_read > 0) {
      memcpy(dest, bufstart_ + offset, to_read);
    }
    return to_read;
  }

  /// Pad to alignment
  void PadToAlignmentWith(int padding) {
    size_t total_size = Roundup(cursize_, alignment_);
    size_t pad_size = total_size - cursize_;

    if (pad_size > 0) {
      assert((pad_size + cursize_) <= capacity_);
      memset(bufstart_ + cursize_, padding, pad_size);
      cursize_ += pad_size;
    }
  }

  // After a partial flush move the tail to the beginning of the buffer
  void RefitTail(size_t tail_offset, size_t tail_size) {
    if (tail_size > 0) {
      memmove(bufstart_, bufstart_ + tail_offset, tail_size);
    }
    cursize_ = tail_size;
  }

  // Returns place to start writing
  char* GetDestination() { return bufstart_ + cursize_; }

  void SetSize(size_t cursize) { cursize_ = cursize; }
};

class WinSequentialFile : public SequentialFile {
 private:
  const std::string filename_;
  FILE* file_;
  int fd_;
  bool use_os_buffer_;

 public:
  WinSequentialFile(const std::string& fname, FILE* f,
                    const EnvOptions& options)
      : filename_(fname),
        file_(f),
        fd_(fileno(f)),
        use_os_buffer_(options.use_os_buffer) {}

  virtual ~WinSequentialFile() {
    assert(file_ != nullptr);
    fclose(file_);
  }

  virtual Status Read(size_t n, Slice* result, char* scratch) override {
    Status s;
    size_t r = 0;

    // read() and fread() as well as write/fwrite do not guarantee
    // to fullfil the entire request in one call thus the loop.
    do {
      r = fread(scratch, 1, n, file_);
    } while (r == 0 && ferror(file_));

    IOSTATS_ADD(bytes_read, r);

    *result = Slice(scratch, r);

    if (r < n) {
      if (feof(file_)) {
        // We leave status as ok if we hit the end of the file
        // We also clear the error so that the reads can continue
        // if a new data is written to the file
        clearerr(file_);
      } else {
        // A partial read with an error: return a non-ok status
        s = Status::IOError(filename_, strerror(errno));
      }
    }

    return s;
  }

  virtual Status Skip(uint64_t n) override {
    if (fseek(file_, n, SEEK_CUR)) {
      return IOError(filename_, errno);
    }
    return Status::OK();
  }

  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }
};

// pread() based random-access
class WinRandomAccessFile : public RandomAccessFile {
  const std::string filename_;
  HANDLE hFile_;
  const bool use_os_buffer_;
  mutable std::mutex buffer_mut_;
  mutable AlignedBuffer buffer_;
  mutable uint64_t
      buffered_start_;  // file offset set that is currently buffered

 public:
  WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
                      const EnvOptions& options)
      : filename_(fname),
        hFile_(hFile),
        use_os_buffer_(options.use_os_buffer),
        buffer_(alignment),
        buffered_start_(0) {
    assert(!options.use_mmap_reads);

    // Unbuffered access, use internal buffer for reads
    if (!use_os_buffer_) {
      // Random read, no need in a big buffer
      // We read things in database blocks which are likely to be similar to
      // the alignment we use.
      buffer_.AllocateNewBuffer(alignment * 2);
    }
  }

  virtual ~WinRandomAccessFile() {
    if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
      ::CloseHandle(hFile_);
    }
  }

  virtual Status Read(uint64_t offset, size_t n, Slice* result,
                      char* scratch) const override {
    Status s;
    SSIZE_T r = -1;
    size_t left = n;
    char* dest = scratch;

    // When in unbuffered mode we need to do the following changes:
    // - use our own aligned buffer
    // - always read at the offset of that is a multiple of alignment
    if (!use_os_buffer_) {
      std::lock_guard<std::mutex> lg(buffer_mut_);

      // Let's see if at least some of the requested data is already
      // in the buffer
      if (offset >= buffered_start_ &&
          offset < (buffered_start_ + buffer_.GetCurrentSize())) {
        size_t buffer_offset = offset - buffered_start_;
        r = buffer_.Read(dest, buffer_offset, left);
        assert(r >= 0);

        left -= size_t(r);
        offset += r;
        dest += r;
      }

      // Still some left or none was buffered
      if (left > 0) {
        // Figure out the start/end offset for reading and amount to read
        const size_t alignment = buffer_.GetAlignment();
        const size_t start_page_start =
            TruncateToPageBoundary(alignment, offset);
        const size_t end_page_start =
            TruncateToPageBoundary(alignment, offset + left - 1);
        const size_t actual_bytes_toread =
            (end_page_start - start_page_start) + alignment;

        if (buffer_.GetCapacity() < actual_bytes_toread) {
          buffer_.AllocateNewBuffer(actual_bytes_toread);
        } else {
          buffer_.Clear();
        }

        SSIZE_T read = 0;
        {
          IOSTATS_TIMER_GUARD(read_nanos);
          read = pread(hFile_, buffer_.GetDestination(), actual_bytes_toread,
                       start_page_start);
        }

        if (read > 0) {
          buffer_.SetSize(read);
          buffered_start_ = start_page_start;

          // Let's figure out how much we read from the users standpoint
          if ((buffered_start_ + uint64_t(read)) > offset) {
            size_t buffer_offset = offset - buffered_start_;
            r = buffer_.Read(dest, buffer_offset, left);
          } else {
            r = 0;
          }
          left -= r;
        } else {
          r = read;
        }
      }

    } else {
      r = pread(hFile_, scratch, left, offset);
      if (r > 0) {
        left -= r;
      }
    }

    IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
    *result = Slice(scratch, (r < 0) ? 0 : n - left);

    if (r < 0) {
      s = IOErrorFromLastWindowsError(filename_);
    }
    return s;
  }

  virtual void Hint(AccessPattern pattern) override {}

  virtual Status InvalidateCache(size_t offset, size_t length) override {
    return Status::OK();
  }
};

// This is a sequential write class. It has been mimicked (as others) after
// the original Posix class. We add support for unbuffered I/O on windows as
// well
// we utilize the original buffer as an alignment buffer to write directly to
// file with no buffering.
// No buffering requires that the provided buffer is aligned to the physical
// sector size (SSD page size) and
// that all SetFilePointer() operations to occur with such an alignment.
// We thus always write in sector/page size increments to the drive and leave
// the tail for the next write OR for Close() at which point we pad with zeros.
// No padding is required for
// buffered access.
class WinWritableFile : public WritableFile {
 private:
  const std::string filename_;
  HANDLE hFile_;
  AlignedBuffer buffer_;

  uint64_t filesize_;      // How much data is actually written disk
  uint64_t reservedsize_;  // how far we have reserved space

  bool pending_sync_;

  RateLimiter* rate_limiter_;

  const bool use_os_buffer_;  // Used to indicate unbuffered access, the file
                              // must be opened as unbuffered if false

 public:
  WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
                  size_t capacity, const EnvOptions& options)
      : filename_(fname),
        hFile_(hFile),
        buffer_(alignment),
        filesize_(0),
        reservedsize_(0),
        pending_sync_(false),
        rate_limiter_(options.rate_limiter),
        use_os_buffer_(options.use_os_buffer) {
    assert(!options.use_mmap_writes);

    buffer_.AllocateNewBuffer(capacity);
  }

  ~WinWritableFile() {
    if (NULL != hFile_ && INVALID_HANDLE_VALUE != hFile_) {
      WinWritableFile::Close();
    }
  }

  virtual Status Append(const Slice& data) override {
    const char* src = data.data();

    assert(data.size() < INT_MAX);

    size_t left = data.size();
    Status s;
    pending_sync_ = true;

    // This would call Alloc() if we are out of blocks
    PrepareWrite(GetFileSize(), left);

    // Flush only when I/O is buffered
    if (use_os_buffer_ &&
        (buffer_.GetCapacity() - buffer_.GetCurrentSize()) < left) {
      if (buffer_.GetCurrentSize() > 0) {
        s = Flush();
        if (!s.ok()) {
          return s;
        }
      }

      if (buffer_.GetCapacity() < c_OneMB) {
        size_t desiredCapacity = buffer_.GetCapacity() * 2;
        desiredCapacity = std::min(desiredCapacity, c_OneMB);
        buffer_.AllocateNewBuffer(desiredCapacity);
      }
    }

    // We always use the internal buffer for the unbuffered I/O
    // or we simply use it for its original purpose to accumulate many small
    // chunks
    if (!use_os_buffer_ || (buffer_.GetCapacity() >= left)) {
      while (left > 0) {
        size_t appended = buffer_.Append(src, left);
        left -= appended;
        src += appended;

        if (left > 0) {
          s = Flush();
          if (!s.ok()) {
            break;
          }

          size_t cursize = buffer_.GetCurrentSize();
          size_t capacity = buffer_.GetCapacity();

          // We double the buffer here because
          // Flush calls do not keep up with the incoming bytes
          // This is the only place when buffer is changed with unbuffered I/O
          if (cursize == 0 && capacity < c_OneMB) {
            size_t desiredCapacity = capacity * 2;
            desiredCapacity = std::min(desiredCapacity, c_OneMB);
            buffer_.AllocateNewBuffer(desiredCapacity);
          }
        }
      }
    } else {
      // Writing directly to file bypassing what is in the buffer
      assert(buffer_.GetCurrentSize() == 0);
      // Use rate limiter for normal I/O very large request if available
      s = WriteBuffered(src, left);
    }

    return s;
  }

  virtual Status Close() override {
    Status s;

    // If there is any data in the cache not written we need to deal with it
    const size_t cursize = buffer_.GetCurrentSize();
    const uint64_t final_size = filesize_ + cursize;

    if (cursize > 0) {
      // If OS buffering is on, we just flush the remainder, otherwise need
      if (!use_os_buffer_) {
        s = WriteUnbuffered();
      } else {
        s = WriteBuffered(buffer_.GetBufferStart(), cursize);
      }
    }

    if (s.ok()) {
      s = ftruncate(filename_, hFile_, final_size);
    }

    // Sync data if buffer was flushed
    if (s.ok() && (cursize > 0) && fsync(hFile_) < 0) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_,
                                  lastError);
    }

    if (FALSE == ::CloseHandle(hFile_)) {
      if (s.ok()) {
        auto lastError = GetLastError();
        s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_,
                                    lastError);
      }
    }

    hFile_ = INVALID_HANDLE_VALUE;
    return s;
  }

  // write out the cached data to the OS cache
  virtual Status Flush() override {
    Status status;

    if (buffer_.GetCurrentSize() > 0) {
      if (!use_os_buffer_) {
        status = WriteUnbuffered();
      } else {
        status =
            WriteBuffered(buffer_.GetBufferStart(), buffer_.GetCurrentSize());
        if (status.ok()) {
          buffer_.SetSize(0);
        }
      }
    }
    return status;
  }

  virtual Status Sync() override {
    Status s = Flush();
    if (!s.ok()) {
      return s;
    }

    // Calls flush buffers
    if (pending_sync_ && fsync(hFile_) < 0) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_,
                                  lastError);
    } else {
      pending_sync_ = false;
    }
    return s;
  }

  virtual Status Fsync() override { return Sync(); }

  virtual uint64_t GetFileSize() override {
    return filesize_ + buffer_.GetCurrentSize();
  }

  virtual Status Allocate(off_t offset, off_t len) override {
    Status status;
    TEST_KILL_RANDOM(rocksdb_kill_odds);

    // Make sure that we reserve an aligned amount of space
    // since the reservation block size is driven outside so we want
    // to check if we are ok with reservation here
    size_t spaceToReserve = Roundup(offset + len, buffer_.GetAlignment());
    // Nothing to do
    if (spaceToReserve <= reservedsize_) {
      return status;
    }

    IOSTATS_TIMER_GUARD(allocate_nanos);
    status = fallocate(filename_, hFile_, spaceToReserve);
    if (status.ok()) {
      reservedsize_ = spaceToReserve;
    }
    return status;
  }

 private:
  // This method writes to disk the specified data and makes use of the rate
  // limiter
  // if available
  Status WriteBuffered(const char* data, size_t size) {
    Status s;
    assert(use_os_buffer_);
    const char* src = data;
    size_t left = size;

    size_t actually_written = 0;

    while (left > 0) {
      size_t bytes_allowed = RequestToken(left, false);

      DWORD bytesWritten = 0;
      if (!WriteFile(hFile_, src, bytes_allowed, &bytesWritten, NULL)) {
        auto lastError = GetLastError();
        s = IOErrorFromWindowsError(
            "Failed to write buffered via rate_limiter: " + filename_,
            lastError);
        break;
      } else {
        actually_written += bytesWritten;
        src += bytesWritten;
        left -= bytesWritten;
      }
    }

    IOSTATS_ADD(bytes_written, actually_written);
    filesize_ += actually_written;

    return s;
  }

  // This flushes the accumulated data in the buffer. We pad data with zeros if
  // necessary to the whole page.
  // However, during automatic flushes padding would not be necessary.
  // We always use RateLimiter if available. We move (Refit) any buffer bytes
  // that are left over the
  // whole number of pages to be written again on the next flush because we can
  // only write on aligned
  // offsets.
  Status WriteUnbuffered() {
    Status s;

    assert(!use_os_buffer_);
    size_t alignment = buffer_.GetAlignment();
    assert((filesize_ % alignment) == 0);

    // Calculate whole page final file advance if all writes succeed
    size_t file_advance =
        TruncateToPageBoundary(alignment, buffer_.GetCurrentSize());

    // Calculate the leftover tail, we write it here padded with zeros BUT we
    // will write
    // it again in the future either on Close() OR when the current whole page
    // fills out
    size_t leftover_tail = buffer_.GetCurrentSize() - file_advance;

    // Round up and pad
    buffer_.PadToAlignmentWith(0);

    const char* src = buffer_.GetBufferStart();
    size_t left = buffer_.GetCurrentSize();
    uint64_t file_offset = filesize_;
    size_t actually_written = 0;

    while (left > 0) {
      // Request how much is allowed. If this is less than one alignment we may
      // be blocking a lot on every write
      // because we can not write less than one alignment (page) unit thus check
      // the configuration.
      size_t bytes_allowed = RequestToken(left, true);
      SSIZE_T ret = pwrite(hFile_, buffer_.GetBufferStart() + actually_written,
                           bytes_allowed, file_offset);

      // Error break
      if (ret < 0) {
        auto lastError = GetLastError();
        s = IOErrorFromWindowsError(
            "Failed to pwrite for unbuffered: " + filename_, lastError);
        buffer_.SetSize(file_advance + leftover_tail);
        break;
      }
      actually_written += ret;
      file_offset += ret;
      left -= ret;
    }

    IOSTATS_ADD(bytes_written, actually_written);

    if (s.ok()) {
      // Move the tail to the beginning of the buffer
      // This never happens during normal Append but rather during
      // explicit call to Flush()/Sync() or Close()
      buffer_.RefitTail(file_advance, leftover_tail);
      // This is where we start writing next time which may or not be
      // the actual file size on disk. They match if the buffer size
      // is a multiple of whole pages otherwise filesize_ is leftover_tail
      // behind
      filesize_ += file_advance;
    }
    return s;
  }

  // This truncates the request to a single burst bytes
  // and then goes through the request to make sure we are
  // satisfied in the order of the I/O priority
  size_t RequestToken(size_t bytes, bool align) const {
    if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
      bytes = std::min(
          bytes, static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));

      if (align) {
        // Here we may actually require more than burst and block
        // but we can not write less than one page at a time on unbuffered
        // thus we may want not to use ratelimiter s
        size_t alignment = buffer_.GetAlignment();
        bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
      }

      rate_limiter_->Request(bytes, io_priority_);
    }
    return bytes;
  }
};

class WinDirectory : public Directory {
 public:
  WinDirectory() {}

  virtual Status Fsync() override { return Status::OK(); }
};

class WinFileLock : public FileLock {
 public:
  explicit WinFileLock(HANDLE hFile) : hFile_(hFile) {
    assert(hFile != NULL);
    assert(hFile != INVALID_HANDLE_VALUE);
  }

  ~WinFileLock() {
    BOOL ret = ::CloseHandle(hFile_);
    assert(ret);
  }

 private:
  HANDLE hFile_;
};

namespace {

void WinthreadCall(const char* label, std::error_code result) {
  if (0 != result.value()) {
    fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value()));
    abort();
  }
}
}

class WinEnv : public Env {
 public:
  WinEnv();

  virtual ~WinEnv() {
    for (auto& th : threads_to_join_) {
      th.join();
    }

    threads_to_join_.clear();

    for (auto& thpool : thread_pools_) {
      thpool.JoinAllThreads();
    }
    // All threads must be joined before the deletion of
    // thread_status_updater_.
    delete thread_status_updater_;
  }

  virtual Status DeleteFile(const std::string& fname) override {
    Status result;

    if (_unlink(fname.c_str())) {
      result = IOError("Failed to delete: " + fname, errno);
    }

    return result;
  }

  Status GetCurrentTime(int64_t* unix_time) override {
    time_t time = std::time(nullptr);
    if (time == (time_t)(-1)) {
      return Status::NotSupported("Failed to get time");
    }

    *unix_time = time;
    return Status::OK();
  }

  virtual Status NewSequentialFile(const std::string& fname,
                                   std::unique_ptr<SequentialFile>* result,
                                   const EnvOptions& options) override {
    Status s;

    result->reset();

    // Corruption test needs to rename and delete files of these kind
    // while they are still open with another handle. For that reason we
    // allow share_write and delete(allows rename).
    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
      hFile = CreateFileA(
          fname.c_str(), GENERIC_READ,
          FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
          OPEN_EXISTING,  // Original fopen mode is "rb"
          FILE_ATTRIBUTE_NORMAL, NULL);
    }

    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname,
                                  lastError);
    } else {
      int fd = _open_osfhandle(reinterpret_cast<intptr_t>(hFile), 0);
      if (fd == -1) {
        auto code = errno;
        CloseHandle(hFile);
        s = IOError("Failed to _open_osfhandle for NewSequentialFile: " + fname,
                    code);
      } else {
        FILE* file = _fdopen(fd, "rb");
        if (file == nullptr) {
          auto code = errno;
          _close(fd);
          s = IOError("Failed to fdopen NewSequentialFile: " + fname, code);
        } else {
          result->reset(new WinSequentialFile(fname, file, options));
        }
      }
    }
    return s;
  }

  virtual Status NewRandomAccessFile(const std::string& fname,
                                     std::unique_ptr<RandomAccessFile>* result,
                                     const EnvOptions& options) override {
    result->reset();
    Status s;

    // Open the file for read-only random access
    // Random access is to disable read-ahead as the system reads too much data
    DWORD fileFlags = FILE_ATTRIBUTE_READONLY;

    if (!options.use_os_buffer && !options.use_mmap_reads) {
      fileFlags |= FILE_FLAG_NO_BUFFERING;
    } else {
      fileFlags |= FILE_FLAG_RANDOM_ACCESS;
    }

    /// Shared access is necessary for corruption test to pass
    // almost all tests would work with a possible exception of fault_injection
    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
      hFile =
          CreateFileA(fname.c_str(), GENERIC_READ,
                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
                      NULL, OPEN_EXISTING, fileFlags, NULL);
    }

    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
      return IOErrorFromWindowsError(
          "NewRandomAccessFile failed to Create/Open: " + fname, lastError);
    }

    UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);

    // CAUTION! This will map the entire file into the process address space
    if (options.use_mmap_reads && sizeof(void*) >= 8) {
      // Use mmap when virtual address-space is plentiful.
      uint64_t fileSize;

      s = GetFileSize(fname, &fileSize);

      if (s.ok()) {
        // Will not map empty files
        if (fileSize == 0) {
          return IOError(
              "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
        }

        HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READONLY,
                                         0,  // Whole file at its present length
                                         0,
                                         NULL);  // Mapping name

        if (!hMap) {
          auto lastError = GetLastError();
          return IOErrorFromWindowsError(
              "Failed to create file mapping for NewRandomAccessFile: " + fname,
              lastError);
        }

        UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);

        const void* mapped_region =
            MapViewOfFileEx(hMap, FILE_MAP_READ,
                            0,  // High DWORD of access start
                            0,  // Low DWORD
                            fileSize,
                            NULL);  // Let the OS choose the mapping

        if (!mapped_region) {
          auto lastError = GetLastError();
          return IOErrorFromWindowsError(
              "Failed to MapViewOfFile for NewRandomAccessFile: " + fname,
              lastError);
        }

        result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region,
                                              fileSize));

        mapGuard.release();
        fileGuard.release();
      }
    } else {
      result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
      fileGuard.release();
    }
    return s;
  }

  virtual Status NewWritableFile(const std::string& fname,
                                 std::unique_ptr<WritableFile>* result,
                                 const EnvOptions& options) override {
    const size_t c_BufferCapacity = 64 * 1024;

    EnvOptions local_options(options);

    result->reset();
    Status s;

    DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;

    if (!local_options.use_os_buffer && !local_options.use_mmap_writes) {
      fileFlags = FILE_FLAG_NO_BUFFERING;
    }

    // Desired access. We are want to write only here but if we want to memory
    // map
    // the file then there is no write only mode so we have to create it
    // Read/Write
    // However, MapViewOfFile specifies only Write only
    DWORD desired_access = GENERIC_WRITE;
    DWORD shared_mode = FILE_SHARE_READ;

    if (local_options.use_mmap_writes) {
      desired_access |= GENERIC_READ;
    } else {
      // Adding this solely for tests to pass (fault_injection_test,
      // wal_manager_test).
      shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE);
    }

    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
      hFile = CreateFileA(
          fname.c_str(),
          desired_access,  // Access desired
          shared_mode,
          NULL,           // Security attributes
          CREATE_ALWAYS,  // Posix env says O_CREAT | O_RDWR | O_TRUNC
          fileFlags,      // Flags
          NULL);          // Template File
    }

    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
      return IOErrorFromWindowsError(
          "Failed to create a NewWriteableFile: " + fname, lastError);
    }

    if (options.use_mmap_writes) {
      // We usually do not use mmmapping on SSD and thus we pass memory
      // page_size
      result->reset(new WinMmapFile(fname, hFile, page_size_,
                                    allocation_granularity_, local_options));
    } else {
      // Here we want the buffer allocation to be aligned by the SSD page size
      // and to be a multiple of it
      result->reset(new WinWritableFile(fname, hFile, page_size_,
                                        c_BufferCapacity, local_options));
    }
    return s;
  }

  virtual Status NewDirectory(const std::string& name,
                              std::unique_ptr<Directory>* result) override {
    Status s;
    // Must be nullptr on failure
    result->reset();
    // Must fail if directory does not exist
    if (!DirExists(name)) {
      s = IOError("Directory does not exist: " + name, EEXIST);
    } else {
      IOSTATS_TIMER_GUARD(open_nanos);
      result->reset(new WinDirectory);
    }
    return s;
  }

  virtual Status FileExists(const std::string& fname) override {
    // F_OK == 0
    const int F_OK_ = 0;
    return _access(fname.c_str(), F_OK_) == 0 ? Status::OK()
                                              : Status::NotFound();
  }

  virtual Status GetChildren(const std::string& dir,
                             std::vector<std::string>* result) override {
    std::vector<std::string> output;

    Status status;

    auto CloseDir = [](DIR* p) { closedir(p); };
    std::unique_ptr<DIR, decltype(CloseDir)> dirp(opendir(dir.c_str()),
                                                  CloseDir);

    if (!dirp) {
      status = IOError(dir, errno);
    } else {
      if (result->capacity() > 0) {
        output.reserve(result->capacity());
      }

      struct dirent* ent = readdir(dirp.get());
      while (ent) {
        output.push_back(ent->d_name);
        ent = readdir(dirp.get());
      }
    }

    output.swap(*result);

    return status;
  }

  virtual Status CreateDir(const std::string& name) override {
    Status result;

    if (_mkdir(name.c_str()) != 0) {
      auto code = errno;
      result = IOError("Failed to create dir: " + name, code);
    }

    return result;
  }

  virtual Status CreateDirIfMissing(const std::string& name) override {
    Status result;

    if (DirExists(name)) {
      return result;
    }

    if (_mkdir(name.c_str()) != 0) {
      if (errno == EEXIST) {
        result =
            Status::IOError("`" + name + "' exists but is not a directory");
      } else {
        auto code = errno;
        result = IOError("Failed to create dir: " + name, code);
      }
    }

    return result;
  }

  virtual Status DeleteDir(const std::string& name) override {
    Status result;
    if (_rmdir(name.c_str()) != 0) {
      auto code = errno;
      result = IOError("Failed to remove dir: " + name, code);
    }
    return result;
  }

  virtual Status GetFileSize(const std::string& fname,
                             uint64_t* size) override {
    Status s;

    WIN32_FILE_ATTRIBUTE_DATA attrs;
    if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
      ULARGE_INTEGER file_size;
      file_size.HighPart = attrs.nFileSizeHigh;
      file_size.LowPart = attrs.nFileSizeLow;
      *size = file_size.QuadPart;
    } else {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError);
    }
    return s;
  }

  static inline uint64_t FileTimeToUnixTime(const FILETIME& ftTime) {
    const uint64_t c_FileTimePerSecond = 10000000U;
    // UNIX epoch starts on 1970-01-01T00:00:00Z
    // Windows FILETIME starts on 1601-01-01T00:00:00Z
    // Therefore, we need to subtract the below number of seconds from
    // the seconds that we obtain from FILETIME with an obvious loss of
    // precision
    const uint64_t c_SecondBeforeUnixEpoch = 11644473600U;

    ULARGE_INTEGER li;
    li.HighPart = ftTime.dwHighDateTime;
    li.LowPart = ftTime.dwLowDateTime;

    uint64_t result =
        (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
    return result;
  }

  virtual Status GetFileModificationTime(const std::string& fname,
                                         uint64_t* file_mtime) override {
    Status s;

    WIN32_FILE_ATTRIBUTE_DATA attrs;
    if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
      *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
    } else {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError(
          "Can not get file modification time for: " + fname, lastError);
      *file_mtime = 0;
    }

    return s;
  }

  virtual Status RenameFile(const std::string& src,
                            const std::string& target) override {
    Status result;

    // rename() is not capable of replacing the existing file as on Linux
    // so use OS API directly
    if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) {
      DWORD lastError = GetLastError();

      std::string text("Failed to rename: ");
      text.append(src).append(" to: ").append(target);

      result = IOErrorFromWindowsError(text, lastError);
    }

    return result;
  }

  virtual Status LinkFile(const std::string& src,
                          const std::string& target) override {
    Status result;

    if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) {
      DWORD lastError = GetLastError();

      std::string text("Failed to link: ");
      text.append(src).append(" to: ").append(target);

      result = IOErrorFromWindowsError(text, lastError);
    }

    return result;
  }

  virtual Status LockFile(const std::string& lockFname,
                          FileLock** lock) override {
    assert(lock != nullptr);

    *lock = NULL;
    Status result;

    // No-sharing, this is a LOCK file
    const DWORD ExclusiveAccessON = 0;

    // Obtain exclusive access to the LOCK file
    // Previously, instead of NORMAL attr we set DELETE on close and that worked
    // well except with fault_injection test that insists on deleting it.
    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
      hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE),
                          ExclusiveAccessON, NULL, CREATE_ALWAYS,
                          FILE_ATTRIBUTE_NORMAL, NULL);
    }

    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
      result = IOErrorFromWindowsError(
          "Failed to create lock file: " + lockFname, lastError);
    } else {
      *lock = new WinFileLock(hFile);
    }

    return result;
  }

  virtual Status UnlockFile(FileLock* lock) override {
    Status result;

    assert(lock != nullptr);

    delete lock;

    return result;
  }

  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW,
                        void* tag = nullptr) override;

  virtual int UnSchedule(void* arg, Priority pri) override;

  virtual void StartThread(void (*function)(void* arg), void* arg) override;

  virtual void WaitForJoin() override;

  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;

  virtual Status GetTestDirectory(std::string* result) override {
    std::string output;

    const char* env = getenv("TEST_TMPDIR");
    if (env && env[0] != '\0') {
      output = env;
      CreateDir(output);
    } else {
      env = getenv("TMP");

      if (env && env[0] != '\0') {
        output = env;
      } else {
        output = "c:\\tmp";
      }

      CreateDir(output);
    }

    output.append("\\testrocksdb-");
    output.append(std::to_string(_getpid()));

    CreateDir(output);

    output.swap(*result);

    return Status::OK();
  }

  virtual Status GetThreadList(
      std::vector<ThreadStatus>* thread_list) override {
    assert(thread_status_updater_);
    return thread_status_updater_->GetThreadList(thread_list);
  }

  static uint64_t gettid() {
    uint64_t thread_id = GetCurrentThreadId();
    return thread_id;
  }

  virtual uint64_t GetThreadID() const override { return gettid(); }

  virtual Status NewLogger(const std::string& fname,
                           std::shared_ptr<Logger>* result) override {
    Status s;

    result->reset();

    HANDLE hFile = 0;
    {
      IOSTATS_TIMER_GUARD(open_nanos);
      hFile = CreateFileA(
          fname.c_str(), GENERIC_WRITE,
          FILE_SHARE_READ | FILE_SHARE_DELETE,  // In RocksDb log files are
                                                // renamed and deleted before
                                                // they are closed. This enables
                                                // doing so.
          NULL,
          CREATE_ALWAYS,  // Original fopen mode is "w"
          FILE_ATTRIBUTE_NORMAL, NULL);
    }

    if (INVALID_HANDLE_VALUE == hFile) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError);
    } else {
      {
        // With log files we want to set the true creation time as of now
        // because the system
        // for some reason caches the attributes of the previous file that just
        // been renamed from
        // this name so auto_roll_logger_test fails
        FILETIME ft;
        GetSystemTimeAsFileTime(&ft);
        // Set creation, last access and last write time to the same value
        SetFileTime(hFile, &ft, &ft, &ft);
      }

      int fd = _open_osfhandle(reinterpret_cast<intptr_t>(hFile), 0);
      if (fd == -1) {
        auto code = errno;
        CloseHandle(hFile);
        s = IOError("Failed to _open_osfhandle: " + fname, code);
      } else {
        FILE* file = _fdopen(fd, "w");
        if (file == nullptr) {
          auto code = errno;
          _close(fd);
          s = IOError("Failed to fdopen: " + fname, code);
        } else {
          result->reset(new WinLogger(&WinEnv::gettid, this, file));
        }
      }
    }
    return s;
  }

  virtual uint64_t NowMicros() override {
    // all std::chrono clocks on windows have the same resolution that is only
    // On Windows 8 and Windows 2012 Server
    // GetSystemTimePreciseAsFileTime(&current_time) can be used
    LARGE_INTEGER li;
    QueryPerformanceCounter(&li);
    // Convert to nanoseconds first to avoid loss of precision
    // and divide by frequency
    li.QuadPart *= std::micro::den;
    li.QuadPart /= perf_counter_frequency_;
    return li.QuadPart;
  }

  virtual uint64_t NowNanos() override {
    // all std::chrono clocks on windows have the same resolution that is only
    // good enough for microseconds but not nanoseconds
    // On Windows 8 and Windows 2012 Server
    // GetSystemTimePreciseAsFileTime(&current_time) can be used
    LARGE_INTEGER li;
    QueryPerformanceCounter(&li);
    // Convert to nanoseconds first to avoid loss of precision
    // and divide by frequency
    li.QuadPart *= std::nano::den;
    li.QuadPart /= perf_counter_frequency_;
    return li.QuadPart;
  }

  virtual void SleepForMicroseconds(int micros) override {
    std::this_thread::sleep_for(std::chrono::microseconds(micros));
  }

  virtual Status GetHostName(char* name, uint64_t len) override {
    Status s;
    DWORD nSize = len;

    if (!::GetComputerNameA(name, &nSize)) {
      auto lastError = GetLastError();
      s = IOErrorFromWindowsError("GetHostName", lastError);
    } else {
      name[nSize] = 0;
    }

    return s;
  }

  virtual Status GetCurrTime(int64_t* unix_time) {
    Status s;

    time_t ret = time(nullptr);
    if (ret == (time_t)-1) {
      *unix_time = 0;
      s = IOError("GetCurrTime", errno);
    } else {
      *unix_time = (int64_t)ret;
    }

    return s;
  }

  virtual Status GetAbsolutePath(const std::string& db_path,
                                 std::string* output_path) override {
    // Check if we already have an absolute path
    // that starts with non dot and has a semicolon in it
    if ((!db_path.empty() && (db_path[0] == '/' || db_path[0] == '\\')) ||
        (db_path.size() > 2 && db_path[0] != '.' &&
         ((db_path[1] == ':' && db_path[2] == '\\') ||
          (db_path[1] == ':' && db_path[2] == '/')))) {
      *output_path = db_path;
      return Status::OK();
    }

    std::string result;
    result.resize(_MAX_PATH);

    char* ret = _getcwd(&result[0], _MAX_PATH);
    if (ret == nullptr) {
      return Status::IOError("Failed to get current working directory",
                             strerror(errno));
    }

    result.resize(strlen(result.data()));

    result.swap(*output_path);
    return Status::OK();
  }

  // Allow increasing the number of worker threads.
  virtual void SetBackgroundThreads(int num, Priority pri) override {
    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
    thread_pools_[pri].SetBackgroundThreads(num);
  }

  virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
    thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
  }

  virtual std::string TimeToString(uint64_t secondsSince1970) override {
    std::string result;

    const time_t seconds = secondsSince1970;
    const int maxsize = 64;

    struct tm t;
    errno_t ret = localtime_s(&t, &seconds);

    if (ret) {
      result = std::to_string(seconds);
    } else {
      result.resize(maxsize);
      char* p = &result[0];

      int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ",
                         t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
                         t.tm_min, t.tm_sec);
      assert(len > 0);

      result.resize(len);
    }

    return result;
  }

  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
                                 const DBOptions& db_options) const override {
    EnvOptions optimized = env_options;
    optimized.use_mmap_writes = false;
    optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
    optimized.use_os_buffer =
        true;  // This is because we flush only whole pages on unbuffered io and
               // the last records are not guaranteed to be flushed.
    // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
    // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
    // test and make this false
    optimized.fallocate_with_keep_size = true;
    return optimized;
  }

  EnvOptions OptimizeForManifestWrite(
      const EnvOptions& env_options) const override {
    EnvOptions optimized = env_options;
    optimized.use_mmap_writes = false;
    optimized.use_os_buffer = true;
    optimized.fallocate_with_keep_size = true;
    return optimized;
  }

 private:
  // Returns true iff the named directory exists and is a directory.
  virtual bool DirExists(const std::string& dname) {
    WIN32_FILE_ATTRIBUTE_DATA attrs;
    if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) {
      return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
    }
    return false;
  }

  bool SupportsFastAllocate(const std::string& /* path */) { return false; }

  class ThreadPool {
   public:
    ThreadPool()
        : total_threads_limit_(1),
          bgthreads_(0),
          queue_(),
          queue_len_(0U),
          exit_all_threads_(false),
          low_io_priority_(false),
          env_(nullptr) {}

    ~ThreadPool() { assert(bgthreads_.size() == 0U); }

    void JoinAllThreads() {
      {
        std::lock_guard<std::mutex> lock(mu_);
        assert(!exit_all_threads_);
        exit_all_threads_ = true;
        bgsignal_.notify_all();
      }

      for (std::thread& th : bgthreads_) {
        th.join();
      }

      // Subject to assert in the __dtor
      bgthreads_.clear();
    }

    void SetHostEnv(Env* env) { env_ = env; }

    // Return true if there is at least one thread needs to terminate.
    bool HasExcessiveThread() const {
      return bgthreads_.size() > total_threads_limit_;
    }

    // Return true iff the current thread is the excessive thread to terminate.
    // Always terminate the running thread that is added last, even if there are
    // more than one thread to terminate.
    bool IsLastExcessiveThread(size_t thread_id) const {
      return HasExcessiveThread() && thread_id == bgthreads_.size() - 1;
    }

    // Is one of the threads to terminate.
    bool IsExcessiveThread(size_t thread_id) const {
      return thread_id >= total_threads_limit_;
    }

    // Return the thread priority.
    // This would allow its member-thread to know its priority.
    Env::Priority GetThreadPriority() { return priority_; }

    // Set the thread priority.
    void SetThreadPriority(Env::Priority priority) { priority_ = priority; }

    void BGThread(size_t thread_id) {
      while (true) {
        // Wait until there is an item that is ready to run
        std::unique_lock<std::mutex> uniqueLock(mu_);

        // Stop waiting if the thread needs to do work or needs to terminate.
        while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
               (queue_.empty() || IsExcessiveThread(thread_id))) {
          bgsignal_.wait(uniqueLock);
        }

        if (exit_all_threads_) {
          // mechanism to let BG threads exit safely
          uniqueLock.unlock();
          break;
        }

        if (IsLastExcessiveThread(thread_id)) {
          // Current thread is the last generated one and is excessive.
          // We always terminate excessive thread in the reverse order of
          // generation time.
          std::thread& terminating_thread = bgthreads_.back();
          auto tid = terminating_thread.get_id();
          // Ensure that that this thread is ours
          assert(tid == std::this_thread::get_id());
          terminating_thread.detach();
          bgthreads_.pop_back();

          if (HasExcessiveThread()) {
            // There is still at least more excessive thread to terminate.
            WakeUpAllThreads();
          }

          uniqueLock.unlock();

          PrintThreadInfo(thread_id, gettid());
          break;
        }

        void (*function)(void*) = queue_.front().function;
        void* arg = queue_.front().arg;
        queue_.pop_front();
        queue_len_.store(queue_.size(), std::memory_order_relaxed);

        uniqueLock.unlock();
        (*function)(arg);
      }
    }

    // Helper struct for passing arguments when creating threads.
    struct BGThreadMetadata {
      ThreadPool* thread_pool_;
      size_t thread_id_;  // Thread count in the thread.

      explicit BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id)
          : thread_pool_(thread_pool), thread_id_(thread_id) {}
    };

    static void* BGThreadWrapper(void* arg) {
      std::unique_ptr<BGThreadMetadata> meta(
          reinterpret_cast<BGThreadMetadata*>(arg));

      size_t thread_id = meta->thread_id_;
      ThreadPool* tp = meta->thread_pool_;

#if ROCKSDB_USING_THREAD_STATUS
      // for thread-status
      ThreadStatusUtil::RegisterThread(
          tp->env_, (tp->GetThreadPriority() == Env::Priority::HIGH
                         ? ThreadStatus::HIGH_PRIORITY
                         : ThreadStatus::LOW_PRIORITY));
#endif
      tp->BGThread(thread_id);
#if ROCKSDB_USING_THREAD_STATUS
      ThreadStatusUtil::UnregisterThread();
#endif
      return nullptr;
    }

    void WakeUpAllThreads() { bgsignal_.notify_all(); }

    void SetBackgroundThreadsInternal(size_t num, bool allow_reduce) {
      std::lock_guard<std::mutex> lg(mu_);

      if (exit_all_threads_) {
        return;
      }

      if (num > total_threads_limit_ ||
          (num < total_threads_limit_ && allow_reduce)) {
        total_threads_limit_ = std::max(size_t(1), num);
        WakeUpAllThreads();
        StartBGThreads();
      }
      assert(total_threads_limit_ > 0);
    }

    void IncBackgroundThreadsIfNeeded(int num) {
      SetBackgroundThreadsInternal(num, false);
    }

    void SetBackgroundThreads(int num) {
      SetBackgroundThreadsInternal(num, true);
    }

    void StartBGThreads() {
      // Start background thread if necessary
      while (bgthreads_.size() < total_threads_limit_) {
        std::thread p_t(&ThreadPool::BGThreadWrapper,
                        new BGThreadMetadata(this, bgthreads_.size()));
        bgthreads_.push_back(std::move(p_t));
      }
    }

    void Schedule(void (*function)(void* arg1), void* arg, void* tag) {
      std::lock_guard<std::mutex> lg(mu_);

      if (exit_all_threads_) {
        return;
      }

      StartBGThreads();

      // Add to priority queue
      queue_.push_back(BGItem());
      queue_.back().function = function;
      queue_.back().arg = arg;
      queue_.back().tag = tag;
      queue_len_.store(queue_.size(), std::memory_order_relaxed);

      if (!HasExcessiveThread()) {
        // Wake up at least one waiting thread.
        bgsignal_.notify_one();
      } else {
        // Need to wake up all threads to make sure the one woken
        // up is not the one to terminate.
        WakeUpAllThreads();
      }
    }

    int UnSchedule(void* arg) {
      int count = 0;

      std::lock_guard<std::mutex> lg(mu_);

      // Remove from priority queue
      BGQueue::iterator it = queue_.begin();
      while (it != queue_.end()) {
        if (arg == (*it).tag) {
          it = queue_.erase(it);
          count++;
        } else {
          ++it;
        }
      }

      queue_len_.store(queue_.size(), std::memory_order_relaxed);

      return count;
    }

    unsigned int GetQueueLen() const {
      return static_cast<unsigned int>(
          queue_len_.load(std::memory_order_relaxed));
    }

   private:
    // Entry per Schedule() call
    struct BGItem {
      void* arg;
      void (*function)(void*);
      void* tag;
    };

    typedef std::deque<BGItem> BGQueue;

    std::mutex mu_;
    std::condition_variable bgsignal_;
    size_t total_threads_limit_;
    std::vector<std::thread> bgthreads_;
    BGQueue queue_;
    std::atomic_size_t queue_len_;  // Queue length. Used for stats reporting
    bool exit_all_threads_;
    bool low_io_priority_;
    Env::Priority priority_;
    Env* env_;
  };

  bool checkedDiskForMmap_;
  bool forceMmapOff;  // do we override Env options?
  size_t page_size_;
  size_t allocation_granularity_;
  uint64_t perf_counter_frequency_;
  std::vector<ThreadPool> thread_pools_;
  mutable std::mutex mu_;
  std::vector<std::thread> threads_to_join_;
};

WinEnv::WinEnv()
    : checkedDiskForMmap_(false),
      forceMmapOff(false),
      page_size_(4 * 1012),
      allocation_granularity_(page_size_),
      perf_counter_frequency_(0),
      thread_pools_(Priority::TOTAL) {
  SYSTEM_INFO sinfo;
  GetSystemInfo(&sinfo);

  page_size_ = sinfo.dwPageSize;
  allocation_granularity_ = sinfo.dwAllocationGranularity;

  {
    LARGE_INTEGER qpf;
    BOOL ret = QueryPerformanceFrequency(&qpf);
    assert(ret == TRUE);
    perf_counter_frequency_ = qpf.QuadPart;
  }

  for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
    thread_pools_[pool_id].SetThreadPriority(
        static_cast<Env::Priority>(pool_id));
    // This allows later initializing the thread-local-env of each thread.
    thread_pools_[pool_id].SetHostEnv(this);
  }

  // Protected member of the base class
  thread_status_updater_ = CreateThreadStatusUpdater();
}

void WinEnv::Schedule(void (*function)(void*), void* arg, Priority pri,
                      void* tag) {
  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
  thread_pools_[pri].Schedule(function, arg, tag);
}

int WinEnv::UnSchedule(void* arg, Priority pri) {
  return thread_pools_[pri].UnSchedule(arg);
}

unsigned int WinEnv::GetThreadPoolQueueLen(Priority pri) const {
  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
  return thread_pools_[pri].GetQueueLen();
}

namespace {
struct StartThreadState {
  void (*user_function)(void*);
  void* arg;
};
}

static void* StartThreadWrapper(void* arg) {
  std::unique_ptr<StartThreadState> state(
      reinterpret_cast<StartThreadState*>(arg));
  state->user_function(state->arg);
  return nullptr;
}

void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
  StartThreadState* state = new StartThreadState;
  state->user_function = function;
  state->arg = arg;
  try {
    std::thread th(&StartThreadWrapper, state);

    std::lock_guard<std::mutex> lg(mu_);
    threads_to_join_.push_back(std::move(th));

  } catch (const std::system_error& ex) {
    WinthreadCall("start thread", ex.code());
  }
}

void WinEnv::WaitForJoin() {
  for (auto& th : threads_to_join_) {
    th.join();
  }

  threads_to_join_.clear();
}

}  // namespace

std::string Env::GenerateUniqueId() {
  std::string result;

  UUID uuid;
  UuidCreateSequential(&uuid);

  RPC_CSTR rpc_str;
  auto status = UuidToStringA(&uuid, &rpc_str);
  assert(status == RPC_S_OK);

  result = reinterpret_cast<char*>(rpc_str);

  status = RpcStringFreeA(&rpc_str);
  assert(status == RPC_S_OK);

  return result;
}

// We choose to create this on the heap and using std::once for the following
// reasons
// 1) Currently available MS compiler does not implement atomic C++11
// initialization of
//    function local statics
// 2) We choose not to destroy the env because joining the threads from the
// system loader
//    which destroys the statics (same as from DLLMain) creates a system loader
//    dead-lock.
//    in this manner any remaining threads are terminated OK.
namespace {
std::once_flag winenv_once_flag;
Env* envptr;
};

Env* Env::Default() {
  std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); });
  return envptr;
}

}  // namespace rocksdb