rocksdb/util/file_reader_writer.cc

//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "util/file_reader_writer.h"

#include <algorithm>
#include "port/port.h"
#include "util/iostats_context_imp.h"
#include "util/random.h"
#include "util/rate_limiter.h"
#include "util/sync_point.h"

namespace rocksdb {
Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
  Status s = file_->Read(n, result, scratch);
  IOSTATS_ADD(bytes_read, result->size());
  return s;
}

Status SequentialFileReader::Skip(uint64_t n) { return file_->Skip(n); }

Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
                                    char* scratch) const {
  IOSTATS_TIMER_GUARD(read_nanos);
  Status s = file_->Read(offset, n, result, scratch);
  IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
  return s;
}

Status WritableFileWriter::Append(const Slice& data) {
  const char* src = data.data();
  size_t left = data.size();
  Status s;
  pending_sync_ = true;
  pending_fsync_ = true;

  TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);

  writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
  // if there is no space in the cache, then flush
  if (cursize_ + left > capacity_) {
    s = Flush();
    if (!s.ok()) {
      return s;
    }
    // Increase the buffer size, but capped at 1MB
    if (capacity_ < (1 << 20)) {
      capacity_ *= 2;
      buf_.reset(new char[capacity_]);
    }
    assert(cursize_ == 0);
  }

  // if the write fits into the cache, then write to cache
  // otherwise do a write() syscall to write to OS buffers.
  if (cursize_ + left <= capacity_) {
    memcpy(buf_.get() + cursize_, src, left);
    cursize_ += left;
  } else {
    while (left != 0) {
      size_t size = RequestToken(left);
      {
        IOSTATS_TIMER_GUARD(write_nanos);
        s = writable_file_->Append(Slice(src, size));
        if (!s.ok()) {
          return s;
        }
      }
      IOSTATS_ADD(bytes_written, size);
      TEST_KILL_RANDOM(rocksdb_kill_odds);

      left -= size;
      src += size;
    }
  }
  TEST_KILL_RANDOM(rocksdb_kill_odds);
  filesize_ += data.size();
  return Status::OK();
}

Status WritableFileWriter::Close() {
  Status s;
  s = Flush();  // flush cache to OS
  if (!s.ok()) {
    return s;
  }

  TEST_KILL_RANDOM(rocksdb_kill_odds);
  return writable_file_->Close();
}

// write out the cached data to the OS cache
Status WritableFileWriter::Flush() {
  TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
  size_t left = cursize_;
  char* src = buf_.get();
  while (left != 0) {
    size_t size = RequestToken(left);
    {
      IOSTATS_TIMER_GUARD(write_nanos);
      Status s = writable_file_->Append(Slice(src, size));
      if (!s.ok()) {
        return s;
      }
    }
    IOSTATS_ADD(bytes_written, size);
    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
    left -= size;
    src += size;
  }
  cursize_ = 0;

  writable_file_->Flush();

  // sync OS cache to disk for every bytes_per_sync_
  // TODO: give log file and sst file different options (log
  // files could be potentially cached in OS for their whole
  // life time, thus we might not want to flush at all).

  // We try to avoid sync to the last 1MB of data. For two reasons:
  // (1) avoid rewrite the same page that is modified later.
  // (2) for older version of OS, write can block while writing out
  //     the page.
  // Xfs does neighbor page flushing outside of the specified ranges. We
  // need to make sure sync range is far from the write offset.
  if (bytes_per_sync_) {
    uint64_t kBytesNotSyncRange = 1024 * 1024;  // recent 1MB is not synced.
    uint64_t kBytesAlignWhenSync = 4 * 1024;    // Align 4KB.
    if (filesize_ > kBytesNotSyncRange) {
      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
      assert(offset_sync_to >= last_sync_size_);
      if (offset_sync_to > 0 &&
          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
        RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
        last_sync_size_ = offset_sync_to;
      }
    }
  }

  return Status::OK();
}

Status WritableFileWriter::Sync(bool use_fsync) {
  Status s = Flush();
  if (!s.ok()) {
    return s;
  }
  TEST_KILL_RANDOM(rocksdb_kill_odds);
  if (pending_sync_) {
    if (use_fsync) {
      s = writable_file_->Fsync();
    } else {
      s = writable_file_->Sync();
    }
    if (!s.ok()) {
      return s;
    }
  }
  TEST_KILL_RANDOM(rocksdb_kill_odds);
  pending_sync_ = false;
  if (use_fsync) {
    pending_fsync_ = false;
  }
  return Status::OK();
}

Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) {
  IOSTATS_TIMER_GUARD(range_sync_nanos);
  return writable_file_->RangeSync(offset, nbytes);
}

size_t WritableFileWriter::RequestToken(size_t bytes) {
  Env::IOPriority io_priority;
  if (rate_limiter_&&(io_priority = writable_file_->GetIOPriority()) <
      Env::IO_TOTAL) {
    bytes = std::min(bytes,
                     static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
    rate_limiter_->Request(bytes, io_priority);
  }
  return bytes;
}

Status RandomRWFileAccessor::Write(uint64_t offset, const Slice& data) {
  Status s;
  pending_sync_ = true;
  pending_fsync_ = true;

  {
    IOSTATS_TIMER_GUARD(write_nanos);
    s = random_rw_file_->Write(offset, data);
    if (!s.ok()) {
      return s;
    }
  }
  IOSTATS_ADD(bytes_written, data.size());

  return s;
}

Status RandomRWFileAccessor::Read(uint64_t offset, size_t n, Slice* result,
                                  char* scratch) const {
  Status s;
  {
    IOSTATS_TIMER_GUARD(read_nanos);
    s = random_rw_file_->Read(offset, n, result, scratch);
    if (!s.ok()) {
      return s;
    }
  }
  IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
  return s;
}

Status RandomRWFileAccessor::Close() { return random_rw_file_->Close(); }

Status RandomRWFileAccessor::Sync(bool use_fsync) {
  Status s;
  if (pending_sync_) {
    if (use_fsync) {
      s = random_rw_file_->Fsync();
    } else {
      s = random_rw_file_->Sync();
    }
    if (!s.ok()) {
      return s;
    }
  }
  if (use_fsync) {
    pending_fsync_ = false;
  }
  pending_sync_ = false;

  return s;
}
}  // namespace rocksdb
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future. Test Plan: Run all existing unit tests. Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor Reviewed By: igor Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D42321 2015-07-18 01:16:11 +02:00			`// Copyright (c) 2013, Facebook, Inc. All rights reserved.`
			`// This source code is licensed under the BSD-style license found in the`
			`// LICENSE file in the root directory of this source tree. An additional grant`
			`// of patent rights can be found in the PATENTS file in the same directory.`
			`//`
			`// Copyright (c) 2011 The LevelDB Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style license that can be`
			`// found in the LICENSE file. See the AUTHORS file for names of contributors.`

			`#include "util/file_reader_writer.h"`

			`#include <algorithm>`
			`#include "port/port.h"`
			`#include "util/iostats_context_imp.h"`
			`#include "util/random.h"`
			`#include "util/rate_limiter.h"`
			`#include "util/sync_point.h"`

			`namespace rocksdb {`
			`Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {`
			`Status s = file_->Read(n, result, scratch);`
			`IOSTATS_ADD(bytes_read, result->size());`
			`return s;`
			`}`

			`Status SequentialFileReader::Skip(uint64_t n) { return file_->Skip(n); }`

			`Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,`
			`char* scratch) const {`
			`IOSTATS_TIMER_GUARD(read_nanos);`
			`Status s = file_->Read(offset, n, result, scratch);`
			`IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());`
			`return s;`
			`}`

			`Status WritableFileWriter::Append(const Slice& data) {`
			`const char* src = data.data();`
			`size_t left = data.size();`
			`Status s;`
			`pending_sync_ = true;`
			`pending_fsync_ = true;`

			`TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);`

			`writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);`
			`// if there is no space in the cache, then flush`
			`if (cursize_ + left > capacity_) {`
			`s = Flush();`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`// Increase the buffer size, but capped at 1MB`
			`if (capacity_ < (1 << 20)) {`
			`capacity_ *= 2;`
			`buf_.reset(new char[capacity_]);`
			`}`
			`assert(cursize_ == 0);`
			`}`

			`// if the write fits into the cache, then write to cache`
			`// otherwise do a write() syscall to write to OS buffers.`
			`if (cursize_ + left <= capacity_) {`
			`memcpy(buf_.get() + cursize_, src, left);`
			`cursize_ += left;`
			`} else {`
			`while (left != 0) {`
			`size_t size = RequestToken(left);`
			`{`
			`IOSTATS_TIMER_GUARD(write_nanos);`
			`s = writable_file_->Append(Slice(src, size));`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`}`
			`IOSTATS_ADD(bytes_written, size);`
			`TEST_KILL_RANDOM(rocksdb_kill_odds);`

			`left -= size;`
			`src += size;`
			`}`
			`}`
			`TEST_KILL_RANDOM(rocksdb_kill_odds);`
			`filesize_ += data.size();`
			`return Status::OK();`
			`}`

			`Status WritableFileWriter::Close() {`
			`Status s;`
			`s = Flush(); // flush cache to OS`
			`if (!s.ok()) {`
			`return s;`
			`}`

			`TEST_KILL_RANDOM(rocksdb_kill_odds);`
			`return writable_file_->Close();`
			`}`

			`// write out the cached data to the OS cache`
			`Status WritableFileWriter::Flush() {`
			`TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);`
			`size_t left = cursize_;`
			`char* src = buf_.get();`
			`while (left != 0) {`
			`size_t size = RequestToken(left);`
			`{`
			`IOSTATS_TIMER_GUARD(write_nanos);`
			`Status s = writable_file_->Append(Slice(src, size));`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`}`
			`IOSTATS_ADD(bytes_written, size);`
			`TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);`
			`left -= size;`
			`src += size;`
			`}`
			`cursize_ = 0;`

			`writable_file_->Flush();`

			`// sync OS cache to disk for every bytes_per_sync_`
			`// TODO: give log file and sst file different options (log`
			`// files could be potentially cached in OS for their whole`
			`// life time, thus we might not want to flush at all).`
RangeSync not to sync last 1MB of the file Summary: From other ones' investigation: "sync_file_range() behavior highly depends on kernel version and filesystem. xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)." Test Plan: make all check Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony Reviewed By: anthony Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin Differential Revision: https://reviews.facebook.net/D15807 2015-07-20 23:46:15 +02:00
			`// We try to avoid sync to the last 1MB of data. For two reasons:`
			`// (1) avoid rewrite the same page that is modified later.`
			`// (2) for older version of OS, write can block while writing out`
			`// the page.`
			`// Xfs does neighbor page flushing outside of the specified ranges. We`
			`// need to make sure sync range is far from the write offset.`
			`if (bytes_per_sync_) {`
			`uint64_t kBytesNotSyncRange = 1024 * 1024; // recent 1MB is not synced.`
			`uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.`
			`if (filesize_ > kBytesNotSyncRange) {`
			`uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;`
			`offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;`
			`assert(offset_sync_to >= last_sync_size_);`
			`if (offset_sync_to > 0 &&`
			`offset_sync_to - last_sync_size_ >= bytes_per_sync_) {`
			`RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);`
			`last_sync_size_ = offset_sync_to;`
			`}`
			`}`
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future. Test Plan: Run all existing unit tests. Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor Reviewed By: igor Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D42321 2015-07-18 01:16:11 +02:00			`}`

			`return Status::OK();`
			`}`

			`Status WritableFileWriter::Sync(bool use_fsync) {`
			`Status s = Flush();`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`TEST_KILL_RANDOM(rocksdb_kill_odds);`
			`if (pending_sync_) {`
			`if (use_fsync) {`
			`s = writable_file_->Fsync();`
			`} else {`
			`s = writable_file_->Sync();`
			`}`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`}`
			`TEST_KILL_RANDOM(rocksdb_kill_odds);`
			`pending_sync_ = false;`
			`if (use_fsync) {`
			`pending_fsync_ = false;`
			`}`
			`return Status::OK();`
			`}`

			`Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) {`
			`IOSTATS_TIMER_GUARD(range_sync_nanos);`
			`return writable_file_->RangeSync(offset, nbytes);`
			`}`

			`size_t WritableFileWriter::RequestToken(size_t bytes) {`
			`Env::IOPriority io_priority;`
			`if (rate_limiter_&&(io_priority = writable_file_->GetIOPriority()) <`
			`Env::IO_TOTAL) {`
			`bytes = std::min(bytes,`
			`static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));`
			`rate_limiter_->Request(bytes, io_priority);`
			`}`
			`return bytes;`
			`}`

			`Status RandomRWFileAccessor::Write(uint64_t offset, const Slice& data) {`
			`Status s;`
			`pending_sync_ = true;`
			`pending_fsync_ = true;`

			`{`
			`IOSTATS_TIMER_GUARD(write_nanos);`
			`s = random_rw_file_->Write(offset, data);`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`}`
			`IOSTATS_ADD(bytes_written, data.size());`

			`return s;`
			`}`

			`Status RandomRWFileAccessor::Read(uint64_t offset, size_t n, Slice* result,`
			`char* scratch) const {`
			`Status s;`
			`{`
			`IOSTATS_TIMER_GUARD(read_nanos);`
			`s = random_rw_file_->Read(offset, n, result, scratch);`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`}`
			`IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());`
			`return s;`
			`}`

			`Status RandomRWFileAccessor::Close() { return random_rw_file_->Close(); }`

			`Status RandomRWFileAccessor::Sync(bool use_fsync) {`
			`Status s;`
			`if (pending_sync_) {`
			`if (use_fsync) {`
			`s = random_rw_file_->Fsync();`
			`} else {`
			`s = random_rw_file_->Sync();`
			`}`
			`if (!s.ok()) {`
			`return s;`
			`}`
			`}`
			`if (use_fsync) {`
			`pending_fsync_ = false;`
			`}`
			`pending_sync_ = false;`

			`return s;`
			`}`
			`} // namespace rocksdb`