Improve SimulatedHybridFileSystem (#9301)

Summary:
Several improvements to SimulatedHybridFileSystem:
(1) Allow a mode where all I/Os to all files simulate HDD. This can be enabled in db_bench using -simulate_hdd
(2) Latency calculation is slightly more accurate
(3) Allow to simulate more than one HDD spindles.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9301

Test Plan: Run db_bench and observe the results are reasonable.

Reviewed By: jay-zhuang

Differential Revision: D33141662

fbshipit-source-id: b736e58c4ba910d06899cc9ccec79b628275f4fa
This commit is contained in:
sdong 2021-12-29 11:13:49 -08:00 committed by Facebook GitHub Bot
parent 1c39b7952b
commit a931bacf5d
4 changed files with 58 additions and 29 deletions

View File

@ -7,6 +7,7 @@
## 6.28.0 (2021-12-17) ## 6.28.0 (2021-12-17)
### New Features ### New Features
* Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps. * Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps.
* Introduce SimulatedHybridFileSystem which can help simulating HDD latency in db_bench. Tiered Storage latency simulation can be enabled using -simulate_hybrid_fs_file (note that it doesn't work if db_bench is interrupted in the middle). -simulate_hdd can also be used to simulate all files on HDD.
### Bug Fixes ### Bug Fixes
* Fixed a bug in rocksdb automatic implicit prefetching which got broken because of new feature adaptive_readahead and internal prefetching got disabled when iterator moves from one file to next. * Fixed a bug in rocksdb automatic implicit prefetching which got broken because of new feature adaptive_readahead and internal prefetching got disabled when iterator moves from one file to next.

View File

@ -1154,6 +1154,10 @@ DEFINE_string(simulate_hybrid_fs_file, "",
"File for Store Metadata for Simulate hybrid FS. Empty means " "File for Store Metadata for Simulate hybrid FS. Empty means "
"disable the feature. Now, if it is set, " "disable the feature. Now, if it is set, "
"bottommost_temperature is set to kWarm."); "bottommost_temperature is set to kWarm.");
DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
"In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
"are simulated.");
DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard; static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
@ -8135,12 +8139,15 @@ int db_bench_tool(int argc, char** argv) {
fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str()); fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
exit(1); exit(1);
} }
} else if (FLAGS_simulate_hybrid_fs_file != "") { } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
//**TODO: Make the simulate fs something that can be loaded //**TODO: Make the simulate fs something that can be loaded
// from the ObjectRegistry... // from the ObjectRegistry...
static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env = static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>( NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
FileSystem::Default(), FLAGS_simulate_hybrid_fs_file)); FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
/*throughput_multiplier=*/
int{FLAGS_simulate_hybrid_hdd_multipliers},
/*is_full_fs_warm=*/FLAGS_simulate_hdd));
FLAGS_env = composite_env.get(); FLAGS_env = composite_env.get();
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE

View File

@ -3,6 +3,7 @@
// COPYING file in the root directory) and Apache 2.0 License // COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory). // (found in the LICENSE.Apache file in the root directory).
#include "util/stop_watch.h"
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include "tools/simulated_hybrid_file_system.h" #include "tools/simulated_hybrid_file_system.h"
@ -15,7 +16,6 @@
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
const int kLatencyAddedPerRequestUs = 15000;
const int64_t kUsPerSec = 1000000; const int64_t kUsPerSec = 1000000;
const int64_t kDummyBytesPerUs = 1024; const int64_t kDummyBytesPerUs = 1024;
@ -43,14 +43,17 @@ void RateLimiterRequest(RateLimiter* rater_limiter, int64_t amount) {
// warm // warm
SimulatedHybridFileSystem::SimulatedHybridFileSystem( SimulatedHybridFileSystem::SimulatedHybridFileSystem(
const std::shared_ptr<FileSystem>& base, const std::shared_ptr<FileSystem>& base,
const std::string& metadata_file_name) const std::string& metadata_file_name, int throughput_multiplier,
bool is_full_fs_warm)
: FileSystemWrapper(base), : FileSystemWrapper(base),
// Limit to 100 requests per second. // Limit to 100 requests per second.
rate_limiter_(NewGenericRateLimiter( rate_limiter_(NewGenericRateLimiter(
kDummyBytesPerUs * kUsPerSec /* rate_bytes_per_sec */, int64_t{throughput_multiplier} * kDummyBytesPerUs *
kUsPerSec /* rate_bytes_per_sec */,
1000 /* refill_period_us */)), 1000 /* refill_period_us */)),
metadata_file_name_(metadata_file_name), metadata_file_name_(metadata_file_name),
name_("SimulatedHybridFileSystem: " + std::string(target()->Name())) { name_("SimulatedHybridFileSystem: " + std::string(target()->Name())),
is_full_fs_warm_(is_full_fs_warm) {
IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr); IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr);
if (s.IsNotFound()) { if (s.IsNotFound()) {
return; return;
@ -77,6 +80,9 @@ SimulatedHybridFileSystem::SimulatedHybridFileSystem(
// SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the // SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the
// file. // file.
SimulatedHybridFileSystem::~SimulatedHybridFileSystem() { SimulatedHybridFileSystem::~SimulatedHybridFileSystem() {
if (metadata_file_name_.empty()) {
return;
}
std::string metadata; std::string metadata;
for (const auto& f : warm_file_set_) { for (const auto& f : warm_file_set_) {
metadata += f; metadata += f;
@ -93,13 +99,15 @@ IOStatus SimulatedHybridFileSystem::NewRandomAccessFile(
const std::string& fname, const FileOptions& file_opts, const std::string& fname, const FileOptions& file_opts,
std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) { std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
Temperature temperature = Temperature::kUnknown; Temperature temperature = Temperature::kUnknown;
{ if (is_full_fs_warm_) {
temperature = Temperature::kWarm;
} else {
const std::lock_guard<std::mutex> lock(mutex_); const std::lock_guard<std::mutex> lock(mutex_);
if (warm_file_set_.find(fname) != warm_file_set_.end()) { if (warm_file_set_.find(fname) != warm_file_set_.end()) {
temperature = Temperature::kWarm; temperature = Temperature::kWarm;
} }
assert(temperature == file_opts.temperature);
} }
assert(temperature == file_opts.temperature);
IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
result->reset( result->reset(
new SimulatedHybridRaf(std::move(*result), rate_limiter_, temperature)); new SimulatedHybridRaf(std::move(*result), rate_limiter_, temperature));
@ -115,7 +123,7 @@ IOStatus SimulatedHybridFileSystem::NewWritableFile(
} }
IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
if (file_opts.temperature == Temperature::kWarm) { if (file_opts.temperature == Temperature::kWarm || is_full_fs_warm_) {
result->reset(new SimulatedWritableFile(std::move(*result), rate_limiter_)); result->reset(new SimulatedWritableFile(std::move(*result), rate_limiter_));
} }
return s; return s;
@ -135,8 +143,7 @@ IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n,
const IOOptions& options, Slice* result, const IOOptions& options, Slice* result,
char* scratch, IODebugContext* dbg) const { char* scratch, IODebugContext* dbg) const {
if (temperature_ == Temperature::kWarm) { if (temperature_ == Temperature::kWarm) {
Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs); SimulateIOWait(n);
RequestRateLimit(n);
} }
return target()->Read(offset, n, options, result, scratch, dbg); return target()->Read(offset, n, options, result, scratch, dbg);
} }
@ -146,10 +153,8 @@ IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs,
IODebugContext* dbg) { IODebugContext* dbg) {
if (temperature_ == Temperature::kWarm) { if (temperature_ == Temperature::kWarm) {
for (size_t i = 0; i < num_reqs; i++) { for (size_t i = 0; i < num_reqs; i++) {
RequestRateLimit(reqs[i].len); SimulateIOWait(reqs[i].len);
} }
Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs *
static_cast<int>(num_reqs));
} }
return target()->MultiRead(reqs, num_reqs, options, dbg); return target()->MultiRead(reqs, num_reqs, options, dbg);
} }
@ -158,24 +163,34 @@ IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n,
const IOOptions& options, const IOOptions& options,
IODebugContext* dbg) { IODebugContext* dbg) {
if (temperature_ == Temperature::kWarm) { if (temperature_ == Temperature::kWarm) {
RequestRateLimit(n); SimulateIOWait(n);
Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs);
} }
return target()->Prefetch(offset, n, options, dbg); return target()->Prefetch(offset, n, options, dbg);
} }
void SimulatedHybridRaf::RequestRateLimit(int64_t bytes) const { void SimulatedHybridRaf::SimulateIOWait(int64_t bytes) const {
RateLimiterRequest(rate_limiter_.get(), CalculateServeTimeUs(bytes)); int serve_time = CalculateServeTimeUs(bytes);
{
StopWatchNano stop_watch(Env::Default()->GetSystemClock().get(),
/*auto_start=*/true);
RateLimiterRequest(rate_limiter_.get(), serve_time);
int time_passed_us = static_cast<int>(stop_watch.ElapsedNanos() / 1000);
if (time_passed_us < serve_time) {
Env::Default()->SleepForMicroseconds(serve_time - time_passed_us);
}
}
} }
void SimulatedWritableFile::RequestRateLimit(int64_t bytes) const { void SimulatedWritableFile::SimulateIOWait(int64_t bytes) const {
RateLimiterRequest(rate_limiter_.get(), CalculateServeTimeUs(bytes)); int serve_time = CalculateServeTimeUs(bytes);
Env::Default()->SleepForMicroseconds(serve_time);
RateLimiterRequest(rate_limiter_.get(), serve_time);
} }
IOStatus SimulatedWritableFile::Append(const Slice& data, const IOOptions& ioo, IOStatus SimulatedWritableFile::Append(const Slice& data, const IOOptions& ioo,
IODebugContext* idc) { IODebugContext* idc) {
if (use_direct_io()) { if (use_direct_io()) {
RequestRateLimit(data.size()); SimulateIOWait(data.size());
} else { } else {
unsynced_bytes += data.size(); unsynced_bytes += data.size();
} }
@ -186,7 +201,7 @@ IOStatus SimulatedWritableFile::Append(
const Slice& data, const IOOptions& options, const Slice& data, const IOOptions& options,
const DataVerificationInfo& verification_info, IODebugContext* dbg) { const DataVerificationInfo& verification_info, IODebugContext* dbg) {
if (use_direct_io()) { if (use_direct_io()) {
RequestRateLimit(data.size()); SimulateIOWait(data.size());
} else { } else {
unsynced_bytes += data.size(); unsynced_bytes += data.size();
} }
@ -198,7 +213,7 @@ IOStatus SimulatedWritableFile::PositionedAppend(const Slice& data,
const IOOptions& options, const IOOptions& options,
IODebugContext* dbg) { IODebugContext* dbg) {
if (use_direct_io()) { if (use_direct_io()) {
RequestRateLimit(data.size()); SimulateIOWait(data.size());
} else { } else {
// This might be overcalculated, but it's probably OK. // This might be overcalculated, but it's probably OK.
unsynced_bytes += data.size(); unsynced_bytes += data.size();
@ -209,7 +224,7 @@ IOStatus SimulatedWritableFile::PositionedAppend(
const Slice& data, uint64_t offset, const IOOptions& options, const Slice& data, uint64_t offset, const IOOptions& options,
const DataVerificationInfo& verification_info, IODebugContext* dbg) { const DataVerificationInfo& verification_info, IODebugContext* dbg) {
if (use_direct_io()) { if (use_direct_io()) {
RequestRateLimit(data.size()); SimulateIOWait(data.size());
} else { } else {
// This might be overcalculated, but it's probably OK. // This might be overcalculated, but it's probably OK.
unsynced_bytes += data.size(); unsynced_bytes += data.size();
@ -221,7 +236,7 @@ IOStatus SimulatedWritableFile::PositionedAppend(
IOStatus SimulatedWritableFile::Sync(const IOOptions& options, IOStatus SimulatedWritableFile::Sync(const IOOptions& options,
IODebugContext* dbg) { IODebugContext* dbg) {
if (unsynced_bytes > 0) { if (unsynced_bytes > 0) {
RequestRateLimit(unsynced_bytes); SimulateIOWait(unsynced_bytes);
unsynced_bytes = 0; unsynced_bytes = 0;
} }
return target()->Sync(options, dbg); return target()->Sync(options, dbg);

View File

@ -28,8 +28,13 @@ class SimulatedHybridFileSystem : public FileSystemWrapper {
// metadata_file_name stores metadata of the files, so that it can be // metadata_file_name stores metadata of the files, so that it can be
// loaded after process restarts. If the file doesn't exist, create // loaded after process restarts. If the file doesn't exist, create
// one. The file is written when the class is destroyed. // one. The file is written when the class is destroyed.
explicit SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base, // throughput_multiplier: multiplier of throughput. For example, 1 is to
const std::string& metadata_file_name); // simulate single disk spindle. 4 is to simualte 4 disk spindles.
// is_full_fs_warm: if true, all files are all included in slow I/O
// simulation.
SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base,
const std::string& metadata_file_name,
int throughput_multiplier, bool is_full_fs_warm);
~SimulatedHybridFileSystem() override; ~SimulatedHybridFileSystem() override;
@ -55,6 +60,7 @@ class SimulatedHybridFileSystem : public FileSystemWrapper {
std::unordered_set<std::string> warm_file_set_; std::unordered_set<std::string> warm_file_set_;
std::string metadata_file_name_; std::string metadata_file_name_;
std::string name_; std::string name_;
bool is_full_fs_warm_;
}; };
// Simulated random access file that can control IOPs and latency to simulate // Simulated random access file that can control IOPs and latency to simulate
@ -84,7 +90,7 @@ class SimulatedHybridRaf : public FSRandomAccessFileOwnerWrapper {
std::shared_ptr<RateLimiter> rate_limiter_; std::shared_ptr<RateLimiter> rate_limiter_;
Temperature temperature_; Temperature temperature_;
void RequestRateLimit(int64_t num_requests) const; void SimulateIOWait(int64_t num_requests) const;
}; };
class SimulatedWritableFile : public FSWritableFileWrapper { class SimulatedWritableFile : public FSWritableFileWrapper {
@ -113,7 +119,7 @@ class SimulatedWritableFile : public FSWritableFileWrapper {
std::shared_ptr<RateLimiter> rate_limiter_; std::shared_ptr<RateLimiter> rate_limiter_;
size_t unsynced_bytes = 0; size_t unsynced_bytes = 0;
void RequestRateLimit(int64_t num_requests) const; void SimulateIOWait(int64_t num_requests) const;
}; };
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE