rocksdb/trace_replay/io_tracer.h
mrambacher 3dff28cf9b Use SystemClock* instead of std::shared_ptr<SystemClock> in lower level routines (#8033)
Summary:
For performance purposes, the lower level routines were changed to use a SystemClock* instead of a std::shared_ptr<SystemClock>.  The shared ptr has some performance degradation on certain hardware classes.

For most of the system, there is no risk of the pointer being deleted/invalid because the shared_ptr will be stored elsewhere.  For example, the ImmutableDBOptions stores the Env which has a std::shared_ptr<SystemClock> in it.  The SystemClock* within the ImmutableDBOptions is essentially a "short cut" to gain access to this constant resource.

There were a few classes (PeriodicWorkScheduler?) where the "short cut" property did not hold.  In those cases, the shared pointer was preserved.

Using db_bench readrandom perf_level=3 on my EC2 box, this change performed as well or better than 6.17:

6.17: readrandom   :      28.046 micros/op 854902 ops/sec;   61.3 MB/s (355999 of 355999 found)
6.18: readrandom   :      32.615 micros/op 735306 ops/sec;   52.7 MB/s (290999 of 290999 found)
PR: readrandom   :      27.500 micros/op 871909 ops/sec;   62.5 MB/s (367999 of 367999 found)

(Note that the times for 6.18 are prior to revert of the SystemClock).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8033

Reviewed By: pdillinger

Differential Revision: D27014563

Pulled By: mrambacher

fbshipit-source-id: ad0459eba03182e454391b5926bf5cdd45657b67
2021-03-15 04:34:11 -07:00

192 lines
6.4 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <atomic>
#include <fstream>
#include "monitoring/instrumented_mutex.h"
#include "rocksdb/options.h"
#include "trace_replay/trace_replay.h"
namespace ROCKSDB_NAMESPACE {
class SystemClock;
class TraceReader;
class TraceWriter;
/* In order to log new data in trace record for specified operations, do
following:
1. Add new data in IOTraceOP (say kIONewData= 3)
2. Log it in IOTraceWriter::WriteIOOp, and read that in
IOTraceReader::ReadIOOp and
IOTraceRecordParser::PrintHumanReadableIOTraceRecord in the switch case.
3. In the FileSystemTracer APIs where this data will be logged with, update
io_op_data |= (1 << IOTraceOp::kIONewData).
*/
enum IOTraceOp : char {
// The value of each enum represents the bitwise position for
// IOTraceRecord.io_op_data.
kIOFileSize = 0,
kIOLen = 1,
kIOOffset = 2,
};
struct IOTraceRecord {
// Required fields for all accesses.
uint64_t access_timestamp = 0;
TraceType trace_type = TraceType::kTraceMax;
// Each bit in io_op_data stores which corresponding info from IOTraceOp will
// be added in the trace. Foreg, if bit at position 1 is set then
// IOTraceOp::kIOLen (length) will be logged in the record.
uint64_t io_op_data = 0;
std::string file_operation;
uint64_t latency = 0;
std::string io_status;
// Stores file name instead of full path.
std::string file_name;
// Fields added to record based on IO operation.
uint64_t len = 0;
uint64_t offset = 0;
uint64_t file_size = 0;
IOTraceRecord() {}
IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
const uint64_t& _io_op_data, const std::string& _file_operation,
const uint64_t& _latency, const std::string& _io_status,
const std::string& _file_name, const uint64_t& _file_size = 0)
: access_timestamp(_access_timestamp),
trace_type(_trace_type),
io_op_data(_io_op_data),
file_operation(_file_operation),
latency(_latency),
io_status(_io_status),
file_name(_file_name),
file_size(_file_size) {}
IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
const uint64_t& _io_op_data, const std::string& _file_operation,
const uint64_t& _latency, const std::string& _io_status,
const std::string& _file_name, const uint64_t& _len,
const uint64_t& _offset)
: access_timestamp(_access_timestamp),
trace_type(_trace_type),
io_op_data(_io_op_data),
file_operation(_file_operation),
latency(_latency),
io_status(_io_status),
file_name(_file_name),
len(_len),
offset(_offset) {}
};
struct IOTraceHeader {
uint64_t start_time;
uint32_t rocksdb_major_version;
uint32_t rocksdb_minor_version;
};
// IOTraceWriter writes IO operation as a single trace. Each trace will have a
// timestamp and type, followed by the trace payload.
class IOTraceWriter {
public:
IOTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
std::unique_ptr<TraceWriter>&& trace_writer);
~IOTraceWriter() = default;
// No copy and move.
IOTraceWriter(const IOTraceWriter&) = delete;
IOTraceWriter& operator=(const IOTraceWriter&) = delete;
IOTraceWriter(IOTraceWriter&&) = delete;
IOTraceWriter& operator=(IOTraceWriter&&) = delete;
Status WriteIOOp(const IOTraceRecord& record);
// Write a trace header at the beginning, typically on initiating a trace,
// with some metadata like a magic number and RocksDB version.
Status WriteHeader();
private:
SystemClock* clock_;
TraceOptions trace_options_;
std::unique_ptr<TraceWriter> trace_writer_;
};
// IOTraceReader helps read the trace file generated by IOTraceWriter.
class IOTraceReader {
public:
explicit IOTraceReader(std::unique_ptr<TraceReader>&& reader);
~IOTraceReader() = default;
// No copy and move.
IOTraceReader(const IOTraceReader&) = delete;
IOTraceReader& operator=(const IOTraceReader&) = delete;
IOTraceReader(IOTraceReader&&) = delete;
IOTraceReader& operator=(IOTraceReader&&) = delete;
Status ReadHeader(IOTraceHeader* header);
Status ReadIOOp(IOTraceRecord* record);
private:
std::unique_ptr<TraceReader> trace_reader_;
};
// An IO tracer. It uses IOTraceWriter to write the access record to the
// trace file.
class IOTracer {
public:
IOTracer();
~IOTracer();
// No copy and move.
IOTracer(const IOTracer&) = delete;
IOTracer& operator=(const IOTracer&) = delete;
IOTracer(IOTracer&&) = delete;
IOTracer& operator=(IOTracer&&) = delete;
// no_sanitize is added for tracing_enabled. writer_ is protected under mutex
// so even if user call Start/EndIOTrace and tracing_enabled is not updated in
// the meanwhile, WriteIOOp will anyways check the writer_ protected under
// mutex and ignore the operation if writer_is null. So its ok if
// tracing_enabled shows non updated value.
#if defined(__clang__)
#if defined(__has_feature) && __has_feature(thread_sanitizer)
#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread")))
#endif // __has_feature(thread_sanitizer)
#else // __clang__
#ifdef __SANITIZE_THREAD__
#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread")))
#endif // __SANITIZE_THREAD__
#endif // __clang__
#ifndef TSAN_SUPPRESSION
#define TSAN_SUPPRESSION
#endif // TSAN_SUPPRESSION
// Start writing IO operations to the trace_writer.
TSAN_SUPPRESSION Status
StartIOTrace(SystemClock* clock, const TraceOptions& trace_options,
std::unique_ptr<TraceWriter>&& trace_writer);
// Stop writing IO operations to the trace_writer.
TSAN_SUPPRESSION void EndIOTrace();
TSAN_SUPPRESSION bool is_tracing_enabled() const { return tracing_enabled; }
void WriteIOOp(const IOTraceRecord& record);
private:
TraceOptions trace_options_;
// A mutex protects the writer_.
InstrumentedMutex trace_writer_mutex_;
std::atomic<IOTraceWriter*> writer_;
// bool tracing_enabled is added to avoid costly operation of checking atomic
// variable 'writer_' is nullptr or not in is_tracing_enabled().
// is_tracing_enabled() is invoked multiple times by FileSystem classes.
bool tracing_enabled;
};
} // namespace ROCKSDB_NAMESPACE