3dff28cf9b
Summary: For performance purposes, the lower level routines were changed to use a SystemClock* instead of a std::shared_ptr<SystemClock>. The shared ptr has some performance degradation on certain hardware classes. For most of the system, there is no risk of the pointer being deleted/invalid because the shared_ptr will be stored elsewhere. For example, the ImmutableDBOptions stores the Env which has a std::shared_ptr<SystemClock> in it. The SystemClock* within the ImmutableDBOptions is essentially a "short cut" to gain access to this constant resource. There were a few classes (PeriodicWorkScheduler?) where the "short cut" property did not hold. In those cases, the shared pointer was preserved. Using db_bench readrandom perf_level=3 on my EC2 box, this change performed as well or better than 6.17: 6.17: readrandom : 28.046 micros/op 854902 ops/sec; 61.3 MB/s (355999 of 355999 found) 6.18: readrandom : 32.615 micros/op 735306 ops/sec; 52.7 MB/s (290999 of 290999 found) PR: readrandom : 27.500 micros/op 871909 ops/sec; 62.5 MB/s (367999 of 367999 found) (Note that the times for 6.18 are prior to revert of the SystemClock). Pull Request resolved: https://github.com/facebook/rocksdb/pull/8033 Reviewed By: pdillinger Differential Revision: D27014563 Pulled By: mrambacher fbshipit-source-id: ad0459eba03182e454391b5926bf5cdd45657b67
192 lines
6.4 KiB
C++
192 lines
6.4 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <atomic>
|
|
#include <fstream>
|
|
|
|
#include "monitoring/instrumented_mutex.h"
|
|
#include "rocksdb/options.h"
|
|
#include "trace_replay/trace_replay.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
class SystemClock;
|
|
class TraceReader;
|
|
class TraceWriter;
|
|
|
|
/* In order to log new data in trace record for specified operations, do
|
|
following:
|
|
1. Add new data in IOTraceOP (say kIONewData= 3)
|
|
2. Log it in IOTraceWriter::WriteIOOp, and read that in
|
|
IOTraceReader::ReadIOOp and
|
|
IOTraceRecordParser::PrintHumanReadableIOTraceRecord in the switch case.
|
|
3. In the FileSystemTracer APIs where this data will be logged with, update
|
|
io_op_data |= (1 << IOTraceOp::kIONewData).
|
|
*/
|
|
enum IOTraceOp : char {
|
|
// The value of each enum represents the bitwise position for
|
|
// IOTraceRecord.io_op_data.
|
|
kIOFileSize = 0,
|
|
kIOLen = 1,
|
|
kIOOffset = 2,
|
|
};
|
|
|
|
struct IOTraceRecord {
|
|
// Required fields for all accesses.
|
|
uint64_t access_timestamp = 0;
|
|
TraceType trace_type = TraceType::kTraceMax;
|
|
// Each bit in io_op_data stores which corresponding info from IOTraceOp will
|
|
// be added in the trace. Foreg, if bit at position 1 is set then
|
|
// IOTraceOp::kIOLen (length) will be logged in the record.
|
|
uint64_t io_op_data = 0;
|
|
std::string file_operation;
|
|
uint64_t latency = 0;
|
|
std::string io_status;
|
|
// Stores file name instead of full path.
|
|
std::string file_name;
|
|
// Fields added to record based on IO operation.
|
|
uint64_t len = 0;
|
|
uint64_t offset = 0;
|
|
uint64_t file_size = 0;
|
|
|
|
IOTraceRecord() {}
|
|
|
|
IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
|
|
const uint64_t& _io_op_data, const std::string& _file_operation,
|
|
const uint64_t& _latency, const std::string& _io_status,
|
|
const std::string& _file_name, const uint64_t& _file_size = 0)
|
|
: access_timestamp(_access_timestamp),
|
|
trace_type(_trace_type),
|
|
io_op_data(_io_op_data),
|
|
file_operation(_file_operation),
|
|
latency(_latency),
|
|
io_status(_io_status),
|
|
file_name(_file_name),
|
|
file_size(_file_size) {}
|
|
|
|
IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
|
|
const uint64_t& _io_op_data, const std::string& _file_operation,
|
|
const uint64_t& _latency, const std::string& _io_status,
|
|
const std::string& _file_name, const uint64_t& _len,
|
|
const uint64_t& _offset)
|
|
: access_timestamp(_access_timestamp),
|
|
trace_type(_trace_type),
|
|
io_op_data(_io_op_data),
|
|
file_operation(_file_operation),
|
|
latency(_latency),
|
|
io_status(_io_status),
|
|
file_name(_file_name),
|
|
len(_len),
|
|
offset(_offset) {}
|
|
};
|
|
|
|
struct IOTraceHeader {
|
|
uint64_t start_time;
|
|
uint32_t rocksdb_major_version;
|
|
uint32_t rocksdb_minor_version;
|
|
};
|
|
|
|
// IOTraceWriter writes IO operation as a single trace. Each trace will have a
|
|
// timestamp and type, followed by the trace payload.
|
|
class IOTraceWriter {
|
|
public:
|
|
IOTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
|
|
std::unique_ptr<TraceWriter>&& trace_writer);
|
|
~IOTraceWriter() = default;
|
|
// No copy and move.
|
|
IOTraceWriter(const IOTraceWriter&) = delete;
|
|
IOTraceWriter& operator=(const IOTraceWriter&) = delete;
|
|
IOTraceWriter(IOTraceWriter&&) = delete;
|
|
IOTraceWriter& operator=(IOTraceWriter&&) = delete;
|
|
|
|
Status WriteIOOp(const IOTraceRecord& record);
|
|
|
|
// Write a trace header at the beginning, typically on initiating a trace,
|
|
// with some metadata like a magic number and RocksDB version.
|
|
Status WriteHeader();
|
|
|
|
private:
|
|
SystemClock* clock_;
|
|
TraceOptions trace_options_;
|
|
std::unique_ptr<TraceWriter> trace_writer_;
|
|
};
|
|
|
|
// IOTraceReader helps read the trace file generated by IOTraceWriter.
|
|
class IOTraceReader {
|
|
public:
|
|
explicit IOTraceReader(std::unique_ptr<TraceReader>&& reader);
|
|
~IOTraceReader() = default;
|
|
// No copy and move.
|
|
IOTraceReader(const IOTraceReader&) = delete;
|
|
IOTraceReader& operator=(const IOTraceReader&) = delete;
|
|
IOTraceReader(IOTraceReader&&) = delete;
|
|
IOTraceReader& operator=(IOTraceReader&&) = delete;
|
|
|
|
Status ReadHeader(IOTraceHeader* header);
|
|
|
|
Status ReadIOOp(IOTraceRecord* record);
|
|
|
|
private:
|
|
std::unique_ptr<TraceReader> trace_reader_;
|
|
};
|
|
|
|
// An IO tracer. It uses IOTraceWriter to write the access record to the
|
|
// trace file.
|
|
class IOTracer {
|
|
public:
|
|
IOTracer();
|
|
~IOTracer();
|
|
// No copy and move.
|
|
IOTracer(const IOTracer&) = delete;
|
|
IOTracer& operator=(const IOTracer&) = delete;
|
|
IOTracer(IOTracer&&) = delete;
|
|
IOTracer& operator=(IOTracer&&) = delete;
|
|
|
|
// no_sanitize is added for tracing_enabled. writer_ is protected under mutex
|
|
// so even if user call Start/EndIOTrace and tracing_enabled is not updated in
|
|
// the meanwhile, WriteIOOp will anyways check the writer_ protected under
|
|
// mutex and ignore the operation if writer_is null. So its ok if
|
|
// tracing_enabled shows non updated value.
|
|
|
|
#if defined(__clang__)
|
|
#if defined(__has_feature) && __has_feature(thread_sanitizer)
|
|
#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread")))
|
|
#endif // __has_feature(thread_sanitizer)
|
|
#else // __clang__
|
|
#ifdef __SANITIZE_THREAD__
|
|
#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread")))
|
|
#endif // __SANITIZE_THREAD__
|
|
#endif // __clang__
|
|
|
|
#ifndef TSAN_SUPPRESSION
|
|
#define TSAN_SUPPRESSION
|
|
#endif // TSAN_SUPPRESSION
|
|
|
|
// Start writing IO operations to the trace_writer.
|
|
TSAN_SUPPRESSION Status
|
|
StartIOTrace(SystemClock* clock, const TraceOptions& trace_options,
|
|
std::unique_ptr<TraceWriter>&& trace_writer);
|
|
|
|
// Stop writing IO operations to the trace_writer.
|
|
TSAN_SUPPRESSION void EndIOTrace();
|
|
|
|
TSAN_SUPPRESSION bool is_tracing_enabled() const { return tracing_enabled; }
|
|
|
|
void WriteIOOp(const IOTraceRecord& record);
|
|
|
|
private:
|
|
TraceOptions trace_options_;
|
|
// A mutex protects the writer_.
|
|
InstrumentedMutex trace_writer_mutex_;
|
|
std::atomic<IOTraceWriter*> writer_;
|
|
// bool tracing_enabled is added to avoid costly operation of checking atomic
|
|
// variable 'writer_' is nullptr or not in is_tracing_enabled().
|
|
// is_tracing_enabled() is invoked multiple times by FileSystem classes.
|
|
bool tracing_enabled;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|