3dff28cf9b
Summary: For performance purposes, the lower level routines were changed to use a SystemClock* instead of a std::shared_ptr<SystemClock>. The shared ptr has some performance degradation on certain hardware classes. For most of the system, there is no risk of the pointer being deleted/invalid because the shared_ptr will be stored elsewhere. For example, the ImmutableDBOptions stores the Env which has a std::shared_ptr<SystemClock> in it. The SystemClock* within the ImmutableDBOptions is essentially a "short cut" to gain access to this constant resource. There were a few classes (PeriodicWorkScheduler?) where the "short cut" property did not hold. In those cases, the shared pointer was preserved. Using db_bench readrandom perf_level=3 on my EC2 box, this change performed as well or better than 6.17: 6.17: readrandom : 28.046 micros/op 854902 ops/sec; 61.3 MB/s (355999 of 355999 found) 6.18: readrandom : 32.615 micros/op 735306 ops/sec; 52.7 MB/s (290999 of 290999 found) PR: readrandom : 27.500 micros/op 871909 ops/sec; 62.5 MB/s (367999 of 367999 found) (Note that the times for 6.18 are prior to revert of the SystemClock). Pull Request resolved: https://github.com/facebook/rocksdb/pull/8033 Reviewed By: pdillinger Differential Revision: D27014563 Pulled By: mrambacher fbshipit-source-id: ad0459eba03182e454391b5926bf5cdd45657b67
296 lines
11 KiB
C++
296 lines
11 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <atomic>
|
|
#include <fstream>
|
|
|
|
#include "monitoring/instrumented_mutex.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/trace_reader_writer.h"
|
|
#include "table/table_reader_caller.h"
|
|
#include "trace_replay/trace_replay.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
class Env;
|
|
class SystemClock;
|
|
|
|
extern const uint64_t kMicrosInSecond;
|
|
extern const uint64_t kSecondInMinute;
|
|
extern const uint64_t kSecondInHour;
|
|
|
|
struct BlockCacheTraceRecord;
|
|
|
|
class BlockCacheTraceHelper {
|
|
public:
|
|
static bool IsGetOrMultiGetOnDataBlock(TraceType block_type,
|
|
TableReaderCaller caller);
|
|
static bool IsGetOrMultiGet(TableReaderCaller caller);
|
|
static bool IsUserAccess(TableReaderCaller caller);
|
|
// Row key is a concatenation of the access's fd_number and the referenced
|
|
// user key.
|
|
static std::string ComputeRowKey(const BlockCacheTraceRecord& access);
|
|
// The first four bytes of the referenced key in a Get request is the table
|
|
// id.
|
|
static uint64_t GetTableId(const BlockCacheTraceRecord& access);
|
|
// The sequence number of a get request is the last part of the referenced
|
|
// key.
|
|
static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access);
|
|
// Block offset in a file is the last varint64 in the block key.
|
|
static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access);
|
|
|
|
static const std::string kUnknownColumnFamilyName;
|
|
static const uint64_t kReservedGetId;
|
|
};
|
|
|
|
// Lookup context for tracing block cache accesses.
|
|
// We trace block accesses at five places:
|
|
// 1. BlockBasedTable::GetFilter
|
|
// 2. BlockBasedTable::GetUncompressedDict.
|
|
// 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index,
|
|
// and range deletion block.)
|
|
// 4. BlockBasedTable::Get. (To trace the referenced key and whether the
|
|
// referenced key exists in a fetched data block.)
|
|
// 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the
|
|
// referenced key exists in a fetched data block.)
|
|
// The context is created at:
|
|
// 1. BlockBasedTable::Get. (kUserGet)
|
|
// 2. BlockBasedTable::MultiGet. (kUserMGet)
|
|
// 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or
|
|
// external SST ingestion calls this function.)
|
|
// 4. BlockBasedTable::Open. (kPrefetch)
|
|
// 5. Index/Filter::CacheDependencies. (kPrefetch)
|
|
// 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
|
|
// kUserApproximateSize).
|
|
struct BlockCacheLookupContext {
|
|
BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
|
|
BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id,
|
|
bool _get_from_user_specified_snapshot)
|
|
: caller(_caller),
|
|
get_id(_get_id),
|
|
get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {}
|
|
const TableReaderCaller caller;
|
|
// These are populated when we perform lookup/insert on block cache. The block
|
|
// cache tracer uses these inforation when logging the block access at
|
|
// BlockBasedTable::GET and BlockBasedTable::MultiGet.
|
|
bool is_cache_hit = false;
|
|
bool no_insert = false;
|
|
TraceType block_type = TraceType::kTraceMax;
|
|
uint64_t block_size = 0;
|
|
std::string block_key;
|
|
uint64_t num_keys_in_block = 0;
|
|
// The unique id associated with Get and MultiGet. This enables us to track
|
|
// how many blocks a Get/MultiGet request accesses. We can also measure the
|
|
// impact of row cache vs block cache.
|
|
uint64_t get_id = 0;
|
|
std::string referenced_key;
|
|
bool get_from_user_specified_snapshot = false;
|
|
|
|
void FillLookupContext(bool _is_cache_hit, bool _no_insert,
|
|
TraceType _block_type, uint64_t _block_size,
|
|
const std::string& _block_key,
|
|
uint64_t _num_keys_in_block) {
|
|
is_cache_hit = _is_cache_hit;
|
|
no_insert = _no_insert;
|
|
block_type = _block_type;
|
|
block_size = _block_size;
|
|
block_key = _block_key;
|
|
num_keys_in_block = _num_keys_in_block;
|
|
}
|
|
};
|
|
|
|
enum Boolean : char { kTrue = 1, kFalse = 0 };
|
|
|
|
struct BlockCacheTraceRecord {
|
|
// Required fields for all accesses.
|
|
uint64_t access_timestamp = 0;
|
|
std::string block_key;
|
|
TraceType block_type = TraceType::kTraceMax;
|
|
uint64_t block_size = 0;
|
|
uint64_t cf_id = 0;
|
|
std::string cf_name;
|
|
uint32_t level = 0;
|
|
uint64_t sst_fd_number = 0;
|
|
TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
|
|
Boolean is_cache_hit = Boolean::kFalse;
|
|
Boolean no_insert = Boolean::kFalse;
|
|
// Required field for Get and MultiGet
|
|
uint64_t get_id = BlockCacheTraceHelper::kReservedGetId;
|
|
Boolean get_from_user_specified_snapshot = Boolean::kFalse;
|
|
std::string referenced_key;
|
|
// Required fields for data block and user Get/Multi-Get only.
|
|
uint64_t referenced_data_size = 0;
|
|
uint64_t num_keys_in_block = 0;
|
|
Boolean referenced_key_exist_in_block = Boolean::kFalse;
|
|
|
|
BlockCacheTraceRecord() {}
|
|
|
|
BlockCacheTraceRecord(
|
|
uint64_t _access_timestamp, std::string _block_key, TraceType _block_type,
|
|
uint64_t _block_size, uint64_t _cf_id, std::string _cf_name,
|
|
uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller,
|
|
bool _is_cache_hit, bool _no_insert,
|
|
uint64_t _get_id = BlockCacheTraceHelper::kReservedGetId,
|
|
bool _get_from_user_specified_snapshot = false,
|
|
std::string _referenced_key = "", uint64_t _referenced_data_size = 0,
|
|
uint64_t _num_keys_in_block = 0,
|
|
bool _referenced_key_exist_in_block = false)
|
|
: access_timestamp(_access_timestamp),
|
|
block_key(_block_key),
|
|
block_type(_block_type),
|
|
block_size(_block_size),
|
|
cf_id(_cf_id),
|
|
cf_name(_cf_name),
|
|
level(_level),
|
|
sst_fd_number(_sst_fd_number),
|
|
caller(_caller),
|
|
is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
|
|
no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
|
|
get_id(_get_id),
|
|
get_from_user_specified_snapshot(_get_from_user_specified_snapshot
|
|
? Boolean::kTrue
|
|
: Boolean::kFalse),
|
|
referenced_key(_referenced_key),
|
|
referenced_data_size(_referenced_data_size),
|
|
num_keys_in_block(_num_keys_in_block),
|
|
referenced_key_exist_in_block(
|
|
_referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) {
|
|
}
|
|
};
|
|
|
|
struct BlockCacheTraceHeader {
|
|
uint64_t start_time;
|
|
uint32_t rocksdb_major_version;
|
|
uint32_t rocksdb_minor_version;
|
|
};
|
|
|
|
// BlockCacheTraceWriter captures all RocksDB block cache accesses using a
|
|
// user-provided TraceWriter. Every RocksDB operation is written as a single
|
|
// trace. Each trace will have a timestamp and type, followed by the trace
|
|
// payload.
|
|
class BlockCacheTraceWriter {
|
|
public:
|
|
BlockCacheTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
|
|
std::unique_ptr<TraceWriter>&& trace_writer);
|
|
~BlockCacheTraceWriter() = default;
|
|
// No copy and move.
|
|
BlockCacheTraceWriter(const BlockCacheTraceWriter&) = delete;
|
|
BlockCacheTraceWriter& operator=(const BlockCacheTraceWriter&) = delete;
|
|
BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete;
|
|
BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete;
|
|
|
|
// Pass Slice references to avoid copy.
|
|
Status WriteBlockAccess(const BlockCacheTraceRecord& record,
|
|
const Slice& block_key, const Slice& cf_name,
|
|
const Slice& referenced_key);
|
|
|
|
// Write a trace header at the beginning, typically on initiating a trace,
|
|
// with some metadata like a magic number and RocksDB version.
|
|
Status WriteHeader();
|
|
|
|
private:
|
|
SystemClock* clock_;
|
|
TraceOptions trace_options_;
|
|
std::unique_ptr<TraceWriter> trace_writer_;
|
|
};
|
|
|
|
// Write a trace record in human readable format, see
|
|
// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
|
|
// for details.
|
|
class BlockCacheHumanReadableTraceWriter {
|
|
public:
|
|
~BlockCacheHumanReadableTraceWriter();
|
|
|
|
Status NewWritableFile(const std::string& human_readable_trace_file_path,
|
|
ROCKSDB_NAMESPACE::Env* env);
|
|
|
|
Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
|
|
uint64_t block_id, uint64_t get_key_id);
|
|
|
|
private:
|
|
char trace_record_buffer_[1024 * 1024];
|
|
std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
|
|
human_readable_trace_file_writer_;
|
|
};
|
|
|
|
// BlockCacheTraceReader helps read the trace file generated by
|
|
// BlockCacheTraceWriter using a user provided TraceReader.
|
|
class BlockCacheTraceReader {
|
|
public:
|
|
BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader);
|
|
virtual ~BlockCacheTraceReader() = default;
|
|
// No copy and move.
|
|
BlockCacheTraceReader(const BlockCacheTraceReader&) = delete;
|
|
BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete;
|
|
BlockCacheTraceReader(BlockCacheTraceReader&&) = delete;
|
|
BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete;
|
|
|
|
Status ReadHeader(BlockCacheTraceHeader* header);
|
|
|
|
Status ReadAccess(BlockCacheTraceRecord* record);
|
|
|
|
private:
|
|
std::unique_ptr<TraceReader> trace_reader_;
|
|
};
|
|
|
|
// Read a trace record in human readable format, see
|
|
// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
|
|
// for detailed.
|
|
class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader {
|
|
public:
|
|
BlockCacheHumanReadableTraceReader(const std::string& trace_file_path);
|
|
|
|
~BlockCacheHumanReadableTraceReader();
|
|
|
|
Status ReadHeader(BlockCacheTraceHeader* header);
|
|
|
|
Status ReadAccess(BlockCacheTraceRecord* record);
|
|
|
|
private:
|
|
std::ifstream human_readable_trace_reader_;
|
|
};
|
|
|
|
// A block cache tracer. It downsamples the accesses according to
|
|
// trace_options and uses BlockCacheTraceWriter to write the access record to
|
|
// the trace file.
|
|
class BlockCacheTracer {
|
|
public:
|
|
BlockCacheTracer();
|
|
~BlockCacheTracer();
|
|
// No copy and move.
|
|
BlockCacheTracer(const BlockCacheTracer&) = delete;
|
|
BlockCacheTracer& operator=(const BlockCacheTracer&) = delete;
|
|
BlockCacheTracer(BlockCacheTracer&&) = delete;
|
|
BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
|
|
|
|
// Start writing block cache accesses to the trace_writer.
|
|
Status StartTrace(SystemClock* clock, const TraceOptions& trace_options,
|
|
std::unique_ptr<TraceWriter>&& trace_writer);
|
|
|
|
// Stop writing block cache accesses to the trace_writer.
|
|
void EndTrace();
|
|
|
|
bool is_tracing_enabled() const {
|
|
return writer_.load(std::memory_order_relaxed);
|
|
}
|
|
|
|
Status WriteBlockAccess(const BlockCacheTraceRecord& record,
|
|
const Slice& block_key, const Slice& cf_name,
|
|
const Slice& referenced_key);
|
|
|
|
// GetId cycles from 1 to port::kMaxUint64.
|
|
uint64_t NextGetId();
|
|
|
|
private:
|
|
TraceOptions trace_options_;
|
|
// A mutex protects the writer_.
|
|
InstrumentedMutex trace_writer_mutex_;
|
|
std::atomic<BlockCacheTraceWriter*> writer_;
|
|
std::atomic<uint64_t> get_id_counter_;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|