Blob DB: Add statistics

Summary:
Adding a list of blob db counters.

Also remove WaStats() which doesn't expose the stats and can be substitute by (BLOB_DB_BYTES_WRITTEN / BLOB_DB_BLOB_FILE_BYTES_WRITTEN).
Closes https://github.com/facebook/rocksdb/pull/3193

Differential Revision: D6394216

Pulled By: yiwu-arbug

fbshipit-source-id: 017508c8ff3fcd7ea7403c64d0f9834b24816803
This commit is contained in:
Yi Wu 2017-11-28 11:42:28 -08:00 committed by Facebook Github Bot
parent 3cf562be31
commit 78279350aa
13 changed files with 365 additions and 163 deletions

View File

@ -226,6 +226,73 @@ enum Tickers : uint32_t {
// Number of internal keys skipped by Iterator // Number of internal keys skipped by Iterator
NUMBER_ITER_SKIP, NUMBER_ITER_SKIP,
// BlobDB specific stats
// # of Put/PutTTL/PutUntil to BlobDB.
BLOB_DB_NUM_PUT,
// # of Write to BlobDB.
BLOB_DB_NUM_WRITE,
// # of Get to BlobDB.
BLOB_DB_NUM_GET,
// # of MultiGet to BlobDB.
BLOB_DB_NUM_MULTIGET,
// # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
BLOB_DB_NUM_SEEK,
// # of Next to BlobDB iterator.
BLOB_DB_NUM_NEXT,
// # of Prev to BlobDB iterator.
BLOB_DB_NUM_PREV,
// # of keys written to BlobDB.
BLOB_DB_NUM_KEYS_WRITTEN,
// # of keys read from BlobDB.
BLOB_DB_NUM_KEYS_READ,
// # of bytes (key + value) written to BlobDB.
BLOB_DB_BYTES_WRITTEN,
// # of bytes (keys + value) read from BlobDB.
BLOB_DB_BYTES_READ,
// # of keys written by BlobDB as non-TTL inlined value.
BLOB_DB_WRITE_INLINED,
// # of keys written by BlobDB as TTL inlined value.
BLOB_DB_WRITE_INLINED_TTL,
// # of keys written by BlobDB as non-TTL blob value.
BLOB_DB_WRITE_BLOB,
// # of keys written by BlobDB as TTL blob value.
BLOB_DB_WRITE_BLOB_TTL,
// # of bytes written to blob file.
BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
// # of bytes read from blob file.
BLOB_DB_BLOB_FILE_BYTES_READ,
// # of times a blob files being synced.
BLOB_DB_BLOB_FILE_SYNCED,
// # of blob index evicted from base DB by BlobDB compaction filter because
// of expiration.
BLOB_DB_BLOB_INDEX_EXPIRED,
// # of blob files being garbage collected.
BLOB_DB_GC_NUM_FILES,
// # of blob files generated by garbage collection.
BLOB_DB_GC_NUM_NEW_FILES,
// # of BlobDB garbage collection failures.
BLOB_DB_GC_FAILURES,
// # of keys drop by BlobDB garbage collection because they had been
// overwritten.
BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
// # of keys drop by BlobDB garbage collection because of expiration.
BLOB_DB_GC_NUM_KEYS_EXPIRED,
// # of keys relocated to new blob file by garbage collection.
BLOB_DB_GC_NUM_KEYS_RELOCATED,
// # of bytes drop by BlobDB garbage collection because they had been
// overwritten.
BLOB_DB_GC_BYTES_OVERWRITTEN,
// # of bytes drop by BlobDB garbage collection because of expiration.
BLOB_DB_GC_BYTES_EXPIRED,
// # of bytes relocated to new blob file by garbage collection.
BLOB_DB_GC_BYTES_RELOCATED,
// # of blob files evicted because of BlobDB is full.
BLOB_DB_FIFO_NUM_FILES_EVICTED,
// # of keys in the blob files evicted because of BlobDB is full.
BLOB_DB_FIFO_NUM_KEYS_EVICTED,
// # of bytes in the blob files evicted because of BlobDB is full.
BLOB_DB_FIFO_BYTES_EVICTED,
TICKER_ENUM_MAX TICKER_ENUM_MAX
}; };
@ -332,6 +399,37 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"}, {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
{NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"}, {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
{NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"}, {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
{BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
{BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
{BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
{BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
{BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
{BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
{BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
{BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
{BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
{BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
{BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
{BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
{BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
{BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
{BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
{BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
{BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file,bytes.read"},
{BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
{BLOB_DB_BLOB_INDEX_EXPIRED, "rocksdb.blobdb.blob.index.expired"},
{BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
{BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
{BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
{BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
{BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
{BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
{BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
{BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
{BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
{BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
{BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
{BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
}; };
/** /**
@ -383,6 +481,36 @@ enum Histograms : uint32_t {
// requests. // requests.
READ_NUM_MERGE_OPERANDS, READ_NUM_MERGE_OPERANDS,
// BlobDB specific stats
// Size of keys written to BlobDB.
BLOB_DB_KEY_SIZE,
// Size of values written to BlobDB.
BLOB_DB_VALUE_SIZE,
// BlobDB Put/PutWithTTL/PutUntil/Write latency.
BLOB_DB_WRITE_MICROS,
// BlobDB Get lagency.
BLOB_DB_GET_MICROS,
// BlobDB MultiGet latency.
BLOB_DB_MULTIGET_MICROS,
// BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
BLOB_DB_SEEK_MICROS,
// BlobDB Next latency.
BLOB_DB_NEXT_MICROS,
// BlobDB Prev latency.
BLOB_DB_PREV_MICROS,
// Blob file write latency.
BLOB_DB_BLOB_FILE_WRITE_MICROS,
// Blob file read latency.
BLOB_DB_BLOB_FILE_READ_MICROS,
// Blob file sync latency.
BLOB_DB_BLOB_FILE_SYNC_MICROS,
// BlobDB garbage collection time.
BLOB_DB_GC_MICROS,
// BlobDB compression time.
BLOB_DB_COMPRESSION_MICROS,
// BlobDB decompression time.
BLOB_DB_DECOMPRESSION_MICROS,
HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match
}; };
@ -418,6 +546,20 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"}, {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
{DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"}, {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
{READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"}, {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
{BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
{BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
{BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
{BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
{BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
{BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
{BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
{BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
{BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
{BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
{BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
{BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
{BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
{BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
}; };
struct HistogramData { struct HistogramData {

View File

@ -5,6 +5,7 @@
#pragma once #pragma once
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include "monitoring/statistics.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "utilities/blob_db/blob_index.h" #include "utilities/blob_db/blob_index.h"
@ -15,8 +16,12 @@ namespace blob_db {
// CompactionFilter to delete expired blob index from base DB. // CompactionFilter to delete expired blob index from base DB.
class BlobIndexCompactionFilter : public CompactionFilter { class BlobIndexCompactionFilter : public CompactionFilter {
public: public:
explicit BlobIndexCompactionFilter(uint64_t current_time) BlobIndexCompactionFilter(uint64_t current_time, Statistics* statistics)
: current_time_(current_time) {} : current_time_(current_time), statistics_(statistics) {}
virtual ~BlobIndexCompactionFilter() {
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED, expired_count_);
}
virtual const char* Name() const override { virtual const char* Name() const override {
return "BlobIndexCompactionFilter"; return "BlobIndexCompactionFilter";
@ -40,6 +45,7 @@ class BlobIndexCompactionFilter : public CompactionFilter {
} }
if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) { if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
// Expired // Expired
expired_count_++;
return Decision::kRemove; return Decision::kRemove;
} }
return Decision::kKeep; return Decision::kKeep;
@ -47,11 +53,16 @@ class BlobIndexCompactionFilter : public CompactionFilter {
private: private:
const uint64_t current_time_; const uint64_t current_time_;
Statistics* statistics_;
// It is safe to not using std::atomic since the compaction filter, created
// from a compaction filter factroy, will not be called from multiple threads.
mutable uint64_t expired_count_ = 0;
}; };
class BlobIndexCompactionFilterFactory : public CompactionFilterFactory { class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
public: public:
explicit BlobIndexCompactionFilterFactory(Env* env) : env_(env) {} BlobIndexCompactionFilterFactory(Env* env, Statistics* statistics)
: env_(env), statistics_(statistics) {}
virtual const char* Name() const override { virtual const char* Name() const override {
return "BlobIndexCompactionFilterFactory"; return "BlobIndexCompactionFilterFactory";
@ -65,12 +76,13 @@ class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
return nullptr; return nullptr;
} }
assert(current_time >= 0); assert(current_time >= 0);
return std::unique_ptr<CompactionFilter>( return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
new BlobIndexCompactionFilter(static_cast<uint64_t>(current_time))); static_cast<uint64_t>(current_time), statistics_));
} }
private: private:
Env* env_; Env* env_;
Statistics* statistics_;
}; };
} // namespace blob_db } // namespace blob_db

View File

@ -70,7 +70,8 @@ Status BlobDB::OpenAndLoad(const Options& options,
} }
changed_options->compaction_filter_factory.reset( changed_options->compaction_filter_factory.reset(
new BlobIndexCompactionFilterFactory(options.env)); new BlobIndexCompactionFilterFactory(options.env,
options.statistics.get()));
changed_options->listeners.emplace_back(fblistener); changed_options->listeners.emplace_back(fblistener);
if (bdb_options.enable_garbage_collection) { if (bdb_options.enable_garbage_collection) {
changed_options->listeners.emplace_back(ce_listener); changed_options->listeners.emplace_back(ce_listener);
@ -163,7 +164,8 @@ Status BlobDB::Open(const DBOptions& db_options_input,
return Status::NotSupported("Blob DB doesn't support compaction filter."); return Status::NotSupported("Blob DB doesn't support compaction filter.");
} }
cf_options.compaction_filter_factory.reset( cf_options.compaction_filter_factory.reset(
new BlobIndexCompactionFilterFactory(db_options.env)); new BlobIndexCompactionFilterFactory(db_options.env,
db_options.statistics.get()));
ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options); ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options);
// we need to open blob db first so that recovery can happen // we need to open blob db first so that recovery can happen

View File

@ -14,6 +14,7 @@
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "monitoring/instrumented_mutex.h" #include "monitoring/instrumented_mutex.h"
#include "monitoring/statistics.h"
#include "rocksdb/convenience.h" #include "rocksdb/convenience.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
@ -30,6 +31,7 @@
#include "util/logging.h" #include "util/logging.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/random.h" #include "util/random.h"
#include "util/stop_watch.h"
#include "util/sync_point.h" #include "util/sync_point.h"
#include "util/timer_queue.h" #include "util/timer_queue.h"
#include "utilities/blob_db/blob_db_iterator.h" #include "utilities/blob_db/blob_db_iterator.h"
@ -106,16 +108,13 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
bdb_options_(blob_db_options), bdb_options_(blob_db_options),
db_options_(db_options), db_options_(db_options),
env_options_(db_options), env_options_(db_options),
statistics_(db_options_.statistics.get()),
dir_change_(false), dir_change_(false),
next_file_number_(1), next_file_number_(1),
epoch_of_(0), epoch_of_(0),
shutdown_(false), shutdown_(false),
current_epoch_(0), current_epoch_(0),
open_file_count_(0), open_file_count_(0),
last_period_write_(0),
last_period_ampl_(0),
total_periods_write_(0),
total_periods_ampl_(0),
total_blob_space_(0), total_blob_space_(0),
open_p1_done_(false), open_p1_done_(false),
debug_level_(0), debug_level_(0),
@ -163,16 +162,13 @@ BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
bdb_options_(blob_db_options), bdb_options_(blob_db_options),
db_options_(db->GetOptions()), db_options_(db->GetOptions()),
env_options_(db_->GetOptions()), env_options_(db_->GetOptions()),
statistics_(db_options_.statistics.get()),
dir_change_(false), dir_change_(false),
next_file_number_(1), next_file_number_(1),
epoch_of_(0), epoch_of_(0),
shutdown_(false), shutdown_(false),
current_epoch_(0), current_epoch_(0),
open_file_count_(0), open_file_count_(0),
last_period_write_(0),
last_period_ampl_(0),
total_periods_write_(0),
total_periods_ampl_(0),
total_blob_space_(0), total_blob_space_(0),
oldest_file_evicted_(false) { oldest_file_evicted_(false) {
if (!bdb_options_.blob_dir.empty()) if (!bdb_options_.blob_dir.empty())
@ -227,8 +223,6 @@ void BlobDBImpl::StartBackgroundTasks() {
std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1)); std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
tqueue_.add(kSanityCheckPeriodMillisecs, tqueue_.add(kSanityCheckPeriodMillisecs,
std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1)); std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
tqueue_.add(kWriteAmplificationStatsPeriodMillisecs,
std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1));
tqueue_.add(kFSyncFilesPeriodMillisecs, tqueue_.add(kFSyncFilesPeriodMillisecs,
std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1)); std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1));
tqueue_.add( tqueue_.add(
@ -490,8 +484,8 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
} }
bfile->log_writer_ = std::make_shared<Writer>( bfile->log_writer_ = std::make_shared<Writer>(
std::move(fwriter), bfile->file_number_, bdb_options_.bytes_per_sync, std::move(fwriter), env_, statistics_, bfile->file_number_,
db_options_.use_fsync, boffset); bdb_options_.bytes_per_sync, db_options_.use_fsync, boffset);
bfile->log_writer_->last_elem_type_ = et; bfile->log_writer_->last_elem_type_ = et;
return s; return s;
@ -745,7 +739,8 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
}; };
Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) { Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_WRITE);
uint32_t default_cf_id = uint32_t default_cf_id =
reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID(); reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
// TODO(yiwu): In case there are multiple writers the latest sequence would // TODO(yiwu): In case there are multiple writers the latest sequence would
@ -856,6 +851,8 @@ Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key, Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
const Slice& value, uint64_t expiration) { const Slice& value, uint64_t expiration) {
StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_PUT);
TEST_SYNC_POINT("BlobDBImpl::PutUntil:Start"); TEST_SYNC_POINT("BlobDBImpl::PutUntil:Start");
Status s; Status s;
WriteBatch batch; WriteBatch batch;
@ -888,11 +885,13 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
if (expiration == kNoExpiration) { if (expiration == kNoExpiration) {
// Put as normal value // Put as normal value
s = batch->Put(key, value); s = batch->Put(key, value);
RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
} else { } else {
// Inlined with TTL // Inlined with TTL
BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value); BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key, s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
index_entry); index_entry);
RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
} }
} else { } else {
std::shared_ptr<BlobFile> bfile = (expiration != kNoExpiration) std::shared_ptr<BlobFile> bfile = (expiration != kNoExpiration)
@ -911,6 +910,11 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration, s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration,
&index_entry); &index_entry);
if (expiration == kNoExpiration) {
RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
} else {
RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
}
if (s.ok()) { if (s.ok()) {
bfile->ExtendSequenceRange(sequence); bfile->ExtendSequenceRange(sequence);
@ -932,6 +936,11 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
} }
} }
RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size());
MeasureTime(statistics_, BLOB_DB_KEY_SIZE, key.size());
MeasureTime(statistics_, BLOB_DB_VALUE_SIZE, value.size());
return s; return s;
} }
@ -940,6 +949,7 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
if (bdb_options_.compression == kNoCompression) { if (bdb_options_.compression == kNoCompression) {
return raw; return raw;
} }
StopWatch compression_sw(env_, statistics_, BLOB_DB_COMPRESSION_MICROS);
CompressionType ct = bdb_options_.compression; CompressionType ct = bdb_options_.compression;
CompressionOptions compression_opts; CompressionOptions compression_opts;
CompressBlock(raw, compression_opts, &ct, kBlockBasedTableVersionFormat, CompressBlock(raw, compression_opts, &ct, kBlockBasedTableVersionFormat,
@ -991,6 +1001,11 @@ bool BlobDBImpl::EvictOldestBlobFile() {
oldest_file->MarkObsolete(oldest_file->GetSequenceRange().second); oldest_file->MarkObsolete(oldest_file->GetSequenceRange().second);
obsolete_files_.push_back(oldest_file); obsolete_files_.push_back(oldest_file);
oldest_file_evicted_.store(true); oldest_file_evicted_.store(true);
RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
oldest_file->BlobCount());
RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
oldest_file->GetFileSize());
return true; return true;
} }
@ -1048,7 +1063,6 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
bfile->blob_count_++; bfile->blob_count_++;
bfile->file_size_ += size_put; bfile->file_size_ += size_put;
last_period_write_ += size_put;
total_blob_space_ += size_put; total_blob_space_ += size_put;
if (expiration == kNoExpiration) { if (expiration == kNoExpiration) {
@ -1066,6 +1080,8 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
std::vector<Status> BlobDBImpl::MultiGet( std::vector<Status> BlobDBImpl::MultiGet(
const ReadOptions& read_options, const ReadOptions& read_options,
const std::vector<Slice>& keys, std::vector<std::string>* values) { const std::vector<Slice>& keys, std::vector<std::string>* values) {
StopWatch multiget_sw(env_, statistics_, BLOB_DB_MULTIGET_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
// Get a snapshot to avoid blob file get deleted between we // Get a snapshot to avoid blob file get deleted between we
// fetch and index entry and reading from the file. // fetch and index entry and reading from the file.
ReadOptions ro(read_options); ReadOptions ro(read_options);
@ -1169,7 +1185,12 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
char* buffer = &(*valueptr)[0]; char* buffer = &(*valueptr)[0];
Slice blob_value; Slice blob_value;
s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value, buffer); {
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value,
buffer);
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_value.size());
}
if (!s.ok() || blob_value.size() != blob_index.size()) { if (!s.ok() || blob_value.size() != blob_index.size()) {
if (debug_level_ >= 2) { if (debug_level_ >= 2) {
ROCKS_LOG_ERROR(db_options_.info_log, ROCKS_LOG_ERROR(db_options_.info_log,
@ -1218,10 +1239,14 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
if (bfile->compression() != kNoCompression) { if (bfile->compression() != kNoCompression) {
BlockContents contents; BlockContents contents;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily()); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
{
StopWatch decompression_sw(env_, statistics_,
BLOB_DB_DECOMPRESSION_MICROS);
s = UncompressBlockContentsForCompressionType( s = UncompressBlockContentsForCompressionType(
blob_value.data(), blob_value.size(), &contents, blob_value.data(), blob_value.size(), &contents,
kBlockBasedTableVersionFormat, Slice(), bfile->compression(), kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
*(cfh->cfd()->ioptions())); *(cfh->cfd()->ioptions()));
}
*(value->GetSelf()) = contents.data.ToString(); *(value->GetSelf()) = contents.data.ToString();
} }
@ -1233,6 +1258,14 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
Status BlobDBImpl::Get(const ReadOptions& read_options, Status BlobDBImpl::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) { PinnableSlice* value) {
StopWatch get_sw(env_, statistics_, BLOB_DB_GET_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_GET);
return GetImpl(read_options, column_family, key, value);
}
Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) {
if (column_family != DefaultColumnFamily()) { if (column_family != DefaultColumnFamily()) {
return Status::NotSupported( return Status::NotSupported(
"Blob DB doesn't support non-default column family."); "Blob DB doesn't support non-default column family.");
@ -1258,6 +1291,8 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
if (snapshot_created) { if (snapshot_created) {
db_->ReleaseSnapshot(ro.snapshot); db_->ReleaseSnapshot(ro.snapshot);
} }
RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
return s; return s;
} }
@ -1546,35 +1581,6 @@ std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
return std::make_pair(true, -1); return std::make_pair(true, -1);
} }
// TODO(yiwu): correct the stats and expose it.
std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
if (aborted) return std::make_pair(false, -1);
WriteLock wl(&mutex_);
if (all_periods_write_.size() >= kWriteAmplificationStatsPeriods) {
total_periods_write_ -= (*all_periods_write_.begin());
total_periods_ampl_ = (*all_periods_ampl_.begin());
all_periods_write_.pop_front();
all_periods_ampl_.pop_front();
}
uint64_t val1 = last_period_write_.load();
uint64_t val2 = last_period_ampl_.load();
all_periods_write_.push_back(val1);
all_periods_ampl_.push_back(val2);
last_period_write_ = 0;
last_period_ampl_ = 0;
total_periods_write_ += val1;
total_periods_ampl_ += val2;
return std::make_pair(true, -1);
}
// Write callback for garbage collection to check if key has been updated // Write callback for garbage collection to check if key has been updated
// since last read. Similar to how OptimisticTransaction works. See inline // since last read. Similar to how OptimisticTransaction works. See inline
// comment in GCFileAndUpdateLSM(). // comment in GCFileAndUpdateLSM().
@ -1635,6 +1641,7 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
// DELETED in the LSM // DELETED in the LSM
Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr, Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
GCStats* gc_stats) { GCStats* gc_stats) {
StopWatch gc_sw(env_, statistics_, BLOB_DB_GC_MICROS);
uint64_t now = EpochNow(); uint64_t now = EpochNow();
std::shared_ptr<Reader> reader = std::shared_ptr<Reader> reader =
@ -1727,6 +1734,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
if (get_status.IsNotFound() || !is_blob_index) { if (get_status.IsNotFound() || !is_blob_index) {
// Either the key is deleted or updated with a newer version whish is // Either the key is deleted or updated with a newer version whish is
// inlined in LSM. // inlined in LSM.
gc_stats->num_keys_overwritten++;
gc_stats->bytes_overwritten += record.record_size();
continue; continue;
} }
@ -1742,6 +1751,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
blob_index.file_number() != bfptr->BlobFileNumber() || blob_index.file_number() != bfptr->BlobFileNumber() ||
blob_index.offset() != blob_offset) { blob_index.offset() != blob_offset) {
// Key has been overwritten. Drop the blob record. // Key has been overwritten. Drop the blob record.
gc_stats->num_keys_overwritten++;
gc_stats->bytes_overwritten += record.record_size();
continue; continue;
} }
@ -1751,8 +1762,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
// TODO(yiwu): Blob indexes will be remove by BlobIndexCompactionFilter. // TODO(yiwu): Blob indexes will be remove by BlobIndexCompactionFilter.
// We can just drop the blob record. // We can just drop the blob record.
if (no_relocation_ttl || (has_ttl && now >= record.expiration)) { if (no_relocation_ttl || (has_ttl && now >= record.expiration)) {
gc_stats->num_deletes++; gc_stats->num_keys_expired++;
gc_stats->deleted_size += record.value_size; gc_stats->bytes_expired += record.record_size();
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete"); TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete");
WriteBatch delete_batch; WriteBatch delete_batch;
Status delete_status = delete_batch.Delete(record.key); Status delete_status = delete_batch.Delete(record.key);
@ -1760,12 +1771,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
delete_status = db_impl_->WriteWithCallback(WriteOptions(), delete_status = db_impl_->WriteWithCallback(WriteOptions(),
&delete_batch, &callback); &delete_batch, &callback);
} }
if (delete_status.ok()) { if (!delete_status.ok() && !delete_status.IsBusy()) {
gc_stats->delete_succeeded++;
} else if (delete_status.IsBusy()) {
// The key is overwritten in the meanwhile. Drop the blob record.
gc_stats->overwritten_while_delete++;
} else {
// We hit an error. // We hit an error.
s = delete_status; s = delete_status;
ROCKS_LOG_ERROR(db_options_.info_log, ROCKS_LOG_ERROR(db_options_.info_log,
@ -1788,7 +1794,6 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
std::string reason("GC of "); std::string reason("GC of ");
reason += bfptr->PathName(); reason += bfptr->PathName();
newfile = NewBlobFile(reason); newfile = NewBlobFile(reason);
gc_stats->newfile = newfile;
new_writer = CheckOrCreateWriterLocked(newfile); new_writer = CheckOrCreateWriterLocked(newfile);
newfile->header_ = std::move(header); newfile->header_ = std::move(header);
@ -1810,9 +1815,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile)); blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile));
} }
gc_stats->num_relocate++;
std::string new_index_entry; std::string new_index_entry;
uint64_t new_blob_offset = 0; uint64_t new_blob_offset = 0;
uint64_t new_key_offset = 0; uint64_t new_key_offset = 0;
// write the blob to the blob log. // write the blob to the blob log.
@ -1838,10 +1841,12 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
if (rewrite_status.ok()) { if (rewrite_status.ok()) {
newfile->ExtendSequenceRange( newfile->ExtendSequenceRange(
WriteBatchInternal::Sequence(&rewrite_batch)); WriteBatchInternal::Sequence(&rewrite_batch));
gc_stats->relocate_succeeded++; gc_stats->num_keys_relocated++;
gc_stats->bytes_relocated += record.record_size();
} else if (rewrite_status.IsBusy()) { } else if (rewrite_status.IsBusy()) {
// The key is overwritten in the meanwhile. Drop the blob record. // The key is overwritten in the meanwhile. Drop the blob record.
gc_stats->overwritten_while_relocate++; gc_stats->num_keys_overwritten++;
gc_stats->bytes_overwritten += record.record_size();
} else { } else {
// We hit an error. // We hit an error.
s = rewrite_status; s = rewrite_status;
@ -1864,17 +1869,34 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
ROCKS_LOG_INFO( ROCKS_LOG_INFO(
db_options_.info_log, db_options_.info_log,
"%s blob file %" PRIu64 "%s blob file %" PRIu64 ". Total blob records: %" PRIu64
". Total blob records: %" PRIu64 ", Deletes: %" PRIu64 "/%" PRIu64 ", Expired: %" PRIu64 " keys/%" PRIu64 " bytes, Overwritten: %" PRIu64
" succeeded, Relocates: %" PRIu64 "/%" PRIu64 " succeeded.", " keys/%" PRIu64 " bytes.",
s.ok() ? "Successfully garbage collected" : "Failed to garbage collect", s.ok() ? "Successfully garbage collected" : "Failed to garbage collect",
bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->delete_succeeded, bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->num_keys_expired,
gc_stats->num_deletes, gc_stats->relocate_succeeded, gc_stats->bytes_expired, gc_stats->num_keys_overwritten,
gc_stats->num_relocate); gc_stats->bytes_overwritten, gc_stats->num_keys_relocated,
gc_stats->bytes_relocated);
RecordTick(statistics_, BLOB_DB_GC_NUM_FILES);
RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
gc_stats->num_keys_overwritten);
RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_EXPIRED,
gc_stats->num_keys_expired);
RecordTick(statistics_, BLOB_DB_GC_BYTES_OVERWRITTEN,
gc_stats->bytes_overwritten);
RecordTick(statistics_, BLOB_DB_GC_BYTES_EXPIRED, gc_stats->bytes_expired);
if (newfile != nullptr) { if (newfile != nullptr) {
total_blob_space_ += newfile->file_size_; total_blob_space_ += newfile->file_size_;
ROCKS_LOG_INFO(db_options_.info_log, "New blob file %" PRIu64 ".", ROCKS_LOG_INFO(db_options_.info_log, "New blob file %" PRIu64 ".",
newfile->BlobFileNumber()); newfile->BlobFileNumber());
RecordTick(statistics_, BLOB_DB_GC_NUM_NEW_FILES);
RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
gc_stats->num_keys_relocated);
RecordTick(statistics_, BLOB_DB_GC_BYTES_RELOCATED,
gc_stats->bytes_relocated);
}
if (!s.ok()) {
RecordTick(statistics_, BLOB_DB_GC_FAILURES);
} }
return s; return s;
} }
@ -2120,8 +2142,10 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
if (bfile->gc_once_after_open_.load()) { if (bfile->gc_once_after_open_.load()) {
WriteLock lockbfile_w(&bfile->mutex_); WriteLock lockbfile_w(&bfile->mutex_);
bfile->deleted_size_ = gc_stats.deleted_size; bfile->deleted_size_ =
bfile->deleted_count_ = gc_stats.num_deletes; gc_stats.bytes_overwritten + gc_stats.bytes_expired;
bfile->deleted_count_ =
gc_stats.num_keys_overwritten + gc_stats.num_keys_expired;
bfile->gc_once_after_open_ = false; bfile->gc_once_after_open_ = false;
} }
} }
@ -2144,7 +2168,7 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
auto* iter = db_impl_->NewIteratorImpl( auto* iter = db_impl_->NewIteratorImpl(
read_options, cfd, snapshot->GetSequenceNumber(), read_options, cfd, snapshot->GetSequenceNumber(),
nullptr /*read_callback*/, true /*allow_blob*/); nullptr /*read_callback*/, true /*allow_blob*/);
return new BlobDBIterator(own_snapshot, iter, this); return new BlobDBIterator(own_snapshot, iter, this, env_, statistics_);
} }
Status DestroyBlobDB(const std::string& dbname, const Options& options, Status DestroyBlobDB(const std::string& dbname, const Options& options,

View File

@ -24,6 +24,7 @@
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/listener.h" #include "rocksdb/listener.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/statistics.h"
#include "rocksdb/wal_filter.h" #include "rocksdb/wal_filter.h"
#include "util/mpsc.h" #include "util/mpsc.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
@ -135,16 +136,12 @@ struct blobf_compare_ttl {
struct GCStats { struct GCStats {
uint64_t blob_count = 0; uint64_t blob_count = 0;
uint64_t num_deletes = 0; uint64_t num_keys_overwritten = 0;
uint64_t deleted_size = 0; uint64_t num_keys_expired = 0;
uint64_t retry_delete = 0; uint64_t num_keys_relocated = 0;
uint64_t delete_succeeded = 0; uint64_t bytes_overwritten = 0;
uint64_t overwritten_while_delete = 0; uint64_t bytes_expired = 0;
uint64_t num_relocate = 0; uint64_t bytes_relocated = 0;
uint64_t retry_relocate = 0;
uint64_t relocate_succeeded = 0;
uint64_t overwritten_while_relocate = 0;
std::shared_ptr<BlobFile> newfile = nullptr;
}; };
/** /**
@ -178,10 +175,6 @@ class BlobDBImpl : public BlobDB {
// how many periods of stats do we keep. // how many periods of stats do we keep.
static constexpr uint32_t kWriteAmplificationStatsPeriods = 24; static constexpr uint32_t kWriteAmplificationStatsPeriods = 24;
// what is the length of any period
static constexpr uint32_t kWriteAmplificationStatsPeriodMillisecs =
3600 * 1000;
// we will garbage collect blob files in // we will garbage collect blob files in
// which entire files have expired. However if the // which entire files have expired. However if the
// ttl_range of files is very large say a day, we // ttl_range of files is very large say a day, we
@ -292,6 +285,10 @@ class BlobDBImpl : public BlobDB {
// Return true if a snapshot is created. // Return true if a snapshot is created.
bool SetSnapshotIfNeeded(ReadOptions* read_options); bool SetSnapshotIfNeeded(ReadOptions* read_options);
Status GetImpl(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value);
Status GetBlobValue(const Slice& key, const Slice& index_entry, Status GetBlobValue(const Slice& key, const Slice& index_entry,
PinnableSlice* value); PinnableSlice* value);
@ -364,9 +361,6 @@ class BlobDBImpl : public BlobDB {
// efficiency // efficiency
std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted); std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
// periodically print write amplification statistics
std::pair<bool, int64_t> WaStats(bool aborted);
// background task to do book-keeping of deleted keys // background task to do book-keeping of deleted keys
std::pair<bool, int64_t> EvictDeletions(bool aborted); std::pair<bool, int64_t> EvictDeletions(bool aborted);
@ -444,6 +438,9 @@ class BlobDBImpl : public BlobDB {
DBOptions db_options_; DBOptions db_options_;
EnvOptions env_options_; EnvOptions env_options_;
// Raw pointer of statistic. db_options_ has a shared_ptr to hold ownership.
Statistics* statistics_;
// name of the database directory // name of the database directory
std::string dbname_; std::string dbname_;
@ -519,18 +516,6 @@ class BlobDBImpl : public BlobDB {
// counter is used to monitor and close excess RA files. // counter is used to monitor and close excess RA files.
std::atomic<uint32_t> open_file_count_; std::atomic<uint32_t> open_file_count_;
// should hold mutex to modify
// STATISTICS for WA of Blob Files due to GC
// collect by default 24 hourly periods
std::list<uint64_t> all_periods_write_;
std::list<uint64_t> all_periods_ampl_;
std::atomic<uint64_t> last_period_write_;
std::atomic<uint64_t> last_period_ampl_;
uint64_t total_periods_write_;
uint64_t total_periods_ampl_;
// total size of all blob files at a given time // total size of all blob files at a given time
std::atomic<uint64_t> total_blob_space_; std::atomic<uint64_t> total_blob_space_;
std::list<std::shared_ptr<BlobFile>> obsolete_files_; std::list<std::shared_ptr<BlobFile>> obsolete_files_;

View File

@ -6,7 +6,9 @@
#pragma once #pragma once
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include "monitoring/statistics.h"
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "util/stop_watch.h"
#include "utilities/blob_db/blob_db_impl.h" #include "utilities/blob_db/blob_db_impl.h"
namespace rocksdb { namespace rocksdb {
@ -17,8 +19,12 @@ using rocksdb::ManagedSnapshot;
class BlobDBIterator : public Iterator { class BlobDBIterator : public Iterator {
public: public:
BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter, BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
BlobDBImpl* blob_db) BlobDBImpl* blob_db, Env* env, Statistics* statistics)
: snapshot_(snapshot), iter_(iter), blob_db_(blob_db) {} : snapshot_(snapshot),
iter_(iter),
blob_db_(blob_db),
env_(env),
statistics_(statistics) {}
virtual ~BlobDBIterator() = default; virtual ~BlobDBIterator() = default;
@ -37,33 +43,45 @@ class BlobDBIterator : public Iterator {
} }
void SeekToFirst() override { void SeekToFirst() override {
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
iter_->SeekToFirst(); iter_->SeekToFirst();
UpdateBlobValue(); UpdateBlobValue();
} }
void SeekToLast() override { void SeekToLast() override {
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
iter_->SeekToLast(); iter_->SeekToLast();
UpdateBlobValue(); UpdateBlobValue();
} }
void Seek(const Slice& target) override { void Seek(const Slice& target) override {
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
iter_->Seek(target); iter_->Seek(target);
UpdateBlobValue(); UpdateBlobValue();
} }
void SeekForPrev(const Slice& target) override { void SeekForPrev(const Slice& target) override {
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
iter_->SeekForPrev(target); iter_->SeekForPrev(target);
UpdateBlobValue(); UpdateBlobValue();
} }
void Next() override { void Next() override {
assert(Valid()); assert(Valid());
StopWatch next_sw(env_, statistics_, BLOB_DB_NEXT_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_NEXT);
iter_->Next(); iter_->Next();
UpdateBlobValue(); UpdateBlobValue();
} }
void Prev() override { void Prev() override {
assert(Valid()); assert(Valid());
StopWatch prev_sw(env_, statistics_, BLOB_DB_PREV_MICROS);
RecordTick(statistics_, BLOB_DB_NUM_PREV);
iter_->Prev(); iter_->Prev();
UpdateBlobValue(); UpdateBlobValue();
} }
@ -96,6 +114,8 @@ class BlobDBIterator : public Iterator {
std::unique_ptr<ManagedSnapshot> snapshot_; std::unique_ptr<ManagedSnapshot> snapshot_;
std::unique_ptr<ArenaWrappedDBIter> iter_; std::unique_ptr<ArenaWrappedDBIter> iter_;
BlobDBImpl* blob_db_; BlobDBImpl* blob_db_;
Env* env_;
Statistics* statistics_;
Status status_; Status status_;
PinnableSlice value_; PinnableSlice value_;
}; };

View File

@ -260,8 +260,8 @@ TEST_F(BlobDBTest, PutWithTTL) {
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0])); ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes); ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
ASSERT_EQ(data.size(), gc_stats.num_relocate); ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
VerifyDB(data); VerifyDB(data);
} }
@ -290,8 +290,8 @@ TEST_F(BlobDBTest, PutUntil) {
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0])); ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes); ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
ASSERT_EQ(data.size(), gc_stats.num_relocate); ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
VerifyDB(data); VerifyDB(data);
} }
@ -323,8 +323,8 @@ TEST_F(BlobDBTest, TTLExtrator_NoTTL) {
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0])); ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
ASSERT_EQ(0, gc_stats.num_deletes); ASSERT_EQ(0, gc_stats.num_keys_expired);
ASSERT_EQ(100, gc_stats.num_relocate); ASSERT_EQ(100, gc_stats.num_keys_relocated);
VerifyDB(data); VerifyDB(data);
} }
@ -370,8 +370,8 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractTTL) {
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data; auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes); ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
ASSERT_EQ(data.size(), gc_stats.num_relocate); ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
VerifyDB(data); VerifyDB(data);
} }
@ -418,8 +418,8 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractExpiration) {
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data; auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes); ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
ASSERT_EQ(data.size(), gc_stats.num_relocate); ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
VerifyDB(data); VerifyDB(data);
} }
@ -475,8 +475,8 @@ TEST_F(BlobDBTest, TTLExtractor_ChangeValue) {
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0])); ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes); ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
ASSERT_EQ(data.size(), gc_stats.num_relocate); ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
VerifyDB(data); VerifyDB(data);
} }
@ -675,8 +675,8 @@ TEST_F(BlobDBTest, GCAfterOverwriteKeys) {
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
ASSERT_EQ(200, gc_stats.blob_count); ASSERT_EQ(200, gc_stats.blob_count);
ASSERT_EQ(0, gc_stats.num_deletes); ASSERT_EQ(0, gc_stats.num_keys_expired);
ASSERT_EQ(200 - new_keys, gc_stats.num_relocate); ASSERT_EQ(200 - new_keys, gc_stats.num_keys_relocated);
VerifyDB(data); VerifyDB(data);
} }
@ -704,10 +704,9 @@ TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
ASSERT_EQ(1, gc_stats.blob_count); ASSERT_EQ(1, gc_stats.blob_count);
ASSERT_EQ(0, gc_stats.num_deletes); ASSERT_EQ(0, gc_stats.num_keys_expired);
ASSERT_EQ(1, gc_stats.num_relocate); ASSERT_EQ(1, gc_stats.num_keys_overwritten);
ASSERT_EQ(0, gc_stats.relocate_succeeded); ASSERT_EQ(0, gc_stats.num_keys_relocated);
ASSERT_EQ(1, gc_stats.overwritten_while_relocate);
writer.join(); writer.join();
VerifyDB({{"foo", "v2"}}); VerifyDB({{"foo", "v2"}});
} }
@ -741,10 +740,8 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats)); ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
ASSERT_EQ(1, gc_stats.blob_count); ASSERT_EQ(1, gc_stats.blob_count);
ASSERT_EQ(1, gc_stats.num_deletes); ASSERT_EQ(1, gc_stats.num_keys_expired);
ASSERT_EQ(0, gc_stats.delete_succeeded); ASSERT_EQ(0, gc_stats.num_keys_relocated);
ASSERT_EQ(1, gc_stats.overwritten_while_delete);
ASSERT_EQ(0, gc_stats.num_relocate);
writer.join(); writer.join();
VerifyDB({{"foo", "v2"}}); VerifyDB({{"foo", "v2"}});
} }
@ -838,8 +835,7 @@ TEST_F(BlobDBTest, ReadWhileGC) {
GCStats gc_stats; GCStats gc_stats;
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats)); ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
ASSERT_EQ(1, gc_stats.blob_count); ASSERT_EQ(1, gc_stats.blob_count);
ASSERT_EQ(1, gc_stats.num_relocate); ASSERT_EQ(1, gc_stats.num_keys_relocated);
ASSERT_EQ(1, gc_stats.relocate_succeeded);
blob_db_impl()->TEST_DeleteObsoleteFiles(); blob_db_impl()->TEST_DeleteObsoleteFiles();
// The file shouln't be deleted // The file shouln't be deleted
blob_files = blob_db_impl()->TEST_GetBlobFiles(); blob_files = blob_db_impl()->TEST_GetBlobFiles();
@ -904,11 +900,11 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
ASSERT_TRUE(bfile->Obsolete()); ASSERT_TRUE(bfile->Obsolete());
ASSERT_EQ(1, gc_stats.blob_count); ASSERT_EQ(1, gc_stats.blob_count);
if (delete_key) { if (delete_key) {
ASSERT_EQ(0, gc_stats.num_relocate); ASSERT_EQ(0, gc_stats.num_keys_relocated);
ASSERT_EQ(bfile->GetSequenceRange().second + 1, ASSERT_EQ(bfile->GetSequenceRange().second + 1,
bfile->GetObsoleteSequence()); bfile->GetObsoleteSequence());
} else { } else {
ASSERT_EQ(1, gc_stats.num_relocate); ASSERT_EQ(1, gc_stats.num_keys_relocated);
ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
bfile->GetObsoleteSequence()); bfile->GetObsoleteSequence());
} }

View File

@ -100,8 +100,8 @@ std::shared_ptr<Reader> BlobFile::OpenSequentialReader(
std::unique_ptr<SequentialFileReader> sfile_reader; std::unique_ptr<SequentialFileReader> sfile_reader;
sfile_reader.reset(new SequentialFileReader(std::move(sfile))); sfile_reader.reset(new SequentialFileReader(std::move(sfile)));
std::shared_ptr<Reader> log_reader = std::shared_ptr<Reader> log_reader = std::make_shared<Reader>(
std::make_shared<Reader>(db_options.info_log, std::move(sfile_reader)); std::move(sfile_reader), db_options.env, db_options.statistics.get());
return log_reader; return log_reader;
} }

View File

@ -111,6 +111,8 @@ struct BlobLogRecord {
std::string key_buf; std::string key_buf;
std::string value_buf; std::string value_buf;
uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
void EncodeHeaderTo(std::string* dst); void EncodeHeaderTo(std::string* dst);
Status DecodeHeaderFrom(Slice src); Status DecodeHeaderFrom(Slice src);

View File

@ -9,22 +9,30 @@
#include <algorithm> #include <algorithm>
#include "monitoring/statistics.h"
#include "util/file_reader_writer.h" #include "util/file_reader_writer.h"
#include "util/stop_watch.h"
namespace rocksdb { namespace rocksdb {
namespace blob_db { namespace blob_db {
Reader::Reader(std::shared_ptr<Logger> info_log, Reader::Reader(unique_ptr<SequentialFileReader>&& file_reader, Env* env,
unique_ptr<SequentialFileReader>&& _file) Statistics* statistics)
: info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {} : file_(std::move(file_reader)),
env_(env),
statistics_(statistics),
buffer_(),
next_byte_(0) {}
Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) { Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
buf->reserve(size); buf->reserve(size);
Status s = file_->Read(size, slice, &(*buf)[0]); Status s = file_->Read(size, slice, &(*buf)[0]);
next_byte_ += size; next_byte_ += size;
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
if (slice->size() != size) { if (slice->size() != size) {
return Status::Corruption("EOF reached while reading record"); return Status::Corruption("EOF reached while reading record");
} }

View File

@ -10,7 +10,9 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include "rocksdb/env.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "rocksdb/statistics.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "utilities/blob_db/blob_log_format.h" #include "utilities/blob_db/blob_log_format.h"
@ -37,17 +39,8 @@ class Reader {
// Create a reader that will return log records from "*file". // Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use. // "*file" must remain live while this Reader is in use.
// Reader(std::unique_ptr<SequentialFileReader>&& file_reader, Env* env,
// If "reporter" is non-nullptr, it is notified whenever some data is Statistics* statistics);
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
Reader(std::shared_ptr<Logger> info_log,
std::unique_ptr<SequentialFileReader>&& file);
~Reader() = default; ~Reader() = default;
@ -68,17 +61,14 @@ class Reader {
Status ReadSlice(uint64_t size, Slice* slice, std::string* buf); Status ReadSlice(uint64_t size, Slice* slice, std::string* buf);
SequentialFileReader* file() { return file_.get(); }
void ResetNextByte() { next_byte_ = 0; } void ResetNextByte() { next_byte_ = 0; }
uint64_t GetNextByte() const { return next_byte_; } uint64_t GetNextByte() const { return next_byte_; }
const SequentialFileReader* file_reader() const { return file_.get(); }
private: private:
std::shared_ptr<Logger> info_log_;
const std::unique_ptr<SequentialFileReader> file_; const std::unique_ptr<SequentialFileReader> file_;
Env* env_;
Statistics* statistics_;
std::string backing_store_; std::string backing_store_;
Slice buffer_; Slice buffer_;

View File

@ -8,17 +8,23 @@
#include <cstdint> #include <cstdint>
#include <string> #include <string>
#include "monitoring/statistics.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/file_reader_writer.h" #include "util/file_reader_writer.h"
#include "util/stop_watch.h"
#include "utilities/blob_db/blob_log_format.h" #include "utilities/blob_db/blob_log_format.h"
namespace rocksdb { namespace rocksdb {
namespace blob_db { namespace blob_db {
Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number, Writer::Writer(unique_ptr<WritableFileWriter>&& dest, Env* env,
uint64_t bpsync, bool use_fs, uint64_t boffset) Statistics* statistics, uint64_t log_number, uint64_t bpsync,
bool use_fs, uint64_t boffset)
: dest_(std::move(dest)), : dest_(std::move(dest)),
env_(env),
statistics_(statistics),
log_number_(log_number), log_number_(log_number),
block_offset_(boffset), block_offset_(boffset),
bytes_per_sync_(bpsync), bytes_per_sync_(bpsync),
@ -26,7 +32,11 @@ Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
use_fsync_(use_fs), use_fsync_(use_fs),
last_elem_type_(kEtNone) {} last_elem_type_(kEtNone) {}
void Writer::Sync() { dest_->Sync(use_fsync_); } void Writer::Sync() {
StopWatch sync_sw(env_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
dest_->Sync(use_fsync_);
RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
}
Status Writer::WriteHeader(BlobLogHeader& header) { Status Writer::WriteHeader(BlobLogHeader& header) {
assert(block_offset_ == 0); assert(block_offset_ == 0);
@ -40,6 +50,8 @@ Status Writer::WriteHeader(BlobLogHeader& header) {
s = dest_->Flush(); s = dest_->Flush();
} }
last_elem_type_ = kEtFileHdr; last_elem_type_ = kEtFileHdr;
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
BlobLogHeader::kSize);
return s; return s;
} }
@ -58,6 +70,8 @@ Status Writer::AppendFooter(BlobLogFooter& footer) {
} }
last_elem_type_ = kEtFileFooter; last_elem_type_ = kEtFileFooter;
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
BlobLogFooter::kSize);
return s; return s;
} }
@ -98,6 +112,7 @@ void Writer::ConstructBlobHeader(std::string* buf, const Slice& key,
Status Writer::EmitPhysicalRecord(const std::string& headerbuf, Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
const Slice& key, const Slice& val, const Slice& key, const Slice& val,
uint64_t* key_offset, uint64_t* blob_offset) { uint64_t* key_offset, uint64_t* blob_offset) {
StopWatch write_sw(env_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
Status s = dest_->Append(Slice(headerbuf)); Status s = dest_->Append(Slice(headerbuf));
if (s.ok()) { if (s.ok()) {
s = dest_->Append(key); s = dest_->Append(key);
@ -113,6 +128,8 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
*blob_offset = *key_offset + key.size(); *blob_offset = *key_offset + key.size();
block_offset_ = *blob_offset + val.size(); block_offset_ = *blob_offset + val.size();
last_elem_type_ = kEtRecord; last_elem_type_ = kEtRecord;
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
BlobLogRecord::kHeaderSize + key.size() + val.size());
return s; return s;
} }

View File

@ -10,7 +10,9 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include "rocksdb/env.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "rocksdb/statistics.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/types.h" #include "rocksdb/types.h"
#include "utilities/blob_db/blob_log_format.h" #include "utilities/blob_db/blob_log_format.h"
@ -34,9 +36,9 @@ class Writer {
// Create a writer that will append data to "*dest". // Create a writer that will append data to "*dest".
// "*dest" must be initially empty. // "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use. // "*dest" must remain live while this Writer is in use.
explicit Writer(std::unique_ptr<WritableFileWriter>&& dest, Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
uint64_t log_number, uint64_t bpsync, bool use_fsync, Statistics* statistics, uint64_t log_number, uint64_t bpsync,
uint64_t boffset = 0); bool use_fsync, uint64_t boffset = 0);
~Writer() = default; ~Writer() = default;
@ -75,6 +77,8 @@ class Writer {
private: private:
std::unique_ptr<WritableFileWriter> dest_; std::unique_ptr<WritableFileWriter> dest_;
Env* env_;
Statistics* statistics_;
uint64_t log_number_; uint64_t log_number_;
uint64_t block_offset_; // Current offset in block uint64_t block_offset_; // Current offset in block
uint64_t bytes_per_sync_; uint64_t bytes_per_sync_;