rocksdb/utilities/blob_db/blob_db_impl.cc
Yi Wu 2c45ada4c4 Blob DB garbage collection should keep keys with newer version
Summary:
Fix the bug where if blob db garbage collection revmoe keys with newer version. It shouldn't delete the key from base db when sequence number in base db is not equal to the one in blob log.
Closes https://github.com/facebook/rocksdb/pull/2678

Differential Revision: D5549752

Pulled By: yiwu-arbug

fbshipit-source-id: abb8649260963b5c389748023970fd746279d227
2017-08-03 13:12:12 -07:00

2236 lines
72 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include "utilities/blob_db/blob_db_impl.h"
#include <algorithm>
#include <cinttypes>
#include <iomanip>
#include <limits>
#include <memory>
#include "db/db_impl.h"
#include "db/write_batch_internal.h"
#include "monitoring/instrumented_mutex.h"
#include "rocksdb/convenience.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/utilities/stackable_db.h"
#include "rocksdb/utilities/transaction.h"
#include "table/block.h"
#include "table/block_based_table_builder.h"
#include "table/block_builder.h"
#include "table/meta_blocks.h"
#include "util/cast_util.h"
#include "util/crc32c.h"
#include "util/file_reader_writer.h"
#include "util/filename.h"
#include "util/logging.h"
#include "util/random.h"
#include "util/timer_queue.h"
#include "utilities/transactions/optimistic_transaction_db_impl.h"
#include "utilities/transactions/optimistic_transaction_impl.h"
namespace {
int kBlockBasedTableVersionFormat = 2;
void extendTTL(rocksdb::blob_db::ttlrange_t* ttl_range, uint32_t ttl) {
ttl_range->first = std::min(ttl_range->first, ttl);
ttl_range->second = std::max(ttl_range->second, ttl);
}
void extendTimestamps(rocksdb::blob_db::tsrange_t* ts_range, uint64_t ts) {
ts_range->first = std::min(ts_range->first, ts);
ts_range->second = std::max(ts_range->second, ts);
}
void extendSN(rocksdb::blob_db::snrange_t* sn_range,
rocksdb::SequenceNumber sn) {
sn_range->first = std::min(sn_range->first, sn);
sn_range->second = std::max(sn_range->second, sn);
}
} // end namespace
namespace rocksdb {
namespace blob_db {
// BlobHandle is a pointer to the blob that is stored in the LSM
class BlobHandle {
public:
BlobHandle()
: file_number_(std::numeric_limits<uint64_t>::max()),
offset_(std::numeric_limits<uint64_t>::max()),
size_(std::numeric_limits<uint64_t>::max()),
compression_(kNoCompression) {}
uint64_t filenumber() const { return file_number_; }
void set_filenumber(uint64_t fn) { file_number_ = fn; }
// The offset of the block in the file.
uint64_t offset() const { return offset_; }
void set_offset(uint64_t _offset) { offset_ = _offset; }
// The size of the stored block
uint64_t size() const { return size_; }
void set_size(uint64_t _size) { size_ = _size; }
CompressionType compression() const { return compression_; }
void set_compression(CompressionType t) { compression_ = t; }
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* input);
void clear();
private:
uint64_t file_number_;
uint64_t offset_;
uint64_t size_;
CompressionType compression_;
};
void BlobHandle::EncodeTo(std::string* dst) const {
// Sanity check that all fields have been set
assert(offset_ != std::numeric_limits<uint64_t>::max());
assert(size_ != std::numeric_limits<uint64_t>::max());
assert(file_number_ != std::numeric_limits<uint64_t>::max());
dst->reserve(30);
PutVarint64(dst, file_number_);
PutVarint64(dst, offset_);
PutVarint64(dst, size_);
dst->push_back(static_cast<unsigned char>(compression_));
}
void BlobHandle::clear() {
file_number_ = std::numeric_limits<uint64_t>::max();
offset_ = std::numeric_limits<uint64_t>::max();
size_ = std::numeric_limits<uint64_t>::max();
compression_ = kNoCompression;
}
Status BlobHandle::DecodeFrom(Slice* input) {
if (GetVarint64(input, &file_number_) && GetVarint64(input, &offset_) &&
GetVarint64(input, &size_)) {
compression_ = static_cast<CompressionType>(input->data()[0]);
return Status::OK();
} else {
clear();
return Status::Corruption("bad blob handle");
}
}
Random blob_rgen(static_cast<uint32_t>(time(nullptr)));
void BlobDBFlushBeginListener::OnFlushBegin(DB* db, const FlushJobInfo& info) {
if (impl_) impl_->OnFlushBeginHandler(db, info);
}
WalFilter::WalProcessingOption BlobReconcileWalFilter::LogRecordFound(
unsigned long long log_number, const std::string& log_file_name,
const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
return WalFilter::WalProcessingOption::kContinueProcessing;
}
bool blobf_compare_ttl::operator()(const std::shared_ptr<BlobFile>& lhs,
const std::shared_ptr<BlobFile>& rhs) const {
if (lhs->ttl_range_.first < rhs->ttl_range_.first) return true;
if (lhs->ttl_range_.first > rhs->ttl_range_.first) return false;
return lhs->BlobFileNumber() > rhs->BlobFileNumber();
}
void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
int level, const Slice& key,
CompactionEventListener::CompactionListenerValueType value_type,
const Slice& existing_value, const SequenceNumber& sn, bool is_new) {
if (!is_new &&
value_type ==
CompactionEventListener::CompactionListenerValueType::kValue) {
BlobHandle handle;
Slice lsmval(existing_value);
Status s = handle.DecodeFrom(&lsmval);
if (s.ok()) {
if (impl_->debug_level_ >= 3)
ROCKS_LOG_INFO(impl_->db_options_.info_log,
"CALLBACK COMPACTED OUT KEY: %s SN: %d "
"NEW: %d FN: %" PRIu64 " OFFSET: %" PRIu64
" SIZE: %" PRIu64,
key.ToString().c_str(), sn, is_new, handle.filenumber(),
handle.offset(), handle.size());
impl_->override_vals_q_.enqueue({handle.filenumber(), key.size(),
handle.offset(), handle.size(), sn});
}
} else {
if (impl_->debug_level_ >= 3)
ROCKS_LOG_INFO(impl_->db_options_.info_log,
"CALLBACK NEW KEY: %s SN: %d NEW: %d",
key.ToString().c_str(), sn, is_new);
}
}
BlobDBImpl::BlobDBImpl(const std::string& dbname,
const BlobDBOptions& blob_db_options,
const DBOptions& db_options)
: BlobDB(nullptr),
db_impl_(nullptr),
env_(db_options.env),
ttl_extractor_(blob_db_options.ttl_extractor.get()),
wo_set_(false),
bdb_options_(blob_db_options),
db_options_(db_options),
env_options_(db_options),
dir_change_(false),
next_file_number_(1),
epoch_of_(0),
shutdown_(false),
current_epoch_(0),
open_file_count_(0),
last_period_write_(0),
last_period_ampl_(0),
total_periods_write_(0),
total_periods_ampl_(0),
total_blob_space_(0),
open_p1_done_(false),
debug_level_(0) {
blob_dir_ = (bdb_options_.path_relative)
? dbname + "/" + bdb_options_.blob_dir
: bdb_options_.blob_dir;
}
Status BlobDBImpl::LinkToBaseDB(DB* db) {
assert(db_ == nullptr);
assert(open_p1_done_);
db_ = db;
// the Base DB in-itself can be a stackable DB
db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
env_ = db_->GetEnv();
opt_db_.reset(new OptimisticTransactionDBImpl(db, false));
Status s = env_->CreateDirIfMissing(blob_dir_);
if (!s.ok()) {
ROCKS_LOG_WARN(db_options_.info_log,
"Failed to create blob directory: %s status: '%s'",
blob_dir_.c_str(), s.ToString().c_str());
}
s = env_->NewDirectory(blob_dir_, &dir_ent_);
if (!s.ok()) {
ROCKS_LOG_WARN(db_options_.info_log,
"Failed to open blob directory: %s status: '%s'",
blob_dir_.c_str(), s.ToString().c_str());
}
if (!bdb_options_.disable_background_tasks) {
StartBackgroundTasks();
}
return s;
}
BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
: BlobDB(db),
db_impl_(static_cast_with_check<DBImpl, DB>(db)),
opt_db_(new OptimisticTransactionDBImpl(db, false)),
wo_set_(false),
bdb_options_(blob_db_options),
db_options_(db->GetOptions()),
env_options_(db_->GetOptions()),
dir_change_(false),
next_file_number_(1),
epoch_of_(0),
shutdown_(false),
current_epoch_(0),
open_file_count_(0),
last_period_write_(0),
last_period_ampl_(0),
total_periods_write_(0),
total_periods_ampl_(0),
total_blob_space_(0) {
if (!bdb_options_.blob_dir.empty())
blob_dir_ = (bdb_options_.path_relative)
? db_->GetName() + "/" + bdb_options_.blob_dir
: bdb_options_.blob_dir;
}
BlobDBImpl::~BlobDBImpl() {
// CancelAllBackgroundWork(db_, true);
Shutdown();
}
Status BlobDBImpl::OpenPhase1() {
assert(db_ == nullptr);
if (blob_dir_.empty())
return Status::NotSupported("No blob directory in options");
std::unique_ptr<Directory> dir_ent;
Status s = env_->NewDirectory(blob_dir_, &dir_ent);
if (!s.ok()) {
ROCKS_LOG_WARN(db_options_.info_log,
"Failed to open blob directory: %s status: '%s'",
blob_dir_.c_str(), s.ToString().c_str());
open_p1_done_ = true;
return Status::OK();
}
s = OpenAllFiles();
open_p1_done_ = true;
return s;
}
void BlobDBImpl::StartBackgroundTasks() {
// store a call to a member function and object
tqueue_.add(
kReclaimOpenFilesPeriodMillisecs,
std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
tqueue_.add(kGCCheckPeriodMillisecs,
std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1));
tqueue_.add(
kDeleteCheckPeriodMillisecs,
std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1));
tqueue_.add(
kDeleteCheckPeriodMillisecs,
std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
tqueue_.add(
kDeleteObsoletedFilesPeriodMillisecs,
std::bind(&BlobDBImpl::DeleteObsFiles, this, std::placeholders::_1));
tqueue_.add(kSanityCheckPeriodMillisecs,
std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
tqueue_.add(kWriteAmplificationStatsPeriodMillisecs,
std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1));
tqueue_.add(kFSyncFilesPeriodMillisecs,
std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1));
tqueue_.add(
kCheckSeqFilesPeriodMillisecs,
std::bind(&BlobDBImpl::CheckSeqFiles, this, std::placeholders::_1));
}
void BlobDBImpl::Shutdown() { shutdown_.store(true); }
void BlobDBImpl::OnFlushBeginHandler(DB* db, const FlushJobInfo& info) {
if (shutdown_.load()) return;
// a callback that happens too soon needs to be ignored
if (!db_) return;
FsyncFiles(false);
}
Status BlobDBImpl::GetAllLogFiles(
std::set<std::pair<uint64_t, std::string>>* file_nums) {
std::vector<std::string> all_files;
Status status = env_->GetChildren(blob_dir_, &all_files);
if (!status.ok()) {
return status;
}
for (const auto& f : all_files) {
uint64_t number;
FileType type;
bool psucc = ParseFileName(f, &number, &type);
if (psucc && type == kBlobFile) {
file_nums->insert(std::make_pair(number, f));
} else {
ROCKS_LOG_WARN(db_options_.info_log,
"Skipping file in blob directory %s parse: %d type: %d",
f.c_str(), psucc, ((psucc) ? type : -1));
}
}
return status;
}
Status BlobDBImpl::OpenAllFiles() {
WriteLock wl(&mutex_);
std::set<std::pair<uint64_t, std::string>> file_nums;
Status status = GetAllLogFiles(&file_nums);
if (!status.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to collect files from blob dir: %s status: '%s'",
blob_dir_.c_str(), status.ToString().c_str());
return status;
}
ROCKS_LOG_INFO(db_options_.info_log,
"BlobDir files path: %s count: %d min: %" PRIu64
" max: %" PRIu64,
blob_dir_.c_str(), static_cast<int>(file_nums.size()),
(file_nums.empty()) ? -1 : (file_nums.begin())->first,
(file_nums.empty()) ? -1 : (file_nums.end())->first);
if (!file_nums.empty())
next_file_number_.store((file_nums.rbegin())->first + 1);
for (auto f_iter : file_nums) {
std::string bfpath = BlobFileName(blob_dir_, f_iter.first);
uint64_t size_bytes;
Status s1 = env_->GetFileSize(bfpath, &size_bytes);
if (!s1.ok()) {
ROCKS_LOG_WARN(
db_options_.info_log,
"Unable to get size of %s. File skipped from open status: '%s'",
bfpath.c_str(), s1.ToString().c_str());
continue;
}
if (debug_level_ >= 1)
ROCKS_LOG_INFO(db_options_.info_log, "Blob File open: %s size: %" PRIu64,
bfpath.c_str(), size_bytes);
std::shared_ptr<BlobFile> bfptr =
std::make_shared<BlobFile>(this, blob_dir_, f_iter.first);
bfptr->SetFileSize(size_bytes);
// since this file already existed, we will try to reconcile
// deleted count with LSM
bfptr->gc_once_after_open_ = true;
// read header
std::shared_ptr<Reader> reader;
reader = bfptr->OpenSequentialReader(env_, db_options_, env_options_);
s1 = reader->ReadHeader(&bfptr->header_);
if (!s1.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failure to read header for blob-file %s "
"status: '%s' size: %" PRIu64,
bfpath.c_str(), s1.ToString().c_str(), size_bytes);
continue;
}
bfptr->header_valid_ = true;
std::shared_ptr<RandomAccessFileReader> ra_reader =
GetOrOpenRandomAccessReader(bfptr, env_, env_options_);
BlobLogFooter bf;
s1 = bfptr->ReadFooter(&bf);
bfptr->CloseRandomAccessLocked();
if (s1.ok()) {
s1 = bfptr->SetFromFooterLocked(bf);
if (!s1.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Header Footer mismatch for blob-file %s "
"status: '%s' size: %" PRIu64,
bfpath.c_str(), s1.ToString().c_str(), size_bytes);
continue;
}
} else {
ROCKS_LOG_INFO(db_options_.info_log,
"File found incomplete (w/o footer) %s", bfpath.c_str());
// sequentially iterate over the file and read all the records
ttlrange_t ttl_range(std::numeric_limits<uint32_t>::max(),
std::numeric_limits<uint32_t>::min());
tsrange_t ts_range(std::numeric_limits<uint32_t>::max(),
std::numeric_limits<uint32_t>::min());
snrange_t sn_range(std::numeric_limits<SequenceNumber>::max(),
std::numeric_limits<SequenceNumber>::min());
uint64_t blob_count = 0;
BlobLogRecord record;
Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter;
uint64_t record_start = reader->GetNextByte();
// TODO(arahut) - when we detect corruption, we should truncate
while (reader->ReadRecord(&record, shallow).ok()) {
++blob_count;
if (bfptr->HasTTL()) {
extendTTL(&ttl_range, record.GetTTL());
}
if (bfptr->HasTimestamp()) {
extendTimestamps(&ts_range, record.GetTimeVal());
}
extendSN(&sn_range, record.GetSN());
record_start = reader->GetNextByte();
}
if (record_start != bfptr->GetFileSize()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Blob file is corrupted or crashed during write %s"
" good_size: %" PRIu64 " file_size: %" PRIu64,
bfpath.c_str(), record_start, bfptr->GetFileSize());
}
if (!blob_count) {
ROCKS_LOG_INFO(db_options_.info_log, "BlobCount = 0 in file %s",
bfpath.c_str());
continue;
}
bfptr->SetBlobCount(blob_count);
bfptr->SetSNRange(sn_range);
if (bfptr->HasTimestamp()) bfptr->set_time_range(ts_range);
ROCKS_LOG_INFO(db_options_.info_log,
"Blob File: %s blob_count: %" PRIu64
" size_bytes: %" PRIu64
" sn_range: (%d, %d) ts: %d ttl: %d",
bfpath.c_str(), blob_count, size_bytes, sn_range.first,
sn_range.second, bfptr->HasTimestamp(), bfptr->HasTTL());
if (bfptr->HasTTL()) {
ttl_range.second =
std::max(ttl_range.second,
ttl_range.first + (uint32_t)bdb_options_.ttl_range_secs);
bfptr->set_ttl_range(ttl_range);
std::time_t epoch_now = std::chrono::system_clock::to_time_t(
std::chrono::system_clock::now());
if (ttl_range.second < epoch_now) {
Status fstatus = CreateWriterLocked(bfptr);
if (fstatus.ok()) fstatus = bfptr->WriteFooterAndCloseLocked();
if (!fstatus.ok()) {
ROCKS_LOG_ERROR(
db_options_.info_log,
"Failed to close Blob File: %s status: '%s'. Skipped",
bfpath.c_str(), fstatus.ToString().c_str());
continue;
} else {
ROCKS_LOG_ERROR(db_options_.info_log,
"Blob File Closed: %s now: %d ttl_range: (%d, %d)",
bfpath.c_str(), epoch_now, ttl_range.first,
ttl_range.second);
}
} else {
open_blob_files_.insert(bfptr);
}
}
}
blob_files_.insert(std::make_pair(f_iter.first, bfptr));
}
return status;
}
void BlobDBImpl::CloseRandomAccessLocked(
const std::shared_ptr<BlobFile>& bfile) {
bfile->CloseRandomAccessLocked();
open_file_count_--;
}
std::shared_ptr<RandomAccessFileReader> BlobDBImpl::GetOrOpenRandomAccessReader(
const std::shared_ptr<BlobFile>& bfile, Env* env,
const EnvOptions& env_options) {
bool fresh_open = false;
auto rar = bfile->GetOrOpenRandomAccessReader(env, env_options, &fresh_open);
if (fresh_open) open_file_count_++;
return rar;
}
std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(const std::string& reason) {
uint64_t file_num = next_file_number_++;
auto bfile = std::make_shared<BlobFile>(this, blob_dir_, file_num);
ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
bfile->PathName().c_str(), reason.c_str());
LogFlush(db_options_.info_log);
return bfile;
}
Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
std::string fpath(bfile->PathName());
std::unique_ptr<WritableFile> wfile;
// We are having issue that we write duplicate blob to blob file and the bug
// is related to writable file buffer. Force no buffer until we fix the bug.
EnvOptions env_options = env_options_;
env_options.writable_file_max_buffer_size = 0;
Status s = env_->ReopenWritableFile(fpath, &wfile, env_options);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to open blob file for write: %s status: '%s'"
" exists: '%s'",
fpath.c_str(), s.ToString().c_str(),
env_->FileExists(fpath).ToString().c_str());
return s;
}
std::unique_ptr<WritableFileWriter> fwriter;
fwriter.reset(new WritableFileWriter(std::move(wfile), env_options));
uint64_t boffset = bfile->GetFileSize();
if (debug_level_ >= 2 && boffset) {
ROCKS_LOG_DEBUG(db_options_.info_log, "Open blob file: %s with offset: %d",
fpath.c_str(), boffset);
}
Writer::ElemType et = Writer::kEtNone;
if (bfile->file_size_ == BlobLogHeader::kHeaderSize)
et = Writer::kEtFileHdr;
else if (bfile->file_size_ > BlobLogHeader::kHeaderSize)
et = Writer::kEtFooter;
else if (bfile->file_size_) {
ROCKS_LOG_WARN(db_options_.info_log,
"Open blob file: %s with wrong size: %d", fpath.c_str(),
boffset);
return Status::Corruption("Invalid blob file size");
}
bfile->log_writer_ = std::make_shared<Writer>(
std::move(fwriter), bfile->file_number_, bdb_options_.bytes_per_sync,
db_options_.use_fsync, boffset);
bfile->log_writer_->last_elem_type_ = et;
return s;
}
std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
uint32_t expiration) const {
if (open_blob_files_.empty()) return nullptr;
std::shared_ptr<BlobFile> tmp = std::make_shared<BlobFile>();
tmp->ttl_range_ = std::make_pair(expiration, 0);
auto citr = open_blob_files_.equal_range(tmp);
if (citr.first == open_blob_files_.end()) {
assert(citr.second == open_blob_files_.end());
std::shared_ptr<BlobFile> check = *(open_blob_files_.rbegin());
return (check->ttl_range_.second < expiration) ? nullptr : check;
}
if (citr.first != citr.second) return *(citr.first);
auto finditr = citr.second;
if (finditr != open_blob_files_.begin()) --finditr;
bool b2 = (*finditr)->ttl_range_.second < expiration;
bool b1 = (*finditr)->ttl_range_.first > expiration;
return (b1 || b2) ? nullptr : (*finditr);
}
std::shared_ptr<Writer> BlobDBImpl::CheckOrCreateWriterLocked(
const std::shared_ptr<BlobFile>& bfile) {
std::shared_ptr<Writer> writer = bfile->GetWriter();
if (writer) return writer;
Status s = CreateWriterLocked(bfile);
if (!s.ok()) return nullptr;
writer = bfile->GetWriter();
return writer;
}
void BlobDBImpl::UpdateWriteOptions(const WriteOptions& options) {
if (!wo_set_.load(std::memory_order_relaxed)) {
// DCLP
WriteLock wl(&mutex_);
if (!wo_set_.load(std::memory_order_acquire)) {
wo_set_.store(true, std::memory_order_release);
write_options_ = options;
}
}
}
std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
uint32_t val = blob_rgen.Next();
{
ReadLock rl(&mutex_);
if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs)
return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs];
}
std::shared_ptr<BlobFile> bfile = NewBlobFile("SelectBlobFile");
assert(bfile);
// file not visible, hence no lock
std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
if (!writer) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to get writer from blob file: %s",
bfile->PathName().c_str());
return nullptr;
}
bfile->file_size_ = BlobLogHeader::kHeaderSize;
bfile->header_.compression_ = bdb_options_.compression;
bfile->header_valid_ = true;
// CHECK again
WriteLock wl(&mutex_);
if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs) {
return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs];
}
Status s = writer->WriteHeader(bfile->header_);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to write header to new blob file: %s"
" status: '%s'",
bfile->PathName().c_str(), s.ToString().c_str());
return nullptr;
}
dir_change_.store(true);
blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
open_simple_files_.push_back(bfile);
return bfile;
}
std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint32_t expiration) {
uint64_t epoch_read = 0;
std::shared_ptr<BlobFile> bfile;
{
ReadLock rl(&mutex_);
bfile = FindBlobFileLocked(expiration);
epoch_read = epoch_of_.load();
}
if (bfile) {
assert(!bfile->Immutable());
return bfile;
}
uint32_t exp_low =
(expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
uint32_t exp_high = exp_low + bdb_options_.ttl_range_secs;
ttlrange_t ttl_guess = std::make_pair(exp_low, exp_high);
bfile = NewBlobFile("SelectBlobFileTTL");
assert(bfile);
ROCKS_LOG_INFO(db_options_.info_log, "New blob file TTL range: %s %d %d",
bfile->PathName().c_str(), exp_low, exp_high);
LogFlush(db_options_.info_log);
// we don't need to take lock as no other thread is seeing bfile yet
std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
if (!writer) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to get writer from blob file with TTL: %s",
bfile->PathName().c_str());
return nullptr;
}
bfile->header_.set_ttl_guess(ttl_guess);
bfile->header_.compression_ = bdb_options_.compression;
bfile->header_valid_ = true;
bfile->file_size_ = BlobLogHeader::kHeaderSize;
// set the first value of the range, since that is
// concrete at this time. also necessary to add to open_blob_files_
bfile->ttl_range_ = ttl_guess;
WriteLock wl(&mutex_);
// in case the epoch has shifted in the interim, then check
// check condition again - should be rare.
if (epoch_of_.load() != epoch_read) {
auto bfile2 = FindBlobFileLocked(expiration);
if (bfile2) return bfile2;
}
Status s = writer->WriteHeader(bfile->header_);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to write header to new blob file: %s"
" status: '%s'",
bfile->PathName().c_str(), s.ToString().c_str());
return nullptr;
}
dir_change_.store(true);
blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
open_blob_files_.insert(bfile);
epoch_of_++;
return bfile;
}
Status BlobDBImpl::Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
std::string new_value;
Slice value_slice;
int32_t expiration = ExtractExpiration(key, value, &value_slice, &new_value);
return PutUntil(options, column_family, key, value_slice, expiration);
}
Status BlobDBImpl::Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key) {
SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
Status s = db_->Delete(options, column_family, key);
// add deleted key to list of keys that have been deleted for book-keeping
delete_keys_q_.enqueue({column_family, key.ToString(), lsn});
return s;
}
Status BlobDBImpl::SingleDelete(const WriteOptions& wopts,
ColumnFamilyHandle* column_family,
const Slice& key) {
SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
Status s = db_->SingleDelete(wopts, column_family, key);
delete_keys_q_.enqueue({column_family, key.ToString(), lsn});
return s;
}
Status BlobDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
class BlobInserter : public WriteBatch::Handler {
private:
BlobDBImpl* impl_;
SequenceNumber sequence_;
WriteBatch updates_blob_;
Status batch_rewrite_status_;
std::shared_ptr<BlobFile> last_file_;
bool has_put_;
std::string new_value_;
public:
explicit BlobInserter(BlobDBImpl* impl, SequenceNumber seq)
: impl_(impl), sequence_(seq), has_put_(false) {}
WriteBatch& updates_blob() { return updates_blob_; }
Status batch_rewrite_status() { return batch_rewrite_status_; }
std::shared_ptr<BlobFile>& last_file() { return last_file_; }
bool has_put() { return has_put_; }
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value_slice) override {
Slice value_unc;
int32_t expiration =
impl_->ExtractExpiration(key, value_slice, &value_unc, &new_value_);
std::shared_ptr<BlobFile> bfile =
(expiration != -1)
? impl_->SelectBlobFileTTL(expiration)
: ((last_file_) ? last_file_ : impl_->SelectBlobFile());
if (last_file_ && last_file_ != bfile) {
batch_rewrite_status_ = Status::NotFound("too many blob files");
return batch_rewrite_status_;
}
if (!bfile) {
batch_rewrite_status_ = Status::NotFound("blob file not found");
return batch_rewrite_status_;
}
last_file_ = bfile;
has_put_ = true;
std::string compression_output;
Slice value = impl_->GetCompressedSlice(value_unc, &compression_output);
std::string headerbuf;
Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
std::string index_entry;
Status st = impl_->AppendBlob(bfile, headerbuf, key, value, &index_entry);
if (st.ok()) {
impl_->AppendSN(last_file_, sequence_);
sequence_++;
}
if (expiration != -1) {
extendTTL(&(bfile->ttl_range_), (uint32_t)expiration);
}
if (!st.ok()) {
batch_rewrite_status_ = st;
} else {
WriteBatchInternal::Put(&updates_blob_, column_family_id, key,
index_entry);
}
return Status::OK();
}
virtual Status DeleteCF(uint32_t column_family_id,
const Slice& key) override {
WriteBatchInternal::Delete(&updates_blob_, column_family_id, key);
sequence_++;
return Status::OK();
}
virtual Status SingleDeleteCF(uint32_t /*column_family_id*/,
const Slice& /*key*/) override {
batch_rewrite_status_ =
Status::NotSupported("Not supported operation in blob db.");
return batch_rewrite_status_;
}
virtual Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
const Slice& /*value*/) override {
batch_rewrite_status_ =
Status::NotSupported("Not supported operation in blob db.");
return batch_rewrite_status_;
}
virtual void LogData(const Slice& blob) override {
updates_blob_.PutLogData(blob);
}
};
SequenceNumber sequence = db_impl_->GetLatestSequenceNumber() + 1;
BlobInserter blob_inserter(this, sequence);
updates->Iterate(&blob_inserter);
if (!blob_inserter.batch_rewrite_status().ok()) {
return blob_inserter.batch_rewrite_status();
}
Status s = db_->Write(opts, &(blob_inserter.updates_blob()));
if (!s.ok()) {
return s;
}
if (blob_inserter.has_put()) {
CloseIf(blob_inserter.last_file());
}
// add deleted key to list of keys that have been deleted for book-keeping
class DeleteBookkeeper : public WriteBatch::Handler {
public:
explicit DeleteBookkeeper(BlobDBImpl* impl, const SequenceNumber& seq)
: impl_(impl), sequence_(seq) {}
virtual Status PutCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
const Slice& /*value*/) override {
sequence_++;
return Status::OK();
}
virtual Status DeleteCF(uint32_t column_family_id,
const Slice& key) override {
ColumnFamilyHandle* cfh =
impl_->db_impl_->GetColumnFamilyHandleUnlocked(column_family_id);
impl_->delete_keys_q_.enqueue({cfh, key.ToString(), sequence_});
sequence_++;
return Status::OK();
}
private:
BlobDBImpl* impl_;
SequenceNumber sequence_;
};
// add deleted key to list of keys that have been deleted for book-keeping
DeleteBookkeeper delete_bookkeeper(this, sequence);
updates->Iterate(&delete_bookkeeper);
return Status::OK();
}
Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value,
int32_t ttl) {
return PutUntil(options, column_family, key, value,
static_cast<int32_t>(EpochNow()) + ttl);
}
Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
std::string* compression_output) const {
if (bdb_options_.compression == kNoCompression) {
return raw;
}
CompressionType ct = bdb_options_.compression;
CompressionOptions compression_opts;
CompressBlock(raw, compression_opts, &ct, kBlockBasedTableVersionFormat,
Slice(), compression_output);
return *compression_output;
}
// TODO(yiwu): We should use uint64_t for expiration.
Status BlobDBImpl::PutUntil(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value_unc, int32_t expiration) {
UpdateWriteOptions(options);
std::shared_ptr<BlobFile> bfile =
(expiration != -1) ? SelectBlobFileTTL(expiration) : SelectBlobFile();
if (!bfile) return Status::NotFound("Blob file not found");
std::string compression_output;
Slice value = GetCompressedSlice(value_unc, &compression_output);
std::string headerbuf;
Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
// this is another more safer way to do it, where you keep the writeLock
// for the entire write path. this will increase latency and reduce
// throughput
// WriteLock lockbfile_w(&bfile->mutex_);
// std::shared_ptr<Writer> writer =
// CheckOrCreateWriterLocked(bfile);
if (debug_level_ >= 3)
ROCKS_LOG_DEBUG(
db_options_.info_log, ">Adding KEY FILE: %s: KEY: %s VALSZ: %d",
bfile->PathName().c_str(), key.ToString().c_str(), value.size());
std::string index_entry;
Status s = AppendBlob(bfile, headerbuf, key, value, &index_entry);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to append blob to FILE: %s: KEY: %s VALSZ: %d"
" status: '%s' blob_file: '%s'",
bfile->PathName().c_str(), key.ToString().c_str(),
value.size(), s.ToString().c_str(),
bfile->DumpState().c_str());
// Fallback just write to the LSM and get going
WriteBatch batch;
batch.Put(column_family, key, value);
return db_->Write(options, &batch);
}
WriteBatch batch;
batch.Put(column_family, key, index_entry);
// this goes to the base db and can be expensive
s = db_->Write(options, &batch);
// this is the sequence number of the write.
SequenceNumber sn = WriteBatchInternal::Sequence(&batch);
if (debug_level_ >= 3)
ROCKS_LOG_INFO(db_options_.info_log, "<Adding KEY FILE: %s: KEY: %s SN: %d",
bfile->PathName().c_str(), key.ToString().c_str(), sn);
s = AppendSN(bfile, sn);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failed to append SN to FILE: %s: KEY: %s VALSZ: %d"
" status: '%s' blob_file: '%s'",
bfile->PathName().c_str(), key.ToString().c_str(),
value.size(), s.ToString().c_str(),
bfile->DumpState().c_str());
}
if (expiration != -1) extendTTL(&(bfile->ttl_range_), (uint32_t)expiration);
CloseIf(bfile);
return s;
}
// TODO(yiwu): We should return uint64_t after updating the rest of the code
// to use uint64_t for expiration.
int32_t BlobDBImpl::ExtractExpiration(const Slice& key, const Slice& value,
Slice* value_slice,
std::string* new_value) {
uint64_t expiration = kNoExpiration;
bool value_changed = false;
if (ttl_extractor_ != nullptr) {
bool has_ttl = ttl_extractor_->ExtractExpiration(
key, value, EpochNow(), &expiration, new_value, &value_changed);
if (!has_ttl) {
expiration = kNoExpiration;
}
}
*value_slice = value_changed ? Slice(*new_value) : value;
return (expiration == kNoExpiration) ? -1 : static_cast<int32_t>(expiration);
}
Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
const std::string& headerbuf, const Slice& key,
const Slice& value, std::string* index_entry) {
Status s;
uint64_t blob_offset = 0;
uint64_t key_offset = 0;
{
WriteLock lockbfile_w(&bfile->mutex_);
std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
if (!writer) return Status::IOError("Failed to create blob writer");
// write the blob to the blob log.
s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset,
&blob_offset);
}
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Invalid status in AppendBlob: %s status: '%s'",
bfile->PathName().c_str(), s.ToString().c_str());
return s;
}
// increment blob count
bfile->blob_count_++;
auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size();
bfile->file_size_ += size_put;
last_period_write_ += size_put;
total_blob_space_ += size_put;
BlobHandle handle;
handle.set_filenumber(bfile->BlobFileNumber());
handle.set_size(value.size());
handle.set_offset(blob_offset);
handle.set_compression(bdb_options_.compression);
handle.EncodeTo(index_entry);
if (debug_level_ >= 3)
ROCKS_LOG_INFO(db_options_.info_log,
">Adding KEY FILE: %s: BC: %d OFFSET: %d SZ: %d",
bfile->PathName().c_str(), bfile->blob_count_.load(),
blob_offset, value.size());
return s;
}
Status BlobDBImpl::AppendSN(const std::shared_ptr<BlobFile>& bfile,
const SequenceNumber& sn) {
Status s;
{
WriteLock lockbfile_w(&bfile->mutex_);
std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
if (!writer) return Status::IOError("Failed to create blob writer");
s = writer->AddRecordFooter(sn);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Invalid status in AppendSN: %s status: '%s'",
bfile->PathName().c_str(), s.ToString().c_str());
return s;
}
if (sn != std::numeric_limits<SequenceNumber>::max())
extendSN(&(bfile->sn_range_), sn);
}
bfile->file_size_ += BlobLogRecord::kFooterSize;
last_period_write_ += BlobLogRecord::kFooterSize;
total_blob_space_ += BlobLogRecord::kFooterSize;
return s;
}
std::vector<Status> BlobDBImpl::MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_family,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
std::vector<std::string> values_lsm;
values_lsm.resize(keys.size());
auto statuses = db_->MultiGet(options, column_family, keys, &values_lsm);
for (size_t i = 0; i < keys.size(); ++i) {
if (!statuses[i].ok()) continue;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
auto cfd = cfh->cfd();
Status s = CommonGet(cfd, keys[i], values_lsm[i], &((*values)[i]));
statuses[i] = s;
}
return statuses;
}
Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
const std::string& index_entry, std::string* value,
SequenceNumber* sequence) {
Slice index_entry_slice(index_entry);
BlobHandle handle;
Status s = handle.DecodeFrom(&index_entry_slice);
if (!s.ok()) return s;
// offset has to have certain min, as we will read CRC
// later from the Blob Header, which needs to be also a
// valid offset.
if (handle.offset() <
(BlobLogHeader::kHeaderSize + BlobLogRecord::kHeaderSize + key.size())) {
if (debug_level_ >= 2) {
ROCKS_LOG_ERROR(
db_options_.info_log,
"Invalid blob handle file_number: %" PRIu64 " blob_offset: %" PRIu64
" blob_size: %" PRIu64 " key: %s",
handle.filenumber(), handle.offset(), handle.size(), key.data());
}
return Status::NotFound("Blob Not Found, although found in LSM");
}
std::shared_ptr<BlobFile> bfile;
{
ReadLock rl(&mutex_);
auto hitr = blob_files_.find(handle.filenumber());
// file was deleted
if (hitr == blob_files_.end()) {
return Status::NotFound("Blob Not Found as blob file missing");
}
bfile = hitr->second;
}
if (bfile->Obsolete()) {
return Status::NotFound(
"Blob Not Found as blob file was garbage collected");
}
// 0 - size
if (!handle.size() && value != nullptr) {
value->clear();
return Status::OK();
}
// takes locks when called
std::shared_ptr<RandomAccessFileReader> reader =
GetOrOpenRandomAccessReader(bfile, env_, env_options_);
if (value != nullptr) {
std::string* valueptr = value;
std::string value_c;
if (bdb_options_.compression != kNoCompression) {
valueptr = &value_c;
}
// allocate the buffer. This is safe in C++11
valueptr->resize(handle.size());
char* buffer = &(*valueptr)[0];
Slice blob_value;
s = reader->Read(handle.offset(), handle.size(), &blob_value, buffer);
if (!s.ok() || blob_value.size() != handle.size()) {
if (debug_level_ >= 2) {
ROCKS_LOG_ERROR(
db_options_.info_log,
"Failed to read blob from file: %s blob_offset: %" PRIu64
" blob_size: %" PRIu64 " read: %d key: %s status: '%s'",
bfile->PathName().c_str(), handle.offset(), handle.size(),
static_cast<int>(blob_value.size()), key.data(),
s.ToString().c_str());
}
return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
}
Slice crc_slice;
uint32_t crc_exp;
std::string crc_str;
crc_str.resize(sizeof(uint32_t));
char* crc_buffer = &(crc_str[0]);
s = reader->Read(handle.offset() - (key.size() + sizeof(uint32_t)),
sizeof(uint32_t), &crc_slice, crc_buffer);
if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) {
if (debug_level_ >= 2) {
ROCKS_LOG_ERROR(
db_options_.info_log,
"Failed to fetch blob crc file: %s blob_offset: %" PRIu64
" blob_size: %" PRIu64 " key: %s status: '%s'",
bfile->PathName().c_str(), handle.offset(), handle.size(),
key.data(), s.ToString().c_str());
}
return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
}
uint32_t crc = crc32c::Extend(0, blob_value.data(), blob_value.size());
crc = crc32c::Mask(crc); // Adjust for storage
if (crc != crc_exp) {
if (debug_level_ >= 2) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Blob crc mismatch file: %s blob_offset: %" PRIu64
" blob_size: %" PRIu64 " key: %s status: '%s'",
bfile->PathName().c_str(), handle.offset(),
handle.size(), key.data(), s.ToString().c_str());
}
return Status::Corruption("Corruption. Blob CRC mismatch");
}
if (bdb_options_.compression != kNoCompression) {
BlockContents contents;
s = UncompressBlockContentsForCompressionType(
blob_value.data(), blob_value.size(), &contents,
kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression,
*(cfd->ioptions()));
*value = contents.data.ToString();
}
}
if (sequence != nullptr) {
char buffer[BlobLogRecord::kFooterSize];
Slice footer_slice;
s = reader->Read(handle.offset() + handle.size(),
BlobLogRecord::kFooterSize, &footer_slice, buffer);
if (!s.ok()) {
return s;
}
BlobLogRecord record;
s = record.DecodeFooterFrom(footer_slice);
if (!s.ok()) {
return s;
}
*sequence = record.GetSN();
}
return s;
}
Status BlobDBImpl::Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
Status s;
std::string index_entry;
s = db_->Get(options, column_family, key, &index_entry);
if (!s.ok()) {
if (debug_level_ >= 3)
ROCKS_LOG_WARN(db_options_.info_log,
"Get Failed on LSM KEY: %s status: '%s'",
key.ToString().c_str(), s.ToString().c_str());
return s;
}
s = CommonGet(cfd, key, index_entry, value->GetSelf());
value->PinSelf();
return s;
}
Slice BlobDBIterator::value() const {
Slice index_entry = iter_->value();
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh_);
auto cfd = cfh->cfd();
Status s = db_impl_->CommonGet(cfd, iter_->key(), index_entry.ToString(false),
&vpart_);
return Slice(vpart_);
}
std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
if (aborted) return std::make_pair(false, -1);
ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check");
ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" PRIu64,
blob_files_.size());
ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" PRIu64,
open_blob_files_.size());
for (auto bfile : open_blob_files_) {
assert(!bfile->Immutable());
}
uint64_t epoch_now = EpochNow();
for (auto bfile_pair : blob_files_) {
auto bfile = bfile_pair.second;
ROCKS_LOG_INFO(
db_options_.info_log,
"Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64,
bfile->PathName().c_str(), bfile->GetFileSize(), bfile->BlobCount(),
bfile->deleted_count_, bfile->deleted_size_,
(bfile->ttl_range_.second - epoch_now));
}
// reschedule
return std::make_pair(true, -1);
}
std::pair<bool, int64_t> BlobDBImpl::CloseSeqWrite(
std::shared_ptr<BlobFile> bfile, bool aborted) {
{
WriteLock wl(&mutex_);
// this prevents others from picking up this file
open_blob_files_.erase(bfile);
auto findit =
std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile);
if (findit != open_simple_files_.end()) open_simple_files_.erase(findit);
}
if (!bfile->closed_.load()) {
WriteLock lockbfile_w(&bfile->mutex_);
bfile->WriteFooterAndCloseLocked();
}
return std::make_pair(false, -1);
}
void BlobDBImpl::CloseIf(const std::shared_ptr<BlobFile>& bfile) {
// atomic read
bool close = bfile->GetFileSize() > bdb_options_.blob_file_size;
if (!close) return;
if (debug_level_ >= 2) {
ROCKS_LOG_DEBUG(db_options_.info_log,
"Scheduling file for close %s fsize: %" PRIu64
" limit: %" PRIu64,
bfile->PathName().c_str(), bfile->GetFileSize(),
bdb_options_.blob_file_size);
}
{
WriteLock wl(&mutex_);
open_blob_files_.erase(bfile);
auto findit =
std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile);
if (findit != open_simple_files_.end()) {
open_simple_files_.erase(findit);
} else {
ROCKS_LOG_WARN(db_options_.info_log,
"File not found while closing %s fsize: %" PRIu64
" Multithreaded Writes?",
bfile->PathName().c_str(), bfile->GetFileSize());
}
}
tqueue_.add(0, std::bind(&BlobDBImpl::CloseSeqWrite, this, bfile,
std::placeholders::_1));
}
bool BlobDBImpl::FileDeleteOk_SnapshotCheckLocked(
const std::shared_ptr<BlobFile>& bfile) {
assert(bfile->Obsolete());
SequenceNumber esn = bfile->GetSNRange().first;
// this is not correct.
// you want to check that there are no snapshots in the
bool notok = db_impl_->HasActiveSnapshotLaterThanSN(esn);
if (notok) {
ROCKS_LOG_INFO(db_options_.info_log,
"Could not delete file due to snapshot failure %s",
bfile->PathName().c_str());
return false;
} else {
ROCKS_LOG_INFO(db_options_.info_log,
"Will delete file due to snapshot success %s",
bfile->PathName().c_str());
return true;
}
}
bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
uint64_t blob_offset,
uint64_t blob_size) {
(void)blob_offset;
std::shared_ptr<BlobFile> bfile;
{
ReadLock rl(&mutex_);
auto hitr = blob_files_.find(file_number);
// file was deleted
if (hitr == blob_files_.end()) {
return false;
}
bfile = hitr->second;
}
WriteLock lockbfile_w(&bfile->mutex_);
bfile->deleted_count_++;
bfile->deleted_size_ += key_size + blob_size + BlobLogRecord::kHeaderSize +
BlobLogRecord::kFooterSize;
return true;
}
bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& lsmValue) {
Slice val(lsmValue);
BlobHandle handle;
Status s = handle.DecodeFrom(&val);
if (!s.ok()) {
ROCKS_LOG_INFO(db_options_.info_log,
"Could not parse lsm val in MarkBlobDeleted %s",
lsmValue.ToString().c_str());
return false;
}
bool succ = FindFileAndEvictABlob(handle.filenumber(), key.size(),
handle.offset(), handle.size());
return succ;
}
std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
if (aborted) return std::make_pair(false, -1);
override_packet_t packet;
size_t total_vals = 0;
size_t mark_evicted = 0;
while (override_vals_q_.dequeue(&packet)) {
bool succeeded =
FindFileAndEvictABlob(packet.file_number_, packet.key_size_,
packet.blob_offset_, packet.blob_size_);
total_vals++;
if (succeeded) {
mark_evicted++;
}
}
ROCKS_LOG_INFO(db_options_.info_log,
"Mark %" ROCKSDB_PRIszt
" values to evict, out of %" ROCKSDB_PRIszt
" compacted values.",
mark_evicted, total_vals);
return std::make_pair(true, -1);
}
std::pair<bool, int64_t> BlobDBImpl::EvictDeletions(bool aborted) {
if (aborted) return std::make_pair(false, -1);
ColumnFamilyHandle* last_cfh = nullptr;
Options last_op;
Arena arena;
ScopedArenaIterator iter;
// we will use same RangeDelAggregator for all cf's.
// essentially we do not support Range Deletes now
std::unique_ptr<RangeDelAggregator> range_del_agg;
delete_packet_t dpacket;
while (delete_keys_q_.dequeue(&dpacket)) {
if (last_cfh != dpacket.cfh_) {
if (!range_del_agg) {
auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(dpacket.cfh_);
auto cfd = cfhi->cfd();
range_del_agg.reset(new RangeDelAggregator(cfd->internal_comparator(),
kMaxSequenceNumber));
}
// this can be expensive
last_cfh = dpacket.cfh_;
last_op = db_impl_->GetOptions(last_cfh);
iter.set(db_impl_->NewInternalIterator(&arena, range_del_agg.get(),
dpacket.cfh_));
// this will not work for multiple CF's.
}
Slice user_key(dpacket.key_);
InternalKey target(user_key, dpacket.dsn_, kTypeValue);
Slice eslice = target.Encode();
iter->Seek(eslice);
if (!iter->status().ok()) {
ROCKS_LOG_INFO(db_options_.info_log, "Invalid iterator seek %s",
dpacket.key_.c_str());
continue;
}
const Comparator* bwc = BytewiseComparator();
while (iter->Valid()) {
if (!bwc->Equal(ExtractUserKey(iter->key()), ExtractUserKey(eslice)))
break;
ParsedInternalKey ikey(Slice(), 0, kTypeValue);
if (!ParseInternalKey(iter->key(), &ikey)) {
continue;
}
// once you hit a DELETE, assume the keys below have been
// processed previously
if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) break;
Slice val = iter->value();
MarkBlobDeleted(ikey.user_key, val);
iter->Next();
}
}
return std::make_pair(true, -1);
}
std::pair<bool, int64_t> BlobDBImpl::CheckSeqFiles(bool aborted) {
if (aborted) return std::make_pair(false, -1);
std::vector<std::shared_ptr<BlobFile>> process_files;
{
uint64_t epoch_now = EpochNow();
ReadLock rl(&mutex_);
for (auto bfile : open_blob_files_) {
{
ReadLock lockbfile_r(&bfile->mutex_);
if (bfile->ttl_range_.second > epoch_now) continue;
process_files.push_back(bfile);
}
}
}
for (auto bfile : process_files) CloseSeqWrite(bfile, false);
return std::make_pair(true, -1);
}
std::pair<bool, int64_t> BlobDBImpl::FsyncFiles(bool aborted) {
if (aborted) return std::make_pair(false, -1);
std::vector<std::shared_ptr<BlobFile>> process_files;
{
ReadLock rl(&mutex_);
for (auto fitr : open_blob_files_) {
if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync))
process_files.push_back(fitr);
}
for (auto fitr : open_simple_files_) {
if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync))
process_files.push_back(fitr);
}
}
for (auto fitr : process_files) {
if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync)) fitr->Fsync();
}
bool expected = true;
if (dir_change_.compare_exchange_weak(expected, false)) dir_ent_->Fsync();
return std::make_pair(true, -1);
}
std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
if (aborted) return std::make_pair(false, -1);
if (open_file_count_.load() < kOpenFilesTrigger) {
return std::make_pair(true, -1);
}
// in the future, we should sort by last_access_
// instead of closing every file
ReadLock rl(&mutex_);
for (auto const& ent : blob_files_) {
auto bfile = ent.second;
if (bfile->last_access_.load() == -1) continue;
WriteLock lockbfile_w(&bfile->mutex_);
CloseRandomAccessLocked(bfile);
}
return std::make_pair(true, -1);
}
std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
if (aborted) return std::make_pair(false, -1);
WriteLock wl(&mutex_);
if (all_periods_write_.size() < kWriteAmplificationStatsPeriods) {
total_periods_write_ -= (*all_periods_write_.begin());
total_periods_ampl_ = (*all_periods_ampl_.begin());
all_periods_write_.pop_front();
all_periods_ampl_.pop_front();
}
uint64_t val1 = last_period_write_.load();
uint64_t val2 = last_period_ampl_.load();
all_periods_write_.push_back(val1);
all_periods_ampl_.push_back(val2);
last_period_write_ = 0;
last_period_ampl_ = 0;
total_periods_write_ += val1;
total_periods_ampl_ += val2;
return std::make_pair(true, -1);
}
////////////////////////////////////////////////////////////////////////////////
// iterate over the blobs sequentially and check if the blob sequence number
// is the latest. If it is the latest, preserve it, otherwise delete it
// if it is TTL based, and the TTL has expired, then
// we can blow the entity if the key is still the latest or the Key is not
// found
// WHAT HAPPENS IF THE KEY HAS BEEN OVERRIDEN. Then we can drop the blob
// without doing anything if the earliest snapshot is not
// referring to that sequence number, i.e. it is later than the sequence number
// of the new key
//
// if it is not TTL based, then we can blow the key if the key has been
// DELETED in the LSM
////////////////////////////////////////////////////////////////////////////////
Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
GCStats* gcstats) {
uint64_t tt = EpochNow();
std::shared_ptr<Reader> reader =
bfptr->OpenSequentialReader(env_, db_options_, env_options_);
if (!reader) {
ROCKS_LOG_ERROR(db_options_.info_log,
"File sequential reader could not be opened",
bfptr->PathName().c_str());
return Status::IOError("failed to create sequential reader");
}
BlobLogHeader header;
Status s = reader->ReadHeader(&header);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"Failure to read header for blob-file %s",
bfptr->PathName().c_str());
return s;
}
bool first_gc = bfptr->gc_once_after_open_;
ColumnFamilyHandle* cfh = bfptr->GetColumnFamily(db_);
auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
auto cfd = cfhi->cfd();
bool has_ttl = header.HasTTL();
// this reads the key but skips the blob
Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter;
assert(opt_db_);
bool no_relocation_ttl = (has_ttl && tt > bfptr->GetTTLRange().second);
bool no_relocation_lsmdel = false;
{
ReadLock lockbfile_r(&bfptr->mutex_);
no_relocation_lsmdel = (bfptr->GetFileSize() ==
(BlobLogHeader::kHeaderSize + bfptr->deleted_size_ +
BlobLogFooter::kFooterSize));
}
bool no_relocation = no_relocation_ttl || no_relocation_lsmdel;
if (!no_relocation) {
// read the blob because you have to write it back to new file
shallow = Reader::kReadHdrKeyBlobFooter;
}
BlobLogRecord record;
std::shared_ptr<BlobFile> newfile;
std::shared_ptr<Writer> new_writer;
while (reader->ReadRecord(&record, shallow).ok()) {
gcstats->blob_count++;
bool del_this = false;
bool reloc_this = false;
// TODO(yiwu): The following logic should use GetForUpdate() from
// optimistic transaction to check if the key is current, otherwise
// there can be another writer sneak in between sequence number of
// and the deletion.
// this particular TTL has expired
if (no_relocation_ttl || (has_ttl && tt > record.GetTTL())) {
del_this = true;
} else if (!first_gc) {
SequenceNumber seq = kMaxSequenceNumber;
bool found_record_for_key = false;
SuperVersion* sv = db_impl_->GetAndRefSuperVersion(cfd);
if (sv == nullptr) {
Status result =
Status::InvalidArgument("Could not access column family 0");
return result;
}
Status s1 = db_impl_->GetLatestSequenceForKey(
sv, record.Key(), false, &seq, &found_record_for_key);
if (found_record_for_key && seq == record.GetSN()) {
reloc_this = true;
}
db_impl_->ReturnAndCleanupSuperVersion(cfd, sv);
}
if (del_this) {
gcstats->num_deletes++;
gcstats->deleted_size += record.GetBlobSize();
if (first_gc) continue;
Transaction* txn = opt_db_->BeginTransaction(
write_options_, OptimisticTransactionOptions(), nullptr);
txn->Delete(cfh, record.Key());
Status s1 = txn->Commit();
// chances that this DELETE will fail is low. If it fails, it would be
// because a new version of the key came in at this time, which will
// override the current version being iterated on.
if (!s1.IsBusy()) {
// assume that failures happen due to new writes.
gcstats->overrided_while_delete++;
}
delete txn;
}
if (reloc_this) {
if (!newfile) {
// new file
std::string reason("GC of ");
reason += bfptr->PathName();
newfile = NewBlobFile(reason);
gcstats->newfile = newfile;
new_writer = CheckOrCreateWriterLocked(newfile);
newfile->header_ = std::move(header);
// Can't use header beyond this point
newfile->header_valid_ = true;
newfile->file_size_ = BlobLogHeader::kHeaderSize;
s = new_writer->WriteHeader(newfile->header_);
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"File: %s - header writing failed",
newfile->PathName().c_str());
return s;
}
WriteLock wl(&mutex_);
dir_change_.store(true);
blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile));
}
gcstats->num_relocs++;
std::string index_entry;
uint64_t blob_offset = 0;
uint64_t key_offset = 0;
// write the blob to the blob log.
s = new_writer->AddRecord(record.Key(), record.Blob(), &key_offset,
&blob_offset, record.GetTTL());
BlobHandle handle;
handle.set_filenumber(newfile->BlobFileNumber());
handle.set_size(record.Blob().size());
handle.set_offset(blob_offset);
handle.set_compression(bdb_options_.compression);
handle.EncodeTo(&index_entry);
new_writer->AddRecordFooter(record.GetSN());
newfile->blob_count_++;
newfile->file_size_ += BlobLogRecord::kHeaderSize + record.Key().size() +
record.Blob().size() + BlobLogRecord::kFooterSize;
Transaction* txn = opt_db_->BeginTransaction(
write_options_, OptimisticTransactionOptions(), nullptr);
txn->Put(cfh, record.Key(), index_entry);
Status s1 = txn->Commit();
// chances that this Put will fail is low. If it fails, it would be
// because a new version of the key came in at this time, which will
// override the current version being iterated on.
if (s1.IsBusy()) {
ROCKS_LOG_INFO(db_options_.info_log,
"Optimistic transaction failed: %s put bn: %" PRIu32,
bfptr->PathName().c_str(), gcstats->blob_count);
} else {
gcstats->succ_relocs++;
ROCKS_LOG_DEBUG(db_options_.info_log,
"Successfully added put back into LSM: %s bn: %" PRIu32,
bfptr->PathName().c_str(), gcstats->blob_count);
}
delete txn;
}
}
if (gcstats->newfile) total_blob_space_ += newfile->file_size_;
ROCKS_LOG_INFO(db_options_.info_log,
"File: %s Num deletes %" PRIu32 " Num relocs: %" PRIu32
" Succ Deletes: %" PRIu32 " Succ relocs: %" PRIu32,
bfptr->PathName().c_str(), gcstats->num_deletes,
gcstats->num_relocs, gcstats->succ_deletes_lsm,
gcstats->succ_relocs);
return s;
}
// Ideally we should hold the lock during the entire function,
// but under the asusmption that this is only called when a
// file is Immutable, we can reduce the critical section
bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
uint64_t last_id, std::string* reason) {
if (bfile->HasTTL()) {
ttlrange_t ttl_range = bfile->GetTTLRange();
if (tt > ttl_range.second) {
*reason = "entire file ttl expired";
return true;
}
if (!bfile->file_size_.load()) {
ROCKS_LOG_ERROR(db_options_.info_log, "Invalid file size = 0 %s",
bfile->PathName().c_str());
*reason = "file is empty";
return false;
}
if (bfile->gc_once_after_open_.load()) {
return true;
}
if (bdb_options_.ttl_range_secs < kPartialExpirationGCRangeSecs) {
*reason = "has ttl but partial expiration not turned on";
return false;
}
ReadLock lockbfile_r(&bfile->mutex_);
bool ret = ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
kPartialExpirationPercentage);
if (ret) {
*reason = "deleted blobs beyond threshold";
} else {
*reason = "deleted blobs below threshold";
}
return ret;
}
// when crash happens, we lose the in-memory account of deleted blobs.
// we are therefore forced to do one GC to make sure delete accounting
// is OK
if (bfile->gc_once_after_open_.load()) {
return true;
}
ReadLock lockbfile_r(&bfile->mutex_);
if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
kPartialExpirationPercentage) {
*reason = "deleted simple blobs beyond threshold";
return true;
}
// if we haven't reached limits of disk space, don't DELETE
if (bdb_options_.blob_dir_size == 0 ||
total_blob_space_.load() < bdb_options_.blob_dir_size) {
*reason = "disk space not exceeded";
return false;
}
bool ret = bfile->BlobFileNumber() == last_id;
if (ret) {
*reason = "eligible last simple blob file";
} else {
*reason = "not eligible since not last simple blob file";
}
return ret;
}
std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
if (aborted) return std::make_pair(false, -1);
{
ReadLock rl(&mutex_);
if (obsolete_files_.empty()) return std::make_pair(true, -1);
}
std::list<std::shared_ptr<BlobFile>> tobsolete;
{
WriteLock wl(&mutex_);
tobsolete.swap(obsolete_files_);
}
bool file_deleted = false;
for (auto iter = tobsolete.begin(); iter != tobsolete.end();) {
auto bfile = *iter;
{
ReadLock lockbfile_r(&bfile->mutex_);
if (!FileDeleteOk_SnapshotCheckLocked(bfile)) {
++iter;
continue;
}
}
Status s = env_->DeleteFile(bfile->PathName());
if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_.info_log,
"File failed to be deleted as obsolete %s",
bfile->PathName().c_str());
++iter;
continue;
}
file_deleted = true;
total_blob_space_ -= bfile->file_size_;
ROCKS_LOG_INFO(db_options_.info_log,
"File deleted as obsolete from blob dir %s",
bfile->PathName().c_str());
iter = tobsolete.erase(iter);
}
// directory change. Fsync
if (file_deleted) dir_ent_->Fsync();
// put files back into obsolete if for some reason, delete failed
if (!tobsolete.empty()) {
WriteLock wl(&mutex_);
for (auto bfile : tobsolete) obsolete_files_.push_front(bfile);
}
return std::make_pair(!aborted, -1);
}
bool BlobDBImpl::CallbackEvictsImpl(std::shared_ptr<BlobFile> bfile) {
std::shared_ptr<Reader> reader =
bfile->OpenSequentialReader(env_, db_options_, env_options_);
if (!reader) {
ROCKS_LOG_ERROR(
db_options_.info_log,
"File sequential reader could not be opened for evict callback: %s",
bfile->PathName().c_str());
return false;
}
ReadLock lockbfile_r(&bfile->mutex_);
BlobLogHeader header;
Status s = reader->ReadHeader(&header);
if (!s.ok()) {
ROCKS_LOG_ERROR(
db_options_.info_log,
"Failure to read header for blob-file during evict callback %s",
bfile->PathName().c_str());
return false;
}
ColumnFamilyHandle* cfh = bfile->GetColumnFamily(db_);
BlobLogRecord record;
Reader::ReadLevel full = Reader::kReadHdrKeyBlobFooter;
while (reader->ReadRecord(&record, full).ok()) {
bdb_options_.gc_evict_cb_fn(cfh, record.Key(), record.Blob());
}
return true;
}
std::pair<bool, int64_t> BlobDBImpl::RemoveTimerQ(TimerQueue* tq,
bool aborted) {
WriteLock wl(&mutex_);
for (auto itr = cb_threads_.begin(); itr != cb_threads_.end(); ++itr) {
if ((*itr).get() != tq) continue;
cb_threads_.erase(itr);
break;
}
return std::make_pair(false, -1);
}
std::pair<bool, int64_t> BlobDBImpl::CallbackEvicts(
TimerQueue* tq, std::shared_ptr<BlobFile> bfile, bool aborted) {
if (aborted) return std::make_pair(false, -1);
bool succ = CallbackEvictsImpl(bfile);
if (succ) {
ROCKS_LOG_DEBUG(db_options_.info_log, "Eviction callbacks completed %s",
bfile->PathName().c_str());
}
WriteLock wl(&mutex_);
bfile->SetCanBeDeleted();
obsolete_files_.push_front(bfile);
if (tq) {
// all of the callbacks have been processed
tqueue_.add(0, std::bind(&BlobDBImpl::RemoveTimerQ, this, tq,
std::placeholders::_1));
}
return std::make_pair(false, -1);
}
void BlobDBImpl::CopyBlobFiles(
std::vector<std::shared_ptr<BlobFile>>* bfiles_copy, uint64_t* last_id) {
ReadLock rl(&mutex_);
// take a copy
bfiles_copy->reserve(blob_files_.size());
for (auto const& ent : blob_files_) {
bfiles_copy->push_back(ent.second);
// A. has ttl is immutable, once set, hence no locks required
// B. blob files are sorted based on number(i.e. index of creation )
// so we will return the last blob file
if (!ent.second->HasTTL()) *last_id = ent.second->BlobFileNumber();
}
}
void BlobDBImpl::FilterSubsetOfFiles(
const std::vector<std::shared_ptr<BlobFile>>& blob_files,
std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
uint64_t last_id, size_t files_to_collect) {
// 100.0 / 15.0 = 7
uint64_t next_epoch_increment = static_cast<uint64_t>(
std::ceil(100 / static_cast<double>(kGCFilePercentage)));
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::time_t tt = std::chrono::system_clock::to_time_t(now);
size_t files_processed = 0;
for (auto bfile : blob_files) {
if (files_processed >= files_to_collect) break;
// if this is the first time processing the file
// i.e. gc_epoch == -1, process it.
// else process the file if its processing epoch matches
// the current epoch. Typically the #of epochs should be
// around 5-10
if (bfile->gc_epoch_ != -1 && (uint64_t)bfile->gc_epoch_ != epoch) {
continue;
}
files_processed++;
// reset the epoch
bfile->gc_epoch_ = epoch + next_epoch_increment;
// file has already been GC'd or is still open for append,
// then it should not be GC'd
if (bfile->Obsolete() || !bfile->Immutable()) continue;
std::string reason;
bool shouldgc = ShouldGCFile(bfile, tt, last_id, &reason);
if (!shouldgc) {
ROCKS_LOG_DEBUG(db_options_.info_log,
"File has been skipped for GC ttl %s %d %d reason='%s'",
bfile->PathName().c_str(), tt,
bfile->GetTTLRange().second, reason.c_str());
continue;
}
ROCKS_LOG_INFO(db_options_.info_log,
"File has been chosen for GC ttl %s %d %d reason='%s'",
bfile->PathName().c_str(), tt, bfile->GetTTLRange().second,
reason.c_str());
to_process->push_back(bfile);
}
}
std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
if (aborted) return std::make_pair(false, -1);
current_epoch_++;
// collect the ID of the last regular file, in case we need to GC it.
uint64_t last_id = std::numeric_limits<uint64_t>::max();
std::vector<std::shared_ptr<BlobFile>> blob_files;
CopyBlobFiles(&blob_files, &last_id);
if (!blob_files.size()) return std::make_pair(true, -1);
// 15% of files are collected each call to space out the IO and CPU
// consumption.
size_t files_to_collect = (kGCFilePercentage * blob_files.size()) / 100;
std::vector<std::shared_ptr<BlobFile>> to_process;
FilterSubsetOfFiles(blob_files, &to_process, current_epoch_, last_id,
files_to_collect);
// in this collect the set of files, which became obsolete
std::vector<std::shared_ptr<BlobFile>> obsoletes;
for (auto bfile : to_process) {
GCStats gcstats;
Status s = GCFileAndUpdateLSM(bfile, &gcstats);
if (!s.ok()) continue;
if (bfile->gc_once_after_open_.load()) {
WriteLock lockbfile_w(&bfile->mutex_);
bfile->deleted_size_ = gcstats.deleted_size;
bfile->deleted_count_ = gcstats.num_deletes;
bfile->gc_once_after_open_ = false;
} else {
obsoletes.push_back(bfile);
}
}
if (!obsoletes.empty()) {
bool evict_cb = (!!bdb_options_.gc_evict_cb_fn);
std::shared_ptr<TimerQueue> tq;
if (evict_cb) tq = std::make_shared<TimerQueue>();
// if evict callback is present, first schedule the callback thread
WriteLock wl(&mutex_);
for (auto bfile : obsoletes) {
bool last_file = (bfile == obsoletes.back());
// remove from global list so writers
blob_files_.erase(bfile->BlobFileNumber());
if (!evict_cb) {
bfile->SetCanBeDeleted();
obsolete_files_.push_front(bfile);
} else {
tq->add(0, std::bind(&BlobDBImpl::CallbackEvicts, this,
(last_file) ? tq.get() : nullptr, bfile,
std::placeholders::_1));
}
}
if (evict_cb) cb_threads_.emplace_back(tq);
}
// reschedule
return std::make_pair(true, -1);
}
Iterator* BlobDBImpl::NewIterator(const ReadOptions& opts,
ColumnFamilyHandle* column_family) {
return new BlobDBIterator(db_->NewIterator(opts, column_family),
column_family, this);
}
Status DestroyBlobDB(const std::string& dbname, const Options& options,
const BlobDBOptions& bdb_options) {
const ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
Env* env = soptions.env;
Status status;
std::string blobdir;
blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir
: bdb_options.blob_dir;
std::vector<std::string> filenames;
env->GetChildren(blobdir, &filenames);
for (const auto& f : filenames) {
uint64_t number;
FileType type;
if (ParseFileName(f, &number, &type) && type == kBlobFile) {
Status del = env->DeleteFile(blobdir + "/" + f);
if (status.ok() && !del.ok()) {
status = del;
}
}
}
env->DeleteDir(blobdir);
Status destroy = DestroyDB(dbname, options);
if (status.ok() && !destroy.ok()) {
status = destroy;
}
return status;
}
#ifndef NDEBUG
Status BlobDBImpl::TEST_GetSequenceNumber(const Slice& key,
SequenceNumber* sequence) {
std::string index_entry;
Status s = db_->Get(ReadOptions(), key, &index_entry);
if (!s.ok()) {
return s;
}
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
return CommonGet(cfh->cfd(), key, index_entry, nullptr, sequence);
}
std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
std::vector<std::shared_ptr<BlobFile>> blob_files;
for (auto& p : blob_files_) {
blob_files.emplace_back(p.second);
}
return blob_files;
}
void BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
CloseSeqWrite(bfile, false /*abort*/);
}
Status BlobDBImpl::TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
GCStats* gc_stats) {
return GCFileAndUpdateLSM(bfile, gc_stats);
}
#endif // !NDEBUG
} // namespace blob_db
} // namespace rocksdb
#endif // ROCKSDB_LITE