Blob storage pr
Summary: The final pull request for Blob Storage. Closes https://github.com/facebook/rocksdb/pull/2269 Differential Revision: D5033189 Pulled By: yiwu-arbug fbshipit-source-id: 6356b683ccd58cbf38a1dc55e2ea400feecd5d06
This commit is contained in:
parent
492fc49a86
commit
d85ff4953c
@ -446,6 +446,10 @@ set(SOURCES
|
|||||||
util/xxhash.cc
|
util/xxhash.cc
|
||||||
utilities/backupable/backupable_db.cc
|
utilities/backupable/backupable_db.cc
|
||||||
utilities/blob_db/blob_db.cc
|
utilities/blob_db/blob_db.cc
|
||||||
|
utilities/blob_db/blob_db_impl.cc
|
||||||
|
utilities/blob_db/blob_log_reader.cc
|
||||||
|
utilities/blob_db/blob_log_writer.cc
|
||||||
|
utilities/blob_db/blob_log_format.cc
|
||||||
utilities/checkpoint/checkpoint_impl.cc
|
utilities/checkpoint/checkpoint_impl.cc
|
||||||
utilities/col_buf_decoder.cc
|
utilities/col_buf_decoder.cc
|
||||||
utilities/col_buf_encoder.cc
|
utilities/col_buf_encoder.cc
|
||||||
@ -658,6 +662,7 @@ set(TESTS
|
|||||||
util/heap_test.cc
|
util/heap_test.cc
|
||||||
util/rate_limiter_test.cc
|
util/rate_limiter_test.cc
|
||||||
util/slice_transform_test.cc
|
util/slice_transform_test.cc
|
||||||
|
util/timer_queue_test.cc
|
||||||
util/thread_list_test.cc
|
util/thread_list_test.cc
|
||||||
util/thread_local_test.cc
|
util/thread_local_test.cc
|
||||||
utilities/backupable/backupable_db_test.cc
|
utilities/backupable/backupable_db_test.cc
|
||||||
|
5
Makefile
5
Makefile
@ -403,7 +403,6 @@ TESTS = \
|
|||||||
ttl_test \
|
ttl_test \
|
||||||
date_tiered_test \
|
date_tiered_test \
|
||||||
backupable_db_test \
|
backupable_db_test \
|
||||||
blob_db_test \
|
|
||||||
document_db_test \
|
document_db_test \
|
||||||
json_document_test \
|
json_document_test \
|
||||||
sim_cache_test \
|
sim_cache_test \
|
||||||
@ -424,6 +423,7 @@ TESTS = \
|
|||||||
options_settable_test \
|
options_settable_test \
|
||||||
options_util_test \
|
options_util_test \
|
||||||
event_logger_test \
|
event_logger_test \
|
||||||
|
timer_queue_test \
|
||||||
cuckoo_table_builder_test \
|
cuckoo_table_builder_test \
|
||||||
cuckoo_table_reader_test \
|
cuckoo_table_reader_test \
|
||||||
cuckoo_table_db_test \
|
cuckoo_table_db_test \
|
||||||
@ -1307,6 +1307,9 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS
|
|||||||
event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(AM_LINK)
|
$(AM_LINK)
|
||||||
|
|
||||||
|
timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
|
$(AM_LINK)
|
||||||
|
|
||||||
sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(AM_LINK)
|
$(AM_LINK)
|
||||||
|
|
||||||
|
6
TARGETS
6
TARGETS
@ -196,6 +196,12 @@ cpp_library(
|
|||||||
"util/xxhash.cc",
|
"util/xxhash.cc",
|
||||||
"utilities/backupable/backupable_db.cc",
|
"utilities/backupable/backupable_db.cc",
|
||||||
"utilities/blob_db/blob_db.cc",
|
"utilities/blob_db/blob_db.cc",
|
||||||
|
"utilities/blob_db/blob_db_impl.cc",
|
||||||
|
"utilities/blob_db/blob_db_options_impl.cc",
|
||||||
|
"utilities/blob_db/blob_file.cc",
|
||||||
|
"utilities/blob_db/blob_log_reader.cc",
|
||||||
|
"utilities/blob_db/blob_log_writer.cc",
|
||||||
|
"utilities/blob_db/blob_log_format.cc",
|
||||||
"utilities/checkpoint/checkpoint_impl.cc",
|
"utilities/checkpoint/checkpoint_impl.cc",
|
||||||
"utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
|
"utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
|
||||||
"utilities/convenience/info_log_finder.cc",
|
"utilities/convenience/info_log_finder.cc",
|
||||||
|
@ -93,7 +93,9 @@ CompactionIterator::CompactionIterator(
|
|||||||
latest_snapshot_ = snapshots_->back();
|
latest_snapshot_ = snapshots_->back();
|
||||||
}
|
}
|
||||||
if (compaction_filter_ != nullptr) {
|
if (compaction_filter_ != nullptr) {
|
||||||
if (compaction_filter_->IgnoreSnapshots()) ignore_snapshots_ = true;
|
if (compaction_filter_->IgnoreSnapshots()) {
|
||||||
|
ignore_snapshots_ = true;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ignore_snapshots_ = false;
|
ignore_snapshots_ = false;
|
||||||
}
|
}
|
||||||
|
@ -1577,6 +1577,14 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
|||||||
delete casted_s;
|
delete casted_s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool DBImpl::HasActiveSnapshotLaterThanSN(SequenceNumber sn) {
|
||||||
|
InstrumentedMutexLock l(&mutex_);
|
||||||
|
if (snapshots_.empty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return (snapshots_.newest()->GetSequenceNumber() > sn);
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
||||||
TablePropertiesCollection* props) {
|
TablePropertiesCollection* props) {
|
||||||
@ -1821,6 +1829,20 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
|
|||||||
return cf_memtables->GetColumnFamilyHandle();
|
return cf_memtables->GetColumnFamilyHandle();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// REQUIRED: mutex is NOT held.
|
||||||
|
ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked(
|
||||||
|
uint32_t column_family_id) {
|
||||||
|
ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
|
||||||
|
|
||||||
|
InstrumentedMutexLock l(&mutex_);
|
||||||
|
|
||||||
|
if (!cf_memtables->Seek(column_family_id)) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return cf_memtables->GetColumnFamilyHandle();
|
||||||
|
}
|
||||||
|
|
||||||
void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
|
void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
|
||||||
const Range& range,
|
const Range& range,
|
||||||
uint64_t* const count,
|
uint64_t* const count,
|
||||||
|
@ -203,6 +203,8 @@ class DBImpl : public DB {
|
|||||||
|
|
||||||
virtual SequenceNumber GetLatestSequenceNumber() const override;
|
virtual SequenceNumber GetLatestSequenceNumber() const override;
|
||||||
|
|
||||||
|
bool HasActiveSnapshotLaterThanSN(SequenceNumber sn);
|
||||||
|
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
using DB::ResetStats;
|
using DB::ResetStats;
|
||||||
virtual Status ResetStats() override;
|
virtual Status ResetStats() override;
|
||||||
@ -465,6 +467,9 @@ class DBImpl : public DB {
|
|||||||
// mutex is released.
|
// mutex is released.
|
||||||
ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
|
ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
|
||||||
|
|
||||||
|
// Same as above, should called without mutex held and not on write thread.
|
||||||
|
ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id);
|
||||||
|
|
||||||
// Returns the number of currently running flushes.
|
// Returns the number of currently running flushes.
|
||||||
// REQUIREMENT: mutex_ must be held when calling this function.
|
// REQUIREMENT: mutex_ must be held when calling this function.
|
||||||
int num_running_flushes() {
|
int num_running_flushes() {
|
||||||
|
21
env/env_posix.cc
vendored
21
env/env_posix.cc
vendored
@ -253,13 +253,14 @@ class PosixEnv : public Env {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status NewWritableFile(const std::string& fname,
|
virtual Status OpenWritableFile(const std::string& fname,
|
||||||
unique_ptr<WritableFile>* result,
|
unique_ptr<WritableFile>* result,
|
||||||
const EnvOptions& options) override {
|
const EnvOptions& options,
|
||||||
|
bool reopen = false) {
|
||||||
result->reset();
|
result->reset();
|
||||||
Status s;
|
Status s;
|
||||||
int fd = -1;
|
int fd = -1;
|
||||||
int flags = O_CREAT | O_TRUNC;
|
int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
|
||||||
// Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
|
// Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
|
||||||
if (options.use_direct_writes && !options.use_mmap_writes) {
|
if (options.use_direct_writes && !options.use_mmap_writes) {
|
||||||
// Note: we should avoid O_APPEND here due to ta the following bug:
|
// Note: we should avoid O_APPEND here due to ta the following bug:
|
||||||
@ -333,6 +334,18 @@ class PosixEnv : public Env {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual Status NewWritableFile(const std::string& fname,
|
||||||
|
unique_ptr<WritableFile>* result,
|
||||||
|
const EnvOptions& options) override {
|
||||||
|
return OpenWritableFile(fname, result, options, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual Status ReopenWritableFile(const std::string& fname,
|
||||||
|
unique_ptr<WritableFile>* result,
|
||||||
|
const EnvOptions& options) override {
|
||||||
|
return OpenWritableFile(fname, result, options, true);
|
||||||
|
}
|
||||||
|
|
||||||
virtual Status ReuseWritableFile(const std::string& fname,
|
virtual Status ReuseWritableFile(const std::string& fname,
|
||||||
const std::string& old_fname,
|
const std::string& old_fname,
|
||||||
unique_ptr<WritableFile>* result,
|
unique_ptr<WritableFile>* result,
|
||||||
|
@ -468,8 +468,6 @@ class SequentialFile {
|
|||||||
// aligned buffer for Direct I/O
|
// aligned buffer for Direct I/O
|
||||||
virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
|
virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
|
||||||
|
|
||||||
virtual void Rewind() {}
|
|
||||||
|
|
||||||
// Remove any kind of caching of data from the offset to offset+length
|
// Remove any kind of caching of data from the offset to offset+length
|
||||||
// of this file. If the length is 0, then it refers to the end of file.
|
// of this file. If the length is 0, then it refers to the end of file.
|
||||||
// If the system is not caching the file contents, then this is a noop.
|
// If the system is not caching the file contents, then this is a noop.
|
||||||
|
7
src.mk
7
src.mk
@ -150,6 +150,12 @@ LIB_SOURCES = \
|
|||||||
util/xxhash.cc \
|
util/xxhash.cc \
|
||||||
utilities/backupable/backupable_db.cc \
|
utilities/backupable/backupable_db.cc \
|
||||||
utilities/blob_db/blob_db.cc \
|
utilities/blob_db/blob_db.cc \
|
||||||
|
utilities/blob_db/blob_db_impl.cc \
|
||||||
|
utilities/blob_db/blob_db_options_impl.cc \
|
||||||
|
utilities/blob_db/blob_file.cc \
|
||||||
|
utilities/blob_db/blob_log_reader.cc \
|
||||||
|
utilities/blob_db/blob_log_writer.cc \
|
||||||
|
utilities/blob_db/blob_log_format.cc \
|
||||||
utilities/checkpoint/checkpoint_impl.cc \
|
utilities/checkpoint/checkpoint_impl.cc \
|
||||||
utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc \
|
utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc \
|
||||||
utilities/convenience/info_log_finder.cc \
|
utilities/convenience/info_log_finder.cc \
|
||||||
@ -308,6 +314,7 @@ MAIN_SOURCES = \
|
|||||||
util/log_write_bench.cc \
|
util/log_write_bench.cc \
|
||||||
util/rate_limiter_test.cc \
|
util/rate_limiter_test.cc \
|
||||||
util/slice_transform_test.cc \
|
util/slice_transform_test.cc \
|
||||||
|
util/timer_queue_test.cc \
|
||||||
util/thread_list_test.cc \
|
util/thread_list_test.cc \
|
||||||
util/thread_local_test.cc \
|
util/thread_local_test.cc \
|
||||||
utilities/backupable/backupable_db_test.cc \
|
utilities/backupable/backupable_db_test.cc \
|
||||||
|
@ -583,6 +583,10 @@ DEFINE_bool(optimistic_transaction_db, false,
|
|||||||
"Open a OptimisticTransactionDB instance. "
|
"Open a OptimisticTransactionDB instance. "
|
||||||
"Required for randomtransaction benchmark.");
|
"Required for randomtransaction benchmark.");
|
||||||
|
|
||||||
|
DEFINE_bool(use_blob_db, false,
|
||||||
|
"Open a BlobDB instance. "
|
||||||
|
"Required for largevalue benchmark.");
|
||||||
|
|
||||||
DEFINE_bool(transaction_db, false,
|
DEFINE_bool(transaction_db, false,
|
||||||
"Open a TransactionDB instance. "
|
"Open a TransactionDB instance. "
|
||||||
"Required for randomtransaction benchmark.");
|
"Required for randomtransaction benchmark.");
|
||||||
@ -630,8 +634,6 @@ DEFINE_bool(report_bg_io_stats, false,
|
|||||||
DEFINE_bool(use_stderr_info_logger, false,
|
DEFINE_bool(use_stderr_info_logger, false,
|
||||||
"Write info logs to stderr instead of to LOG file. ");
|
"Write info logs to stderr instead of to LOG file. ");
|
||||||
|
|
||||||
DEFINE_bool(use_blob_db, false, "Whether to use BlobDB. ");
|
|
||||||
|
|
||||||
static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
|
static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
|
||||||
assert(ctype);
|
assert(ctype);
|
||||||
|
|
||||||
@ -1128,6 +1130,15 @@ class RandomGenerator {
|
|||||||
pos_ += len;
|
pos_ += len;
|
||||||
return Slice(data_.data() + pos_ - len, len);
|
return Slice(data_.data() + pos_ - len, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Slice GenerateWithTTL(unsigned int len) {
|
||||||
|
assert(len <= data_.size());
|
||||||
|
if (pos_ + len > data_.size()) {
|
||||||
|
pos_ = 0;
|
||||||
|
}
|
||||||
|
pos_ += len;
|
||||||
|
return Slice(data_.data() + pos_ - len, len);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void AppendWithSpace(std::string* str, Slice msg) {
|
static void AppendWithSpace(std::string* str, Slice msg) {
|
||||||
@ -3227,9 +3238,14 @@ void VerifyDBFromDB(std::string& truth_db_name) {
|
|||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
db->db = ptr;
|
db->db = ptr;
|
||||||
}
|
}
|
||||||
#endif // ROCKSDB_LITE
|
|
||||||
} else if (FLAGS_use_blob_db) {
|
} else if (FLAGS_use_blob_db) {
|
||||||
s = NewBlobDB(options, db_name, &db->db);
|
blob_db::BlobDBOptions blob_db_options;
|
||||||
|
blob_db::BlobDB* ptr;
|
||||||
|
s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
|
||||||
|
if (s.ok()) {
|
||||||
|
db->db = ptr;
|
||||||
|
}
|
||||||
|
#endif // ROCKSDB_LITE
|
||||||
} else {
|
} else {
|
||||||
s = DB::Open(options, db_name, &db->db);
|
s = DB::Open(options, db_name, &db->db);
|
||||||
}
|
}
|
||||||
@ -3406,8 +3422,12 @@ void VerifyDBFromDB(std::string& truth_db_name) {
|
|||||||
int64_t rand_num = key_gens[id]->Next();
|
int64_t rand_num = key_gens[id]->Next();
|
||||||
GenerateKeyFromInt(rand_num, FLAGS_num, &key);
|
GenerateKeyFromInt(rand_num, FLAGS_num, &key);
|
||||||
if (FLAGS_use_blob_db) {
|
if (FLAGS_use_blob_db) {
|
||||||
s = db_with_cfh->db->Put(write_options_, key,
|
Slice val = gen.Generate(value_size_);
|
||||||
gen.Generate(value_size_));
|
int ttl = rand() % 86400;
|
||||||
|
blob_db::BlobDB* blobdb =
|
||||||
|
static_cast<blob_db::BlobDB*>(db_with_cfh->db);
|
||||||
|
s = blobdb->PutWithTTL(write_options_, key, val, ttl);
|
||||||
|
|
||||||
} else if (FLAGS_num_column_families <= 1) {
|
} else if (FLAGS_num_column_families <= 1) {
|
||||||
batch.Put(key, gen.Generate(value_size_));
|
batch.Put(key, gen.Generate(value_size_));
|
||||||
} else {
|
} else {
|
||||||
|
@ -48,6 +48,8 @@ class SequentialFileReader {
|
|||||||
|
|
||||||
Status Skip(uint64_t n);
|
Status Skip(uint64_t n);
|
||||||
|
|
||||||
|
void Rewind();
|
||||||
|
|
||||||
SequentialFile* file() { return file_.get(); }
|
SequentialFile* file() { return file_.get(); }
|
||||||
|
|
||||||
bool use_direct_io() const { return file_->use_direct_io(); }
|
bool use_direct_io() const { return file_->use_direct_io(); }
|
||||||
|
158
util/mpsc.h
Normal file
158
util/mpsc.h
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Large parts of this file is borrowed from the public domain code below.
|
||||||
|
// from https://github.com/mstump/queues
|
||||||
|
|
||||||
|
// C++ implementation of Dmitry Vyukov's non-intrusive
|
||||||
|
// lock free unbound MPSC queue
|
||||||
|
// http://www.1024cores.net/home/
|
||||||
|
// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
|
||||||
|
|
||||||
|
// License from mstump/queues
|
||||||
|
// This is free and unencumbered software released into the public domain.
|
||||||
|
//
|
||||||
|
// Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
// distribute this software, either in source code form or as a compiled
|
||||||
|
// binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
// means.
|
||||||
|
//
|
||||||
|
// In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
// of this software dedicate any and all copyright interest in the
|
||||||
|
// software to the public domain. We make this dedication for the benefit
|
||||||
|
// of the public at large and to the detriment of our heirs and
|
||||||
|
// successors. We intend this dedication to be an overt act of
|
||||||
|
// relinquishment in perpetuity of all present and future rights to this
|
||||||
|
// software under copyright law.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
// OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
//
|
||||||
|
// For more information, please refer to <http://unlicense.org>
|
||||||
|
|
||||||
|
// License from http://www.1024cores.net/home/
|
||||||
|
// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
|
||||||
|
// Copyright (c) 2010-2011 Dmitry Vyukov. All rights reserved.
|
||||||
|
// Redistribution and use in source and binary forms, with or
|
||||||
|
// without modification, are permitted provided that the following
|
||||||
|
// conditions are met:
|
||||||
|
// 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other materials provided with the distribution.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED BY DMITRY VYUKOV "AS IS" AND ANY EXPRESS OR
|
||||||
|
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||||
|
// EVENT SHALL DMITRY VYUKOV OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||||
|
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
//
|
||||||
|
// The views and conclusions contained in the software and documentation
|
||||||
|
// are those of the authors and should not be interpreted as representing
|
||||||
|
// official policies, either expressed or implied, of Dmitry Vyukov.
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef UTIL_MPSC_H_
|
||||||
|
#define UTIL_MPSC_H_
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <cassert>
|
||||||
|
#include <type_traits>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Multiple Producer Single Consumer Lockless Q
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
class mpsc_queue_t {
|
||||||
|
public:
|
||||||
|
struct buffer_node_t {
|
||||||
|
T data;
|
||||||
|
std::atomic<buffer_node_t*> next;
|
||||||
|
};
|
||||||
|
|
||||||
|
mpsc_queue_t() {
|
||||||
|
buffer_node_aligned_t* al_st = new buffer_node_aligned_t;
|
||||||
|
buffer_node_t* node = new (al_st) buffer_node_t();
|
||||||
|
_head.store(node);
|
||||||
|
_tail.store(node);
|
||||||
|
|
||||||
|
node->next.store(nullptr, std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
~mpsc_queue_t() {
|
||||||
|
T output;
|
||||||
|
while (this->dequeue(&output)) {
|
||||||
|
}
|
||||||
|
buffer_node_t* front = _head.load(std::memory_order_relaxed);
|
||||||
|
front->~buffer_node_t();
|
||||||
|
|
||||||
|
::operator delete(front);
|
||||||
|
}
|
||||||
|
|
||||||
|
void enqueue(const T& input) {
|
||||||
|
buffer_node_aligned_t* al_st = new buffer_node_aligned_t;
|
||||||
|
buffer_node_t* node = new (al_st) buffer_node_t();
|
||||||
|
|
||||||
|
node->data = input;
|
||||||
|
node->next.store(nullptr, std::memory_order_relaxed);
|
||||||
|
|
||||||
|
buffer_node_t* prev_head = _head.exchange(node, std::memory_order_acq_rel);
|
||||||
|
prev_head->next.store(node, std::memory_order_release);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool dequeue(T* output) {
|
||||||
|
buffer_node_t* tail = _tail.load(std::memory_order_relaxed);
|
||||||
|
buffer_node_t* next = tail->next.load(std::memory_order_acquire);
|
||||||
|
|
||||||
|
if (next == nullptr) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
*output = next->data;
|
||||||
|
_tail.store(next, std::memory_order_release);
|
||||||
|
|
||||||
|
tail->~buffer_node_t();
|
||||||
|
|
||||||
|
::operator delete(tail);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// you can only use pop_all if the queue is SPSC
|
||||||
|
buffer_node_t* pop_all() {
|
||||||
|
// nobody else can move the tail pointer.
|
||||||
|
buffer_node_t* tptr = _tail.load(std::memory_order_relaxed);
|
||||||
|
buffer_node_t* next =
|
||||||
|
tptr->next.exchange(nullptr, std::memory_order_acquire);
|
||||||
|
_head.exchange(tptr, std::memory_order_acquire);
|
||||||
|
|
||||||
|
// there is a race condition here
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
typedef typename std::aligned_storage<
|
||||||
|
sizeof(buffer_node_t), std::alignment_of<buffer_node_t>::value>::type
|
||||||
|
buffer_node_aligned_t;
|
||||||
|
|
||||||
|
std::atomic<buffer_node_t*> _head;
|
||||||
|
std::atomic<buffer_node_t*> _tail;
|
||||||
|
|
||||||
|
mpsc_queue_t(const mpsc_queue_t&) = delete;
|
||||||
|
mpsc_queue_t& operator=(const mpsc_queue_t&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // UTIL_MPSC_H_
|
217
util/timer_queue.h
Normal file
217
util/timer_queue.h
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Borrowed from
|
||||||
|
// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
|
||||||
|
// Timer Queue
|
||||||
|
//
|
||||||
|
// License
|
||||||
|
//
|
||||||
|
// The source code in this article is licensed under the CC0 license, so feel
|
||||||
|
// free to copy, modify, share, do whatever you want with it.
|
||||||
|
// No attribution is required, but Ill be happy if you do.
|
||||||
|
// CC0 license
|
||||||
|
|
||||||
|
// The person who associated a work with this deed has dedicated the work to the
|
||||||
|
// public domain by waiving all of his or her rights to the work worldwide
|
||||||
|
// under copyright law, including all related and neighboring rights, to the
|
||||||
|
// extent allowed by law. You can copy, modify, distribute and perform the
|
||||||
|
// work, even for commercial purposes, all without asking permission.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <assert.h>
|
||||||
|
#include <chrono>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <functional>
|
||||||
|
#include <queue>
|
||||||
|
#include <thread>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// Allows execution of handlers at a specified time in the future
|
||||||
|
// Guarantees:
|
||||||
|
// - All handlers are executed ONCE, even if cancelled (aborted parameter will
|
||||||
|
// be set to true)
|
||||||
|
// - If TimerQueue is destroyed, it will cancel all handlers.
|
||||||
|
// - Handlers are ALWAYS executed in the Timer Queue worker thread.
|
||||||
|
// - Handlers execution order is NOT guaranteed
|
||||||
|
//
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// borrowed from
|
||||||
|
// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
|
||||||
|
class TimerQueue {
|
||||||
|
public:
|
||||||
|
TimerQueue() : m_th(&TimerQueue::run, this) {}
|
||||||
|
|
||||||
|
~TimerQueue() {
|
||||||
|
cancelAll();
|
||||||
|
// Abusing the timer queue to trigger the shutdown.
|
||||||
|
add(0, [this](bool) {
|
||||||
|
m_finish = true;
|
||||||
|
return std::make_pair(false, 0);
|
||||||
|
});
|
||||||
|
m_th.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adds a new timer
|
||||||
|
// \return
|
||||||
|
// Returns the ID of the new timer. You can use this ID to cancel the
|
||||||
|
// timer
|
||||||
|
uint64_t add(int64_t milliseconds,
|
||||||
|
std::function<std::pair<bool, int64_t>(bool)> handler) {
|
||||||
|
WorkItem item;
|
||||||
|
Clock::time_point tp = Clock::now();
|
||||||
|
item.end = tp + std::chrono::milliseconds(milliseconds);
|
||||||
|
item.period = milliseconds;
|
||||||
|
item.handler = std::move(handler);
|
||||||
|
|
||||||
|
std::unique_lock<std::mutex> lk(m_mtx);
|
||||||
|
uint64_t id = ++m_idcounter;
|
||||||
|
item.id = id;
|
||||||
|
m_items.push(std::move(item));
|
||||||
|
|
||||||
|
// Something changed, so wake up timer thread
|
||||||
|
m_checkWork.notify_one();
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cancels the specified timer
|
||||||
|
// \return
|
||||||
|
// 1 if the timer was cancelled.
|
||||||
|
// 0 if you were too late to cancel (or the timer ID was never valid to
|
||||||
|
// start with)
|
||||||
|
size_t cancel(uint64_t id) {
|
||||||
|
// Instead of removing the item from the container (thus breaking the
|
||||||
|
// heap integrity), we set the item as having no handler, and put
|
||||||
|
// that handler on a new item at the top for immediate execution
|
||||||
|
// The timer thread will then ignore the original item, since it has no
|
||||||
|
// handler.
|
||||||
|
std::unique_lock<std::mutex> lk(m_mtx);
|
||||||
|
for (auto&& item : m_items.getContainer()) {
|
||||||
|
if (item.id == id && item.handler) {
|
||||||
|
WorkItem newItem;
|
||||||
|
// Zero time, so it stays at the top for immediate execution
|
||||||
|
newItem.end = Clock::time_point();
|
||||||
|
newItem.id = 0; // Means it is a canceled item
|
||||||
|
// Move the handler from item to newitem (thus clearing item)
|
||||||
|
newItem.handler = std::move(item.handler);
|
||||||
|
m_items.push(std::move(newItem));
|
||||||
|
|
||||||
|
// Something changed, so wake up timer thread
|
||||||
|
m_checkWork.notify_one();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cancels all timers
|
||||||
|
// \return
|
||||||
|
// The number of timers cancelled
|
||||||
|
size_t cancelAll() {
|
||||||
|
// Setting all "end" to 0 (for immediate execution) is ok,
|
||||||
|
// since it maintains the heap integrity
|
||||||
|
std::unique_lock<std::mutex> lk(m_mtx);
|
||||||
|
m_cancel = true;
|
||||||
|
for (auto&& item : m_items.getContainer()) {
|
||||||
|
if (item.id && item.handler) {
|
||||||
|
item.end = Clock::time_point();
|
||||||
|
item.id = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto ret = m_items.size();
|
||||||
|
|
||||||
|
m_checkWork.notify_one();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
using Clock = std::chrono::steady_clock;
|
||||||
|
TimerQueue(const TimerQueue&) = delete;
|
||||||
|
TimerQueue& operator=(const TimerQueue&) = delete;
|
||||||
|
|
||||||
|
void run() {
|
||||||
|
std::unique_lock<std::mutex> lk(m_mtx);
|
||||||
|
while (!m_finish) {
|
||||||
|
auto end = calcWaitTime_lock();
|
||||||
|
if (end.first) {
|
||||||
|
// Timers found, so wait until it expires (or something else
|
||||||
|
// changes)
|
||||||
|
m_checkWork.wait_until(lk, end.second);
|
||||||
|
} else {
|
||||||
|
// No timers exist, so wait forever until something changes
|
||||||
|
m_checkWork.wait(lk);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check and execute as much work as possible, such as, all expired
|
||||||
|
// timers
|
||||||
|
checkWork(&lk);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we are shutting down, we should not have any items left,
|
||||||
|
// since the shutdown cancels all items
|
||||||
|
assert(m_items.size() == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<bool, Clock::time_point> calcWaitTime_lock() {
|
||||||
|
while (m_items.size()) {
|
||||||
|
if (m_items.top().handler) {
|
||||||
|
// Item present, so return the new wait time
|
||||||
|
return std::make_pair(true, m_items.top().end);
|
||||||
|
} else {
|
||||||
|
// Discard empty handlers (they were cancelled)
|
||||||
|
m_items.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No items found, so return no wait time (causes the thread to wait
|
||||||
|
// indefinitely)
|
||||||
|
return std::make_pair(false, Clock::time_point());
|
||||||
|
}
|
||||||
|
|
||||||
|
void checkWork(std::unique_lock<std::mutex>* lk) {
|
||||||
|
while (m_items.size() && m_items.top().end <= Clock::now()) {
|
||||||
|
WorkItem item(m_items.top());
|
||||||
|
m_items.pop();
|
||||||
|
|
||||||
|
if (item.handler) {
|
||||||
|
(*lk).unlock();
|
||||||
|
auto reschedule_pair = item.handler(item.id == 0);
|
||||||
|
(*lk).lock();
|
||||||
|
if (!m_cancel && reschedule_pair.first) {
|
||||||
|
int64_t new_period = (reschedule_pair.second == -1)
|
||||||
|
? item.period
|
||||||
|
: reschedule_pair.second;
|
||||||
|
|
||||||
|
item.period = new_period;
|
||||||
|
item.end = Clock::now() + std::chrono::milliseconds(new_period);
|
||||||
|
m_items.push(std::move(item));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool m_finish = false;
|
||||||
|
bool m_cancel = false;
|
||||||
|
uint64_t m_idcounter = 0;
|
||||||
|
std::condition_variable m_checkWork;
|
||||||
|
|
||||||
|
struct WorkItem {
|
||||||
|
Clock::time_point end;
|
||||||
|
int64_t period;
|
||||||
|
uint64_t id; // id==0 means it was cancelled
|
||||||
|
std::function<std::pair<bool, int64_t>(bool)> handler;
|
||||||
|
bool operator>(const WorkItem& other) const { return end > other.end; }
|
||||||
|
};
|
||||||
|
|
||||||
|
std::mutex m_mtx;
|
||||||
|
// Inheriting from priority_queue, so we can access the internal container
|
||||||
|
class Queue : public std::priority_queue<WorkItem, std::vector<WorkItem>,
|
||||||
|
std::greater<WorkItem>> {
|
||||||
|
public:
|
||||||
|
std::vector<WorkItem>& getContainer() { return this->c; }
|
||||||
|
} m_items;
|
||||||
|
std::thread m_th;
|
||||||
|
};
|
72
util/timer_queue_test.cc
Normal file
72
util/timer_queue_test.cc
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
// borrowed from
|
||||||
|
// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
|
||||||
|
// Timer Queue
|
||||||
|
//
|
||||||
|
// License
|
||||||
|
//
|
||||||
|
// The source code in this article is licensed under the CC0 license, so feel
|
||||||
|
// free
|
||||||
|
// to copy, modify, share, do whatever you want with it.
|
||||||
|
// No attribution is required, but Ill be happy if you do.
|
||||||
|
// CC0 license
|
||||||
|
|
||||||
|
// The person who associated a work with this deed has dedicated the work to the
|
||||||
|
// public domain by waiving all of his or her rights to the work worldwide
|
||||||
|
// under copyright law, including all related and neighboring rights, to the
|
||||||
|
// extent allowed by law. You can copy, modify, distribute and perform the
|
||||||
|
// work, even for
|
||||||
|
// commercial purposes, all without asking permission. See Other Information
|
||||||
|
// below.
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "util/timer_queue.h"
|
||||||
|
#include <future>
|
||||||
|
|
||||||
|
namespace Timing {
|
||||||
|
|
||||||
|
using Clock = std::chrono::high_resolution_clock;
|
||||||
|
double now() {
|
||||||
|
static auto start = Clock::now();
|
||||||
|
return std::chrono::duration<double, std::milli>(Clock::now() - start)
|
||||||
|
.count();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Timing
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
TimerQueue q;
|
||||||
|
|
||||||
|
double tnow = Timing::now();
|
||||||
|
|
||||||
|
q.add(10000, [tnow](bool aborted) mutable {
|
||||||
|
printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
|
||||||
|
return std::make_pair(false, 0);
|
||||||
|
});
|
||||||
|
q.add(10001, [tnow](bool aborted) mutable {
|
||||||
|
printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
|
||||||
|
return std::make_pair(false, 0);
|
||||||
|
});
|
||||||
|
|
||||||
|
q.add(1000, [tnow](bool aborted) mutable {
|
||||||
|
printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
|
||||||
|
return std::make_pair(!aborted, 1000);
|
||||||
|
});
|
||||||
|
|
||||||
|
auto id = q.add(2000, [tnow](bool aborted) mutable {
|
||||||
|
printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
|
||||||
|
return std::make_pair(!aborted, 2000);
|
||||||
|
});
|
||||||
|
|
||||||
|
(void)id;
|
||||||
|
// auto ret = q.cancel(id);
|
||||||
|
// assert(ret == 1);
|
||||||
|
// q.cancelAll();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
//////////////////////////////////////////
|
@ -1,12 +1,15 @@
|
|||||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// This source code is licensed under the BSD-style license found in the
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
#include "utilities/blob_db/blob_db.h"
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include "utilities/blob_db/blob_db.h"
|
||||||
#include "db/write_batch_internal.h"
|
#include "db/write_batch_internal.h"
|
||||||
#include "monitoring/instrumented_mutex.h"
|
#include "monitoring/instrumented_mutex.h"
|
||||||
#include "options/cf_options.h"
|
#include "options/cf_options.h"
|
||||||
|
#include "rocksdb/compaction_filter.h"
|
||||||
#include "rocksdb/convenience.h"
|
#include "rocksdb/convenience.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
@ -17,194 +20,152 @@
|
|||||||
#include "util/crc32c.h"
|
#include "util/crc32c.h"
|
||||||
#include "util/file_reader_writer.h"
|
#include "util/file_reader_writer.h"
|
||||||
#include "util/filename.h"
|
#include "util/filename.h"
|
||||||
|
#include "utilities/blob_db/blob_db_impl.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
namespace {
|
namespace blob_db {
|
||||||
int kBlockBasedTableVersionFormat = 2;
|
port::Mutex listener_mutex;
|
||||||
} // namespace
|
typedef std::shared_ptr<BlobDBFlushBeginListener> FlushBeginListener_t;
|
||||||
|
typedef std::shared_ptr<BlobReconcileWalFilter> ReconcileWalFilter_t;
|
||||||
|
typedef std::shared_ptr<EvictAllVersionsCompactionListener>
|
||||||
|
CompactionListener_t;
|
||||||
|
|
||||||
class BlobDB : public StackableDB {
|
// to ensure the lifetime of the listeners
|
||||||
public:
|
std::vector<std::shared_ptr<EventListener>> all_blobdb_listeners;
|
||||||
using rocksdb::StackableDB::Put;
|
std::vector<ReconcileWalFilter_t> all_wal_filters;
|
||||||
Status Put(const WriteOptions& options, const Slice& key,
|
|
||||||
const Slice& value) override;
|
|
||||||
|
|
||||||
using rocksdb::StackableDB::Get;
|
Status BlobDB::OpenAndLoad(const Options& options,
|
||||||
Status Get(const ReadOptions& options, const Slice& key,
|
const BlobDBOptions& bdb_options,
|
||||||
std::string* value) override;
|
const std::string& dbname, BlobDB** blob_db,
|
||||||
|
Options* changed_options) {
|
||||||
|
*changed_options = options;
|
||||||
|
*blob_db = nullptr;
|
||||||
|
|
||||||
Status Open();
|
FlushBeginListener_t fblistener =
|
||||||
|
std::make_shared<BlobDBFlushBeginListener>();
|
||||||
|
ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
|
||||||
|
CompactionListener_t ce_listener =
|
||||||
|
std::make_shared<EvictAllVersionsCompactionListener>();
|
||||||
|
|
||||||
explicit BlobDB(DB* db);
|
{
|
||||||
|
MutexLock l(&listener_mutex);
|
||||||
private:
|
all_blobdb_listeners.push_back(fblistener);
|
||||||
std::string dbname_;
|
all_blobdb_listeners.push_back(ce_listener);
|
||||||
ImmutableCFOptions ioptions_;
|
all_wal_filters.push_back(rw_filter);
|
||||||
InstrumentedMutex mutex_;
|
|
||||||
std::unique_ptr<RandomAccessFileReader> file_reader_;
|
|
||||||
std::unique_ptr<WritableFileWriter> file_writer_;
|
|
||||||
size_t writer_offset_;
|
|
||||||
size_t next_sync_offset_;
|
|
||||||
|
|
||||||
static const std::string kFileName;
|
|
||||||
static const size_t kBlockHeaderSize;
|
|
||||||
static const size_t kBytesPerSync;
|
|
||||||
};
|
|
||||||
|
|
||||||
Status NewBlobDB(Options options, std::string dbname, DB** blob_db) {
|
|
||||||
DB* db;
|
|
||||||
Status s = DB::Open(options, dbname, &db);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
}
|
||||||
BlobDB* bdb = new BlobDB(db);
|
|
||||||
s = bdb->Open();
|
changed_options->listeners.emplace_back(fblistener);
|
||||||
|
changed_options->listeners.emplace_back(ce_listener);
|
||||||
|
changed_options->wal_filter = rw_filter.get();
|
||||||
|
|
||||||
|
DBOptions db_options(*changed_options);
|
||||||
|
|
||||||
|
// we need to open blob db first so that recovery can happen
|
||||||
|
BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
|
||||||
|
|
||||||
|
fblistener->SetImplPtr(bdb);
|
||||||
|
ce_listener->SetImplPtr(bdb);
|
||||||
|
rw_filter->SetImplPtr(bdb);
|
||||||
|
|
||||||
|
Status s = bdb->OpenPhase1();
|
||||||
|
if (!s.ok()) return s;
|
||||||
|
|
||||||
|
*blob_db = bdb;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options,
|
||||||
|
const std::string& dbname, BlobDB** blob_db) {
|
||||||
|
*blob_db = nullptr;
|
||||||
|
DBOptions db_options(options);
|
||||||
|
ColumnFamilyOptions cf_options(options);
|
||||||
|
std::vector<ColumnFamilyDescriptor> column_families;
|
||||||
|
column_families.push_back(
|
||||||
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
||||||
|
std::vector<ColumnFamilyHandle*> handles;
|
||||||
|
Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families,
|
||||||
|
&handles, blob_db);
|
||||||
|
if (s.ok()) {
|
||||||
|
assert(handles.size() == 1);
|
||||||
|
// i can delete the handle since DBImpl is always holding a reference to
|
||||||
|
// default column family
|
||||||
|
delete handles[0];
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobDB::Open(const DBOptions& db_options,
|
||||||
|
const BlobDBOptions& bdb_options, const std::string& dbname,
|
||||||
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||||
|
std::vector<ColumnFamilyHandle*>* handles, BlobDB** blob_db,
|
||||||
|
bool no_base_db) {
|
||||||
|
*blob_db = nullptr;
|
||||||
|
|
||||||
|
DBOptions my_db_options(db_options);
|
||||||
|
FlushBeginListener_t fblistener =
|
||||||
|
std::make_shared<BlobDBFlushBeginListener>();
|
||||||
|
CompactionListener_t ce_listener =
|
||||||
|
std::make_shared<EvictAllVersionsCompactionListener>();
|
||||||
|
ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
|
||||||
|
|
||||||
|
my_db_options.listeners.emplace_back(fblistener);
|
||||||
|
my_db_options.listeners.emplace_back(ce_listener);
|
||||||
|
my_db_options.wal_filter = rw_filter.get();
|
||||||
|
|
||||||
|
{
|
||||||
|
MutexLock l(&listener_mutex);
|
||||||
|
all_blobdb_listeners.push_back(fblistener);
|
||||||
|
all_blobdb_listeners.push_back(ce_listener);
|
||||||
|
all_wal_filters.push_back(rw_filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
// we need to open blob db first so that recovery can happen
|
||||||
|
BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, my_db_options);
|
||||||
|
fblistener->SetImplPtr(bdb);
|
||||||
|
ce_listener->SetImplPtr(bdb);
|
||||||
|
rw_filter->SetImplPtr(bdb);
|
||||||
|
|
||||||
|
Status s = bdb->OpenPhase1();
|
||||||
|
if (!s.ok()) return s;
|
||||||
|
|
||||||
|
if (no_base_db) return s;
|
||||||
|
|
||||||
|
DB* db = nullptr;
|
||||||
|
s = DB::Open(my_db_options, dbname, column_families, handles, &db);
|
||||||
|
if (!s.ok()) return s;
|
||||||
|
|
||||||
|
// set the implementation pointer
|
||||||
|
s = bdb->LinkToBaseDB(db);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
delete bdb;
|
delete bdb;
|
||||||
|
bdb = nullptr;
|
||||||
}
|
}
|
||||||
*blob_db = bdb;
|
*blob_db = bdb;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string BlobDB::kFileName = "blob_log";
|
BlobDB::BlobDB(DB* db) : StackableDB(db) {}
|
||||||
const size_t BlobDB::kBlockHeaderSize = 8;
|
|
||||||
const size_t BlobDB::kBytesPerSync = 1024 * 1024 * 128;
|
|
||||||
|
|
||||||
BlobDB::BlobDB(DB* db)
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
: StackableDB(db),
|
//
|
||||||
ioptions_(db->GetOptions()),
|
//
|
||||||
writer_offset_(0),
|
// std::function<int(double)> fnCaller =
|
||||||
next_sync_offset_(kBytesPerSync) {}
|
// std::bind(&A::fn, &anInstance, std::placeholders::_1);
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
BlobDBOptions::BlobDBOptions()
|
||||||
|
: blob_dir("blob_dir"),
|
||||||
|
path_relative(true),
|
||||||
|
is_fifo(false),
|
||||||
|
blob_dir_size(1000ULL * 1024ULL * 1024ULL * 1024ULL),
|
||||||
|
ttl_range_secs(3600),
|
||||||
|
min_blob_size(512),
|
||||||
|
bytes_per_sync(0),
|
||||||
|
blob_file_size(256 * 1024 * 1024),
|
||||||
|
num_concurrent_simple_blobs(4),
|
||||||
|
default_ttl_extractor(false),
|
||||||
|
compression(kNoCompression) {}
|
||||||
|
|
||||||
Status BlobDB::Open() {
|
} // namespace blob_db
|
||||||
unique_ptr<WritableFile> wfile;
|
|
||||||
EnvOptions env_options(db_->GetOptions());
|
|
||||||
Status s = ioptions_.env->NewWritableFile(db_->GetName() + "/" + kFileName,
|
|
||||||
&wfile, env_options);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
file_writer_.reset(new WritableFileWriter(std::move(wfile), env_options));
|
|
||||||
|
|
||||||
// Write version
|
|
||||||
std::string version;
|
|
||||||
PutFixed64(&version, 0);
|
|
||||||
s = file_writer_->Append(Slice(version));
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
writer_offset_ += version.size();
|
|
||||||
|
|
||||||
std::unique_ptr<RandomAccessFile> rfile;
|
|
||||||
s = ioptions_.env->NewRandomAccessFile(db_->GetName() + "/" + kFileName,
|
|
||||||
&rfile, env_options);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
file_reader_.reset(new RandomAccessFileReader(std::move(rfile)));
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status BlobDB::Put(const WriteOptions& options, const Slice& key,
|
|
||||||
const Slice& value) {
|
|
||||||
BlockBuilder block_builder(1, false);
|
|
||||||
block_builder.Add(key, value);
|
|
||||||
|
|
||||||
CompressionType compression = CompressionType::kLZ4Compression;
|
|
||||||
CompressionOptions compression_opts;
|
|
||||||
|
|
||||||
Slice block_contents;
|
|
||||||
std::string compression_output;
|
|
||||||
|
|
||||||
block_contents = CompressBlock(block_builder.Finish(), compression_opts,
|
|
||||||
&compression, kBlockBasedTableVersionFormat,
|
|
||||||
Slice() /* dictionary */, &compression_output);
|
|
||||||
|
|
||||||
char header[kBlockHeaderSize];
|
|
||||||
char trailer[kBlockTrailerSize];
|
|
||||||
trailer[0] = compression;
|
|
||||||
auto crc = crc32c::Value(block_contents.data(), block_contents.size());
|
|
||||||
crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type
|
|
||||||
EncodeFixed32(trailer + 1, crc32c::Mask(crc));
|
|
||||||
|
|
||||||
BlockHandle handle;
|
|
||||||
std::string index_entry;
|
|
||||||
Status s;
|
|
||||||
{
|
|
||||||
InstrumentedMutexLock l(&mutex_);
|
|
||||||
auto raw_block_size = block_contents.size();
|
|
||||||
EncodeFixed64(header, raw_block_size);
|
|
||||||
s = file_writer_->Append(Slice(header, kBlockHeaderSize));
|
|
||||||
writer_offset_ += kBlockHeaderSize;
|
|
||||||
if (s.ok()) {
|
|
||||||
handle.set_offset(writer_offset_);
|
|
||||||
handle.set_size(raw_block_size);
|
|
||||||
s = file_writer_->Append(block_contents);
|
|
||||||
}
|
|
||||||
if (s.ok()) {
|
|
||||||
s = file_writer_->Append(Slice(trailer, kBlockTrailerSize));
|
|
||||||
}
|
|
||||||
if (s.ok()) {
|
|
||||||
s = file_writer_->Flush();
|
|
||||||
}
|
|
||||||
if (s.ok() && writer_offset_ > next_sync_offset_) {
|
|
||||||
// Sync every kBytesPerSync. This is a hacky way to limit unsynced data.
|
|
||||||
next_sync_offset_ += kBytesPerSync;
|
|
||||||
s = file_writer_->Sync(db_->GetOptions().use_fsync);
|
|
||||||
}
|
|
||||||
if (s.ok()) {
|
|
||||||
writer_offset_ += block_contents.size() + kBlockTrailerSize;
|
|
||||||
// Put file number
|
|
||||||
PutVarint64(&index_entry, 0);
|
|
||||||
handle.EncodeTo(&index_entry);
|
|
||||||
s = db_->Put(options, key, index_entry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status BlobDB::Get(const ReadOptions& options, const Slice& key,
|
|
||||||
std::string* value) {
|
|
||||||
Status s;
|
|
||||||
std::string index_entry;
|
|
||||||
s = db_->Get(options, key, &index_entry);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
BlockHandle handle;
|
|
||||||
Slice index_entry_slice(index_entry);
|
|
||||||
uint64_t file_number;
|
|
||||||
if (!GetVarint64(&index_entry_slice, &file_number)) {
|
|
||||||
return Status::Corruption();
|
|
||||||
}
|
|
||||||
assert(file_number == 0);
|
|
||||||
s = handle.DecodeFrom(&index_entry_slice);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
Footer footer(0, kBlockBasedTableVersionFormat);
|
|
||||||
BlockContents contents;
|
|
||||||
s = ReadBlockContents(file_reader_.get(), footer, options, handle, &contents,
|
|
||||||
ioptions_);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
Block block(std::move(contents), kDisableGlobalSequenceNumber);
|
|
||||||
BlockIter bit;
|
|
||||||
InternalIterator* it = block.NewIterator(nullptr, &bit);
|
|
||||||
it->SeekToFirst();
|
|
||||||
if (!it->status().ok()) {
|
|
||||||
return it->status();
|
|
||||||
}
|
|
||||||
*value = it->value().ToString();
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
#else
|
#endif
|
||||||
namespace rocksdb {
|
|
||||||
Status NewBlobDB(Options options, std::string dbname, DB** blob_db) {
|
|
||||||
return Status::NotSupported();
|
|
||||||
}
|
|
||||||
} // namespace rocksdb
|
|
||||||
#endif // ROCKSDB_LITE
|
|
||||||
|
@ -7,12 +7,19 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/status.h"
|
#include "rocksdb/status.h"
|
||||||
|
#include "rocksdb/utilities/stackable_db.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
// EXPERIMENAL ONLY
|
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
// A wrapped database which puts values of KV pairs in a separate log
|
// A wrapped database which puts values of KV pairs in a separate log
|
||||||
// and store location to the log in the underlying DB.
|
// and store location to the log in the underlying DB.
|
||||||
// It lacks lots of importatant functionalities, e.g. DB restarts,
|
// It lacks lots of importatant functionalities, e.g. DB restarts,
|
||||||
@ -20,5 +27,177 @@ namespace rocksdb {
|
|||||||
//
|
//
|
||||||
// The factory needs to be moved to include/rocksdb/utilities to allow
|
// The factory needs to be moved to include/rocksdb/utilities to allow
|
||||||
// users to use blob DB.
|
// users to use blob DB.
|
||||||
extern Status NewBlobDB(Options options, std::string dbname, DB** blob_db);
|
|
||||||
|
struct BlobDBOptions {
|
||||||
|
// name of the directory under main db, where blobs will be stored.
|
||||||
|
// default is "blob_dir"
|
||||||
|
std::string blob_dir;
|
||||||
|
|
||||||
|
// whether the blob_dir path is relative or absolute.
|
||||||
|
bool path_relative;
|
||||||
|
|
||||||
|
// is the eviction strategy fifo based
|
||||||
|
bool is_fifo;
|
||||||
|
|
||||||
|
// maximum size of the blob dir. Once this gets used, up
|
||||||
|
// evict the blob file which is oldest (is_fifo )
|
||||||
|
// 0 means no limits
|
||||||
|
uint64_t blob_dir_size;
|
||||||
|
|
||||||
|
// a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
|
||||||
|
// (10 minutes), and the first bucket starts at 1471542000
|
||||||
|
// then the blob buckets will be
|
||||||
|
// first bucket is 1471542000 - 1471542600
|
||||||
|
// second bucket is 1471542600 - 1471543200
|
||||||
|
// and so on
|
||||||
|
uint32_t ttl_range_secs;
|
||||||
|
|
||||||
|
// at what size will the blobs be stored in separate log rather than
|
||||||
|
// inline
|
||||||
|
uint64_t min_blob_size;
|
||||||
|
|
||||||
|
// at what bytes will the blob files be synced to blob log.
|
||||||
|
uint64_t bytes_per_sync;
|
||||||
|
|
||||||
|
// the target size of each blob file. File will become immutable
|
||||||
|
// after it exceeds that size
|
||||||
|
uint64_t blob_file_size;
|
||||||
|
|
||||||
|
// how many files to use for simple blobs at one time
|
||||||
|
uint32_t num_concurrent_simple_blobs;
|
||||||
|
|
||||||
|
// this function is to be provided by client if they intend to
|
||||||
|
// use Put API to provide TTL.
|
||||||
|
// the first argument is the value in the Put API
|
||||||
|
// in case you want to do some modifications to the value,
|
||||||
|
// return a new Slice in the second.
|
||||||
|
// otherwise just copy the input value into output.
|
||||||
|
// the ttl should be extracted and returned in last pointer.
|
||||||
|
// otherwise assign it to -1
|
||||||
|
std::function<bool(const Slice&, Slice*, int32_t*)> extract_ttl_fn;
|
||||||
|
|
||||||
|
// eviction callback.
|
||||||
|
// this function will be called for every blob that is getting
|
||||||
|
// evicted.
|
||||||
|
std::function<void(const ColumnFamilyHandle*, const Slice&, const Slice&)>
|
||||||
|
gc_evict_cb_fn;
|
||||||
|
|
||||||
|
// default ttl extactor
|
||||||
|
bool default_ttl_extractor;
|
||||||
|
|
||||||
|
// what compression to use for Blob's
|
||||||
|
CompressionType compression;
|
||||||
|
|
||||||
|
// default constructor
|
||||||
|
BlobDBOptions();
|
||||||
|
|
||||||
|
BlobDBOptions(const BlobDBOptions& in) = default;
|
||||||
|
|
||||||
|
virtual ~BlobDBOptions() = default;
|
||||||
|
};
|
||||||
|
|
||||||
|
class BlobDB : public StackableDB {
|
||||||
|
public:
|
||||||
|
// the suffix to a blob value to represent "ttl:TTLVAL"
|
||||||
|
static const uint64_t kTTLSuffixLength = 8;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using rocksdb::StackableDB::Put;
|
||||||
|
|
||||||
|
// This function needs to be called before destroying
|
||||||
|
// the base DB
|
||||||
|
static Status DestroyBlobDB(const std::string& dbname, const Options& options,
|
||||||
|
const BlobDBOptions& bdb_options);
|
||||||
|
|
||||||
|
virtual Status Put(const WriteOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
const Slice& value) override = 0;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::Delete;
|
||||||
|
virtual Status Delete(const WriteOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family,
|
||||||
|
const Slice& key) override = 0;
|
||||||
|
|
||||||
|
virtual Status PutWithTTL(const WriteOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
const Slice& value, int32_t ttl) = 0;
|
||||||
|
|
||||||
|
virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
|
||||||
|
const Slice& value, int32_t ttl) {
|
||||||
|
return PutWithTTL(options, DefaultColumnFamily(), key, value, ttl);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual Status PutUntil(const WriteOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
const Slice& value, int32_t expiration) = 0;
|
||||||
|
|
||||||
|
virtual Status PutUntil(const WriteOptions& options, const Slice& key,
|
||||||
|
const Slice& value, int32_t expiration) {
|
||||||
|
return PutUntil(options, DefaultColumnFamily(), key, value, expiration);
|
||||||
|
}
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::Get;
|
||||||
|
virtual Status Get(const ReadOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
std::string* value) override = 0;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::MultiGet;
|
||||||
|
virtual std::vector<Status> MultiGet(
|
||||||
|
const ReadOptions& options,
|
||||||
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||||
|
const std::vector<Slice>& keys,
|
||||||
|
std::vector<std::string>* values) override = 0;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::SingleDelete;
|
||||||
|
virtual Status SingleDelete(const WriteOptions& wopts,
|
||||||
|
ColumnFamilyHandle* column_family,
|
||||||
|
const Slice& key) override = 0;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::Merge;
|
||||||
|
virtual Status Merge(const WriteOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
const Slice& value) override {
|
||||||
|
return Status::NotSupported("Not supported operation in blob db.");
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual Status Write(const WriteOptions& opts,
|
||||||
|
WriteBatch* updates) override = 0;
|
||||||
|
|
||||||
|
// Starting point for opening a Blob DB.
|
||||||
|
// changed_options - critical. Blob DB loads and inserts listeners
|
||||||
|
// into options which are necessary for recovery and atomicity
|
||||||
|
// Use this pattern if you need control on step 2, i.e. your
|
||||||
|
// BaseDB is not just a simple rocksdb but a stacked DB
|
||||||
|
// 1. ::OpenAndLoad
|
||||||
|
// 2. Open Base DB with the changed_options
|
||||||
|
// 3. ::LinkToBaseDB
|
||||||
|
static Status OpenAndLoad(const Options& options,
|
||||||
|
const BlobDBOptions& bdb_options,
|
||||||
|
const std::string& dbname, BlobDB** blob_db,
|
||||||
|
Options* changed_options);
|
||||||
|
|
||||||
|
// This is another way to open BLOB DB which do not have other
|
||||||
|
// Stackable DB's in play
|
||||||
|
// Steps.
|
||||||
|
// 1. ::Open
|
||||||
|
static Status Open(const Options& options, const BlobDBOptions& bdb_options,
|
||||||
|
const std::string& dbname, BlobDB** blob_db);
|
||||||
|
|
||||||
|
static Status Open(const DBOptions& db_options,
|
||||||
|
const BlobDBOptions& bdb_options,
|
||||||
|
const std::string& dbname,
|
||||||
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||||
|
std::vector<ColumnFamilyHandle*>* handles,
|
||||||
|
BlobDB** blob_db, bool no_base_db = false);
|
||||||
|
|
||||||
|
virtual ~BlobDB() {}
|
||||||
|
|
||||||
|
virtual Status LinkToBaseDB(DB* db_base) = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
explicit BlobDB(DB* db);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
||||||
|
2210
utilities/blob_db/blob_db_impl.cc
Normal file
2210
utilities/blob_db/blob_db_impl.cc
Normal file
File diff suppressed because it is too large
Load Diff
657
utilities/blob_db/blob_db_impl.h
Normal file
657
utilities/blob_db/blob_db_impl.h
Normal file
@ -0,0 +1,657 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <ctime>
|
||||||
|
#include <list>
|
||||||
|
#include <memory>
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "rocksdb/compaction_filter.h"
|
||||||
|
#include "rocksdb/db.h"
|
||||||
|
#include "rocksdb/listener.h"
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/wal_filter.h"
|
||||||
|
#include "util/file_reader_writer.h"
|
||||||
|
#include "util/mpsc.h"
|
||||||
|
#include "util/mutexlock.h"
|
||||||
|
#include "util/timer_queue.h"
|
||||||
|
#include "utilities/blob_db/blob_db.h"
|
||||||
|
#include "utilities/blob_db/blob_db_options_impl.h"
|
||||||
|
#include "utilities/blob_db/blob_log_format.h"
|
||||||
|
#include "utilities/blob_db/blob_log_reader.h"
|
||||||
|
#include "utilities/blob_db/blob_log_writer.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class DBImpl;
|
||||||
|
class ColumnFamilyHandle;
|
||||||
|
class ColumnFamilyData;
|
||||||
|
class OptimisticTransactionDBImpl;
|
||||||
|
struct FlushJobInfo;
|
||||||
|
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
class BlobFile;
|
||||||
|
class BlobDBImpl;
|
||||||
|
struct GCStats;
|
||||||
|
|
||||||
|
class BlobDBFlushBeginListener : public EventListener {
|
||||||
|
public:
|
||||||
|
explicit BlobDBFlushBeginListener() : impl_(nullptr) {}
|
||||||
|
|
||||||
|
void OnFlushBegin(DB* db, const FlushJobInfo& info) override;
|
||||||
|
|
||||||
|
void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
BlobDBImpl* impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// this implements the callback from the WAL which ensures that the
|
||||||
|
// blob record is present in the blob log. If fsync/fdatasync in not
|
||||||
|
// happening on every write, there is the probability that keys in the
|
||||||
|
// blob log can lag the keys in blobs
|
||||||
|
class BlobReconcileWalFilter : public WalFilter {
|
||||||
|
public:
|
||||||
|
virtual WalFilter::WalProcessingOption LogRecordFound(
|
||||||
|
unsigned long long log_number, const std::string& log_file_name,
|
||||||
|
const WriteBatch& batch, WriteBatch* new_batch,
|
||||||
|
bool* batch_changed) override;
|
||||||
|
|
||||||
|
virtual const char* Name() const override { return "BlobDBWalReconciler"; }
|
||||||
|
|
||||||
|
void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
BlobDBImpl* impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class EvictAllVersionsCompactionListener : public EventListener {
|
||||||
|
public:
|
||||||
|
class InternalListener : public CompactionEventListener {
|
||||||
|
friend class BlobDBImpl;
|
||||||
|
|
||||||
|
public:
|
||||||
|
virtual void OnCompaction(int level, const Slice& key,
|
||||||
|
CompactionListenerValueType value_type,
|
||||||
|
const Slice& existing_value,
|
||||||
|
const SequenceNumber& sn, bool is_new) override;
|
||||||
|
|
||||||
|
void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
BlobDBImpl* impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
explicit EvictAllVersionsCompactionListener()
|
||||||
|
: internal_listener_(new InternalListener()) {}
|
||||||
|
|
||||||
|
virtual CompactionEventListener* GetCompactionEventListener() override {
|
||||||
|
return internal_listener_.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetImplPtr(BlobDBImpl* p) { internal_listener_->SetImplPtr(p); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<InternalListener> internal_listener_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
class EvictAllVersionsFilterFactory : public CompactionFilterFactory {
|
||||||
|
private:
|
||||||
|
BlobDBImpl* impl_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
EvictAllVersionsFilterFactory() : impl_(nullptr) {}
|
||||||
|
|
||||||
|
void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
|
||||||
|
|
||||||
|
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||||
|
const CompactionFilter::Context& context) override;
|
||||||
|
|
||||||
|
virtual const char* Name() const override {
|
||||||
|
return "EvictAllVersionsFilterFactory";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Comparator to sort "TTL" aware Blob files based on the lower value of
|
||||||
|
// TTL range.
|
||||||
|
struct blobf_compare_ttl {
|
||||||
|
bool operator()(const std::shared_ptr<BlobFile>& lhs,
|
||||||
|
const std::shared_ptr<BlobFile>& rhs) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The implementation class for BlobDB. This manages the value
|
||||||
|
* part in TTL aware sequentially written files. These files are
|
||||||
|
* Garbage Collected.
|
||||||
|
*/
|
||||||
|
class BlobDBImpl : public BlobDB {
|
||||||
|
friend class BlobDBFlushBeginListener;
|
||||||
|
friend class EvictAllVersionsCompactionListener;
|
||||||
|
friend class BlobDB;
|
||||||
|
friend class BlobFile;
|
||||||
|
friend class BlobDBIterator;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using rocksdb::StackableDB::Put;
|
||||||
|
Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
||||||
|
const Slice& key, const Slice& value) override;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::Delete;
|
||||||
|
Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
||||||
|
const Slice& key) override;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::SingleDelete;
|
||||||
|
virtual Status SingleDelete(const WriteOptions& wopts,
|
||||||
|
ColumnFamilyHandle* column_family,
|
||||||
|
const Slice& key) override;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::Get;
|
||||||
|
Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
||||||
|
const Slice& key, std::string* value) override;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::NewIterator;
|
||||||
|
virtual Iterator* NewIterator(const ReadOptions& opts,
|
||||||
|
ColumnFamilyHandle* column_family) override;
|
||||||
|
|
||||||
|
using rocksdb::StackableDB::MultiGet;
|
||||||
|
virtual std::vector<Status> MultiGet(
|
||||||
|
const ReadOptions& options,
|
||||||
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||||
|
const std::vector<Slice>& keys,
|
||||||
|
std::vector<std::string>* values) override;
|
||||||
|
|
||||||
|
virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
|
||||||
|
|
||||||
|
using BlobDB::PutWithTTL;
|
||||||
|
Status PutWithTTL(const WriteOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
const Slice& value, int32_t ttl) override;
|
||||||
|
|
||||||
|
using BlobDB::PutUntil;
|
||||||
|
Status PutUntil(const WriteOptions& options,
|
||||||
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
|
const Slice& value_unc, int32_t expiration) override;
|
||||||
|
|
||||||
|
Status LinkToBaseDB(DB* db) override;
|
||||||
|
|
||||||
|
BlobDBImpl(DB* db, const BlobDBOptions& bdb_options);
|
||||||
|
|
||||||
|
BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
|
||||||
|
const DBOptions& db_options);
|
||||||
|
|
||||||
|
~BlobDBImpl();
|
||||||
|
|
||||||
|
private:
|
||||||
|
static bool ExtractTTLFromBlob(const Slice& value, Slice* newval,
|
||||||
|
int32_t* ttl_val);
|
||||||
|
|
||||||
|
Status OpenPhase1();
|
||||||
|
|
||||||
|
Status CommonGet(const ColumnFamilyData* cfd, const Slice& key,
|
||||||
|
const std::string& index_entry, std::string* value);
|
||||||
|
|
||||||
|
// Just before flush starts acting on memtable files,
|
||||||
|
// this handler is called.
|
||||||
|
void OnFlushBeginHandler(DB* db, const FlushJobInfo& info);
|
||||||
|
|
||||||
|
// timer queue callback to close a file by appending a footer
|
||||||
|
// removes file from open files list
|
||||||
|
std::pair<bool, int64_t> CloseSeqWrite(std::shared_ptr<BlobFile> bfile,
|
||||||
|
bool aborted);
|
||||||
|
|
||||||
|
// is this file ready for Garbage collection. if the TTL of the file
|
||||||
|
// has expired or if threshold of the file has been evicted
|
||||||
|
// tt - current time
|
||||||
|
// last_id - the id of the non-TTL file to evict
|
||||||
|
bool ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
|
||||||
|
uint64_t last_id, std::string* reason);
|
||||||
|
|
||||||
|
// collect all the blob log files from the blob directory
|
||||||
|
Status GetAllLogFiles(std::set<std::pair<uint64_t, std::string>>* file_nums);
|
||||||
|
|
||||||
|
// appends a task into timer queue to close the file
|
||||||
|
void CloseIf(const std::shared_ptr<BlobFile>& bfile);
|
||||||
|
|
||||||
|
Status AppendBlob(const std::shared_ptr<BlobFile>& bfile,
|
||||||
|
const std::string& headerbuf, const Slice& key,
|
||||||
|
const Slice& value, std::string* index_entry);
|
||||||
|
|
||||||
|
Status AppendSN(const std::shared_ptr<BlobFile>& bfile,
|
||||||
|
const SequenceNumber& sn);
|
||||||
|
|
||||||
|
// find an existing blob log file based on the expiration unix epoch
|
||||||
|
// if such a file does not exist, return nullptr
|
||||||
|
std::shared_ptr<BlobFile> SelectBlobFileTTL(uint32_t expiration);
|
||||||
|
|
||||||
|
// find an existing blob log file to append the value to
|
||||||
|
std::shared_ptr<BlobFile> SelectBlobFile();
|
||||||
|
|
||||||
|
std::shared_ptr<BlobFile> FindBlobFileLocked(uint32_t expiration) const;
|
||||||
|
|
||||||
|
void UpdateWriteOptions(const WriteOptions& options);
|
||||||
|
|
||||||
|
void Shutdown();
|
||||||
|
|
||||||
|
// periodic sanity check. Bunch of checks
|
||||||
|
std::pair<bool, int64_t> SanityCheck(bool aborted);
|
||||||
|
|
||||||
|
// delete files which have been garbage collected and marked
|
||||||
|
// obsolete. Check whether any snapshots exist which refer to
|
||||||
|
// the same
|
||||||
|
std::pair<bool, int64_t> DeleteObsFiles(bool aborted);
|
||||||
|
|
||||||
|
// Major task to garbage collect expired and deleted blobs
|
||||||
|
std::pair<bool, int64_t> RunGC(bool aborted);
|
||||||
|
|
||||||
|
// asynchronous task to fsync/fdatasync the open blob files
|
||||||
|
std::pair<bool, int64_t> FsyncFiles(bool aborted);
|
||||||
|
|
||||||
|
// periodically check if open blob files and their TTL's has expired
|
||||||
|
// if expired, close the sequential writer and make the file immutable
|
||||||
|
std::pair<bool, int64_t> CheckSeqFiles(bool aborted);
|
||||||
|
|
||||||
|
// if the number of open files, approaches ULIMIT's this
|
||||||
|
// task will close random readers, which are kept around for
|
||||||
|
// efficiency
|
||||||
|
std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
|
||||||
|
|
||||||
|
// periodically print write amplification statistics
|
||||||
|
std::pair<bool, int64_t> WaStats(bool aborted);
|
||||||
|
|
||||||
|
// background task to do book-keeping of deleted keys
|
||||||
|
std::pair<bool, int64_t> EvictDeletions(bool aborted);
|
||||||
|
|
||||||
|
std::pair<bool, int64_t> EvictCompacted(bool aborted);
|
||||||
|
|
||||||
|
bool CallbackEvictsImpl(std::shared_ptr<BlobFile> bfile);
|
||||||
|
|
||||||
|
std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
|
||||||
|
|
||||||
|
std::pair<bool, int64_t> CallbackEvicts(TimerQueue* tq,
|
||||||
|
std::shared_ptr<BlobFile> bfile,
|
||||||
|
bool aborted);
|
||||||
|
|
||||||
|
// Adds the background tasks to the timer queue
|
||||||
|
void StartBackgroundTasks();
|
||||||
|
|
||||||
|
// add a new Blob File
|
||||||
|
std::shared_ptr<BlobFile> NewBlobFile(const std::string& reason);
|
||||||
|
|
||||||
|
Status OpenAllFiles();
|
||||||
|
|
||||||
|
// hold write mutex on file and call
|
||||||
|
// creates a Random Access reader for GET call
|
||||||
|
std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
|
||||||
|
const std::shared_ptr<BlobFile>& bfile, Env* env,
|
||||||
|
const EnvOptions& env_options);
|
||||||
|
|
||||||
|
// hold write mutex on file and call.
|
||||||
|
// Close the above Random Access reader
|
||||||
|
void CloseRandomAccessLocked(const std::shared_ptr<BlobFile>& bfile);
|
||||||
|
|
||||||
|
// hold write mutex on file and call
|
||||||
|
// creates a sequential (append) writer for this blobfile
|
||||||
|
Status CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile);
|
||||||
|
|
||||||
|
// returns a Writer object for the file. If writer is not
|
||||||
|
// already present, creates one. Needs Write Mutex to be held
|
||||||
|
std::shared_ptr<Writer> CheckOrCreateWriterLocked(
|
||||||
|
const std::shared_ptr<BlobFile>& bfile);
|
||||||
|
|
||||||
|
// Iterate through keys and values on Blob and write into
|
||||||
|
// separate file the remaining blobs and delete/update pointers
|
||||||
|
// in LSM atomically
|
||||||
|
Status GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||||
|
GCStats* gcstats);
|
||||||
|
|
||||||
|
// checks if there is no snapshot which is referencing the
|
||||||
|
// blobs
|
||||||
|
bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
|
||||||
|
|
||||||
|
bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue);
|
||||||
|
|
||||||
|
bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
|
||||||
|
uint64_t blob_offset, uint64_t blob_size);
|
||||||
|
|
||||||
|
void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
|
||||||
|
uint64_t* last_id);
|
||||||
|
|
||||||
|
void FilterSubsetOfFiles(
|
||||||
|
const std::vector<std::shared_ptr<BlobFile>>& blob_files,
|
||||||
|
std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
|
||||||
|
uint64_t last_id, size_t files_to_collect);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// the base DB
|
||||||
|
DBImpl* db_impl_;
|
||||||
|
|
||||||
|
Env* myenv_;
|
||||||
|
|
||||||
|
// Optimistic Transaction DB used during Garbage collection
|
||||||
|
// for atomicity
|
||||||
|
std::unique_ptr<OptimisticTransactionDBImpl> opt_db_;
|
||||||
|
|
||||||
|
// a boolean to capture whether write_options has been set
|
||||||
|
std::atomic<bool> wo_set_;
|
||||||
|
WriteOptions write_options_;
|
||||||
|
|
||||||
|
// the options that govern the behavior of Blob Storage
|
||||||
|
BlobDBOptionsImpl bdb_options_;
|
||||||
|
DBOptions db_options_;
|
||||||
|
EnvOptions env_options_;
|
||||||
|
|
||||||
|
// name of the database directory
|
||||||
|
std::string dbname_;
|
||||||
|
|
||||||
|
// by default this is "blob_dir" under dbname_
|
||||||
|
// but can be configured
|
||||||
|
std::string blob_dir_;
|
||||||
|
|
||||||
|
// pointer to directory
|
||||||
|
std::unique_ptr<Directory> dir_ent_;
|
||||||
|
|
||||||
|
std::atomic<bool> dir_change_;
|
||||||
|
|
||||||
|
// Read Write Mutex, which protects all the data structures
|
||||||
|
// HEAVILY TRAFFICKED
|
||||||
|
port::RWMutex mutex_;
|
||||||
|
|
||||||
|
// counter for blob file number
|
||||||
|
std::atomic<uint64_t> next_file_number_;
|
||||||
|
|
||||||
|
// entire metadata of all the BLOB files memory
|
||||||
|
std::unordered_map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
|
||||||
|
|
||||||
|
// epoch or version of the open files.
|
||||||
|
std::atomic<uint64_t> epoch_of_;
|
||||||
|
|
||||||
|
// typically we keep 4 open blob files (simple i.e. no TTL)
|
||||||
|
std::vector<std::shared_ptr<BlobFile>> open_simple_files_;
|
||||||
|
|
||||||
|
// all the blob files which are currently being appended to based
|
||||||
|
// on variety of incoming TTL's
|
||||||
|
std::multiset<std::shared_ptr<BlobFile>, blobf_compare_ttl> open_blob_files_;
|
||||||
|
|
||||||
|
// packet of information to put in lockess delete(s) queue
|
||||||
|
struct delete_packet_t {
|
||||||
|
ColumnFamilyHandle* cfh_;
|
||||||
|
std::string key_;
|
||||||
|
SequenceNumber dsn_;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct override_packet_t {
|
||||||
|
uint64_t file_number_;
|
||||||
|
uint64_t key_size_;
|
||||||
|
uint64_t blob_offset_;
|
||||||
|
uint64_t blob_size_;
|
||||||
|
SequenceNumber dsn_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// LOCKLESS multiple producer single consumer queue to quickly append
|
||||||
|
// deletes without taking lock. Can rapidly grow in size!!
|
||||||
|
// deletes happen in LSM, but minor book-keeping needs to happen on
|
||||||
|
// BLOB side (for triggering eviction)
|
||||||
|
mpsc_queue_t<delete_packet_t> delete_keys_q_;
|
||||||
|
|
||||||
|
// LOCKLESS multiple producer single consumer queue for values
|
||||||
|
// that are being compacted
|
||||||
|
mpsc_queue_t<override_packet_t> override_vals_q_;
|
||||||
|
|
||||||
|
// atomic bool to represent shutdown
|
||||||
|
std::atomic<bool> shutdown_;
|
||||||
|
|
||||||
|
// timer based queue to execute tasks
|
||||||
|
TimerQueue tqueue_;
|
||||||
|
|
||||||
|
// timer queues to call eviction callbacks.
|
||||||
|
std::vector<std::shared_ptr<TimerQueue>> cb_threads_;
|
||||||
|
|
||||||
|
// only accessed in GC thread, hence not atomic. The epoch of the
|
||||||
|
// GC task. Each execution is one epoch. Helps us in allocating
|
||||||
|
// files to one execution
|
||||||
|
uint64_t current_epoch_;
|
||||||
|
|
||||||
|
// number of files opened for random access/GET
|
||||||
|
// counter is used to monitor and close excess RA files.
|
||||||
|
std::atomic<uint32_t> open_file_count_;
|
||||||
|
|
||||||
|
// should hold mutex to modify
|
||||||
|
// STATISTICS for WA of Blob Files due to GC
|
||||||
|
// collect by default 24 hourly periods
|
||||||
|
std::list<uint64_t> all_periods_write_;
|
||||||
|
std::list<uint64_t> all_periods_ampl_;
|
||||||
|
|
||||||
|
std::atomic<uint64_t> last_period_write_;
|
||||||
|
std::atomic<uint64_t> last_period_ampl_;
|
||||||
|
|
||||||
|
uint64_t total_periods_write_;
|
||||||
|
uint64_t total_periods_ampl_;
|
||||||
|
|
||||||
|
// total size of all blob files at a given time
|
||||||
|
std::atomic<uint64_t> total_blob_space_;
|
||||||
|
std::list<std::shared_ptr<BlobFile>> obsolete_files_;
|
||||||
|
bool open_p1_done_;
|
||||||
|
|
||||||
|
uint32_t debug_level_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class BlobFile {
|
||||||
|
friend class BlobDBImpl;
|
||||||
|
friend struct blobf_compare_ttl;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// access to parent
|
||||||
|
const BlobDBImpl* parent_;
|
||||||
|
|
||||||
|
// path to blob directory
|
||||||
|
std::string path_to_dir_;
|
||||||
|
|
||||||
|
// the id of the file.
|
||||||
|
// the above 2 are created during file creation and never changed
|
||||||
|
// after that
|
||||||
|
uint64_t file_number_;
|
||||||
|
|
||||||
|
// number of blobs in the file
|
||||||
|
std::atomic<uint64_t> blob_count_;
|
||||||
|
|
||||||
|
// the file will be selected for GC in this future epoch
|
||||||
|
std::atomic<int64_t> gc_epoch_;
|
||||||
|
|
||||||
|
// size of the file
|
||||||
|
std::atomic<uint64_t> file_size_;
|
||||||
|
|
||||||
|
// number of blobs in this particular file which have been evicted
|
||||||
|
uint64_t deleted_count_;
|
||||||
|
|
||||||
|
// size of deleted blobs (used by heuristic to select file for GC)
|
||||||
|
uint64_t deleted_size_;
|
||||||
|
|
||||||
|
BlobLogHeader header_;
|
||||||
|
|
||||||
|
// closed_ = true implies the file is no more mutable
|
||||||
|
// no more blobs will be appended and the footer has been written out
|
||||||
|
std::atomic<bool> closed_;
|
||||||
|
|
||||||
|
// has a pass of garbage collection successfully finished on this file
|
||||||
|
// can_be_deleted_ still needs to do iterator/snapshot checks
|
||||||
|
std::atomic<bool> can_be_deleted_;
|
||||||
|
|
||||||
|
// should this file been gc'd once to reconcile lost deletes/compactions
|
||||||
|
std::atomic<bool> gc_once_after_open_;
|
||||||
|
|
||||||
|
// et - lt of the blobs
|
||||||
|
ttlrange_t ttl_range_;
|
||||||
|
|
||||||
|
// et - lt of the timestamp of the KV pairs.
|
||||||
|
tsrange_t time_range_;
|
||||||
|
|
||||||
|
// ESN - LSN of the blobs
|
||||||
|
snrange_t sn_range_;
|
||||||
|
|
||||||
|
// Sequential/Append writer for blobs
|
||||||
|
std::shared_ptr<Writer> log_writer_;
|
||||||
|
|
||||||
|
// random access file reader for GET calls
|
||||||
|
std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
|
||||||
|
|
||||||
|
// This Read-Write mutex is per file specific and protects
|
||||||
|
// all the datastructures
|
||||||
|
port::RWMutex mutex_;
|
||||||
|
|
||||||
|
// time when the random access reader was last created.
|
||||||
|
std::atomic<std::time_t> last_access_;
|
||||||
|
|
||||||
|
// last time file was fsync'd/fdatasyncd
|
||||||
|
std::atomic<uint64_t> last_fsync_;
|
||||||
|
|
||||||
|
bool header_valid_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
BlobFile();
|
||||||
|
|
||||||
|
BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum);
|
||||||
|
|
||||||
|
~BlobFile();
|
||||||
|
|
||||||
|
ColumnFamilyHandle* GetColumnFamily(DB* db);
|
||||||
|
|
||||||
|
// Returns log file's pathname relative to the main db dir
|
||||||
|
// Eg. For a live-log-file = blob_dir/000003.blob
|
||||||
|
std::string PathName() const;
|
||||||
|
|
||||||
|
// Primary identifier for blob file.
|
||||||
|
// once the file is created, this never changes
|
||||||
|
uint64_t BlobFileNumber() const { return file_number_; }
|
||||||
|
|
||||||
|
// the following functions are atomic, and don't need
|
||||||
|
// read lock
|
||||||
|
uint64_t BlobCount() const {
|
||||||
|
return blob_count_.load(std::memory_order_acquire);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string DumpState() const;
|
||||||
|
|
||||||
|
// if the file has gone through GC and blobs have been relocated
|
||||||
|
bool Obsolete() const { return can_be_deleted_.load(); }
|
||||||
|
|
||||||
|
// if the file is not taking any more appends.
|
||||||
|
bool Immutable() const { return closed_.load(); }
|
||||||
|
|
||||||
|
// we will assume this is atomic
|
||||||
|
bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
|
||||||
|
|
||||||
|
uint64_t GetFileSize() const {
|
||||||
|
return file_size_.load(std::memory_order_acquire);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All Get functions which are not atomic, will need ReadLock on the mutex
|
||||||
|
tsrange_t GetTimeRange() const {
|
||||||
|
assert(HasTimestamp());
|
||||||
|
return time_range_;
|
||||||
|
}
|
||||||
|
|
||||||
|
ttlrange_t GetTTLRange() const { return ttl_range_; }
|
||||||
|
|
||||||
|
snrange_t GetSNRange() const { return sn_range_; }
|
||||||
|
|
||||||
|
bool HasTTL() const {
|
||||||
|
assert(header_valid_);
|
||||||
|
return header_.HasTTL();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HasTimestamp() const {
|
||||||
|
assert(header_valid_);
|
||||||
|
return header_.HasTimestamp();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
|
||||||
|
|
||||||
|
void Fsync();
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<Reader> OpenSequentialReader(
|
||||||
|
Env* env, const DBOptions& db_options,
|
||||||
|
const EnvOptions& env_options) const;
|
||||||
|
|
||||||
|
Status ReadFooter(BlobLogFooter* footer);
|
||||||
|
|
||||||
|
Status WriteFooterAndCloseLocked();
|
||||||
|
|
||||||
|
std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
|
||||||
|
Env* env, const EnvOptions& env_options, bool* fresh_open);
|
||||||
|
|
||||||
|
void CloseRandomAccessLocked();
|
||||||
|
|
||||||
|
// this is used, when you are reading only the footer of a
|
||||||
|
// previously closed file
|
||||||
|
Status SetFromFooterLocked(const BlobLogFooter& footer);
|
||||||
|
|
||||||
|
void set_time_range(const tsrange_t& tr) { time_range_ = tr; }
|
||||||
|
|
||||||
|
void set_ttl_range(const ttlrange_t& ttl) { ttl_range_ = ttl; }
|
||||||
|
|
||||||
|
void SetSNRange(const snrange_t& snr) { sn_range_ = snr; }
|
||||||
|
|
||||||
|
// The following functions are atomic, and don't need locks
|
||||||
|
void SetFileSize(uint64_t fs) { file_size_ = fs; }
|
||||||
|
|
||||||
|
void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
|
||||||
|
|
||||||
|
void SetCanBeDeleted() { can_be_deleted_ = true; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class BlobDBIterator : public Iterator {
|
||||||
|
public:
|
||||||
|
explicit BlobDBIterator(Iterator* iter, ColumnFamilyHandle* column_family,
|
||||||
|
BlobDBImpl* impl)
|
||||||
|
: iter_(iter), cfh_(column_family), db_impl_(impl) {
|
||||||
|
assert(iter_);
|
||||||
|
}
|
||||||
|
|
||||||
|
~BlobDBIterator() { delete iter_; }
|
||||||
|
|
||||||
|
bool Valid() const override { return iter_->Valid(); }
|
||||||
|
|
||||||
|
void SeekToFirst() override { iter_->SeekToFirst(); }
|
||||||
|
|
||||||
|
void SeekToLast() override { iter_->SeekToLast(); }
|
||||||
|
|
||||||
|
void Seek(const Slice& target) override { iter_->Seek(target); }
|
||||||
|
|
||||||
|
void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); }
|
||||||
|
|
||||||
|
void Next() override { iter_->Next(); }
|
||||||
|
|
||||||
|
void Prev() override { iter_->Prev(); }
|
||||||
|
|
||||||
|
Slice key() const override { return iter_->key(); }
|
||||||
|
|
||||||
|
Slice value() const override;
|
||||||
|
|
||||||
|
Status status() const override { return iter_->status(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
Iterator* iter_;
|
||||||
|
ColumnFamilyHandle* cfh_;
|
||||||
|
BlobDBImpl* db_impl_;
|
||||||
|
mutable std::string vpart_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
66
utilities/blob_db/blob_db_options_impl.cc
Normal file
66
utilities/blob_db/blob_db_options_impl.cc
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include "utilities/blob_db/blob_db_options_impl.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
BlobDBOptionsImpl::BlobDBOptionsImpl(const BlobDBOptions& in)
|
||||||
|
: BlobDBOptions(in),
|
||||||
|
deletion_check_period_millisecs(2 * 1000),
|
||||||
|
gc_file_pct(20),
|
||||||
|
gc_check_period_millisecs(60 * 1000),
|
||||||
|
sanity_check_period_millisecs(20 * 60 * 1000),
|
||||||
|
open_files_trigger(100),
|
||||||
|
wa_num_stats_periods(24),
|
||||||
|
wa_stats_period_millisecs(3600 * 1000),
|
||||||
|
partial_expiration_gc_range_secs(4 * 3600),
|
||||||
|
partial_expiration_pct(75),
|
||||||
|
fsync_files_period_millisecs(10 * 1000),
|
||||||
|
reclaim_of_period_millisecs(1 * 1000),
|
||||||
|
delete_obsf_period_millisecs(10 * 1000),
|
||||||
|
check_seqf_period_millisecs(10 * 1000) {}
|
||||||
|
|
||||||
|
BlobDBOptionsImpl::BlobDBOptionsImpl()
|
||||||
|
: deletion_check_period_millisecs(2 * 1000),
|
||||||
|
gc_file_pct(20),
|
||||||
|
gc_check_period_millisecs(60 * 1000),
|
||||||
|
sanity_check_period_millisecs(20 * 60 * 1000),
|
||||||
|
open_files_trigger(100),
|
||||||
|
wa_num_stats_periods(24),
|
||||||
|
wa_stats_period_millisecs(3600 * 1000),
|
||||||
|
partial_expiration_gc_range_secs(4 * 3600),
|
||||||
|
partial_expiration_pct(75),
|
||||||
|
fsync_files_period_millisecs(10 * 1000),
|
||||||
|
reclaim_of_period_millisecs(1 * 1000),
|
||||||
|
delete_obsf_period_millisecs(10 * 1000),
|
||||||
|
check_seqf_period_millisecs(10 * 1000) {}
|
||||||
|
|
||||||
|
BlobDBOptionsImpl& BlobDBOptionsImpl::operator=(const BlobDBOptionsImpl& in) {
|
||||||
|
BlobDBOptions::operator=(in);
|
||||||
|
if (this != &in) {
|
||||||
|
deletion_check_period_millisecs = in.deletion_check_period_millisecs;
|
||||||
|
gc_file_pct = in.gc_file_pct;
|
||||||
|
gc_check_period_millisecs = in.gc_check_period_millisecs;
|
||||||
|
sanity_check_period_millisecs = in.sanity_check_period_millisecs;
|
||||||
|
open_files_trigger = in.open_files_trigger;
|
||||||
|
wa_num_stats_periods = in.wa_num_stats_periods;
|
||||||
|
wa_stats_period_millisecs = in.wa_stats_period_millisecs;
|
||||||
|
partial_expiration_gc_range_secs = in.partial_expiration_gc_range_secs;
|
||||||
|
partial_expiration_pct = in.partial_expiration_pct;
|
||||||
|
fsync_files_period_millisecs = in.fsync_files_period_millisecs;
|
||||||
|
reclaim_of_period_millisecs = in.reclaim_of_period_millisecs;
|
||||||
|
delete_obsf_period_millisecs = in.delete_obsf_period_millisecs;
|
||||||
|
check_seqf_period_millisecs = in.check_seqf_period_millisecs;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
73
utilities/blob_db/blob_db_options_impl.h
Normal file
73
utilities/blob_db/blob_db_options_impl.h
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include "utilities/blob_db/blob_db.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
struct BlobDBOptionsImpl : public BlobDBOptions {
|
||||||
|
// deletions check period
|
||||||
|
uint32_t deletion_check_period_millisecs;
|
||||||
|
|
||||||
|
// gc percentage each check period
|
||||||
|
uint32_t gc_file_pct;
|
||||||
|
|
||||||
|
// gc period
|
||||||
|
uint32_t gc_check_period_millisecs;
|
||||||
|
|
||||||
|
// sanity check task
|
||||||
|
uint32_t sanity_check_period_millisecs;
|
||||||
|
|
||||||
|
// how many random access open files can we tolerate
|
||||||
|
uint32_t open_files_trigger;
|
||||||
|
|
||||||
|
// how many periods of stats do we keep.
|
||||||
|
uint32_t wa_num_stats_periods;
|
||||||
|
|
||||||
|
// what is the length of any period
|
||||||
|
uint32_t wa_stats_period_millisecs;
|
||||||
|
|
||||||
|
// we will garbage collect blob files in
|
||||||
|
// which entire files have expired. However if the
|
||||||
|
// ttl_range of files is very large say a day, we
|
||||||
|
// would have to wait for the entire day, before we
|
||||||
|
// recover most of the space.
|
||||||
|
uint32_t partial_expiration_gc_range_secs;
|
||||||
|
|
||||||
|
// this should be based on allowed Write Amplification
|
||||||
|
// if 50% of the space of a blob file has been deleted/expired,
|
||||||
|
uint32_t partial_expiration_pct;
|
||||||
|
|
||||||
|
// how often should we schedule a job to fsync open files
|
||||||
|
uint32_t fsync_files_period_millisecs;
|
||||||
|
|
||||||
|
// how often to schedule reclaim open files.
|
||||||
|
uint32_t reclaim_of_period_millisecs;
|
||||||
|
|
||||||
|
// how often to schedule delete obs files periods
|
||||||
|
uint32_t delete_obsf_period_millisecs;
|
||||||
|
|
||||||
|
// how often to schedule check seq files period
|
||||||
|
uint32_t check_seqf_period_millisecs;
|
||||||
|
|
||||||
|
// default constructor
|
||||||
|
BlobDBOptionsImpl();
|
||||||
|
|
||||||
|
explicit BlobDBOptionsImpl(const BlobDBOptions& in);
|
||||||
|
|
||||||
|
BlobDBOptionsImpl& operator=(const BlobDBOptionsImpl& in);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
#endif // endif ROCKSDB
|
@ -1,66 +1,567 @@
|
|||||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// This source code is licensed under the BSD-style license found in the
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
#include "utilities/blob_db/blob_db.h"
|
#include "utilities/blob_db/blob_db.h"
|
||||||
|
#include <cstdlib>
|
||||||
|
#include "db/db_test_util.h"
|
||||||
#include "util/random.h"
|
#include "util/random.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
#include "util/testutil.h"
|
#include "util/testutil.h"
|
||||||
|
#include "utilities/blob_db/blob_db_options_impl.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
class BlobDBTest : public testing::Test {
|
|
||||||
public:
|
namespace blob_db {
|
||||||
BlobDBTest() {
|
Random s_rnd(301);
|
||||||
dbname_ = test::TmpDir() + "/blob_db_test";
|
|
||||||
Options options;
|
void gen_random(char *s, const int len) {
|
||||||
options.create_if_missing = true;
|
static const char alphanum[] =
|
||||||
EXPECT_TRUE(NewBlobDB(options, dbname_, &db_).ok());
|
"0123456789"
|
||||||
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||||
|
"abcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
s[i] = alphanum[s_rnd.Next() % (sizeof(alphanum) - 1)];
|
||||||
}
|
}
|
||||||
|
|
||||||
~BlobDBTest() { delete db_; }
|
s[len] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
DB* db_;
|
class BlobDBTest : public testing::Test {
|
||||||
|
public:
|
||||||
|
BlobDBTest() : blobdb_(nullptr) {
|
||||||
|
dbname_ = test::TmpDir() + "/blob_db_test";
|
||||||
|
// Reopen1(BlobDBOptionsImpl());
|
||||||
|
}
|
||||||
|
|
||||||
|
~BlobDBTest() {
|
||||||
|
if (blobdb_) {
|
||||||
|
delete blobdb_;
|
||||||
|
blobdb_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Reopen1(const BlobDBOptionsImpl &bdboptions,
|
||||||
|
const Options &options = Options()) {
|
||||||
|
if (blobdb_) {
|
||||||
|
delete blobdb_;
|
||||||
|
blobdb_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
BlobDBOptionsImpl bblobdb_options = bdboptions;
|
||||||
|
Options myoptions = options;
|
||||||
|
BlobDB::DestroyBlobDB(dbname_, myoptions, bblobdb_options);
|
||||||
|
|
||||||
|
DestroyDB(dbname_, myoptions);
|
||||||
|
|
||||||
|
myoptions.create_if_missing = true;
|
||||||
|
EXPECT_TRUE(
|
||||||
|
BlobDB::Open(myoptions, bblobdb_options, dbname_, &blobdb_).ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
void insert_blobs() {
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
for (size_t i = 0; i < 100000; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
char *val = new char[len + 1];
|
||||||
|
gen_random(val, len);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i % 500);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + 1);
|
||||||
|
|
||||||
|
int ttl = rnd.Next() % 86400;
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, keyslice, valslice, ttl));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 10; i++) {
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i % 500);
|
||||||
|
Slice keyslice(key);
|
||||||
|
blobdb_->Delete(wo, dcfh, keyslice);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BlobDB *blobdb_;
|
||||||
std::string dbname_;
|
std::string dbname_;
|
||||||
}; // class BlobDBTest
|
}; // class BlobDBTest
|
||||||
|
|
||||||
TEST_F(BlobDBTest, Basic) {
|
TEST_F(BlobDBTest, DeleteComplex) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.partial_expiration_pct = 75;
|
||||||
|
bdboptions.gc_check_period_millisecs = 20 * 1000;
|
||||||
|
bdboptions.blob_file_size = 219 * 1024;
|
||||||
|
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
WriteOptions wo;
|
WriteOptions wo;
|
||||||
ReadOptions ro;
|
ReadOptions ro;
|
||||||
std::string value;
|
std::string value;
|
||||||
|
|
||||||
ASSERT_OK(db_->Put(wo, "foo", "v1"));
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
ASSERT_OK(db_->Put(wo, "bar", "v2"));
|
|
||||||
|
|
||||||
ASSERT_OK(db_->Get(ro, "foo", &value));
|
Random rnd(301);
|
||||||
ASSERT_EQ("v1", value);
|
for (size_t i = 0; i < 100; i++) {
|
||||||
ASSERT_OK(db_->Get(ro, "bar", &value));
|
int len = rnd.Next() % 16384;
|
||||||
ASSERT_EQ("v2", value);
|
if (!len) continue;
|
||||||
|
|
||||||
|
char *val = new char[len + 1];
|
||||||
|
gen_random(val, len);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + 1);
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 99; i++) {
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
blobdb_->Delete(wo, dcfh, keyslice);
|
||||||
|
}
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(60 * 1000 * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, OverrideTest) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.ttl_range_secs = 30;
|
||||||
|
bdboptions.gc_file_pct = 100;
|
||||||
|
bdboptions.gc_check_period_millisecs = 20 * 1000;
|
||||||
|
bdboptions.num_concurrent_simple_blobs = 2;
|
||||||
|
bdboptions.blob_file_size = 876 * 1024 * 10;
|
||||||
|
|
||||||
|
Options options;
|
||||||
|
options.write_buffer_size = 256 * 1024;
|
||||||
|
options.info_log_level = INFO_LEVEL;
|
||||||
|
|
||||||
|
Reopen1(bdboptions, options);
|
||||||
|
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
char *val = new char[len + 1];
|
||||||
|
gen_random(val, len);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
char x[10];
|
||||||
|
std::sprintf(x, "%04d", i);
|
||||||
|
key += std::string(x);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + 1);
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
// override all the keys
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
char *val = new char[len + 1];
|
||||||
|
gen_random(val, len);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
char x[10];
|
||||||
|
std::sprintf(x, "%04d", i);
|
||||||
|
key += std::string(x);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + 1);
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
blobdb_->Flush(FlushOptions());
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
blobdb_->GetBaseDB()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
||||||
|
reinterpret_cast<DBImpl *>(blobdb_->GetBaseDB())->TEST_WaitForFlushMemTable();
|
||||||
|
reinterpret_cast<DBImpl *>(blobdb_->GetBaseDB())->TEST_WaitForCompact();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, DeleteTest) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.ttl_range_secs = 30;
|
||||||
|
bdboptions.gc_file_pct = 100;
|
||||||
|
bdboptions.partial_expiration_pct = 18;
|
||||||
|
bdboptions.gc_check_period_millisecs = 20 * 1000;
|
||||||
|
bdboptions.num_concurrent_simple_blobs = 1;
|
||||||
|
bdboptions.blob_file_size = 876 * 1024;
|
||||||
|
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 100; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
char *val = new char[len + 1];
|
||||||
|
gen_random(val, len);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + 1);
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 100; i += 5) {
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
blobdb_->Delete(wo, dcfh, keyslice);
|
||||||
|
}
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(60 * 1000 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, GCTestWithWrite) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.ttl_range_secs = 30;
|
||||||
|
bdboptions.gc_file_pct = 100;
|
||||||
|
bdboptions.gc_check_period_millisecs = 20 * 1000;
|
||||||
|
bdboptions.default_ttl_extractor = true;
|
||||||
|
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
WriteBatch WB;
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
for (size_t i = 0; i < 100; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
int ttl = 30;
|
||||||
|
|
||||||
|
char *val = new char[len + BlobDB::kTTLSuffixLength];
|
||||||
|
gen_random(val, len);
|
||||||
|
strncpy(val + len, "ttl:", 4);
|
||||||
|
EncodeFixed32(val + len + 4, ttl);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + BlobDB::kTTLSuffixLength);
|
||||||
|
|
||||||
|
WB.Put(dcfh, keyslice, valslice);
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Write(wo, &WB));
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cb_evict(const ColumnFamilyHandle *cfh, const Slice &key,
|
||||||
|
const Slice &val) {
|
||||||
|
fprintf(stderr, "key evicted: %s\n", key.ToString().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *LONG_STRING =
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFJFJFJTWFNLLFKFFMFMFMFMFMFMFMFMFMFMFMFMFMMF "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJAJFJFJFJFJTWBFNMFLLWMFMFMFMWKWMFMFMFMFMFMFM "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "
|
||||||
|
"AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH ";
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, GetWithCompression) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.gc_file_pct = 100;
|
||||||
|
bdboptions.gc_check_period_millisecs = 20 * 1000;
|
||||||
|
bdboptions.default_ttl_extractor = true;
|
||||||
|
bdboptions.gc_evict_cb_fn = &cb_evict;
|
||||||
|
bdboptions.compression = CompressionType::kLZ4Compression;
|
||||||
|
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
Random rnd(301);
|
||||||
|
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
std::string orig(LONG_STRING);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 10000; i++) {
|
||||||
|
int len = orig.length();
|
||||||
|
int ttl = 3000 * (rnd.Next() % 10);
|
||||||
|
|
||||||
|
char *val = new char[len + BlobDB::kTTLSuffixLength];
|
||||||
|
strncpy(val, LONG_STRING, len);
|
||||||
|
strncpy(val + len, "ttl:", 4);
|
||||||
|
EncodeFixed32(val + len + 4, ttl);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + BlobDB::kTTLSuffixLength);
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 10000; i++) {
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
std::string val;
|
||||||
|
Status s = blobdb_->Get(ro, dcfh, keyslice, &val);
|
||||||
|
ASSERT_TRUE(orig == val);
|
||||||
|
}
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, GCTestWithPutAndCompression) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.ttl_range_secs = 30;
|
||||||
|
bdboptions.gc_file_pct = 100;
|
||||||
|
bdboptions.gc_check_period_millisecs = 20 * 1000;
|
||||||
|
bdboptions.default_ttl_extractor = true;
|
||||||
|
bdboptions.gc_evict_cb_fn = &cb_evict;
|
||||||
|
bdboptions.compression = CompressionType::kLZ4Compression;
|
||||||
|
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
Random rnd(301);
|
||||||
|
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 100; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
int ttl = 30;
|
||||||
|
|
||||||
|
char *val = new char[len + BlobDB::kTTLSuffixLength];
|
||||||
|
gen_random(val, len);
|
||||||
|
strncpy(val + len, "ttl:", 4);
|
||||||
|
EncodeFixed32(val + len + 4, ttl);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + BlobDB::kTTLSuffixLength);
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, GCTestWithPut) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.ttl_range_secs = 30;
|
||||||
|
bdboptions.gc_file_pct = 100;
|
||||||
|
bdboptions.gc_check_period_millisecs = 20 * 1000;
|
||||||
|
bdboptions.default_ttl_extractor = true;
|
||||||
|
bdboptions.gc_evict_cb_fn = &cb_evict;
|
||||||
|
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
Random rnd(301);
|
||||||
|
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 100; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
int ttl = 30;
|
||||||
|
|
||||||
|
char *val = new char[len + BlobDB::kTTLSuffixLength];
|
||||||
|
gen_random(val, len);
|
||||||
|
strncpy(val + len, "ttl:", 4);
|
||||||
|
EncodeFixed32(val + len + 4, ttl);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + BlobDB::kTTLSuffixLength);
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(120 * 1000 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, GCTest) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
bdboptions.ttl_range_secs = 30;
|
||||||
|
bdboptions.gc_file_pct = 100;
|
||||||
|
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
|
WriteOptions wo;
|
||||||
|
ReadOptions ro;
|
||||||
|
std::string value;
|
||||||
|
Random rnd(301);
|
||||||
|
|
||||||
|
ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 100; i++) {
|
||||||
|
int len = rnd.Next() % 16384;
|
||||||
|
if (!len) continue;
|
||||||
|
|
||||||
|
char *val = new char[len + 1];
|
||||||
|
gen_random(val, len);
|
||||||
|
|
||||||
|
std::string key("key");
|
||||||
|
key += std::to_string(i);
|
||||||
|
|
||||||
|
Slice keyslice(key);
|
||||||
|
Slice valslice(val, len + 1);
|
||||||
|
|
||||||
|
int ttl = 30;
|
||||||
|
|
||||||
|
ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, keyslice, valslice, ttl));
|
||||||
|
delete[] val;
|
||||||
|
}
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(240 * 1000 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, DISABLED_MultipleWriters) {
|
||||||
|
BlobDBOptionsImpl bdboptions;
|
||||||
|
Reopen1(bdboptions);
|
||||||
|
|
||||||
|
ASSERT_TRUE(blobdb_ != nullptr);
|
||||||
|
|
||||||
|
std::vector<std::thread> workers;
|
||||||
|
for (size_t ii = 0; ii < 10; ii++)
|
||||||
|
workers.push_back(std::thread(&BlobDBTest::insert_blobs, this));
|
||||||
|
|
||||||
|
for (std::thread &t : workers) {
|
||||||
|
if (t.joinable()) {
|
||||||
|
t.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Env::Default()->SleepForMicroseconds(180 * 1000 * 1000);
|
||||||
|
// ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "bar", "v2", 60));
|
||||||
|
// ASSERT_OK(blobdb_->Get(ro, dcfh, "foo", &value));
|
||||||
|
// ASSERT_EQ("v1", value);
|
||||||
|
// ASSERT_OK(blobdb_->Get(ro, dcfh, "bar", &value));
|
||||||
|
// ASSERT_EQ("v2", value);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
TEST_F(BlobDBTest, Large) {
|
TEST_F(BlobDBTest, Large) {
|
||||||
|
ASSERT_TRUE(blobdb_ != nullptr);
|
||||||
|
|
||||||
WriteOptions wo;
|
WriteOptions wo;
|
||||||
ReadOptions ro;
|
ReadOptions ro;
|
||||||
std::string value1, value2, value3;
|
std::string value1, value2, value3;
|
||||||
Random rnd(301);
|
Random rnd(301);
|
||||||
|
ColumnFamilyHandle* dcfh = blobdb_->DefaultColumnFamily();
|
||||||
|
|
||||||
value1.assign(8999, '1');
|
value1.assign(8999, '1');
|
||||||
ASSERT_OK(db_->Put(wo, "foo", value1));
|
ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "foo", value1, 3600));
|
||||||
value2.assign(9001, '2');
|
value2.assign(9001, '2');
|
||||||
ASSERT_OK(db_->Put(wo, "bar", value2));
|
ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "bar", value2, 3600));
|
||||||
test::RandomString(&rnd, 13333, &value3);
|
test::RandomString(&rnd, 13333, &value3);
|
||||||
ASSERT_OK(db_->Put(wo, "barfoo", value3));
|
ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "barfoo", value3, 3600));
|
||||||
|
|
||||||
std::string value;
|
std::string value;
|
||||||
ASSERT_OK(db_->Get(ro, "foo", &value));
|
ASSERT_OK(blobdb_->Get(ro, dcfh, "foo", &value));
|
||||||
ASSERT_EQ(value1, value);
|
ASSERT_EQ(value1, value);
|
||||||
ASSERT_OK(db_->Get(ro, "bar", &value));
|
ASSERT_OK(blobdb_->Get(ro, dcfh, "bar", &value));
|
||||||
ASSERT_EQ(value2, value);
|
ASSERT_EQ(value2, value);
|
||||||
ASSERT_OK(db_->Get(ro, "barfoo", &value));
|
ASSERT_OK(blobdb_->Get(ro, dcfh, "barfoo", &value));
|
||||||
ASSERT_EQ(value3, value);
|
ASSERT_EQ(value3, value);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
// A black-box test for the ttl wrapper around rocksdb
|
// A black-box test for the ttl wrapper around rocksdb
|
||||||
|
225
utilities/blob_db/blob_file.cc
Normal file
225
utilities/blob_db/blob_file.cc
Normal file
@ -0,0 +1,225 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <memory>
|
||||||
|
#include "utilities/blob_db/blob_db_impl.h"
|
||||||
|
|
||||||
|
#include "util/filename.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
BlobFile::BlobFile()
|
||||||
|
: parent_(nullptr),
|
||||||
|
file_number_(0),
|
||||||
|
blob_count_(0),
|
||||||
|
gc_epoch_(-1),
|
||||||
|
file_size_(0),
|
||||||
|
deleted_count_(0),
|
||||||
|
deleted_size_(0),
|
||||||
|
closed_(false),
|
||||||
|
can_be_deleted_(false),
|
||||||
|
gc_once_after_open_(false),
|
||||||
|
ttl_range_(std::make_pair(0, 0)),
|
||||||
|
time_range_(std::make_pair(0, 0)),
|
||||||
|
sn_range_(std::make_pair(0, 0)),
|
||||||
|
last_access_(-1),
|
||||||
|
last_fsync_(0),
|
||||||
|
header_valid_(false) {}
|
||||||
|
|
||||||
|
BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
|
||||||
|
: parent_(p),
|
||||||
|
path_to_dir_(bdir),
|
||||||
|
file_number_(fn),
|
||||||
|
blob_count_(0),
|
||||||
|
gc_epoch_(-1),
|
||||||
|
file_size_(0),
|
||||||
|
deleted_count_(0),
|
||||||
|
deleted_size_(0),
|
||||||
|
closed_(false),
|
||||||
|
can_be_deleted_(false),
|
||||||
|
gc_once_after_open_(false),
|
||||||
|
ttl_range_(std::make_pair(0, 0)),
|
||||||
|
time_range_(std::make_pair(0, 0)),
|
||||||
|
sn_range_(std::make_pair(0, 0)),
|
||||||
|
last_access_(-1),
|
||||||
|
last_fsync_(0),
|
||||||
|
header_valid_(false) {}
|
||||||
|
|
||||||
|
BlobFile::~BlobFile() {
|
||||||
|
if (can_be_deleted_) {
|
||||||
|
std::string pn(PathName());
|
||||||
|
Status s = Env::Default()->DeleteFile(PathName());
|
||||||
|
if (!s.ok()) {
|
||||||
|
// Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
||||||
|
// "File could not be deleted %s", pn.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string BlobFile::PathName() const {
|
||||||
|
return BlobFileName(path_to_dir_, file_number_);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<Reader> BlobFile::OpenSequentialReader(
|
||||||
|
Env* env, const DBOptions& db_options,
|
||||||
|
const EnvOptions& env_options) const {
|
||||||
|
std::unique_ptr<SequentialFile> sfile;
|
||||||
|
Status s = env->NewSequentialFile(PathName(), &sfile, env_options);
|
||||||
|
if (!s.ok()) {
|
||||||
|
// report something here.
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<SequentialFileReader> sfile_reader;
|
||||||
|
sfile_reader.reset(new SequentialFileReader(std::move(sfile)));
|
||||||
|
|
||||||
|
std::shared_ptr<Reader> log_reader =
|
||||||
|
std::make_shared<Reader>(db_options.info_log, std::move(sfile_reader));
|
||||||
|
|
||||||
|
return log_reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string BlobFile::DumpState() const {
|
||||||
|
char str[1000];
|
||||||
|
std::snprintf(str, sizeof(str),
|
||||||
|
"path: %s fn: %" PRIu64 " blob_count: %" PRIu64
|
||||||
|
" gc_epoch: %" PRIu64 " file_size: %" PRIu64
|
||||||
|
" deleted_count: %" PRIu64 " deleted_size: %" PRIu64
|
||||||
|
" closed: %d can_be_deleted: %d ttl_range: (%d, %d)"
|
||||||
|
" sn_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
|
||||||
|
path_to_dir_.c_str(), file_number_, blob_count_.load(),
|
||||||
|
gc_epoch_.load(), file_size_.load(), deleted_count_,
|
||||||
|
deleted_size_, closed_.load(), can_be_deleted_.load(),
|
||||||
|
ttl_range_.first, ttl_range_.second, sn_range_.first,
|
||||||
|
sn_range_.second, (!!log_writer_), (!!ra_file_reader_));
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
|
||||||
|
assert(last_fsync_ <= file_size_);
|
||||||
|
return (hard) ? file_size_ > last_fsync_
|
||||||
|
: (file_size_ - last_fsync_) >= bytes_per_sync;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobFile::WriteFooterAndCloseLocked() {
|
||||||
|
Log(InfoLogLevel::INFO_LEVEL, parent_->db_options_.info_log,
|
||||||
|
"File is being closed after footer %s", PathName().c_str());
|
||||||
|
|
||||||
|
BlobLogFooter footer;
|
||||||
|
footer.blob_count_ = blob_count_;
|
||||||
|
if (HasTTL()) footer.set_ttl_range(ttl_range_);
|
||||||
|
|
||||||
|
footer.sn_range_ = sn_range_;
|
||||||
|
if (HasTimestamp()) footer.set_time_range(time_range_);
|
||||||
|
|
||||||
|
// this will close the file and reset the Writable File Pointer.
|
||||||
|
Status s = log_writer_->AppendFooter(footer);
|
||||||
|
if (s.ok()) {
|
||||||
|
closed_ = true;
|
||||||
|
file_size_ += BlobLogFooter::kFooterSize;
|
||||||
|
} else {
|
||||||
|
Log(InfoLogLevel::ERROR_LEVEL, parent_->db_options_.info_log,
|
||||||
|
"Failure to read Header for blob-file %s", PathName().c_str());
|
||||||
|
}
|
||||||
|
// delete the sequential writer
|
||||||
|
log_writer_.reset();
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobFile::ReadFooter(BlobLogFooter* bf) {
|
||||||
|
if (file_size_ < (BlobLogHeader::kHeaderSize + BlobLogFooter::kFooterSize)) {
|
||||||
|
return Status::IOError("File does not have footer", PathName());
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t footer_offset = file_size_ - BlobLogFooter::kFooterSize;
|
||||||
|
// assume that ra_file_reader_ is valid before we enter this
|
||||||
|
assert(ra_file_reader_);
|
||||||
|
|
||||||
|
Slice result;
|
||||||
|
char scratch[BlobLogFooter::kFooterSize + 10];
|
||||||
|
Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kFooterSize,
|
||||||
|
&result, scratch);
|
||||||
|
if (!s.ok()) return s;
|
||||||
|
if (result.size() != BlobLogFooter::kFooterSize) {
|
||||||
|
// should not happen
|
||||||
|
return Status::IOError("EOF reached before footer");
|
||||||
|
}
|
||||||
|
|
||||||
|
s = bf->DecodeFrom(&result);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
|
||||||
|
if (footer.HasTTL() != header_.HasTTL()) {
|
||||||
|
return Status::Corruption("has_ttl mismatch");
|
||||||
|
}
|
||||||
|
if (footer.HasTimestamp() != header_.HasTimestamp()) {
|
||||||
|
return Status::Corruption("has_ts mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
// assume that file has been fully fsync'd
|
||||||
|
last_fsync_.store(file_size_);
|
||||||
|
blob_count_ = footer.GetBlobCount();
|
||||||
|
ttl_range_ = footer.GetTTLRange();
|
||||||
|
time_range_ = footer.GetTimeRange();
|
||||||
|
sn_range_ = footer.GetSNRange();
|
||||||
|
closed_ = true;
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlobFile::Fsync() {
|
||||||
|
if (log_writer_.get()) {
|
||||||
|
log_writer_->Sync();
|
||||||
|
last_fsync_.store(file_size_.load());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlobFile::CloseRandomAccessLocked() {
|
||||||
|
ra_file_reader_.reset();
|
||||||
|
last_access_ = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
|
||||||
|
Env* env, const EnvOptions& env_options, bool* fresh_open) {
|
||||||
|
*fresh_open = false;
|
||||||
|
last_access_ =
|
||||||
|
std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
|
||||||
|
{
|
||||||
|
ReadLock lockbfile_r(&mutex_);
|
||||||
|
if (ra_file_reader_) return ra_file_reader_;
|
||||||
|
}
|
||||||
|
|
||||||
|
WriteLock lockbfile_w(&mutex_);
|
||||||
|
if (ra_file_reader_) return ra_file_reader_;
|
||||||
|
|
||||||
|
std::unique_ptr<RandomAccessFile> rfile;
|
||||||
|
Status s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
|
||||||
|
if (!s.ok()) {
|
||||||
|
Log(InfoLogLevel::ERROR_LEVEL, parent_->db_options_.info_log,
|
||||||
|
"Failed to open blob file for random-read: %s status: '%s'"
|
||||||
|
" exists: '%s'",
|
||||||
|
PathName().c_str(), s.ToString().c_str(),
|
||||||
|
env->FileExists(PathName()).ToString().c_str());
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
ra_file_reader_ = std::make_shared<RandomAccessFileReader>(std::move(rfile));
|
||||||
|
*fresh_open = true;
|
||||||
|
return ra_file_reader_;
|
||||||
|
}
|
||||||
|
|
||||||
|
ColumnFamilyHandle* BlobFile::GetColumnFamily(DB* db) {
|
||||||
|
return db->DefaultColumnFamily();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
313
utilities/blob_db/blob_log_format.cc
Normal file
313
utilities/blob_db/blob_log_format.cc
Normal file
@ -0,0 +1,313 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include "utilities/blob_db/blob_log_format.h"
|
||||||
|
#include "util/coding.h"
|
||||||
|
#include "util/crc32c.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
const uint32_t kMagicNumber = 2395959;
|
||||||
|
const uint32_t kVersion1 = 1;
|
||||||
|
const size_t kBlockSize = 32768;
|
||||||
|
|
||||||
|
BlobLogHeader::BlobLogHeader()
|
||||||
|
: magic_number_(kMagicNumber), compression_(kNoCompression) {}
|
||||||
|
|
||||||
|
BlobLogHeader& BlobLogHeader::operator=(BlobLogHeader&& in) noexcept {
|
||||||
|
if (this != &in) {
|
||||||
|
magic_number_ = in.magic_number_;
|
||||||
|
version_ = in.version_;
|
||||||
|
ttl_guess_ = std::move(in.ttl_guess_);
|
||||||
|
ts_guess_ = std::move(in.ts_guess_);
|
||||||
|
compression_ = in.compression_;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
BlobLogFooter::BlobLogFooter() : magic_number_(kMagicNumber), blob_count_(0) {}
|
||||||
|
|
||||||
|
Status BlobLogFooter::DecodeFrom(Slice* input) {
|
||||||
|
uint32_t val;
|
||||||
|
if (!GetFixed32(input, &val)) {
|
||||||
|
return Status::Corruption("Invalid Blob Footer: flags");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool has_ttl = false;
|
||||||
|
bool has_ts = false;
|
||||||
|
val >>= 8;
|
||||||
|
RecordSubType st = static_cast<RecordSubType>(val);
|
||||||
|
switch (st) {
|
||||||
|
case kRegularType:
|
||||||
|
break;
|
||||||
|
case kTTLType:
|
||||||
|
has_ttl = true;
|
||||||
|
break;
|
||||||
|
case kTimestampType:
|
||||||
|
has_ts = true;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return Status::Corruption("Invalid Blob Footer: flags_val");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!GetFixed64(input, &blob_count_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Footer: blob_count");
|
||||||
|
}
|
||||||
|
|
||||||
|
ttlrange_t temp_ttl;
|
||||||
|
if (!GetFixed32(input, &temp_ttl.first) ||
|
||||||
|
!GetFixed32(input, &temp_ttl.second)) {
|
||||||
|
return Status::Corruption("Invalid Blob Footer: ttl_range");
|
||||||
|
}
|
||||||
|
if (has_ttl) {
|
||||||
|
printf("has ttl\n");
|
||||||
|
ttl_range_.reset(new ttlrange_t(temp_ttl));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!GetFixed64(input, &sn_range_.first) ||
|
||||||
|
!GetFixed64(input, &sn_range_.second)) {
|
||||||
|
return Status::Corruption("Invalid Blob Footer: sn_range");
|
||||||
|
}
|
||||||
|
|
||||||
|
tsrange_t temp_ts;
|
||||||
|
if (!GetFixed64(input, &temp_ts.first) ||
|
||||||
|
!GetFixed64(input, &temp_ts.second)) {
|
||||||
|
return Status::Corruption("Invalid Blob Footer: ts_range");
|
||||||
|
}
|
||||||
|
if (has_ts) ts_range_.reset(new tsrange_t(temp_ts));
|
||||||
|
|
||||||
|
if (!GetFixed32(input, &magic_number_) || magic_number_ != kMagicNumber) {
|
||||||
|
return Status::Corruption("Invalid Blob Footer: magic");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlobLogFooter::EncodeTo(std::string* dst) const {
|
||||||
|
dst->reserve(kFooterSize);
|
||||||
|
|
||||||
|
RecordType rt = kFullType;
|
||||||
|
RecordSubType st = kRegularType;
|
||||||
|
if (HasTTL()) {
|
||||||
|
st = kTTLType;
|
||||||
|
} else if (HasTimestamp()) {
|
||||||
|
st = kTimestampType;
|
||||||
|
}
|
||||||
|
uint32_t val = static_cast<uint32_t>(rt) | (static_cast<uint32_t>(st) << 8);
|
||||||
|
PutFixed32(dst, val);
|
||||||
|
|
||||||
|
PutFixed64(dst, blob_count_);
|
||||||
|
bool has_ttl = HasTTL();
|
||||||
|
bool has_ts = HasTimestamp();
|
||||||
|
|
||||||
|
if (has_ttl) {
|
||||||
|
PutFixed32(dst, ttl_range_.get()->first);
|
||||||
|
PutFixed32(dst, ttl_range_.get()->second);
|
||||||
|
} else {
|
||||||
|
PutFixed32(dst, 0);
|
||||||
|
PutFixed32(dst, 0);
|
||||||
|
}
|
||||||
|
PutFixed64(dst, sn_range_.first);
|
||||||
|
PutFixed64(dst, sn_range_.second);
|
||||||
|
|
||||||
|
if (has_ts) {
|
||||||
|
PutFixed64(dst, ts_range_.get()->first);
|
||||||
|
PutFixed64(dst, ts_range_.get()->second);
|
||||||
|
} else {
|
||||||
|
PutFixed64(dst, 0);
|
||||||
|
PutFixed64(dst, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
PutFixed32(dst, magic_number_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlobLogHeader::EncodeTo(std::string* dst) const {
|
||||||
|
dst->reserve(kHeaderSize);
|
||||||
|
|
||||||
|
PutFixed32(dst, magic_number_);
|
||||||
|
|
||||||
|
PutFixed32(dst, version_);
|
||||||
|
|
||||||
|
RecordSubType st = kRegularType;
|
||||||
|
bool has_ttl = HasTTL();
|
||||||
|
bool has_ts = HasTimestamp();
|
||||||
|
|
||||||
|
if (has_ttl) {
|
||||||
|
st = kTTLType;
|
||||||
|
} else if (has_ts) {
|
||||||
|
st = kTimestampType;
|
||||||
|
}
|
||||||
|
uint32_t val =
|
||||||
|
static_cast<uint32_t>(st) | (static_cast<uint32_t>(compression_) << 8);
|
||||||
|
PutFixed32(dst, val);
|
||||||
|
|
||||||
|
if (has_ttl) {
|
||||||
|
PutFixed32(dst, ttl_guess_.get()->first);
|
||||||
|
PutFixed32(dst, ttl_guess_.get()->second);
|
||||||
|
} else {
|
||||||
|
PutFixed32(dst, 0);
|
||||||
|
PutFixed32(dst, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has_ts) {
|
||||||
|
PutFixed64(dst, ts_guess_.get()->first);
|
||||||
|
PutFixed64(dst, ts_guess_.get()->second);
|
||||||
|
} else {
|
||||||
|
PutFixed64(dst, 0);
|
||||||
|
PutFixed64(dst, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobLogHeader::DecodeFrom(Slice* input) {
|
||||||
|
if (!GetFixed32(input, &magic_number_) || magic_number_ != kMagicNumber) {
|
||||||
|
return Status::Corruption("Invalid Blob Log Header: magic");
|
||||||
|
}
|
||||||
|
|
||||||
|
// as of today, we only support 1 version
|
||||||
|
if (!GetFixed32(input, &version_) || version_ != kVersion1) {
|
||||||
|
return Status::Corruption("Invalid Blob Log Header: version");
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t val;
|
||||||
|
if (!GetFixed32(input, &val)) {
|
||||||
|
return Status::Corruption("Invalid Blob Log Header: subtype");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool has_ttl = false;
|
||||||
|
bool has_ts = false;
|
||||||
|
RecordSubType st = static_cast<RecordSubType>(val & 0xff);
|
||||||
|
compression_ = static_cast<CompressionType>((val >> 8) & 0xff);
|
||||||
|
switch (st) {
|
||||||
|
case kRegularType:
|
||||||
|
break;
|
||||||
|
case kTTLType:
|
||||||
|
has_ttl = true;
|
||||||
|
break;
|
||||||
|
case kTimestampType:
|
||||||
|
has_ts = true;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return Status::Corruption("Invalid Blob Log Header: subtype_2");
|
||||||
|
}
|
||||||
|
|
||||||
|
ttlrange_t temp_ttl;
|
||||||
|
if (!GetFixed32(input, &temp_ttl.first) ||
|
||||||
|
!GetFixed32(input, &temp_ttl.second)) {
|
||||||
|
return Status::Corruption("Invalid Blob Log Header: ttl");
|
||||||
|
}
|
||||||
|
if (has_ttl) set_ttl_guess(temp_ttl);
|
||||||
|
|
||||||
|
tsrange_t temp_ts;
|
||||||
|
if (!GetFixed64(input, &temp_ts.first) ||
|
||||||
|
!GetFixed64(input, &temp_ts.second)) {
|
||||||
|
return Status::Corruption("Invalid Blob Log Header: timestamp");
|
||||||
|
}
|
||||||
|
if (has_ts) set_ts_guess(temp_ts);
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
BlobLogRecord::BlobLogRecord()
|
||||||
|
: checksum_(0),
|
||||||
|
header_cksum_(0),
|
||||||
|
key_size_(0),
|
||||||
|
blob_size_(0),
|
||||||
|
time_val_(0),
|
||||||
|
ttl_val_(0),
|
||||||
|
sn_(0),
|
||||||
|
type_(0),
|
||||||
|
subtype_(0) {}
|
||||||
|
|
||||||
|
BlobLogRecord::~BlobLogRecord() {}
|
||||||
|
|
||||||
|
void BlobLogRecord::ResizeKeyBuffer(size_t kbs) {
|
||||||
|
if (kbs > key_buffer_.size()) {
|
||||||
|
key_buffer_.resize(kbs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlobLogRecord::ResizeBlobBuffer(size_t bbs) {
|
||||||
|
if (bbs > blob_buffer_.size()) {
|
||||||
|
blob_buffer_.resize(bbs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlobLogRecord::Clear() {
|
||||||
|
checksum_ = 0;
|
||||||
|
header_cksum_ = 0;
|
||||||
|
key_size_ = 0;
|
||||||
|
blob_size_ = 0;
|
||||||
|
time_val_ = 0;
|
||||||
|
ttl_val_ = 0;
|
||||||
|
sn_ = 0;
|
||||||
|
type_ = subtype_ = 0;
|
||||||
|
key_.clear();
|
||||||
|
blob_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobLogRecord::DecodeHeaderFrom(const Slice& hdrslice) {
|
||||||
|
Slice input = hdrslice;
|
||||||
|
if (input.size() < kHeaderSize) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Header: size");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!GetFixed32(&input, &key_size_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Header: key_size");
|
||||||
|
}
|
||||||
|
if (!GetFixed64(&input, &blob_size_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Header: blob_size");
|
||||||
|
}
|
||||||
|
if (!GetFixed32(&input, &ttl_val_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Header: ttl_val");
|
||||||
|
}
|
||||||
|
if (!GetFixed64(&input, &time_val_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Header: time_val");
|
||||||
|
}
|
||||||
|
|
||||||
|
type_ = *(input.data());
|
||||||
|
input.remove_prefix(1);
|
||||||
|
subtype_ = *(input.data());
|
||||||
|
input.remove_prefix(1);
|
||||||
|
|
||||||
|
if (!GetFixed32(&input, &header_cksum_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Header: header_cksum");
|
||||||
|
}
|
||||||
|
if (!GetFixed32(&input, &checksum_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Header: checksum");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status BlobLogRecord::DecodeFooterFrom(const Slice& footerslice) {
|
||||||
|
Slice input = footerslice;
|
||||||
|
if (input.size() < kFooterSize) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Footer: size");
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t f_crc = crc32c::Extend(0, input.data(), 8);
|
||||||
|
f_crc = crc32c::Mask(f_crc);
|
||||||
|
|
||||||
|
if (!GetFixed64(&input, &sn_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Footer: sn");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!GetFixed32(&input, &footer_cksum_)) {
|
||||||
|
return Status::Corruption("Invalid Blob Record Footer: cksum");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (f_crc != footer_cksum_) {
|
||||||
|
return Status::Corruption("Record Checksum mismatch: footer_cksum");
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
226
utilities/blob_db/blob_log_format.h
Normal file
226
utilities/blob_db/blob_log_format.h
Normal file
@ -0,0 +1,226 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Log format information shared by reader and writer.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/status.h"
|
||||||
|
#include "rocksdb/types.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace blob_db {
|
||||||
|
class BlobFile;
|
||||||
|
class BlobDBImpl;
|
||||||
|
|
||||||
|
enum RecordType : uint8_t {
|
||||||
|
// Zero is reserved for preallocated files
|
||||||
|
kFullType = 0,
|
||||||
|
|
||||||
|
// For fragments
|
||||||
|
kFirstType = 1,
|
||||||
|
kMiddleType = 2,
|
||||||
|
kLastType = 3,
|
||||||
|
kMaxRecordType = kLastType
|
||||||
|
};
|
||||||
|
|
||||||
|
enum RecordSubType : uint8_t {
|
||||||
|
kRegularType = 0,
|
||||||
|
kTTLType = 1,
|
||||||
|
kTimestampType = 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
extern const uint32_t kMagicNumber;
|
||||||
|
|
||||||
|
class Reader;
|
||||||
|
|
||||||
|
typedef std::pair<uint32_t, uint32_t> ttlrange_t;
|
||||||
|
typedef std::pair<uint64_t, uint64_t> tsrange_t;
|
||||||
|
typedef std::pair<rocksdb::SequenceNumber, rocksdb::SequenceNumber> snrange_t;
|
||||||
|
|
||||||
|
class BlobLogHeader {
|
||||||
|
friend class BlobFile;
|
||||||
|
friend class BlobDBImpl;
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint32_t magic_number_ = 0;
|
||||||
|
uint32_t version_ = 1;
|
||||||
|
CompressionType compression_;
|
||||||
|
std::unique_ptr<ttlrange_t> ttl_guess_;
|
||||||
|
std::unique_ptr<tsrange_t> ts_guess_;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void set_ttl_guess(const ttlrange_t& ttl) {
|
||||||
|
ttl_guess_.reset(new ttlrange_t(ttl));
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_version(uint32_t v) { version_ = v; }
|
||||||
|
|
||||||
|
void set_ts_guess(const tsrange_t& ts) { ts_guess_.reset(new tsrange_t(ts)); }
|
||||||
|
|
||||||
|
public:
|
||||||
|
// magic number + version + flags + ttl guess + timestamp range
|
||||||
|
static const size_t kHeaderSize = 4 + 4 + 4 + 4 * 2 + 8 * 2;
|
||||||
|
// 32
|
||||||
|
|
||||||
|
void EncodeTo(std::string* dst) const;
|
||||||
|
|
||||||
|
Status DecodeFrom(Slice* input);
|
||||||
|
|
||||||
|
BlobLogHeader();
|
||||||
|
|
||||||
|
bool HasTTL() const { return !!ttl_guess_; }
|
||||||
|
|
||||||
|
bool HasTimestamp() const { return !!ts_guess_; }
|
||||||
|
|
||||||
|
BlobLogHeader& operator=(BlobLogHeader&& in) noexcept;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Footer encapsulates the fixed information stored at the tail
|
||||||
|
// end of every blob log file.
|
||||||
|
class BlobLogFooter {
|
||||||
|
friend class BlobFile;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Use this constructor when you plan to write out the footer using
|
||||||
|
// EncodeTo(). Never use this constructor with DecodeFrom().
|
||||||
|
BlobLogFooter();
|
||||||
|
|
||||||
|
uint64_t magic_number() const { return magic_number_; }
|
||||||
|
|
||||||
|
void EncodeTo(std::string* dst) const;
|
||||||
|
|
||||||
|
Status DecodeFrom(Slice* input);
|
||||||
|
|
||||||
|
// convert this object to a human readable form
|
||||||
|
std::string ToString() const;
|
||||||
|
|
||||||
|
// footer size = 4 byte magic number
|
||||||
|
// 8 bytes count
|
||||||
|
// 4, 4 - ttl range
|
||||||
|
// 8, 8 - sn range
|
||||||
|
// 8, 8 - ts range
|
||||||
|
// = 56
|
||||||
|
static const size_t kFooterSize = 4 + 4 + 8 + (4 * 2) + (8 * 2) + (8 * 2);
|
||||||
|
|
||||||
|
bool HasTTL() const { return !!ttl_range_; }
|
||||||
|
|
||||||
|
bool HasTimestamp() const { return !!ts_range_; }
|
||||||
|
|
||||||
|
uint64_t GetBlobCount() const { return blob_count_; }
|
||||||
|
|
||||||
|
ttlrange_t GetTTLRange() const {
|
||||||
|
if (ttl_range_) {
|
||||||
|
*ttl_range_;
|
||||||
|
}
|
||||||
|
return {0, 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
tsrange_t GetTimeRange() const {
|
||||||
|
if (ts_range_) {
|
||||||
|
return *ts_range_;
|
||||||
|
}
|
||||||
|
return {0, 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
const snrange_t& GetSNRange() const { return sn_range_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint32_t magic_number_ = 0;
|
||||||
|
uint64_t blob_count_ = 0;
|
||||||
|
|
||||||
|
std::unique_ptr<ttlrange_t> ttl_range_;
|
||||||
|
std::unique_ptr<tsrange_t> ts_range_;
|
||||||
|
snrange_t sn_range_;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void set_ttl_range(const ttlrange_t& ttl) {
|
||||||
|
ttl_range_.reset(new ttlrange_t(ttl));
|
||||||
|
}
|
||||||
|
void set_time_range(const tsrange_t& ts) {
|
||||||
|
ts_range_.reset(new tsrange_t(ts));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
extern const size_t kBlockSize;
|
||||||
|
|
||||||
|
class BlobLogRecord {
|
||||||
|
friend class Reader;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// this might not be set.
|
||||||
|
uint32_t checksum_;
|
||||||
|
uint32_t header_cksum_;
|
||||||
|
uint32_t key_size_;
|
||||||
|
uint64_t blob_size_;
|
||||||
|
uint64_t time_val_;
|
||||||
|
uint32_t ttl_val_;
|
||||||
|
SequenceNumber sn_;
|
||||||
|
uint32_t footer_cksum_;
|
||||||
|
char type_;
|
||||||
|
char subtype_;
|
||||||
|
Slice key_;
|
||||||
|
Slice blob_;
|
||||||
|
std::string key_buffer_;
|
||||||
|
std::string blob_buffer_;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void Clear();
|
||||||
|
|
||||||
|
char* GetKeyBuffer() { return &(key_buffer_[0]); }
|
||||||
|
|
||||||
|
char* GetBlobBuffer() { return &(blob_buffer_[0]); }
|
||||||
|
|
||||||
|
void ResizeKeyBuffer(size_t kbs);
|
||||||
|
|
||||||
|
void ResizeBlobBuffer(size_t bbs);
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Header is
|
||||||
|
// Key Length ( 4 bytes ),
|
||||||
|
// Blob Length ( 8 bytes), timestamp/ttl (8 bytes),
|
||||||
|
// type (1 byte), subtype (1 byte)
|
||||||
|
// header checksum (4 bytes), blob checksum (4 bytes),
|
||||||
|
// = 34
|
||||||
|
static const size_t kHeaderSize = 4 + 4 + 4 + 8 + 4 + 8 + 1 + 1;
|
||||||
|
|
||||||
|
static const size_t kFooterSize = 8 + 4;
|
||||||
|
|
||||||
|
public:
|
||||||
|
BlobLogRecord();
|
||||||
|
|
||||||
|
~BlobLogRecord();
|
||||||
|
|
||||||
|
const Slice& Key() const { return key_; }
|
||||||
|
|
||||||
|
const Slice& Blob() const { return blob_; }
|
||||||
|
|
||||||
|
uint32_t GetKeySize() const { return key_size_; }
|
||||||
|
|
||||||
|
uint64_t GetBlobSize() const { return blob_size_; }
|
||||||
|
|
||||||
|
uint32_t GetTTL() const { return ttl_val_; }
|
||||||
|
|
||||||
|
uint64_t GetTimeVal() const { return time_val_; }
|
||||||
|
|
||||||
|
SequenceNumber GetSN() const { return sn_; }
|
||||||
|
|
||||||
|
Status DecodeHeaderFrom(const Slice& hdrslice);
|
||||||
|
|
||||||
|
Status DecodeFooterFrom(const Slice& footerslice);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
163
utilities/blob_db/blob_log_reader.cc
Normal file
163
utilities/blob_db/blob_log_reader.cc
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include "utilities/blob_db/blob_log_reader.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include "rocksdb/env.h"
|
||||||
|
#include "util/coding.h"
|
||||||
|
#include "util/crc32c.h"
|
||||||
|
#include "util/file_reader_writer.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
Reader::Reader(std::shared_ptr<Logger> info_log,
|
||||||
|
unique_ptr<SequentialFileReader>&& _file)
|
||||||
|
: info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {
|
||||||
|
backing_store_.resize(kBlockSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
Reader::~Reader() {}
|
||||||
|
|
||||||
|
Status Reader::ReadHeader(BlobLogHeader* header) {
|
||||||
|
assert(file_.get() != nullptr);
|
||||||
|
assert(next_byte_ == 0);
|
||||||
|
Status status =
|
||||||
|
file_->Read(BlobLogHeader::kHeaderSize, &buffer_, GetReadBuffer());
|
||||||
|
next_byte_ += buffer_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
|
||||||
|
if (buffer_.size() != BlobLogHeader::kHeaderSize) {
|
||||||
|
return Status::IOError("EOF reached before file header");
|
||||||
|
}
|
||||||
|
|
||||||
|
status = header->DecodeFrom(&buffer_);
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
|
||||||
|
WALRecoveryMode wal_recovery_mode) {
|
||||||
|
record->Clear();
|
||||||
|
buffer_.clear();
|
||||||
|
backing_store_[0] = '\0';
|
||||||
|
|
||||||
|
Status status =
|
||||||
|
file_->Read(BlobLogRecord::kHeaderSize, &buffer_, GetReadBuffer());
|
||||||
|
next_byte_ += buffer_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
if (buffer_.size() != BlobLogRecord::kHeaderSize) {
|
||||||
|
return Status::IOError("EOF reached before record header");
|
||||||
|
}
|
||||||
|
|
||||||
|
status = record->DecodeHeaderFrom(buffer_);
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
|
||||||
|
uint32_t header_crc = 0;
|
||||||
|
uint32_t blob_crc = 0;
|
||||||
|
size_t crc_data_size = BlobLogRecord::kHeaderSize - 2 * sizeof(uint32_t);
|
||||||
|
header_crc = crc32c::Extend(header_crc, buffer_.data(), crc_data_size);
|
||||||
|
|
||||||
|
uint64_t kb_size = record->GetKeySize() + record->GetBlobSize();
|
||||||
|
switch (level) {
|
||||||
|
case kReadHdrFooter:
|
||||||
|
file_->Skip(kb_size);
|
||||||
|
next_byte_ += kb_size;
|
||||||
|
status =
|
||||||
|
file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
|
||||||
|
next_byte_ += buffer_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
if (buffer_.size() != BlobLogRecord::kFooterSize) {
|
||||||
|
return Status::IOError("EOF reached before record footer");
|
||||||
|
}
|
||||||
|
|
||||||
|
status = record->DecodeFooterFrom(buffer_);
|
||||||
|
return status;
|
||||||
|
|
||||||
|
case kReadHdrKeyFooter:
|
||||||
|
record->ResizeKeyBuffer(record->GetKeySize());
|
||||||
|
status = file_->Read(record->GetKeySize(), &record->key_,
|
||||||
|
record->GetKeyBuffer());
|
||||||
|
next_byte_ += record->key_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
if (record->key_.size() != record->GetKeySize()) {
|
||||||
|
return Status::IOError("EOF reached before key read");
|
||||||
|
}
|
||||||
|
|
||||||
|
header_crc =
|
||||||
|
crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
|
||||||
|
header_crc = crc32c::Mask(header_crc);
|
||||||
|
if (header_crc != record->header_cksum_) {
|
||||||
|
return Status::Corruption("Record Checksum mismatch: header_cksum");
|
||||||
|
}
|
||||||
|
|
||||||
|
file_->Skip(record->GetBlobSize());
|
||||||
|
next_byte_ += record->GetBlobSize();
|
||||||
|
|
||||||
|
status =
|
||||||
|
file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
|
||||||
|
next_byte_ += buffer_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
if (buffer_.size() != BlobLogRecord::kFooterSize) {
|
||||||
|
return Status::IOError("EOF reached during footer read");
|
||||||
|
}
|
||||||
|
|
||||||
|
status = record->DecodeFooterFrom(buffer_);
|
||||||
|
return status;
|
||||||
|
|
||||||
|
case kReadHdrKeyBlobFooter:
|
||||||
|
record->ResizeKeyBuffer(record->GetKeySize());
|
||||||
|
status = file_->Read(record->GetKeySize(), &record->key_,
|
||||||
|
record->GetKeyBuffer());
|
||||||
|
next_byte_ += record->key_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
if (record->key_.size() != record->GetKeySize()) {
|
||||||
|
return Status::IOError("EOF reached before key read");
|
||||||
|
}
|
||||||
|
|
||||||
|
header_crc =
|
||||||
|
crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
|
||||||
|
header_crc = crc32c::Mask(header_crc);
|
||||||
|
if (header_crc != record->header_cksum_) {
|
||||||
|
return Status::Corruption("Record Checksum mismatch: header_cksum");
|
||||||
|
}
|
||||||
|
|
||||||
|
record->ResizeBlobBuffer(record->GetBlobSize());
|
||||||
|
status = file_->Read(record->GetBlobSize(), &record->blob_,
|
||||||
|
record->GetBlobBuffer());
|
||||||
|
next_byte_ += record->blob_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
if (record->blob_.size() != record->GetBlobSize()) {
|
||||||
|
return Status::IOError("EOF reached during blob read");
|
||||||
|
}
|
||||||
|
|
||||||
|
blob_crc =
|
||||||
|
crc32c::Extend(blob_crc, record->blob_.data(), record->blob_.size());
|
||||||
|
blob_crc = crc32c::Mask(blob_crc);
|
||||||
|
if (blob_crc != record->checksum_) {
|
||||||
|
return Status::Corruption("Blob Checksum mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
status =
|
||||||
|
file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
|
||||||
|
next_byte_ += buffer_.size();
|
||||||
|
if (!status.ok()) return status;
|
||||||
|
if (buffer_.size() != BlobLogRecord::kFooterSize) {
|
||||||
|
return Status::IOError("EOF reached during blob footer read");
|
||||||
|
}
|
||||||
|
|
||||||
|
status = record->DecodeFooterFrom(buffer_);
|
||||||
|
return status;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
93
utilities/blob_db/blob_log_reader.h
Normal file
93
utilities/blob_db/blob_log_reader.h
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/slice.h"
|
||||||
|
#include "rocksdb/status.h"
|
||||||
|
#include "utilities/blob_db/blob_log_format.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class SequentialFileReader;
|
||||||
|
class Logger;
|
||||||
|
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reader is a general purpose log stream reader implementation. The actual job
|
||||||
|
* of reading from the device is implemented by the SequentialFile interface.
|
||||||
|
*
|
||||||
|
* Please see Writer for details on the file and record layout.
|
||||||
|
*/
|
||||||
|
class Reader {
|
||||||
|
public:
|
||||||
|
enum ReadLevel {
|
||||||
|
kReadHdrFooter,
|
||||||
|
kReadHdrKeyFooter,
|
||||||
|
kReadHdrKeyBlobFooter,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create a reader that will return log records from "*file".
|
||||||
|
// "*file" must remain live while this Reader is in use.
|
||||||
|
//
|
||||||
|
// If "reporter" is non-nullptr, it is notified whenever some data is
|
||||||
|
// dropped due to a detected corruption. "*reporter" must remain
|
||||||
|
// live while this Reader is in use.
|
||||||
|
//
|
||||||
|
// If "checksum" is true, verify checksums if available.
|
||||||
|
//
|
||||||
|
// The Reader will start reading at the first record located at physical
|
||||||
|
// position >= initial_offset within the file.
|
||||||
|
Reader(std::shared_ptr<Logger> info_log,
|
||||||
|
std::unique_ptr<SequentialFileReader>&& file);
|
||||||
|
|
||||||
|
~Reader();
|
||||||
|
|
||||||
|
Status ReadHeader(BlobLogHeader* header);
|
||||||
|
|
||||||
|
// Read the next record into *record. Returns true if read
|
||||||
|
// successfully, false if we hit end of the input. May use
|
||||||
|
// "*scratch" as temporary storage. The contents filled in *record
|
||||||
|
// will only be valid until the next mutating operation on this
|
||||||
|
// reader or the next mutation to *scratch.
|
||||||
|
Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHdrFooter,
|
||||||
|
WALRecoveryMode wal_recovery_mode =
|
||||||
|
WALRecoveryMode::kTolerateCorruptedTailRecords);
|
||||||
|
|
||||||
|
SequentialFileReader* file() { return file_.get(); }
|
||||||
|
|
||||||
|
void ResetNextByte() { next_byte_ = 0; }
|
||||||
|
|
||||||
|
uint64_t GetNextByte() const { return next_byte_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
char* GetReadBuffer() { return &(backing_store_[0]); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<Logger> info_log_;
|
||||||
|
const std::unique_ptr<SequentialFileReader> file_;
|
||||||
|
|
||||||
|
std::string backing_store_;
|
||||||
|
Slice buffer_;
|
||||||
|
|
||||||
|
// which byte to read next. For asserting proper usage
|
||||||
|
uint64_t next_byte_;
|
||||||
|
|
||||||
|
// No copying allowed
|
||||||
|
Reader(const Reader&) = delete;
|
||||||
|
Reader& operator=(const Reader&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
172
utilities/blob_db/blob_log_writer.cc
Normal file
172
utilities/blob_db/blob_log_writer.cc
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include "utilities/blob_db/blob_log_writer.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <limits>
|
||||||
|
#include <string>
|
||||||
|
#include "rocksdb/env.h"
|
||||||
|
#include "util/coding.h"
|
||||||
|
#include "util/crc32c.h"
|
||||||
|
#include "util/file_reader_writer.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
|
||||||
|
uint64_t bpsync, bool use_fs, uint64_t boffset)
|
||||||
|
: dest_(std::move(dest)),
|
||||||
|
log_number_(log_number),
|
||||||
|
block_offset_(boffset),
|
||||||
|
bytes_per_sync_(bpsync),
|
||||||
|
next_sync_offset_(0),
|
||||||
|
use_fsync_(use_fs),
|
||||||
|
last_elem_type_(kEtNone) {
|
||||||
|
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||||
|
char t = static_cast<char>(i);
|
||||||
|
type_crc_[i] = crc32c::Value(&t, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Writer::~Writer() {}
|
||||||
|
|
||||||
|
void Writer::Sync() { dest_->Sync(use_fsync_); }
|
||||||
|
|
||||||
|
Status Writer::WriteHeader(const BlobLogHeader& header) {
|
||||||
|
assert(block_offset_ == 0);
|
||||||
|
assert(last_elem_type_ == kEtNone);
|
||||||
|
std::string str;
|
||||||
|
header.EncodeTo(&str);
|
||||||
|
|
||||||
|
Status s = dest_->Append(Slice(str));
|
||||||
|
if (s.ok()) {
|
||||||
|
block_offset_ += str.size();
|
||||||
|
s = dest_->Flush();
|
||||||
|
}
|
||||||
|
last_elem_type_ = kEtFileHdr;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Writer::AppendFooter(const BlobLogFooter& footer) {
|
||||||
|
assert(block_offset_ != 0);
|
||||||
|
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
|
||||||
|
|
||||||
|
std::string str;
|
||||||
|
footer.EncodeTo(&str);
|
||||||
|
|
||||||
|
Status s = dest_->Append(Slice(str));
|
||||||
|
if (s.ok()) {
|
||||||
|
block_offset_ += str.size();
|
||||||
|
s = dest_->Close();
|
||||||
|
dest_.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
last_elem_type_ = kEtFileFooter;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Writer::AddRecord(const Slice& key, const Slice& val,
|
||||||
|
uint64_t* key_offset, uint64_t* blob_offset,
|
||||||
|
uint32_t ttl) {
|
||||||
|
assert(block_offset_ != 0);
|
||||||
|
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
|
||||||
|
|
||||||
|
std::string buf;
|
||||||
|
ConstructBlobHeader(&buf, key, val, ttl, -1);
|
||||||
|
|
||||||
|
Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Writer::AddRecord(const Slice& key, const Slice& val,
|
||||||
|
uint64_t* key_offset, uint64_t* blob_offset) {
|
||||||
|
assert(block_offset_ != 0);
|
||||||
|
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
|
||||||
|
|
||||||
|
std::string buf;
|
||||||
|
ConstructBlobHeader(&buf, key, val, -1, -1);
|
||||||
|
|
||||||
|
Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Writer::ConstructBlobHeader(std::string* headerbuf, const Slice& key,
|
||||||
|
const Slice& val, int32_t ttl, int64_t ts) {
|
||||||
|
headerbuf->reserve(BlobLogRecord::kHeaderSize);
|
||||||
|
|
||||||
|
uint32_t key_size = static_cast<uint32_t>(key.size());
|
||||||
|
PutFixed32(headerbuf, key_size);
|
||||||
|
PutFixed64(headerbuf, val.size());
|
||||||
|
|
||||||
|
uint32_t ttl_write = (ttl != -1) ? static_cast<uint32_t>(ttl)
|
||||||
|
: std::numeric_limits<uint32_t>::max();
|
||||||
|
PutFixed32(headerbuf, ttl_write);
|
||||||
|
|
||||||
|
uint64_t ts_write = (ts != -1) ? static_cast<uint64_t>(ts)
|
||||||
|
: std::numeric_limits<uint64_t>::max();
|
||||||
|
PutFixed64(headerbuf, ts_write);
|
||||||
|
|
||||||
|
RecordType t = kFullType;
|
||||||
|
headerbuf->push_back(static_cast<char>(t));
|
||||||
|
|
||||||
|
RecordSubType st = kRegularType;
|
||||||
|
if (ttl != -1) st = kTTLType;
|
||||||
|
headerbuf->push_back(static_cast<char>(st));
|
||||||
|
|
||||||
|
uint32_t header_crc = 0;
|
||||||
|
header_crc =
|
||||||
|
crc32c::Extend(header_crc, headerbuf->c_str(), headerbuf->size());
|
||||||
|
header_crc = crc32c::Extend(header_crc, key.data(), key.size());
|
||||||
|
header_crc = crc32c::Mask(header_crc);
|
||||||
|
PutFixed32(headerbuf, header_crc);
|
||||||
|
|
||||||
|
uint32_t crc = 0;
|
||||||
|
// Compute the crc of the record type and the payload.
|
||||||
|
crc = crc32c::Extend(crc, val.data(), val.size());
|
||||||
|
crc = crc32c::Mask(crc); // Adjust for storage
|
||||||
|
PutFixed32(headerbuf, crc);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
|
||||||
|
const Slice& key, const Slice& val,
|
||||||
|
uint64_t* key_offset, uint64_t* blob_offset) {
|
||||||
|
Status s = dest_->Append(Slice(headerbuf));
|
||||||
|
if (s.ok()) {
|
||||||
|
s = dest_->Append(key);
|
||||||
|
if (s.ok()) s = dest_->Append(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
*key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
|
||||||
|
*blob_offset = *key_offset + key.size();
|
||||||
|
block_offset_ = *blob_offset + val.size();
|
||||||
|
last_elem_type_ = kEtRecord;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Writer::AddRecordFooter(const SequenceNumber& seq) {
|
||||||
|
assert(last_elem_type_ == kEtRecord);
|
||||||
|
|
||||||
|
std::string buf;
|
||||||
|
PutFixed64(&buf, seq);
|
||||||
|
|
||||||
|
uint32_t footer_crc = crc32c::Extend(0, buf.c_str(), buf.size());
|
||||||
|
footer_crc = crc32c::Mask(footer_crc);
|
||||||
|
PutFixed32(&buf, footer_crc);
|
||||||
|
|
||||||
|
Status s = dest_->Append(Slice(buf));
|
||||||
|
block_offset_ += BlobLogRecord::kFooterSize;
|
||||||
|
|
||||||
|
if (s.ok()) dest_->Flush();
|
||||||
|
|
||||||
|
last_elem_type_ = kEtFooter;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
98
utilities/blob_db/blob_log_writer.h
Normal file
98
utilities/blob_db/blob_log_writer.h
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "rocksdb/slice.h"
|
||||||
|
#include "rocksdb/status.h"
|
||||||
|
#include "rocksdb/types.h"
|
||||||
|
#include "utilities/blob_db/blob_log_format.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class WritableFileWriter;
|
||||||
|
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writer is the blob log stream writer. It provides an append-only
|
||||||
|
* abstraction for writing blob data.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Look at blob_db_format.h to see the details of the record formats.
|
||||||
|
*/
|
||||||
|
|
||||||
|
class Writer {
|
||||||
|
public:
|
||||||
|
// Create a writer that will append data to "*dest".
|
||||||
|
// "*dest" must be initially empty.
|
||||||
|
// "*dest" must remain live while this Writer is in use.
|
||||||
|
explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
|
||||||
|
uint64_t log_number, uint64_t bpsync, bool use_fsync,
|
||||||
|
uint64_t boffset = 0);
|
||||||
|
~Writer();
|
||||||
|
|
||||||
|
static void ConstructBlobHeader(std::string* headerbuf, const Slice& key,
|
||||||
|
const Slice& val, int32_t ttl, int64_t ts);
|
||||||
|
|
||||||
|
Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
|
||||||
|
uint64_t* blob_offset);
|
||||||
|
|
||||||
|
Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
|
||||||
|
uint64_t* blob_offset, uint32_t ttl);
|
||||||
|
|
||||||
|
Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
|
||||||
|
const Slice& val, uint64_t* key_offset,
|
||||||
|
uint64_t* blob_offset);
|
||||||
|
|
||||||
|
Status AddRecordFooter(const SequenceNumber& sn);
|
||||||
|
|
||||||
|
Status AppendFooter(const BlobLogFooter& footer);
|
||||||
|
|
||||||
|
Status WriteHeader(const BlobLogHeader& header);
|
||||||
|
|
||||||
|
WritableFileWriter* file() { return dest_.get(); }
|
||||||
|
|
||||||
|
const WritableFileWriter* file() const { return dest_.get(); }
|
||||||
|
|
||||||
|
uint64_t get_log_number() const { return log_number_; }
|
||||||
|
|
||||||
|
bool ShouldSync() const { return block_offset_ > next_sync_offset_; }
|
||||||
|
|
||||||
|
void Sync();
|
||||||
|
|
||||||
|
void ResetSyncPointer() { next_sync_offset_ += bytes_per_sync_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<WritableFileWriter> dest_;
|
||||||
|
uint64_t log_number_;
|
||||||
|
uint64_t block_offset_; // Current offset in block
|
||||||
|
uint64_t bytes_per_sync_;
|
||||||
|
uint64_t next_sync_offset_;
|
||||||
|
bool use_fsync_;
|
||||||
|
|
||||||
|
// crc32c values for all supported record types. These are
|
||||||
|
// pre-computed to reduce the overhead of computing the crc of the
|
||||||
|
// record type stored in the header.
|
||||||
|
uint32_t type_crc_[kMaxRecordType + 1];
|
||||||
|
|
||||||
|
// No copying allowed
|
||||||
|
Writer(const Writer&) = delete;
|
||||||
|
Writer& operator=(const Writer&) = delete;
|
||||||
|
|
||||||
|
public:
|
||||||
|
enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFooter, kEtFileFooter };
|
||||||
|
ElemType last_elem_type_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // ROCKSDB_LITE
|
@ -16,10 +16,14 @@ namespace rocksdb {
|
|||||||
|
|
||||||
class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
|
class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
|
||||||
public:
|
public:
|
||||||
explicit OptimisticTransactionDBImpl(DB* db)
|
explicit OptimisticTransactionDBImpl(DB* db, bool take_ownership = true)
|
||||||
: OptimisticTransactionDB(db), db_(db) {}
|
: OptimisticTransactionDB(db), db_(db), db_owner_(take_ownership) {}
|
||||||
|
|
||||||
~OptimisticTransactionDBImpl() {}
|
~OptimisticTransactionDBImpl() {
|
||||||
|
if (!db_owner_) {
|
||||||
|
db_.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Transaction* BeginTransaction(const WriteOptions& write_options,
|
Transaction* BeginTransaction(const WriteOptions& write_options,
|
||||||
const OptimisticTransactionOptions& txn_options,
|
const OptimisticTransactionOptions& txn_options,
|
||||||
@ -29,6 +33,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<DB> db_;
|
std::unique_ptr<DB> db_;
|
||||||
|
bool db_owner_;
|
||||||
|
|
||||||
void ReinitializeTransaction(Transaction* txn,
|
void ReinitializeTransaction(Transaction* txn,
|
||||||
const WriteOptions& write_options,
|
const WriteOptions& write_options,
|
||||||
|
Loading…
Reference in New Issue
Block a user