2014-01-24 23:30:28 +01:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
#include "db/column_family.h"
|
2014-01-24 23:30:28 +01:00
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
#include <string>
|
|
|
|
#include <algorithm>
|
2014-05-21 20:43:35 +02:00
|
|
|
#include <limits>
|
2014-01-24 23:30:28 +01:00
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
#include "db/db_impl.h"
|
2014-01-22 20:44:53 +01:00
|
|
|
#include "db/version_set.h"
|
2014-02-05 02:45:19 +01:00
|
|
|
#include "db/internal_stats.h"
|
2014-02-01 00:30:27 +01:00
|
|
|
#include "db/compaction_picker.h"
|
2014-02-05 01:31:18 +01:00
|
|
|
#include "db/table_properties_collector.h"
|
2014-02-07 00:42:16 +01:00
|
|
|
#include "util/autovector.h"
|
2014-02-05 01:31:18 +01:00
|
|
|
#include "util/hash_skiplist_rep.h"
|
2014-01-22 20:44:53 +01:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
|
|
|
|
DBImpl* db, port::Mutex* mutex)
|
|
|
|
: cfd_(cfd), db_(db), mutex_(mutex) {
|
|
|
|
if (cfd_ != nullptr) {
|
|
|
|
cfd_->Ref();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
|
|
|
|
if (cfd_ != nullptr) {
|
|
|
|
DBImpl::DeletionState deletion_state;
|
|
|
|
mutex_->Lock();
|
|
|
|
if (cfd_->Unref()) {
|
|
|
|
delete cfd_;
|
|
|
|
}
|
|
|
|
db_->FindObsoleteFiles(deletion_state, false, true);
|
|
|
|
mutex_->Unlock();
|
2014-03-11 01:25:10 +01:00
|
|
|
if (deletion_state.HaveSomethingToDelete()) {
|
|
|
|
db_->PurgeObsoleteFiles(deletion_state);
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-26 02:30:54 +01:00
|
|
|
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
|
|
|
|
|
2014-02-05 01:31:18 +01:00
|
|
|
namespace {
|
|
|
|
// Fix user-supplied options to be reasonable
|
|
|
|
template <class T, class V>
|
|
|
|
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
|
|
|
|
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
|
|
|
|
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
|
|
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
|
|
|
|
const InternalFilterPolicy* ipolicy,
|
|
|
|
const ColumnFamilyOptions& src) {
|
|
|
|
ColumnFamilyOptions result = src;
|
|
|
|
result.comparator = icmp;
|
|
|
|
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
|
2014-04-17 00:15:22 +02:00
|
|
|
#ifdef OS_MACOSX
|
|
|
|
// TODO(icanadi) make write_buffer_size uint64_t instead of size_t
|
|
|
|
ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
|
|
|
|
#else
|
2014-02-05 01:31:18 +01:00
|
|
|
ClipToRange(&result.write_buffer_size,
|
|
|
|
((size_t)64) << 10, ((size_t)64) << 30);
|
2014-04-17 00:15:22 +02:00
|
|
|
#endif
|
2014-02-05 01:31:18 +01:00
|
|
|
// if user sets arena_block_size, we trust user to use this value. Otherwise,
|
|
|
|
// calculate a proper value from writer_buffer_size;
|
|
|
|
if (result.arena_block_size <= 0) {
|
|
|
|
result.arena_block_size = result.write_buffer_size / 10;
|
|
|
|
}
|
|
|
|
result.min_write_buffer_number_to_merge =
|
|
|
|
std::min(result.min_write_buffer_number_to_merge,
|
|
|
|
result.max_write_buffer_number - 1);
|
|
|
|
if (result.block_cache == nullptr && !result.no_block_cache) {
|
|
|
|
result.block_cache = NewLRUCache(8 << 20);
|
|
|
|
}
|
|
|
|
result.compression_per_level = src.compression_per_level;
|
|
|
|
if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
|
|
|
|
result.block_size_deviation = 0;
|
|
|
|
}
|
|
|
|
if (result.max_mem_compaction_level >= result.num_levels) {
|
|
|
|
result.max_mem_compaction_level = result.num_levels - 1;
|
|
|
|
}
|
|
|
|
if (result.soft_rate_limit > result.hard_rate_limit) {
|
|
|
|
result.soft_rate_limit = result.hard_rate_limit;
|
|
|
|
}
|
2014-06-17 01:26:46 +02:00
|
|
|
if (result.max_write_buffer_number < 2) {
|
|
|
|
result.max_write_buffer_number = 2;
|
|
|
|
}
|
2014-03-11 01:25:10 +01:00
|
|
|
if (!result.prefix_extractor) {
|
|
|
|
assert(result.memtable_factory);
|
|
|
|
Slice name = result.memtable_factory->Name();
|
|
|
|
if (name.compare("HashSkipListRepFactory") == 0 ||
|
|
|
|
name.compare("HashLinkListRepFactory") == 0) {
|
2014-02-05 01:31:18 +01:00
|
|
|
result.memtable_factory = std::make_shared<SkipListFactory>();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// -- Sanitize the table properties collector
|
|
|
|
// All user defined properties collectors will be wrapped by
|
|
|
|
// UserKeyTablePropertiesCollector since for them they only have the
|
|
|
|
// knowledge of the user keys; internal keys are invisible to them.
|
TablePropertiesCollectorFactory
Summary:
This diff addresses task #4296714 and rethinks how users provide us with TablePropertiesCollectors as part of Options.
Here's description of task #4296714:
I'm debugging #4295529 and noticed that our count of user properties kDeletedKeys is wrong. We're sharing one single InternalKeyPropertiesCollector with all Table Builders. In LOG Files, we're outputting number of kDeletedKeys as connected with a single table, while it's actually the total count of deleted keys since creation of the DB.
For example, this table has 3155 entries and 1391828 deleted keys.
The problem with current approach that we call methods on a single TablePropertiesCollector for all the tables we create. Even worse, we could do it from multiple threads at the same time and TablePropertiesCollector has no way of knowing which table we're calling it for.
Good part: Looks like nobody inside Facebook is using Options::table_properties_collectors. This means we should be able to painfully change the API.
In this change, I introduce TablePropertiesCollectorFactory. For every table we create, we call `CreateTablePropertiesCollector`, which creates a TablePropertiesCollector for a single table. We then use it sequentially from a single thread, which means it doesn't have to be thread-safe.
Test Plan:
Added a test in table_properties_collector_test that fails on master (build two tables, assert that kDeletedKeys count is correct for the second one).
Also, all other tests
Reviewers: sdong, dhruba, haobo, kailiu
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18579
2014-05-13 21:30:55 +02:00
|
|
|
auto& collector_factories = result.table_properties_collector_factories;
|
|
|
|
for (size_t i = 0; i < result.table_properties_collector_factories.size();
|
|
|
|
++i) {
|
|
|
|
assert(collector_factories[i]);
|
|
|
|
collector_factories[i] =
|
|
|
|
std::make_shared<UserKeyTablePropertiesCollectorFactory>(
|
|
|
|
collector_factories[i]);
|
2014-02-05 01:31:18 +01:00
|
|
|
}
|
|
|
|
// Add collector to collect internal key statistics
|
TablePropertiesCollectorFactory
Summary:
This diff addresses task #4296714 and rethinks how users provide us with TablePropertiesCollectors as part of Options.
Here's description of task #4296714:
I'm debugging #4295529 and noticed that our count of user properties kDeletedKeys is wrong. We're sharing one single InternalKeyPropertiesCollector with all Table Builders. In LOG Files, we're outputting number of kDeletedKeys as connected with a single table, while it's actually the total count of deleted keys since creation of the DB.
For example, this table has 3155 entries and 1391828 deleted keys.
The problem with current approach that we call methods on a single TablePropertiesCollector for all the tables we create. Even worse, we could do it from multiple threads at the same time and TablePropertiesCollector has no way of knowing which table we're calling it for.
Good part: Looks like nobody inside Facebook is using Options::table_properties_collectors. This means we should be able to painfully change the API.
In this change, I introduce TablePropertiesCollectorFactory. For every table we create, we call `CreateTablePropertiesCollector`, which creates a TablePropertiesCollector for a single table. We then use it sequentially from a single thread, which means it doesn't have to be thread-safe.
Test Plan:
Added a test in table_properties_collector_test that fails on master (build two tables, assert that kDeletedKeys count is correct for the second one).
Also, all other tests
Reviewers: sdong, dhruba, haobo, kailiu
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18579
2014-05-13 21:30:55 +02:00
|
|
|
collector_factories.push_back(
|
|
|
|
std::make_shared<InternalKeyPropertiesCollectorFactory>());
|
2014-02-05 01:31:18 +01:00
|
|
|
|
2014-05-21 20:43:35 +02:00
|
|
|
if (result.compaction_style == kCompactionStyleFIFO) {
|
|
|
|
result.num_levels = 1;
|
|
|
|
// since we delete level0 files in FIFO compaction when there are too many
|
|
|
|
// of them, these options don't really mean anything
|
|
|
|
result.level0_file_num_compaction_trigger = std::numeric_limits<int>::max();
|
|
|
|
result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
|
|
|
|
result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
|
|
|
|
}
|
|
|
|
|
2014-02-05 01:31:18 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-03-08 01:59:47 +01:00
|
|
|
int SuperVersion::dummy = 0;
|
|
|
|
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
|
|
|
|
void* const SuperVersion::kSVObsolete = nullptr;
|
2014-02-05 01:31:18 +01:00
|
|
|
|
2014-01-24 23:30:28 +01:00
|
|
|
SuperVersion::~SuperVersion() {
|
|
|
|
for (auto td : to_delete) {
|
|
|
|
delete td;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* SuperVersion::Ref() {
|
|
|
|
refs.fetch_add(1, std::memory_order_relaxed);
|
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SuperVersion::Unref() {
|
|
|
|
// fetch_sub returns the previous value of ref
|
2014-02-04 00:28:03 +01:00
|
|
|
uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
|
|
|
|
assert(previous_refs > 0);
|
|
|
|
return previous_refs == 1;
|
2014-01-24 23:30:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void SuperVersion::Cleanup() {
|
|
|
|
assert(refs.load(std::memory_order_relaxed) == 0);
|
|
|
|
imm->Unref(&to_delete);
|
|
|
|
MemTable* m = mem->Unref();
|
|
|
|
if (m != nullptr) {
|
|
|
|
to_delete.push_back(m);
|
|
|
|
}
|
|
|
|
current->Unref();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
|
|
|
|
Version* new_current) {
|
|
|
|
mem = new_mem;
|
|
|
|
imm = new_imm;
|
|
|
|
current = new_current;
|
|
|
|
mem->Ref();
|
|
|
|
imm->Ref();
|
|
|
|
current->Ref();
|
|
|
|
refs.store(1, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
2014-03-04 02:54:04 +01:00
|
|
|
namespace {
|
|
|
|
void SuperVersionUnrefHandle(void* ptr) {
|
2014-03-08 01:59:47 +01:00
|
|
|
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
|
|
|
|
// destroyed. When former happens, the thread shouldn't see kSVInUse.
|
|
|
|
// When latter happens, we are in ~ColumnFamilyData(), no get should happen as
|
|
|
|
// well.
|
2014-03-04 02:54:04 +01:00
|
|
|
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
|
|
|
if (sv->Unref()) {
|
|
|
|
sv->db_mutex->Lock();
|
|
|
|
sv->Cleanup();
|
|
|
|
sv->db_mutex->Unlock();
|
|
|
|
delete sv;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
|
|
|
|
const std::string& name,
|
|
|
|
Version* dummy_versions, Cache* table_cache,
|
2014-02-05 01:31:18 +01:00
|
|
|
const ColumnFamilyOptions& options,
|
2014-02-05 22:12:23 +01:00
|
|
|
const DBOptions* db_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
const EnvOptions& storage_options,
|
|
|
|
ColumnFamilySet* column_family_set)
|
2014-01-29 22:28:50 +01:00
|
|
|
: id_(id),
|
|
|
|
name_(name),
|
|
|
|
dummy_versions_(dummy_versions),
|
|
|
|
current_(nullptr),
|
2014-02-11 02:04:44 +01:00
|
|
|
refs_(0),
|
|
|
|
dropped_(false),
|
2014-02-05 01:31:18 +01:00
|
|
|
internal_comparator_(options.comparator),
|
|
|
|
internal_filter_policy_(options.filter_policy),
|
2014-03-11 22:52:17 +01:00
|
|
|
options_(*db_options, SanitizeOptions(&internal_comparator_,
|
|
|
|
&internal_filter_policy_, options)),
|
2014-01-29 22:28:50 +01:00
|
|
|
mem_(nullptr),
|
2014-05-21 20:43:35 +02:00
|
|
|
imm_(options_.min_write_buffer_number_to_merge),
|
2014-01-29 22:28:50 +01:00
|
|
|
super_version_(nullptr),
|
|
|
|
super_version_number_(0),
|
2014-03-04 02:54:04 +01:00
|
|
|
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
|
2014-01-31 01:49:46 +01:00
|
|
|
next_(nullptr),
|
|
|
|
prev_(nullptr),
|
2014-01-31 00:23:13 +01:00
|
|
|
log_number_(0),
|
2014-02-11 02:04:44 +01:00
|
|
|
need_slowdown_for_num_level0_files_(false),
|
|
|
|
column_family_set_(column_family_set) {
|
|
|
|
Ref();
|
|
|
|
|
2014-02-05 21:20:40 +01:00
|
|
|
// if dummy_versions is nullptr, then this is a dummy column family.
|
|
|
|
if (dummy_versions != nullptr) {
|
2014-05-21 20:43:35 +02:00
|
|
|
internal_stats_.reset(new InternalStats(
|
|
|
|
options_.num_levels, db_options->env, db_options->statistics.get()));
|
2014-02-05 21:20:40 +01:00
|
|
|
table_cache_.reset(
|
2014-03-11 22:52:17 +01:00
|
|
|
new TableCache(dbname, &options_, storage_options, table_cache));
|
2014-02-05 02:45:19 +01:00
|
|
|
if (options_.compaction_style == kCompactionStyleUniversal) {
|
2014-03-11 22:52:17 +01:00
|
|
|
compaction_picker_.reset(
|
|
|
|
new UniversalCompactionPicker(&options_, &internal_comparator_));
|
2014-05-21 20:43:35 +02:00
|
|
|
} else if (options_.compaction_style == kCompactionStyleLevel) {
|
2014-03-11 22:52:17 +01:00
|
|
|
compaction_picker_.reset(
|
|
|
|
new LevelCompactionPicker(&options_, &internal_comparator_));
|
2014-05-21 20:43:35 +02:00
|
|
|
} else {
|
|
|
|
assert(options_.compaction_style == kCompactionStyleFIFO);
|
|
|
|
compaction_picker_.reset(
|
|
|
|
new FIFOCompactionPicker(&options_, &internal_comparator_));
|
2014-02-05 02:45:19 +01:00
|
|
|
}
|
2014-02-07 06:39:20 +01:00
|
|
|
|
2014-03-11 22:52:17 +01:00
|
|
|
Log(options_.info_log, "Options for column family \"%s\":\n",
|
2014-02-07 06:39:20 +01:00
|
|
|
name.c_str());
|
2014-03-11 22:52:17 +01:00
|
|
|
const ColumnFamilyOptions* cf_options = &options_;
|
|
|
|
cf_options->Dump(options_.info_log.get());
|
2014-02-01 00:30:27 +01:00
|
|
|
}
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
// DB mutex held
|
2014-01-22 20:44:53 +01:00
|
|
|
ColumnFamilyData::~ColumnFamilyData() {
|
2014-02-11 02:04:44 +01:00
|
|
|
assert(refs_ == 0);
|
|
|
|
// remove from linked list
|
|
|
|
auto prev = prev_;
|
|
|
|
auto next = next_;
|
|
|
|
prev->next_ = next;
|
|
|
|
next->prev_ = prev;
|
|
|
|
|
|
|
|
// it's nullptr for dummy CFD
|
|
|
|
if (column_family_set_ != nullptr) {
|
|
|
|
// remove from column_family_set
|
2014-03-11 22:52:17 +01:00
|
|
|
column_family_set_->RemoveColumnFamily(this);
|
2014-02-11 02:04:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (current_ != nullptr) {
|
|
|
|
current_->Unref();
|
|
|
|
}
|
|
|
|
|
2014-03-11 22:52:17 +01:00
|
|
|
if (super_version_ != nullptr) {
|
|
|
|
// Release SuperVersion reference kept in ThreadLocalPtr.
|
|
|
|
// This must be done outside of mutex_ since unref handler can lock mutex.
|
|
|
|
super_version_->db_mutex->Unlock();
|
|
|
|
local_sv_.reset();
|
|
|
|
super_version_->db_mutex->Lock();
|
|
|
|
|
|
|
|
bool is_last_reference __attribute__((unused));
|
|
|
|
is_last_reference = super_version_->Unref();
|
|
|
|
assert(is_last_reference);
|
|
|
|
super_version_->Cleanup();
|
|
|
|
delete super_version_;
|
|
|
|
super_version_ = nullptr;
|
|
|
|
}
|
2014-03-04 18:03:56 +01:00
|
|
|
|
2014-01-31 01:49:46 +01:00
|
|
|
if (dummy_versions_ != nullptr) {
|
|
|
|
// List must be empty
|
|
|
|
assert(dummy_versions_->next_ == dummy_versions_);
|
|
|
|
delete dummy_versions_;
|
|
|
|
}
|
2014-01-24 23:30:28 +01:00
|
|
|
|
2014-01-29 22:28:50 +01:00
|
|
|
if (mem_ != nullptr) {
|
|
|
|
delete mem_->Unref();
|
2014-01-24 23:30:28 +01:00
|
|
|
}
|
2014-02-07 00:42:16 +01:00
|
|
|
autovector<MemTable*> to_delete;
|
2014-01-29 22:28:50 +01:00
|
|
|
imm_.current()->Unref(&to_delete);
|
2014-01-24 23:30:28 +01:00
|
|
|
for (MemTable* m : to_delete) {
|
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-14 19:48:01 +02:00
|
|
|
const EnvOptions* ColumnFamilyData::soptions() const {
|
|
|
|
return &(column_family_set_->storage_options_);
|
|
|
|
}
|
|
|
|
|
2014-01-31 00:23:13 +01:00
|
|
|
void ColumnFamilyData::SetCurrent(Version* current) {
|
|
|
|
current_ = current;
|
|
|
|
need_slowdown_for_num_level0_files_ =
|
|
|
|
(options_.level0_slowdown_writes_trigger >= 0 &&
|
|
|
|
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
|
|
|
|
}
|
|
|
|
|
2014-01-24 23:30:28 +01:00
|
|
|
void ColumnFamilyData::CreateNewMemtable() {
|
2014-01-29 22:28:50 +01:00
|
|
|
assert(current_ != nullptr);
|
|
|
|
if (mem_ != nullptr) {
|
|
|
|
delete mem_->Unref();
|
2014-01-24 23:30:28 +01:00
|
|
|
}
|
2014-02-05 01:31:18 +01:00
|
|
|
mem_ = new MemTable(internal_comparator_, options_);
|
2014-01-29 22:28:50 +01:00
|
|
|
mem_->Ref();
|
|
|
|
}
|
|
|
|
|
2014-03-06 01:55:51 +01:00
|
|
|
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
|
|
|
|
return compaction_picker_->PickCompaction(current_, log_buffer);
|
2014-02-01 00:30:27 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
|
|
|
|
const InternalKey* begin,
|
|
|
|
const InternalKey* end,
|
|
|
|
InternalKey** compaction_end) {
|
|
|
|
return compaction_picker_->CompactRange(current_, input_level, output_level,
|
|
|
|
begin, end, compaction_end);
|
|
|
|
}
|
|
|
|
|
2014-04-14 18:34:59 +02:00
|
|
|
SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
|
|
|
|
port::Mutex* db_mutex) {
|
|
|
|
SuperVersion* sv = nullptr;
|
|
|
|
if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
|
|
|
|
sv = GetThreadLocalSuperVersion(db_mutex);
|
|
|
|
sv->Ref();
|
|
|
|
if (!ReturnThreadLocalSuperVersion(sv)) {
|
|
|
|
sv->Unref();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
db_mutex->Lock();
|
|
|
|
sv = super_version_->Ref();
|
|
|
|
db_mutex->Unlock();
|
|
|
|
}
|
|
|
|
return sv;
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
|
|
|
|
port::Mutex* db_mutex) {
|
|
|
|
SuperVersion* sv = nullptr;
|
|
|
|
// The SuperVersion is cached in thread local storage to avoid acquiring
|
|
|
|
// mutex when SuperVersion does not change since the last use. When a new
|
|
|
|
// SuperVersion is installed, the compaction or flush thread cleans up
|
|
|
|
// cached SuperVersion in all existing thread local storage. To avoid
|
|
|
|
// acquiring mutex for this operation, we use atomic Swap() on the thread
|
|
|
|
// local pointer to guarantee exclusive access. If the thread local pointer
|
|
|
|
// is being used while a new SuperVersion is installed, the cached
|
|
|
|
// SuperVersion can become stale. In that case, the background thread would
|
|
|
|
// have swapped in kSVObsolete. We re-check the value at when returning
|
|
|
|
// SuperVersion back to thread local, with an atomic compare and swap.
|
|
|
|
// The superversion will need to be released if detected to be stale.
|
|
|
|
void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
|
|
|
|
// Invariant:
|
|
|
|
// (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
|
|
|
|
// (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
|
|
|
|
// should only keep kSVInUse before ReturnThreadLocalSuperVersion call
|
|
|
|
// (if no Scrape happens).
|
|
|
|
assert(ptr != SuperVersion::kSVInUse);
|
|
|
|
sv = static_cast<SuperVersion*>(ptr);
|
|
|
|
if (sv == SuperVersion::kSVObsolete ||
|
|
|
|
sv->version_number != super_version_number_.load()) {
|
|
|
|
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
|
|
|
|
SuperVersion* sv_to_delete = nullptr;
|
|
|
|
|
|
|
|
if (sv && sv->Unref()) {
|
|
|
|
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
|
|
|
|
db_mutex->Lock();
|
|
|
|
// NOTE: underlying resources held by superversion (sst files) might
|
|
|
|
// not be released until the next background job.
|
|
|
|
sv->Cleanup();
|
|
|
|
sv_to_delete = sv;
|
|
|
|
} else {
|
|
|
|
db_mutex->Lock();
|
|
|
|
}
|
|
|
|
sv = super_version_->Ref();
|
|
|
|
db_mutex->Unlock();
|
|
|
|
|
|
|
|
delete sv_to_delete;
|
|
|
|
}
|
|
|
|
assert(sv != nullptr);
|
|
|
|
return sv;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
|
|
|
|
assert(sv != nullptr);
|
|
|
|
// Put the SuperVersion back
|
|
|
|
void* expected = SuperVersion::kSVInUse;
|
|
|
|
if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
|
|
|
|
// When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
|
|
|
|
// storage has not been altered and no Scrape has happend. The
|
|
|
|
// SuperVersion is still current.
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
// ThreadLocal scrape happened in the process of this GetImpl call (after
|
|
|
|
// thread local Swap() at the beginning and before CompareAndSwap()).
|
|
|
|
// This means the SuperVersion it holds is obsolete.
|
|
|
|
assert(expected == SuperVersion::kSVObsolete);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-01-29 22:28:50 +01:00
|
|
|
SuperVersion* ColumnFamilyData::InstallSuperVersion(
|
2014-03-04 02:54:04 +01:00
|
|
|
SuperVersion* new_superversion, port::Mutex* db_mutex) {
|
2014-03-31 21:44:54 +02:00
|
|
|
new_superversion->db_mutex = db_mutex;
|
2014-01-29 22:28:50 +01:00
|
|
|
new_superversion->Init(mem_, imm_.current(), current_);
|
|
|
|
SuperVersion* old_superversion = super_version_;
|
|
|
|
super_version_ = new_superversion;
|
|
|
|
++super_version_number_;
|
2014-03-04 02:54:04 +01:00
|
|
|
super_version_->version_number = super_version_number_;
|
2014-04-14 18:34:59 +02:00
|
|
|
// Reset SuperVersions cached in thread local storage
|
|
|
|
if (column_family_set_->db_options_->allow_thread_local) {
|
|
|
|
ResetThreadLocalSuperVersions();
|
|
|
|
}
|
2014-01-29 22:28:50 +01:00
|
|
|
if (old_superversion != nullptr && old_superversion->Unref()) {
|
|
|
|
old_superversion->Cleanup();
|
|
|
|
return old_superversion; // will let caller delete outside of mutex
|
|
|
|
}
|
|
|
|
return nullptr;
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
2014-03-04 02:54:04 +01:00
|
|
|
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
|
|
|
|
autovector<void*> sv_ptrs;
|
2014-03-08 01:59:47 +01:00
|
|
|
local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
|
2014-03-04 02:54:04 +01:00
|
|
|
for (auto ptr : sv_ptrs) {
|
|
|
|
assert(ptr);
|
2014-03-08 01:59:47 +01:00
|
|
|
if (ptr == SuperVersion::kSVInUse) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-03-04 02:54:04 +01:00
|
|
|
auto sv = static_cast<SuperVersion*>(ptr);
|
|
|
|
if (sv->Unref()) {
|
|
|
|
sv->Cleanup();
|
|
|
|
delete sv;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
|
2014-02-05 22:12:23 +01:00
|
|
|
const DBOptions* db_options,
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
const EnvOptions& storage_options,
|
|
|
|
Cache* table_cache)
|
2014-01-31 01:49:46 +01:00
|
|
|
: max_column_family_(0),
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
|
2014-02-05 21:20:40 +01:00
|
|
|
ColumnFamilyOptions(), db_options,
|
2014-02-11 02:04:44 +01:00
|
|
|
storage_options_, nullptr)),
|
2014-03-11 22:52:17 +01:00
|
|
|
default_cfd_cache_(nullptr),
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
db_name_(dbname),
|
|
|
|
db_options_(db_options),
|
|
|
|
storage_options_(storage_options),
|
2014-02-06 20:44:50 +01:00
|
|
|
table_cache_(table_cache),
|
|
|
|
spin_lock_(ATOMIC_FLAG_INIT) {
|
2014-01-31 01:49:46 +01:00
|
|
|
// initialize linked list
|
2014-02-11 02:04:44 +01:00
|
|
|
dummy_cfd_->prev_ = dummy_cfd_;
|
|
|
|
dummy_cfd_->next_ = dummy_cfd_;
|
2014-01-31 01:49:46 +01:00
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
|
|
|
|
ColumnFamilySet::~ColumnFamilySet() {
|
2014-02-11 02:04:44 +01:00
|
|
|
while (column_family_data_.size() > 0) {
|
|
|
|
// cfd destructor will delete itself from column_family_data_
|
|
|
|
auto cfd = column_family_data_.begin()->second;
|
|
|
|
cfd->Unref();
|
2014-01-22 20:44:53 +01:00
|
|
|
delete cfd;
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
dummy_cfd_->Unref();
|
2014-01-31 01:49:46 +01:00
|
|
|
delete dummy_cfd_;
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* ColumnFamilySet::GetDefault() const {
|
2014-03-11 22:52:17 +01:00
|
|
|
assert(default_cfd_cache_ != nullptr);
|
|
|
|
return default_cfd_cache_;
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
|
|
|
|
auto cfd_iter = column_family_data_.find(id);
|
|
|
|
if (cfd_iter != column_family_data_.end()) {
|
|
|
|
return cfd_iter->second;
|
|
|
|
} else {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-28 23:05:11 +01:00
|
|
|
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
|
|
|
|
const {
|
|
|
|
auto cfd_iter = column_families_.find(name);
|
2014-03-11 22:52:17 +01:00
|
|
|
if (cfd_iter != column_families_.end()) {
|
|
|
|
auto cfd = GetColumnFamily(cfd_iter->second);
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
return cfd;
|
|
|
|
} else {
|
2014-02-28 23:05:11 +01:00
|
|
|
return nullptr;
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
|
|
|
|
return ++max_column_family_;
|
|
|
|
}
|
|
|
|
|
2014-03-05 21:13:44 +01:00
|
|
|
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
|
|
|
|
|
|
|
|
void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
|
|
|
|
max_column_family_ = std::max(new_max_column_family, max_column_family_);
|
|
|
|
}
|
|
|
|
|
2014-06-03 00:33:54 +02:00
|
|
|
size_t ColumnFamilySet::NumberOfColumnFamilies() const {
|
|
|
|
return column_families_.size();
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
// under a DB mutex
|
2014-01-22 20:44:53 +01:00
|
|
|
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
|
|
|
const std::string& name, uint32_t id, Version* dummy_versions,
|
|
|
|
const ColumnFamilyOptions& options) {
|
|
|
|
assert(column_families_.find(name) == column_families_.end());
|
|
|
|
ColumnFamilyData* new_cfd =
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
|
2014-02-11 02:04:44 +01:00
|
|
|
options, db_options_, storage_options_, this);
|
|
|
|
Lock();
|
|
|
|
column_families_.insert({name, id});
|
2014-01-22 20:44:53 +01:00
|
|
|
column_family_data_.insert({id, new_cfd});
|
2014-02-11 02:04:44 +01:00
|
|
|
Unlock();
|
2014-01-22 20:44:53 +01:00
|
|
|
max_column_family_ = std::max(max_column_family_, id);
|
2014-01-31 01:49:46 +01:00
|
|
|
// add to linked list
|
2014-02-11 02:04:44 +01:00
|
|
|
new_cfd->next_ = dummy_cfd_;
|
|
|
|
auto prev = dummy_cfd_->prev_;
|
|
|
|
new_cfd->prev_ = prev;
|
|
|
|
prev->next_ = new_cfd;
|
|
|
|
dummy_cfd_->prev_ = new_cfd;
|
2014-03-11 22:52:17 +01:00
|
|
|
if (id == 0) {
|
|
|
|
default_cfd_cache_ = new_cfd;
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
return new_cfd;
|
|
|
|
}
|
|
|
|
|
2014-03-11 22:52:17 +01:00
|
|
|
void ColumnFamilySet::Lock() {
|
|
|
|
// spin lock
|
|
|
|
while (spin_lock_.test_and_set(std::memory_order_acquire)) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
|
|
|
|
|
2014-04-07 23:21:25 +02:00
|
|
|
// REQUIRES: DB mutex held
|
|
|
|
void ColumnFamilySet::FreeDeadColumnFamilies() {
|
|
|
|
autovector<ColumnFamilyData*> to_delete;
|
|
|
|
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
|
|
|
|
if (cfd->refs_ == 0) {
|
|
|
|
to_delete.push_back(cfd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto cfd : to_delete) {
|
|
|
|
// this is very rare, so it's not a problem that we do it under a mutex
|
|
|
|
delete cfd;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
// under a DB mutex
|
2014-03-11 22:52:17 +01:00
|
|
|
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
2014-01-31 01:49:46 +01:00
|
|
|
assert(cfd_iter != column_family_data_.end());
|
2014-02-11 02:04:44 +01:00
|
|
|
Lock();
|
2014-01-31 01:49:46 +01:00
|
|
|
column_family_data_.erase(cfd_iter);
|
2014-02-11 02:04:44 +01:00
|
|
|
column_families_.erase(cfd->GetName());
|
|
|
|
Unlock();
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
2014-02-06 01:02:48 +01:00
|
|
|
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
|
2014-03-11 22:52:17 +01:00
|
|
|
if (column_family_id == 0) {
|
|
|
|
// optimization for common case
|
|
|
|
current_ = column_family_set_->GetDefault();
|
|
|
|
} else {
|
|
|
|
// maybe outside of db mutex, should lock
|
|
|
|
column_family_set_->Lock();
|
|
|
|
current_ = column_family_set_->GetColumnFamily(column_family_id);
|
|
|
|
column_family_set_->Unlock();
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
handle_.SetCFD(current_);
|
2014-02-06 01:02:48 +01:00
|
|
|
return current_ != nullptr;
|
|
|
|
}
|
2014-01-28 20:05:04 +01:00
|
|
|
|
2014-02-06 01:02:48 +01:00
|
|
|
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
|
|
|
|
assert(current_ != nullptr);
|
|
|
|
return current_->GetLogNumber();
|
|
|
|
}
|
|
|
|
|
|
|
|
MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
|
|
|
|
assert(current_ != nullptr);
|
|
|
|
return current_->mem();
|
|
|
|
}
|
|
|
|
|
2014-03-11 22:52:17 +01:00
|
|
|
const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
|
2014-02-06 01:02:48 +01:00
|
|
|
assert(current_ != nullptr);
|
2014-03-11 22:52:17 +01:00
|
|
|
return current_->options();
|
2014-02-06 01:02:48 +01:00
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
|
2014-02-06 01:02:48 +01:00
|
|
|
assert(current_ != nullptr);
|
2014-02-11 02:04:44 +01:00
|
|
|
return &handle_;
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
} // namespace rocksdb
|