Merge branch 'master' of github.com:facebook/rocksdb into HEAD
This commit is contained in:
commit
c65448f95a
6
.gitignore
vendored
6
.gitignore
vendored
@ -13,6 +13,10 @@ build_config.mk
|
||||
*_bench
|
||||
*_stress
|
||||
*.out
|
||||
*.class
|
||||
*.jar
|
||||
*.*jnilib*
|
||||
*.d-e
|
||||
|
||||
ldb
|
||||
manifest_dump
|
||||
@ -23,3 +27,5 @@ coverage/COVERAGE_REPORT
|
||||
.gdbhistory
|
||||
.phutil_module_cache
|
||||
tags
|
||||
java/*.log
|
||||
java/include/org_rocksdb_*.h
|
||||
|
13
HISTORY.md
13
HISTORY.md
@ -1,11 +1,15 @@
|
||||
# Rocksdb Change Log
|
||||
|
||||
## Unreleased
|
||||
## Unreleased (will be released in 3.0)
|
||||
* Column family support
|
||||
|
||||
### Public API changes
|
||||
|
||||
## 2.8.0 (04/04/2014)
|
||||
|
||||
* Removed arena.h from public header files.
|
||||
* By default, checksums are verified on every read from database
|
||||
* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
|
||||
* Added is_manual_compaction to CompactionFilter::Context
|
||||
* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
|
||||
* Removed BackupEngine::DeleteBackupsNewerThan() function
|
||||
@ -15,11 +19,18 @@
|
||||
* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
|
||||
* Added a command "checkconsistency" in ldb tool, which checks
|
||||
if file system state matches DB state (file existence and file sizes)
|
||||
* Separate options related to block based table to a new struct BlockBasedTableOptions
|
||||
* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
|
||||
* Add more counters to perf context.
|
||||
* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
|
||||
|
||||
### New Features
|
||||
* If we find one truncated record at the end of the MANIFEST or WAL files,
|
||||
we will ignore it. We assume that writers of these records were interrupted
|
||||
and that we can safely ignore it.
|
||||
* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
|
||||
* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
|
||||
* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
|
||||
* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
|
||||
* Geo-spatial support for locations and radial-search.
|
||||
|
||||
|
@ -67,6 +67,9 @@ libraries. You are on your own.
|
||||
* Please note that some of the optimizations/features are disabled in OSX.
|
||||
We did not run any production workloads on it.
|
||||
|
||||
* **iOS**:
|
||||
* Run: `TARGET_OS=IOS make static_lib`
|
||||
|
||||
## Compilation
|
||||
`make clean; make` will compile librocksdb.a (RocksDB static library) and all
|
||||
the unit tests. You can run all unit tests with `make check`.
|
||||
|
36
Makefile
36
Makefile
@ -23,6 +23,14 @@ $(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platf
|
||||
# this file is generated by the previous line to set build flags and sources
|
||||
include build_config.mk
|
||||
|
||||
ifneq ($(PLATFORM), IOS)
|
||||
CFLAGS += -g
|
||||
CXXFLAGS += -g
|
||||
else
|
||||
# no debug info for IOS, that will make our library big
|
||||
OPT += -DNDEBUG
|
||||
endif
|
||||
|
||||
# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
|
||||
ifdef COMPILE_WITH_ASAN
|
||||
# ASAN compile flags
|
||||
@ -37,8 +45,8 @@ else
|
||||
endif
|
||||
|
||||
WARNING_FLAGS = -Wall -Werror -Wno-sign-compare
|
||||
CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
||||
CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
|
||||
CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
||||
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
|
||||
|
||||
LDFLAGS += $(PLATFORM_LDFLAGS)
|
||||
|
||||
@ -57,6 +65,7 @@ TESTS = \
|
||||
db_test \
|
||||
block_hash_index_test \
|
||||
autovector_test \
|
||||
column_family_test \
|
||||
table_properties_collector_test \
|
||||
arena_test \
|
||||
auto_roll_logger_test \
|
||||
@ -148,11 +157,15 @@ $(SHARED3):
|
||||
endif # PLATFORM_SHARED_EXT
|
||||
|
||||
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
|
||||
release tags valgrind_check whitebox_crash_test format shared_lib all \
|
||||
release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
|
||||
dbg
|
||||
|
||||
all: $(LIBRARY) $(PROGRAMS)
|
||||
|
||||
static_lib: $(LIBRARY)
|
||||
|
||||
shared_lib: $(SHARED)
|
||||
|
||||
dbg: $(LIBRARY) $(PROGRAMS)
|
||||
|
||||
# Will also generate shared libraries.
|
||||
@ -218,8 +231,6 @@ tags:
|
||||
format:
|
||||
build_tools/format-diff.sh
|
||||
|
||||
shared_lib: $(SHARED)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests and tools
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -260,6 +271,9 @@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
@ -404,7 +418,7 @@ ldb: tools/ldb.o $(LIBOBJECTS)
|
||||
# ---------------------------------------------------------------------------
|
||||
JNI_NATIVE_SOURCES = ./java/rocksjni/rocksjni.cc ./java/rocksjni/options.cc ./java/rocksjni/write_batch.cc
|
||||
|
||||
JAVA_INCLUDE = -I/usr/lib/jvm/java-openjdk/include/ -I/usr/lib/jvm/java-openjdk/include/linux
|
||||
JAVA_INCLUDE = -I/usr/lib/jvm/java-openjdk/include/ -I/usr/lib/jvm/java-openjdk/include/linux
|
||||
ROCKSDBJNILIB = ./java/librocksdbjni.so
|
||||
|
||||
ifeq ($(PLATFORM), OS_MACOSX)
|
||||
@ -435,20 +449,20 @@ ifeq ($(PLATFORM), IOS)
|
||||
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
|
||||
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
|
||||
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
|
||||
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString)
|
||||
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
|
||||
|
||||
.cc.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS)
|
||||
xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
.c.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
|
||||
$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
|
||||
xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
else
|
||||
|
@ -87,7 +87,7 @@ PLATFORM_SHARED_CFLAGS="-fPIC"
|
||||
PLATFORM_SHARED_VERSIONED=false
|
||||
|
||||
# generic port files (working on all platform by #ifdef) go directly in /port
|
||||
GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
|
||||
GENERIC_PORT_FILES=`cd $ROCKSDB_ROOT; find port -name '*.cc' | tr "\n" " "`
|
||||
|
||||
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
|
||||
case "$TARGET_OS" in
|
||||
@ -98,6 +98,13 @@ case "$TARGET_OS" in
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
|
||||
# PORT_FILES=port/darwin/darwin_specific.cc
|
||||
;;
|
||||
IOS)
|
||||
PLATFORM=IOS
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE"
|
||||
PLATFORM_SHARED_EXT=dylib
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
Linux)
|
||||
PLATFORM=OS_LINUX
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
|
||||
|
82
db/c.cc
82
db/c.cc
@ -25,12 +25,14 @@
|
||||
#include "rocksdb/universal_compaction.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/table.h"
|
||||
|
||||
using rocksdb::Cache;
|
||||
using rocksdb::Comparator;
|
||||
using rocksdb::CompressionType;
|
||||
using rocksdb::DB;
|
||||
using rocksdb::Env;
|
||||
using rocksdb::InfoLogLevel;
|
||||
using rocksdb::FileLock;
|
||||
using rocksdb::FilterPolicy;
|
||||
using rocksdb::FlushOptions;
|
||||
@ -656,6 +658,11 @@ void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
|
||||
}
|
||||
}
|
||||
|
||||
void rocksdb_options_set_info_log_level(
|
||||
rocksdb_options_t* opt, int v) {
|
||||
opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
|
||||
}
|
||||
|
||||
void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
|
||||
opt->rep.write_buffer_size = s;
|
||||
}
|
||||
@ -714,6 +721,14 @@ void rocksdb_options_set_max_grandparent_overlap_factor(
|
||||
opt->rep.max_grandparent_overlap_factor = n;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
|
||||
rocksdb_options_t* opt, int* level_values, size_t num_levels) {
|
||||
opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
|
||||
for (size_t i = 0; i < num_levels; ++i) {
|
||||
opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
|
||||
}
|
||||
}
|
||||
|
||||
void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
|
||||
opt->rep.statistics = rocksdb::CreateDBStatistics();
|
||||
}
|
||||
@ -857,6 +872,24 @@ void rocksdb_options_set_advise_random_on_open(
|
||||
opt->rep.advise_random_on_open = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_access_hint_on_compaction_start(
|
||||
rocksdb_options_t* opt, int v) {
|
||||
switch(v) {
|
||||
case 0:
|
||||
opt->rep.access_hint_on_compaction_start = rocksdb::Options::NONE;
|
||||
break;
|
||||
case 1:
|
||||
opt->rep.access_hint_on_compaction_start = rocksdb::Options::NORMAL;
|
||||
break;
|
||||
case 2:
|
||||
opt->rep.access_hint_on_compaction_start = rocksdb::Options::SEQUENTIAL;
|
||||
break;
|
||||
case 3:
|
||||
opt->rep.access_hint_on_compaction_start = rocksdb::Options::WILLNEED;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void rocksdb_options_set_use_adaptive_mutex(
|
||||
rocksdb_options_t* opt, unsigned char v) {
|
||||
opt->rep.use_adaptive_mutex = v;
|
||||
@ -867,6 +900,11 @@ void rocksdb_options_set_bytes_per_sync(
|
||||
opt->rep.bytes_per_sync = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_verify_checksums_in_compaction(
|
||||
rocksdb_options_t* opt, unsigned char v) {
|
||||
opt->rep.verify_checksums_in_compaction = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_filter_deletes(
|
||||
rocksdb_options_t* opt, unsigned char v) {
|
||||
opt->rep.filter_deletes = v;
|
||||
@ -1003,11 +1041,48 @@ void rocksdb_options_set_hash_link_list_rep(
|
||||
opt->rep.memtable_factory.reset(factory);
|
||||
}
|
||||
|
||||
void rocksdb_options_set_plain_table_factory(
|
||||
rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key,
|
||||
double hash_table_ratio, size_t index_sparseness) {
|
||||
static rocksdb::TableFactory* factory = 0;
|
||||
if (!factory) {
|
||||
factory = rocksdb::NewPlainTableFactory(
|
||||
user_key_len, bloom_bits_per_key,
|
||||
hash_table_ratio, index_sparseness);
|
||||
}
|
||||
opt->rep.table_factory.reset(factory);
|
||||
}
|
||||
|
||||
void rocksdb_options_set_max_successive_merges(
|
||||
rocksdb_options_t* opt, size_t v) {
|
||||
opt->rep.max_successive_merges = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_min_partial_merge_operands(
|
||||
rocksdb_options_t* opt, uint32_t v) {
|
||||
opt->rep.min_partial_merge_operands = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_bloom_locality(
|
||||
rocksdb_options_t* opt, uint32_t v) {
|
||||
opt->rep.bloom_locality = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_allow_thread_local(
|
||||
rocksdb_options_t* opt, unsigned char v) {
|
||||
opt->rep.allow_thread_local = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_inplace_update_support(
|
||||
rocksdb_options_t* opt, unsigned char v) {
|
||||
opt->rep.inplace_update_support = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_inplace_update_num_locks(
|
||||
rocksdb_options_t* opt, size_t v) {
|
||||
opt->rep.inplace_update_num_locks = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
|
||||
opt->rep.compaction_style = static_cast<rocksdb::CompactionStyle>(style);
|
||||
}
|
||||
@ -1022,21 +1097,14 @@ DB::OpenForReadOnly
|
||||
DB::MultiGet
|
||||
DB::KeyMayExist
|
||||
DB::GetOptions
|
||||
DB::GetLiveFiles
|
||||
DB::GetSortedWalFiles
|
||||
DB::GetLatestSequenceNumber
|
||||
DB::GetUpdatesSince
|
||||
DB::DeleteFile
|
||||
DB::GetDbIdentity
|
||||
DB::RunManualCompaction
|
||||
custom cache
|
||||
compaction_filter
|
||||
max_bytes_for_level_multiplier_additional
|
||||
access_hint_on_compaction_start
|
||||
table_factory
|
||||
table_properties_collectors
|
||||
inplace_update_support
|
||||
inplace_update_num_locks
|
||||
*/
|
||||
|
||||
rocksdb_comparator_t* rocksdb_comparator_create(
|
||||
|
@ -443,6 +443,7 @@ int main(int argc, char** argv) {
|
||||
rocksdb_options_set_filter_policy(options, policy);
|
||||
rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
|
||||
rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
|
||||
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
|
||||
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
|
489
db/column_family.cc
Normal file
489
db/column_family.cc
Normal file
@ -0,0 +1,489 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/column_family.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/compaction_picker.h"
|
||||
#include "db/table_properties_collector.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/hash_skiplist_rep.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
|
||||
DBImpl* db, port::Mutex* mutex)
|
||||
: cfd_(cfd), db_(db), mutex_(mutex) {
|
||||
if (cfd_ != nullptr) {
|
||||
cfd_->Ref();
|
||||
}
|
||||
}
|
||||
|
||||
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
|
||||
if (cfd_ != nullptr) {
|
||||
DBImpl::DeletionState deletion_state;
|
||||
mutex_->Lock();
|
||||
if (cfd_->Unref()) {
|
||||
delete cfd_;
|
||||
}
|
||||
db_->FindObsoleteFiles(deletion_state, false, true);
|
||||
mutex_->Unlock();
|
||||
if (deletion_state.HaveSomethingToDelete()) {
|
||||
db_->PurgeObsoleteFiles(deletion_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
|
||||
|
||||
namespace {
|
||||
// Fix user-supplied options to be reasonable
|
||||
template <class T, class V>
|
||||
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
|
||||
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
|
||||
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const ColumnFamilyOptions& src) {
|
||||
ColumnFamilyOptions result = src;
|
||||
result.comparator = icmp;
|
||||
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
|
||||
ClipToRange(&result.write_buffer_size,
|
||||
((size_t)64) << 10, ((size_t)64) << 30);
|
||||
// if user sets arena_block_size, we trust user to use this value. Otherwise,
|
||||
// calculate a proper value from writer_buffer_size;
|
||||
if (result.arena_block_size <= 0) {
|
||||
result.arena_block_size = result.write_buffer_size / 10;
|
||||
}
|
||||
result.min_write_buffer_number_to_merge =
|
||||
std::min(result.min_write_buffer_number_to_merge,
|
||||
result.max_write_buffer_number - 1);
|
||||
if (result.block_cache == nullptr && !result.no_block_cache) {
|
||||
result.block_cache = NewLRUCache(8 << 20);
|
||||
}
|
||||
result.compression_per_level = src.compression_per_level;
|
||||
if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
|
||||
result.block_size_deviation = 0;
|
||||
}
|
||||
if (result.max_mem_compaction_level >= result.num_levels) {
|
||||
result.max_mem_compaction_level = result.num_levels - 1;
|
||||
}
|
||||
if (result.soft_rate_limit > result.hard_rate_limit) {
|
||||
result.soft_rate_limit = result.hard_rate_limit;
|
||||
}
|
||||
if (!result.prefix_extractor) {
|
||||
assert(result.memtable_factory);
|
||||
Slice name = result.memtable_factory->Name();
|
||||
if (name.compare("HashSkipListRepFactory") == 0 ||
|
||||
name.compare("HashLinkListRepFactory") == 0) {
|
||||
result.memtable_factory = std::make_shared<SkipListFactory>();
|
||||
}
|
||||
}
|
||||
|
||||
// -- Sanitize the table properties collector
|
||||
// All user defined properties collectors will be wrapped by
|
||||
// UserKeyTablePropertiesCollector since for them they only have the
|
||||
// knowledge of the user keys; internal keys are invisible to them.
|
||||
auto& collectors = result.table_properties_collectors;
|
||||
for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) {
|
||||
assert(collectors[i]);
|
||||
collectors[i] =
|
||||
std::make_shared<UserKeyTablePropertiesCollector>(collectors[i]);
|
||||
}
|
||||
// Add collector to collect internal key statistics
|
||||
collectors.push_back(std::make_shared<InternalKeyPropertiesCollector>());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int SuperVersion::dummy = 0;
|
||||
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
|
||||
void* const SuperVersion::kSVObsolete = nullptr;
|
||||
|
||||
SuperVersion::~SuperVersion() {
|
||||
for (auto td : to_delete) {
|
||||
delete td;
|
||||
}
|
||||
}
|
||||
|
||||
SuperVersion* SuperVersion::Ref() {
|
||||
refs.fetch_add(1, std::memory_order_relaxed);
|
||||
return this;
|
||||
}
|
||||
|
||||
bool SuperVersion::Unref() {
|
||||
// fetch_sub returns the previous value of ref
|
||||
uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
|
||||
assert(previous_refs > 0);
|
||||
return previous_refs == 1;
|
||||
}
|
||||
|
||||
void SuperVersion::Cleanup() {
|
||||
assert(refs.load(std::memory_order_relaxed) == 0);
|
||||
imm->Unref(&to_delete);
|
||||
MemTable* m = mem->Unref();
|
||||
if (m != nullptr) {
|
||||
to_delete.push_back(m);
|
||||
}
|
||||
current->Unref();
|
||||
}
|
||||
|
||||
void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
|
||||
Version* new_current) {
|
||||
mem = new_mem;
|
||||
imm = new_imm;
|
||||
current = new_current;
|
||||
mem->Ref();
|
||||
imm->Ref();
|
||||
current->Ref();
|
||||
refs.store(1, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
namespace {
|
||||
void SuperVersionUnrefHandle(void* ptr) {
|
||||
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
|
||||
// destroyed. When former happens, the thread shouldn't see kSVInUse.
|
||||
// When latter happens, we are in ~ColumnFamilyData(), no get should happen as
|
||||
// well.
|
||||
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
||||
if (sv->Unref()) {
|
||||
sv->db_mutex->Lock();
|
||||
sv->Cleanup();
|
||||
sv->db_mutex->Unlock();
|
||||
delete sv;
|
||||
}
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
|
||||
const std::string& name,
|
||||
Version* dummy_versions, Cache* table_cache,
|
||||
const ColumnFamilyOptions& options,
|
||||
const DBOptions* db_options,
|
||||
const EnvOptions& storage_options,
|
||||
ColumnFamilySet* column_family_set)
|
||||
: id_(id),
|
||||
name_(name),
|
||||
dummy_versions_(dummy_versions),
|
||||
current_(nullptr),
|
||||
refs_(0),
|
||||
dropped_(false),
|
||||
internal_comparator_(options.comparator),
|
||||
internal_filter_policy_(options.filter_policy),
|
||||
options_(*db_options, SanitizeOptions(&internal_comparator_,
|
||||
&internal_filter_policy_, options)),
|
||||
mem_(nullptr),
|
||||
imm_(options.min_write_buffer_number_to_merge),
|
||||
super_version_(nullptr),
|
||||
super_version_number_(0),
|
||||
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
|
||||
next_(nullptr),
|
||||
prev_(nullptr),
|
||||
log_number_(0),
|
||||
need_slowdown_for_num_level0_files_(false),
|
||||
column_family_set_(column_family_set) {
|
||||
Ref();
|
||||
|
||||
// if dummy_versions is nullptr, then this is a dummy column family.
|
||||
if (dummy_versions != nullptr) {
|
||||
internal_stats_.reset(new InternalStats(options.num_levels, db_options->env,
|
||||
db_options->statistics.get()));
|
||||
table_cache_.reset(
|
||||
new TableCache(dbname, &options_, storage_options, table_cache));
|
||||
if (options_.compaction_style == kCompactionStyleUniversal) {
|
||||
compaction_picker_.reset(
|
||||
new UniversalCompactionPicker(&options_, &internal_comparator_));
|
||||
} else {
|
||||
compaction_picker_.reset(
|
||||
new LevelCompactionPicker(&options_, &internal_comparator_));
|
||||
}
|
||||
|
||||
Log(options_.info_log, "Options for column family \"%s\":\n",
|
||||
name.c_str());
|
||||
const ColumnFamilyOptions* cf_options = &options_;
|
||||
cf_options->Dump(options_.info_log.get());
|
||||
}
|
||||
}
|
||||
|
||||
// DB mutex held
|
||||
ColumnFamilyData::~ColumnFamilyData() {
|
||||
assert(refs_ == 0);
|
||||
// remove from linked list
|
||||
auto prev = prev_;
|
||||
auto next = next_;
|
||||
prev->next_ = next;
|
||||
next->prev_ = prev;
|
||||
|
||||
// it's nullptr for dummy CFD
|
||||
if (column_family_set_ != nullptr) {
|
||||
// remove from column_family_set
|
||||
column_family_set_->RemoveColumnFamily(this);
|
||||
}
|
||||
|
||||
if (current_ != nullptr) {
|
||||
current_->Unref();
|
||||
}
|
||||
|
||||
if (super_version_ != nullptr) {
|
||||
// Release SuperVersion reference kept in ThreadLocalPtr.
|
||||
// This must be done outside of mutex_ since unref handler can lock mutex.
|
||||
super_version_->db_mutex->Unlock();
|
||||
local_sv_.reset();
|
||||
super_version_->db_mutex->Lock();
|
||||
|
||||
bool is_last_reference __attribute__((unused));
|
||||
is_last_reference = super_version_->Unref();
|
||||
assert(is_last_reference);
|
||||
super_version_->Cleanup();
|
||||
delete super_version_;
|
||||
super_version_ = nullptr;
|
||||
}
|
||||
|
||||
if (dummy_versions_ != nullptr) {
|
||||
// List must be empty
|
||||
assert(dummy_versions_->next_ == dummy_versions_);
|
||||
delete dummy_versions_;
|
||||
}
|
||||
|
||||
if (mem_ != nullptr) {
|
||||
delete mem_->Unref();
|
||||
}
|
||||
autovector<MemTable*> to_delete;
|
||||
imm_.current()->Unref(&to_delete);
|
||||
for (MemTable* m : to_delete) {
|
||||
delete m;
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnFamilyData::SetCurrent(Version* current) {
|
||||
current_ = current;
|
||||
need_slowdown_for_num_level0_files_ =
|
||||
(options_.level0_slowdown_writes_trigger >= 0 &&
|
||||
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
|
||||
}
|
||||
|
||||
void ColumnFamilyData::CreateNewMemtable() {
|
||||
assert(current_ != nullptr);
|
||||
if (mem_ != nullptr) {
|
||||
delete mem_->Unref();
|
||||
}
|
||||
mem_ = new MemTable(internal_comparator_, options_);
|
||||
mem_->Ref();
|
||||
}
|
||||
|
||||
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
|
||||
return compaction_picker_->PickCompaction(current_, log_buffer);
|
||||
}
|
||||
|
||||
Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
|
||||
const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
InternalKey** compaction_end) {
|
||||
return compaction_picker_->CompactRange(current_, input_level, output_level,
|
||||
begin, end, compaction_end);
|
||||
}
|
||||
|
||||
SuperVersion* ColumnFamilyData::InstallSuperVersion(
|
||||
SuperVersion* new_superversion, port::Mutex* db_mutex) {
|
||||
new_superversion->db_mutex = db_mutex;
|
||||
new_superversion->Init(mem_, imm_.current(), current_);
|
||||
SuperVersion* old_superversion = super_version_;
|
||||
super_version_ = new_superversion;
|
||||
++super_version_number_;
|
||||
super_version_->version_number = super_version_number_;
|
||||
if (old_superversion != nullptr && old_superversion->Unref()) {
|
||||
old_superversion->Cleanup();
|
||||
return old_superversion; // will let caller delete outside of mutex
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
|
||||
autovector<void*> sv_ptrs;
|
||||
local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
|
||||
for (auto ptr : sv_ptrs) {
|
||||
assert(ptr);
|
||||
if (ptr == SuperVersion::kSVInUse) {
|
||||
continue;
|
||||
}
|
||||
auto sv = static_cast<SuperVersion*>(ptr);
|
||||
if (sv->Unref()) {
|
||||
sv->Cleanup();
|
||||
delete sv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
|
||||
const DBOptions* db_options,
|
||||
const EnvOptions& storage_options,
|
||||
Cache* table_cache)
|
||||
: max_column_family_(0),
|
||||
dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
|
||||
ColumnFamilyOptions(), db_options,
|
||||
storage_options_, nullptr)),
|
||||
default_cfd_cache_(nullptr),
|
||||
db_name_(dbname),
|
||||
db_options_(db_options),
|
||||
storage_options_(storage_options),
|
||||
table_cache_(table_cache),
|
||||
spin_lock_(ATOMIC_FLAG_INIT) {
|
||||
// initialize linked list
|
||||
dummy_cfd_->prev_ = dummy_cfd_;
|
||||
dummy_cfd_->next_ = dummy_cfd_;
|
||||
}
|
||||
|
||||
ColumnFamilySet::~ColumnFamilySet() {
|
||||
while (column_family_data_.size() > 0) {
|
||||
// cfd destructor will delete itself from column_family_data_
|
||||
auto cfd = column_family_data_.begin()->second;
|
||||
cfd->Unref();
|
||||
delete cfd;
|
||||
}
|
||||
dummy_cfd_->Unref();
|
||||
delete dummy_cfd_;
|
||||
}
|
||||
|
||||
ColumnFamilyData* ColumnFamilySet::GetDefault() const {
|
||||
assert(default_cfd_cache_ != nullptr);
|
||||
return default_cfd_cache_;
|
||||
}
|
||||
|
||||
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
|
||||
auto cfd_iter = column_family_data_.find(id);
|
||||
if (cfd_iter != column_family_data_.end()) {
|
||||
return cfd_iter->second;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
|
||||
const {
|
||||
auto cfd_iter = column_families_.find(name);
|
||||
if (cfd_iter != column_families_.end()) {
|
||||
auto cfd = GetColumnFamily(cfd_iter->second);
|
||||
assert(cfd != nullptr);
|
||||
return cfd;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
|
||||
return ++max_column_family_;
|
||||
}
|
||||
|
||||
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
|
||||
|
||||
void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
|
||||
max_column_family_ = std::max(new_max_column_family, max_column_family_);
|
||||
}
|
||||
|
||||
// under a DB mutex
|
||||
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
||||
const std::string& name, uint32_t id, Version* dummy_versions,
|
||||
const ColumnFamilyOptions& options) {
|
||||
assert(column_families_.find(name) == column_families_.end());
|
||||
ColumnFamilyData* new_cfd =
|
||||
new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
|
||||
options, db_options_, storage_options_, this);
|
||||
Lock();
|
||||
column_families_.insert({name, id});
|
||||
column_family_data_.insert({id, new_cfd});
|
||||
Unlock();
|
||||
max_column_family_ = std::max(max_column_family_, id);
|
||||
// add to linked list
|
||||
new_cfd->next_ = dummy_cfd_;
|
||||
auto prev = dummy_cfd_->prev_;
|
||||
new_cfd->prev_ = prev;
|
||||
prev->next_ = new_cfd;
|
||||
dummy_cfd_->prev_ = new_cfd;
|
||||
if (id == 0) {
|
||||
default_cfd_cache_ = new_cfd;
|
||||
}
|
||||
return new_cfd;
|
||||
}
|
||||
|
||||
void ColumnFamilySet::Lock() {
|
||||
// spin lock
|
||||
while (spin_lock_.test_and_set(std::memory_order_acquire)) {
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
void ColumnFamilySet::FreeDeadColumnFamilies() {
|
||||
autovector<ColumnFamilyData*> to_delete;
|
||||
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
|
||||
if (cfd->refs_ == 0) {
|
||||
to_delete.push_back(cfd);
|
||||
}
|
||||
}
|
||||
for (auto cfd : to_delete) {
|
||||
// this is very rare, so it's not a problem that we do it under a mutex
|
||||
delete cfd;
|
||||
}
|
||||
}
|
||||
|
||||
// under a DB mutex
|
||||
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
||||
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
||||
assert(cfd_iter != column_family_data_.end());
|
||||
Lock();
|
||||
column_family_data_.erase(cfd_iter);
|
||||
column_families_.erase(cfd->GetName());
|
||||
Unlock();
|
||||
}
|
||||
|
||||
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
|
||||
if (column_family_id == 0) {
|
||||
// optimization for common case
|
||||
current_ = column_family_set_->GetDefault();
|
||||
} else {
|
||||
// maybe outside of db mutex, should lock
|
||||
column_family_set_->Lock();
|
||||
current_ = column_family_set_->GetColumnFamily(column_family_id);
|
||||
column_family_set_->Unlock();
|
||||
}
|
||||
handle_.SetCFD(current_);
|
||||
return current_ != nullptr;
|
||||
}
|
||||
|
||||
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
|
||||
assert(current_ != nullptr);
|
||||
return current_->GetLogNumber();
|
||||
}
|
||||
|
||||
MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
|
||||
assert(current_ != nullptr);
|
||||
return current_->mem();
|
||||
}
|
||||
|
||||
const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
|
||||
assert(current_ != nullptr);
|
||||
return current_->options();
|
||||
}
|
||||
|
||||
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
|
||||
assert(current_ != nullptr);
|
||||
return &handle_;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
408
db/column_family.h
Normal file
408
db/column_family.h
Normal file
@ -0,0 +1,408 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <atomic>
|
||||
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "db/memtable_list.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "util/thread_local.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Version;
|
||||
class VersionSet;
|
||||
class MemTable;
|
||||
class MemTableListVersion;
|
||||
class CompactionPicker;
|
||||
class Compaction;
|
||||
class InternalKey;
|
||||
class InternalStats;
|
||||
class ColumnFamilyData;
|
||||
class DBImpl;
|
||||
class LogBuffer;
|
||||
|
||||
// ColumnFamilyHandleImpl is the class that clients use to access different
|
||||
// column families. It has non-trivial destructor, which gets called when client
|
||||
// is done using the column family
|
||||
class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
|
||||
public:
|
||||
// create while holding the mutex
|
||||
ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
|
||||
// destroy without mutex
|
||||
virtual ~ColumnFamilyHandleImpl();
|
||||
virtual ColumnFamilyData* cfd() const { return cfd_; }
|
||||
|
||||
virtual uint32_t GetID() const;
|
||||
|
||||
private:
|
||||
ColumnFamilyData* cfd_;
|
||||
DBImpl* db_;
|
||||
port::Mutex* mutex_;
|
||||
};
|
||||
|
||||
// Does not ref-count ColumnFamilyData
|
||||
// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
|
||||
// calls DBImpl methods. When this happens, MemTableInserter need access to
|
||||
// ColumnFamilyHandle (same as the client would need). In that case, we feed
|
||||
// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
|
||||
// methods
|
||||
class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
|
||||
public:
|
||||
ColumnFamilyHandleInternal()
|
||||
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
|
||||
|
||||
void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
|
||||
virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
|
||||
|
||||
private:
|
||||
ColumnFamilyData* internal_cfd_;
|
||||
};
|
||||
|
||||
// holds references to memtable, all immutable memtables and version
|
||||
struct SuperVersion {
|
||||
MemTable* mem;
|
||||
MemTableListVersion* imm;
|
||||
Version* current;
|
||||
std::atomic<uint32_t> refs;
|
||||
// We need to_delete because during Cleanup(), imm->Unref() returns
|
||||
// all memtables that we need to free through this vector. We then
|
||||
// delete all those memtables outside of mutex, during destruction
|
||||
autovector<MemTable*> to_delete;
|
||||
// Version number of the current SuperVersion
|
||||
uint64_t version_number;
|
||||
port::Mutex* db_mutex;
|
||||
|
||||
// should be called outside the mutex
|
||||
SuperVersion() = default;
|
||||
~SuperVersion();
|
||||
SuperVersion* Ref();
|
||||
|
||||
bool Unref();
|
||||
|
||||
// call these two methods with db mutex held
|
||||
// Cleanup unrefs mem, imm and current. Also, it stores all memtables
|
||||
// that needs to be deleted in to_delete vector. Unrefing those
|
||||
// objects needs to be done in the mutex
|
||||
void Cleanup();
|
||||
void Init(MemTable* new_mem, MemTableListVersion* new_imm,
|
||||
Version* new_current);
|
||||
|
||||
// The value of dummy is not actually used. kSVInUse takes its address as a
|
||||
// mark in the thread local storage to indicate the SuperVersion is in use
|
||||
// by thread. This way, the value of kSVInUse is guaranteed to have no
|
||||
// conflict with SuperVersion object address and portable on different
|
||||
// platform.
|
||||
static int dummy;
|
||||
static void* const kSVInUse;
|
||||
static void* const kSVObsolete;
|
||||
};
|
||||
|
||||
extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const ColumnFamilyOptions& src);
|
||||
|
||||
class ColumnFamilySet;
|
||||
|
||||
// This class keeps all the data that a column family needs. It's mosly dumb and
|
||||
// used just to provide access to metadata.
|
||||
// Most methods require DB mutex held, unless otherwise noted
|
||||
class ColumnFamilyData {
|
||||
public:
|
||||
~ColumnFamilyData();
|
||||
|
||||
// thread-safe
|
||||
uint32_t GetID() const { return id_; }
|
||||
// thread-safe
|
||||
const std::string& GetName() const { return name_; }
|
||||
|
||||
void Ref() { ++refs_; }
|
||||
// will just decrease reference count to 0, but will not delete it. returns
|
||||
// true if the ref count was decreased to zero. in that case, it can be
|
||||
// deleted by the caller immediatelly, or later, by calling
|
||||
// FreeDeadColumnFamilies()
|
||||
bool Unref() {
|
||||
assert(refs_ > 0);
|
||||
return --refs_ == 0;
|
||||
}
|
||||
|
||||
// This can only be called from single-threaded VersionSet::LogAndApply()
|
||||
// After dropping column family no other operation on that column family
|
||||
// will be executed. All the files and memory will be, however, kept around
|
||||
// until client drops the column family handle. That way, client can still
|
||||
// access data from dropped column family.
|
||||
// Column family can be dropped and still alive. In that state:
|
||||
// *) Column family is not included in the iteration.
|
||||
// *) Compaction and flush is not executed on the dropped column family.
|
||||
// *) Client can continue writing and reading from column family. However, all
|
||||
// writes stay in the current memtable.
|
||||
// When the dropped column family is unreferenced, then we:
|
||||
// *) delete all memory associated with that column family
|
||||
// *) delete all the files associated with that column family
|
||||
void SetDropped() {
|
||||
// can't drop default CF
|
||||
assert(id_ != 0);
|
||||
dropped_ = true;
|
||||
}
|
||||
bool IsDropped() const { return dropped_; }
|
||||
|
||||
// thread-safe
|
||||
int NumberLevels() const { return options_.num_levels; }
|
||||
|
||||
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
|
||||
uint64_t GetLogNumber() const { return log_number_; }
|
||||
|
||||
// thread-safe
|
||||
const Options* options() const { return &options_; }
|
||||
|
||||
InternalStats* internal_stats() { return internal_stats_.get(); }
|
||||
|
||||
MemTableList* imm() { return &imm_; }
|
||||
MemTable* mem() { return mem_; }
|
||||
Version* current() { return current_; }
|
||||
Version* dummy_versions() { return dummy_versions_; }
|
||||
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
|
||||
void SetCurrent(Version* current);
|
||||
void CreateNewMemtable();
|
||||
|
||||
TableCache* table_cache() { return table_cache_.get(); }
|
||||
|
||||
// See documentation in compaction_picker.h
|
||||
Compaction* PickCompaction(LogBuffer* log_buffer);
|
||||
Compaction* CompactRange(int input_level, int output_level,
|
||||
const InternalKey* begin, const InternalKey* end,
|
||||
InternalKey** compaction_end);
|
||||
|
||||
CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
|
||||
// thread-safe
|
||||
const Comparator* user_comparator() const {
|
||||
return internal_comparator_.user_comparator();
|
||||
}
|
||||
// thread-safe
|
||||
const InternalKeyComparator& internal_comparator() const {
|
||||
return internal_comparator_;
|
||||
}
|
||||
|
||||
SuperVersion* GetSuperVersion() { return super_version_; }
|
||||
// thread-safe
|
||||
ThreadLocalPtr* GetThreadLocalSuperVersion() const { return local_sv_.get(); }
|
||||
// thread-safe
|
||||
uint64_t GetSuperVersionNumber() const {
|
||||
return super_version_number_.load();
|
||||
}
|
||||
// will return a pointer to SuperVersion* if previous SuperVersion
|
||||
// if its reference count is zero and needs deletion or nullptr if not
|
||||
// As argument takes a pointer to allocated SuperVersion to enable
|
||||
// the clients to allocate SuperVersion outside of mutex.
|
||||
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
|
||||
port::Mutex* db_mutex);
|
||||
|
||||
void ResetThreadLocalSuperVersions();
|
||||
|
||||
// A Flag indicating whether write needs to slowdown because of there are
|
||||
// too many number of level0 files.
|
||||
bool NeedSlowdownForNumLevel0Files() const {
|
||||
return need_slowdown_for_num_level0_files_;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class ColumnFamilySet;
|
||||
ColumnFamilyData(const std::string& dbname, uint32_t id,
|
||||
const std::string& name, Version* dummy_versions,
|
||||
Cache* table_cache, const ColumnFamilyOptions& options,
|
||||
const DBOptions* db_options,
|
||||
const EnvOptions& storage_options,
|
||||
ColumnFamilySet* column_family_set);
|
||||
|
||||
uint32_t id_;
|
||||
const std::string name_;
|
||||
Version* dummy_versions_; // Head of circular doubly-linked list of versions.
|
||||
Version* current_; // == dummy_versions->prev_
|
||||
|
||||
int refs_; // outstanding references to ColumnFamilyData
|
||||
bool dropped_; // true if client dropped it
|
||||
|
||||
const InternalKeyComparator internal_comparator_;
|
||||
const InternalFilterPolicy internal_filter_policy_;
|
||||
|
||||
Options const options_;
|
||||
|
||||
std::unique_ptr<TableCache> table_cache_;
|
||||
|
||||
std::unique_ptr<InternalStats> internal_stats_;
|
||||
|
||||
MemTable* mem_;
|
||||
MemTableList imm_;
|
||||
SuperVersion* super_version_;
|
||||
|
||||
// An ordinal representing the current SuperVersion. Updated by
|
||||
// InstallSuperVersion(), i.e. incremented every time super_version_
|
||||
// changes.
|
||||
std::atomic<uint64_t> super_version_number_;
|
||||
|
||||
// Thread's local copy of SuperVersion pointer
|
||||
// This needs to be destructed before mutex_
|
||||
std::unique_ptr<ThreadLocalPtr> local_sv_;
|
||||
|
||||
// pointers for a circular linked list. we use it to support iterations
|
||||
// that can be concurrent with writes
|
||||
ColumnFamilyData* next_;
|
||||
ColumnFamilyData* prev_;
|
||||
|
||||
// This is the earliest log file number that contains data from this
|
||||
// Column Family. All earlier log files must be ignored and not
|
||||
// recovered from
|
||||
uint64_t log_number_;
|
||||
|
||||
// A flag indicating whether we should delay writes because
|
||||
// we have too many level 0 files
|
||||
bool need_slowdown_for_num_level0_files_;
|
||||
|
||||
// An object that keeps all the compaction stats
|
||||
// and picks the next compaction
|
||||
std::unique_ptr<CompactionPicker> compaction_picker_;
|
||||
|
||||
ColumnFamilySet* column_family_set_;
|
||||
};
|
||||
|
||||
// ColumnFamilySet has interesting thread-safety requirements
|
||||
// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
|
||||
// mutex. Inside, column_family_data_ and column_families_ will be protected
|
||||
// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
|
||||
// VersionSet::LogAndApply() in the normal runtime. It is also called
|
||||
// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
|
||||
// from ColumnFamilyData destructor
|
||||
// * Iteration -- hold DB mutex, but you can release it in the body of
|
||||
// iteration. If you release DB mutex in body, reference the column
|
||||
// family before the mutex and unreference after you unlock, since the column
|
||||
// family might get dropped when the DB mutex is released
|
||||
// * GetDefault() -- thread safe
|
||||
// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
|
||||
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily() --
|
||||
// inside of DB mutex
|
||||
class ColumnFamilySet {
|
||||
public:
|
||||
// ColumnFamilySet supports iteration
|
||||
class iterator {
|
||||
public:
|
||||
explicit iterator(ColumnFamilyData* cfd)
|
||||
: current_(cfd) {}
|
||||
iterator& operator++() {
|
||||
// dummy is never dead or dropped, so this will never be infinite
|
||||
do {
|
||||
current_ = current_->next_;
|
||||
} while (current_->refs_ == 0 || current_->IsDropped());
|
||||
return *this;
|
||||
}
|
||||
bool operator!=(const iterator& other) {
|
||||
return this->current_ != other.current_;
|
||||
}
|
||||
ColumnFamilyData* operator*() { return current_; }
|
||||
|
||||
private:
|
||||
ColumnFamilyData* current_;
|
||||
};
|
||||
|
||||
ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
|
||||
const EnvOptions& storage_options, Cache* table_cache);
|
||||
~ColumnFamilySet();
|
||||
|
||||
ColumnFamilyData* GetDefault() const;
|
||||
// GetColumnFamily() calls return nullptr if column family is not found
|
||||
ColumnFamilyData* GetColumnFamily(uint32_t id) const;
|
||||
ColumnFamilyData* GetColumnFamily(const std::string& name) const;
|
||||
// this call will return the next available column family ID. it guarantees
|
||||
// that there is no column family with id greater than or equal to the
|
||||
// returned value in the current running instance or anytime in RocksDB
|
||||
// instance history.
|
||||
uint32_t GetNextColumnFamilyID();
|
||||
uint32_t GetMaxColumnFamily();
|
||||
void UpdateMaxColumnFamily(uint32_t new_max_column_family);
|
||||
|
||||
ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
|
||||
Version* dummy_version,
|
||||
const ColumnFamilyOptions& options);
|
||||
|
||||
iterator begin() { return iterator(dummy_cfd_->next_); }
|
||||
iterator end() { return iterator(dummy_cfd_); }
|
||||
|
||||
void Lock();
|
||||
void Unlock();
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
// Don't call while iterating over ColumnFamilySet
|
||||
void FreeDeadColumnFamilies();
|
||||
|
||||
private:
|
||||
friend class ColumnFamilyData;
|
||||
// helper function that gets called from cfd destructor
|
||||
// REQUIRES: DB mutex held
|
||||
void RemoveColumnFamily(ColumnFamilyData* cfd);
|
||||
|
||||
// column_families_ and column_family_data_ need to be protected:
|
||||
// * when mutating: 1. DB mutex locked first, 2. spinlock locked second
|
||||
// * when reading, either: 1. lock DB mutex, or 2. lock spinlock
|
||||
// (if both, respect the ordering to avoid deadlock!)
|
||||
std::unordered_map<std::string, uint32_t> column_families_;
|
||||
std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
|
||||
|
||||
uint32_t max_column_family_;
|
||||
ColumnFamilyData* dummy_cfd_;
|
||||
// We don't hold the refcount here, since default column family always exists
|
||||
// We are also not responsible for cleaning up default_cfd_cache_. This is
|
||||
// just a cache that makes common case (accessing default column family)
|
||||
// faster
|
||||
ColumnFamilyData* default_cfd_cache_;
|
||||
|
||||
const std::string db_name_;
|
||||
const DBOptions* const db_options_;
|
||||
const EnvOptions storage_options_;
|
||||
Cache* table_cache_;
|
||||
std::atomic_flag spin_lock_;
|
||||
};
|
||||
|
||||
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
|
||||
// memtables of different column families (specified by ID in the write batch)
|
||||
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
|
||||
public:
|
||||
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
|
||||
: column_family_set_(column_family_set), current_(nullptr) {}
|
||||
|
||||
// sets current_ to ColumnFamilyData with column_family_id
|
||||
// returns false if column family doesn't exist
|
||||
bool Seek(uint32_t column_family_id) override;
|
||||
|
||||
// Returns log number of the selected column family
|
||||
uint64_t GetLogNumber() const override;
|
||||
|
||||
// REQUIRES: Seek() called first
|
||||
virtual MemTable* GetMemTable() const override;
|
||||
|
||||
// Returns options for selected column family
|
||||
// REQUIRES: Seek() called first
|
||||
virtual const Options* GetOptions() const override;
|
||||
|
||||
// Returns column family handle for the selected column family
|
||||
virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
|
||||
|
||||
private:
|
||||
ColumnFamilySet* column_family_set_;
|
||||
ColumnFamilyData* current_;
|
||||
ColumnFamilyHandleInternal handle_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
857
db/column_family_test.cc
Normal file
857
db/column_family_test.cc
Normal file
@ -0,0 +1,857 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
#include "util/coding.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
std::string RandomString(Random* rnd, int len) {
|
||||
std::string r;
|
||||
test::RandomString(rnd, len, &r);
|
||||
return r;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
class ColumnFamilyTest {
|
||||
public:
|
||||
ColumnFamilyTest() : rnd_(139) {
|
||||
env_ = Env::Default();
|
||||
dbname_ = test::TmpDir() + "/column_family_test";
|
||||
db_options_.create_if_missing = true;
|
||||
DestroyDB(dbname_, Options(db_options_, column_family_options_));
|
||||
}
|
||||
|
||||
void Close() {
|
||||
for (auto h : handles_) {
|
||||
delete h;
|
||||
}
|
||||
handles_.clear();
|
||||
names_.clear();
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
}
|
||||
|
||||
Status TryOpen(std::vector<std::string> cf,
|
||||
std::vector<ColumnFamilyOptions> options = {}) {
|
||||
std::vector<ColumnFamilyDescriptor> column_families;
|
||||
names_.clear();
|
||||
for (size_t i = 0; i < cf.size(); ++i) {
|
||||
column_families.push_back(ColumnFamilyDescriptor(
|
||||
cf[i], options.size() == 0 ? column_family_options_ : options[i]));
|
||||
names_.push_back(cf[i]);
|
||||
}
|
||||
return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
|
||||
}
|
||||
|
||||
void Open(std::vector<std::string> cf,
|
||||
std::vector<ColumnFamilyOptions> options = {}) {
|
||||
ASSERT_OK(TryOpen(cf, options));
|
||||
}
|
||||
|
||||
void Open() {
|
||||
Open({"default"});
|
||||
}
|
||||
|
||||
DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
|
||||
|
||||
int GetProperty(int cf, std::string property) {
|
||||
std::string value;
|
||||
ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
|
||||
return std::stoi(value);
|
||||
}
|
||||
|
||||
void Destroy() {
|
||||
for (auto h : handles_) {
|
||||
delete h;
|
||||
}
|
||||
handles_.clear();
|
||||
names_.clear();
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
|
||||
}
|
||||
|
||||
void CreateColumnFamilies(
|
||||
const std::vector<std::string>& cfs,
|
||||
const std::vector<ColumnFamilyOptions> options = {}) {
|
||||
int cfi = handles_.size();
|
||||
handles_.resize(cfi + cfs.size());
|
||||
names_.resize(cfi + cfs.size());
|
||||
for (size_t i = 0; i < cfs.size(); ++i) {
|
||||
ASSERT_OK(db_->CreateColumnFamily(
|
||||
options.size() == 0 ? column_family_options_ : options[i], cfs[i],
|
||||
&handles_[cfi]));
|
||||
names_[cfi] = cfs[i];
|
||||
cfi++;
|
||||
}
|
||||
}
|
||||
|
||||
void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
|
||||
std::vector<std::string> names;
|
||||
for (auto name : names_) {
|
||||
if (name != "") {
|
||||
names.push_back(name);
|
||||
}
|
||||
}
|
||||
Close();
|
||||
assert(options.size() == 0 || names.size() == options.size());
|
||||
Open(names, options);
|
||||
}
|
||||
|
||||
void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
|
||||
CreateColumnFamilies(cfs);
|
||||
Reopen();
|
||||
}
|
||||
|
||||
void DropColumnFamilies(const std::vector<int>& cfs) {
|
||||
for (auto cf : cfs) {
|
||||
ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
|
||||
delete handles_[cf];
|
||||
handles_[cf] = nullptr;
|
||||
names_[cf] = "";
|
||||
}
|
||||
}
|
||||
|
||||
void PutRandomData(int cf, int num, int key_value_size) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
// 10 bytes for key, rest is value
|
||||
ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10),
|
||||
RandomString(&rnd_, key_value_size - 10)));
|
||||
}
|
||||
}
|
||||
|
||||
void WaitForFlush(int cf) {
|
||||
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
|
||||
}
|
||||
|
||||
void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
|
||||
|
||||
Status Put(int cf, const std::string& key, const std::string& value) {
|
||||
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
|
||||
}
|
||||
Status Merge(int cf, const std::string& key, const std::string& value) {
|
||||
return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
|
||||
}
|
||||
Status Flush(int cf) {
|
||||
return db_->Flush(FlushOptions(), handles_[cf]);
|
||||
}
|
||||
|
||||
std::string Get(int cf, const std::string& key) {
|
||||
ReadOptions options;
|
||||
options.verify_checksums = true;
|
||||
std::string result;
|
||||
Status s = db_->Get(options, handles_[cf], Slice(key), &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = "NOT_FOUND";
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void CompactAll(int cf) {
|
||||
ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
|
||||
}
|
||||
|
||||
void Compact(int cf, const Slice& start, const Slice& limit) {
|
||||
ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
|
||||
}
|
||||
|
||||
int NumTableFilesAtLevel(int level, int cf) {
|
||||
return GetProperty(cf,
|
||||
"rocksdb.num-files-at-level" + std::to_string(level));
|
||||
}
|
||||
|
||||
// Return spread of files per level
|
||||
std::string FilesPerLevel(int cf) {
|
||||
std::string result;
|
||||
int last_non_zero_offset = 0;
|
||||
for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
|
||||
int f = NumTableFilesAtLevel(level, cf);
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
|
||||
result += buf;
|
||||
if (f > 0) {
|
||||
last_non_zero_offset = result.size();
|
||||
}
|
||||
}
|
||||
result.resize(last_non_zero_offset);
|
||||
return result;
|
||||
}
|
||||
|
||||
int CountLiveFiles(int cf) {
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
db_->GetLiveFilesMetaData(&metadata);
|
||||
return static_cast<int>(metadata.size());
|
||||
}
|
||||
|
||||
// Do n memtable flushes, each of which produces an sstable
|
||||
// covering the range [small,large].
|
||||
void MakeTables(int cf, int n, const std::string& small,
|
||||
const std::string& large) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
ASSERT_OK(Put(cf, small, "begin"));
|
||||
ASSERT_OK(Put(cf, large, "end"));
|
||||
ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
|
||||
}
|
||||
}
|
||||
|
||||
int CountLiveLogFiles() {
|
||||
int micros_wait_for_log_deletion = 20000;
|
||||
env_->SleepForMicroseconds(micros_wait_for_log_deletion);
|
||||
int ret = 0;
|
||||
VectorLogPtr wal_files;
|
||||
Status s;
|
||||
// GetSortedWalFiles is a flakey function -- it gets all the wal_dir
|
||||
// children files and then later checks for their existance. if some of the
|
||||
// log files doesn't exist anymore, it reports an error. it does all of this
|
||||
// without DB mutex held, so if a background process deletes the log file
|
||||
// while the function is being executed, it returns an error. We retry the
|
||||
// function 10 times to avoid the error failing the test
|
||||
for (int retries = 0; retries < 10; ++retries) {
|
||||
wal_files.clear();
|
||||
s = db_->GetSortedWalFiles(wal_files);
|
||||
if (s.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
ASSERT_OK(s);
|
||||
for (const auto& wal : wal_files) {
|
||||
if (wal->Type() == kAliveLogFile) {
|
||||
++ret;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
|
||||
assert(num_per_cf.size() == handles_.size());
|
||||
|
||||
for (size_t i = 0; i < num_per_cf.size(); ++i) {
|
||||
ASSERT_EQ(num_per_cf[i],
|
||||
GetProperty(i, "rocksdb.num-immutable-mem-table"));
|
||||
}
|
||||
}
|
||||
|
||||
void CopyFile(const std::string& source, const std::string& destination,
|
||||
uint64_t size = 0) {
|
||||
const EnvOptions soptions;
|
||||
unique_ptr<SequentialFile> srcfile;
|
||||
ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
|
||||
unique_ptr<WritableFile> destfile;
|
||||
ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
|
||||
|
||||
if (size == 0) {
|
||||
// default argument means copy everything
|
||||
ASSERT_OK(env_->GetFileSize(source, &size));
|
||||
}
|
||||
|
||||
char buffer[4096];
|
||||
Slice slice;
|
||||
while (size > 0) {
|
||||
uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
|
||||
ASSERT_OK(srcfile->Read(one, &slice, buffer));
|
||||
ASSERT_OK(destfile->Append(slice));
|
||||
size -= slice.size();
|
||||
}
|
||||
ASSERT_OK(destfile->Close());
|
||||
}
|
||||
|
||||
std::vector<ColumnFamilyHandle*> handles_;
|
||||
std::vector<std::string> names_;
|
||||
ColumnFamilyOptions column_family_options_;
|
||||
DBOptions db_options_;
|
||||
std::string dbname_;
|
||||
DB* db_ = nullptr;
|
||||
Env* env_;
|
||||
Random rnd_;
|
||||
};
|
||||
|
||||
TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
|
||||
for (int iter = 0; iter < 3; ++iter) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three"});
|
||||
for (size_t i = 0; i < handles_.size(); ++i) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
|
||||
ASSERT_EQ(i, cfh->GetID());
|
||||
}
|
||||
if (iter == 1) {
|
||||
Reopen();
|
||||
}
|
||||
DropColumnFamilies({3});
|
||||
Reopen();
|
||||
if (iter == 2) {
|
||||
// this tests if max_column_family is correctly persisted with
|
||||
// WriteSnapshot()
|
||||
Reopen();
|
||||
}
|
||||
CreateColumnFamilies({"three2"});
|
||||
// ID 3 that was used for dropped column family "three" should not be reused
|
||||
auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
|
||||
ASSERT_EQ(4, cfh3->GetID());
|
||||
Close();
|
||||
Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(ColumnFamilyTest, AddDrop) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three"});
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
|
||||
DropColumnFamilies({2});
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
CreateColumnFamilies({"four"});
|
||||
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
|
||||
ASSERT_OK(Put(1, "fodor", "mirko"));
|
||||
ASSERT_EQ("mirko", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
|
||||
Close();
|
||||
ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
|
||||
Open({"default", "one", "three", "four"});
|
||||
DropColumnFamilies({1});
|
||||
Reopen();
|
||||
Close();
|
||||
|
||||
std::vector<std::string> families;
|
||||
ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
|
||||
sort(families.begin(), families.end());
|
||||
ASSERT_TRUE(families ==
|
||||
std::vector<std::string>({"default", "four", "three"}));
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, DropTest) {
|
||||
// first iteration - dont reopen DB before dropping
|
||||
// second iteration - reopen DB before dropping
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
Open({"default"});
|
||||
CreateColumnFamiliesAndReopen({"pikachu"});
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
|
||||
}
|
||||
ASSERT_OK(Flush(1));
|
||||
|
||||
if (iter == 1) {
|
||||
Reopen();
|
||||
}
|
||||
ASSERT_EQ("bar1", Get(1, "1"));
|
||||
|
||||
ASSERT_EQ(CountLiveFiles(1), 1);
|
||||
DropColumnFamilies({1});
|
||||
// make sure that all files are deleted when we drop the column family
|
||||
ASSERT_EQ(CountLiveFiles(1), 0);
|
||||
Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, WriteBatchFailure) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
WriteBatch batch;
|
||||
batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
|
||||
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
||||
DropColumnFamilies({1});
|
||||
Status s = db_->Write(WriteOptions(), &batch);
|
||||
ASSERT_TRUE(s.IsInvalidArgument());
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, ReadWrite) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
ASSERT_OK(Put(0, "foo", "v1"));
|
||||
ASSERT_OK(Put(0, "bar", "v2"));
|
||||
ASSERT_OK(Put(1, "mirko", "v3"));
|
||||
ASSERT_OK(Put(0, "foo", "v2"));
|
||||
ASSERT_OK(Put(2, "fodor", "v5"));
|
||||
|
||||
for (int iter = 0; iter <= 3; ++iter) {
|
||||
ASSERT_EQ("v2", Get(0, "foo"));
|
||||
ASSERT_EQ("v2", Get(0, "bar"));
|
||||
ASSERT_EQ("v3", Get(1, "mirko"));
|
||||
ASSERT_EQ("v5", Get(2, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
|
||||
if (iter <= 1) {
|
||||
Reopen();
|
||||
}
|
||||
}
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
|
||||
std::string backup_logs = dbname_ + "/backup_logs";
|
||||
|
||||
// delete old files in backup_logs directory
|
||||
ASSERT_OK(env_->CreateDirIfMissing(dbname_));
|
||||
ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
|
||||
std::vector<std::string> old_files;
|
||||
env_->GetChildren(backup_logs, &old_files);
|
||||
for (auto& file : old_files) {
|
||||
if (file != "." && file != "..") {
|
||||
env_->DeleteFile(backup_logs + "/" + file);
|
||||
}
|
||||
}
|
||||
|
||||
column_family_options_.merge_operator =
|
||||
MergeOperators::CreateUInt64AddOperator();
|
||||
db_options_.wal_dir = dbname_ + "/logs";
|
||||
Destroy();
|
||||
Open();
|
||||
CreateColumnFamilies({"cf1", "cf2"});
|
||||
|
||||
// fill up the DB
|
||||
std::string one, two, three;
|
||||
PutFixed64(&one, 1);
|
||||
PutFixed64(&two, 2);
|
||||
PutFixed64(&three, 3);
|
||||
ASSERT_OK(Merge(0, "foo", one));
|
||||
ASSERT_OK(Merge(1, "mirko", one));
|
||||
ASSERT_OK(Merge(0, "foo", one));
|
||||
ASSERT_OK(Merge(2, "bla", one));
|
||||
ASSERT_OK(Merge(2, "fodor", one));
|
||||
ASSERT_OK(Merge(0, "bar", one));
|
||||
ASSERT_OK(Merge(2, "bla", one));
|
||||
ASSERT_OK(Merge(1, "mirko", two));
|
||||
ASSERT_OK(Merge(1, "franjo", one));
|
||||
|
||||
// copy the logs to backup
|
||||
std::vector<std::string> logs;
|
||||
env_->GetChildren(db_options_.wal_dir, &logs);
|
||||
for (auto& log : logs) {
|
||||
if (log != ".." && log != ".") {
|
||||
CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
|
||||
}
|
||||
}
|
||||
|
||||
// recover the DB
|
||||
Close();
|
||||
|
||||
// 1. check consistency
|
||||
// 2. copy the logs from backup back to WAL dir. if the recovery happens
|
||||
// again on the same log files, this should lead to incorrect results
|
||||
// due to applying merge operator twice
|
||||
// 3. check consistency
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
// assert consistency
|
||||
Open({"default", "cf1", "cf2"});
|
||||
ASSERT_EQ(two, Get(0, "foo"));
|
||||
ASSERT_EQ(one, Get(0, "bar"));
|
||||
ASSERT_EQ(three, Get(1, "mirko"));
|
||||
ASSERT_EQ(one, Get(1, "franjo"));
|
||||
ASSERT_EQ(one, Get(2, "fodor"));
|
||||
ASSERT_EQ(two, Get(2, "bla"));
|
||||
Close();
|
||||
|
||||
if (iter == 0) {
|
||||
// copy the logs from backup back to wal dir
|
||||
for (auto& log : logs) {
|
||||
if (log != ".." && log != ".") {
|
||||
CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, FlushTest) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
ASSERT_OK(Put(0, "foo", "v1"));
|
||||
ASSERT_OK(Put(0, "bar", "v2"));
|
||||
ASSERT_OK(Put(1, "mirko", "v3"));
|
||||
ASSERT_OK(Put(0, "foo", "v2"));
|
||||
ASSERT_OK(Put(2, "fodor", "v5"));
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
Flush(i);
|
||||
}
|
||||
Reopen();
|
||||
|
||||
for (int iter = 0; iter <= 2; ++iter) {
|
||||
ASSERT_EQ("v2", Get(0, "foo"));
|
||||
ASSERT_EQ("v2", Get(0, "bar"));
|
||||
ASSERT_EQ("v3", Get(1, "mirko"));
|
||||
ASSERT_EQ("v5", Get(2, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
|
||||
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
|
||||
if (iter <= 1) {
|
||||
Reopen();
|
||||
}
|
||||
}
|
||||
Close();
|
||||
}
|
||||
|
||||
// Makes sure that obsolete log files get deleted
|
||||
TEST(ColumnFamilyTest, LogDeletionTest) {
|
||||
column_family_options_.write_buffer_size = 100000; // 100KB
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three", "four"});
|
||||
// Each bracket is one log file. if number is in (), it means
|
||||
// we don't need it anymore (it's been flushed)
|
||||
// []
|
||||
ASSERT_EQ(CountLiveLogFiles(), 0);
|
||||
PutRandomData(0, 1, 100);
|
||||
// [0]
|
||||
PutRandomData(1, 1, 100);
|
||||
// [0, 1]
|
||||
PutRandomData(1, 1000, 100);
|
||||
WaitForFlush(1);
|
||||
// [0, (1)] [1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 2);
|
||||
PutRandomData(0, 1, 100);
|
||||
// [0, (1)] [0, 1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 2);
|
||||
PutRandomData(2, 1, 100);
|
||||
// [0, (1)] [0, 1, 2]
|
||||
PutRandomData(2, 1000, 100);
|
||||
WaitForFlush(2);
|
||||
// [0, (1)] [0, 1, (2)] [2]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 3);
|
||||
PutRandomData(2, 1000, 100);
|
||||
WaitForFlush(2);
|
||||
// [0, (1)] [0, 1, (2)] [(2)] [2]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(3, 1, 100);
|
||||
// [0, (1)] [0, 1, (2)] [(2)] [2, 3]
|
||||
PutRandomData(1, 1, 100);
|
||||
// [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(1, 1000, 100);
|
||||
WaitForFlush(1);
|
||||
// [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(0, 1000, 100);
|
||||
WaitForFlush(0);
|
||||
// [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
|
||||
// delete obsolete logs -->
|
||||
// [(1), 2, 3] [1, (0)] [0]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 3);
|
||||
PutRandomData(0, 1000, 100);
|
||||
WaitForFlush(0);
|
||||
// [(1), 2, 3] [1, (0)], [(0)] [0]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(1, 1000, 100);
|
||||
WaitForFlush(1);
|
||||
// [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(2, 1000, 100);
|
||||
WaitForFlush(2);
|
||||
// [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 6);
|
||||
PutRandomData(3, 1000, 100);
|
||||
WaitForFlush(3);
|
||||
// [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
|
||||
// delete obsolete logs -->
|
||||
// [0, (1)] [1, (2)], [2, (3)] [3]
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
Close();
|
||||
}
|
||||
|
||||
// Makes sure that obsolete log files get deleted
|
||||
TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two", "three"});
|
||||
ColumnFamilyOptions default_cf, one, two, three;
|
||||
// setup options. all column families have max_write_buffer_number setup to 10
|
||||
// "default" -> 100KB memtable, start flushing immediatelly
|
||||
// "one" -> 200KB memtable, start flushing with two immutable memtables
|
||||
// "two" -> 1MB memtable, start flushing with three immutable memtables
|
||||
// "three" -> 90KB memtable, start flushing with four immutable memtables
|
||||
default_cf.write_buffer_size = 100000;
|
||||
default_cf.max_write_buffer_number = 10;
|
||||
default_cf.min_write_buffer_number_to_merge = 1;
|
||||
one.write_buffer_size = 200000;
|
||||
one.max_write_buffer_number = 10;
|
||||
one.min_write_buffer_number_to_merge = 2;
|
||||
two.write_buffer_size = 1000000;
|
||||
two.max_write_buffer_number = 10;
|
||||
two.min_write_buffer_number_to_merge = 3;
|
||||
three.write_buffer_size = 90000;
|
||||
three.max_write_buffer_number = 10;
|
||||
three.min_write_buffer_number_to_merge = 4;
|
||||
|
||||
Reopen({default_cf, one, two, three});
|
||||
|
||||
int micros_wait_for_flush = 10000;
|
||||
PutRandomData(0, 100, 1000);
|
||||
WaitForFlush(0);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 1);
|
||||
PutRandomData(1, 200, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 2);
|
||||
PutRandomData(2, 1000, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 1, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 3);
|
||||
PutRandomData(2, 1000, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 4);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 1});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 2});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 6);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 7);
|
||||
PutRandomData(0, 100, 1000);
|
||||
WaitForFlush(0);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 2, 3});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 8);
|
||||
PutRandomData(2, 100, 10000);
|
||||
WaitForFlush(2);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 3});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 9);
|
||||
PutRandomData(3, 90, 1000);
|
||||
WaitForFlush(3);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 10);
|
||||
PutRandomData(3, 90, 1000);
|
||||
env_->SleepForMicroseconds(micros_wait_for_flush);
|
||||
AssertNumberOfImmutableMemtables({0, 1, 0, 1});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 11);
|
||||
PutRandomData(1, 200, 1000);
|
||||
WaitForFlush(1);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 1});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 5);
|
||||
PutRandomData(3, 90*6, 1000);
|
||||
WaitForFlush(3);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 12);
|
||||
PutRandomData(0, 100, 1000);
|
||||
WaitForFlush(0);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 12);
|
||||
PutRandomData(2, 3*100, 10000);
|
||||
WaitForFlush(2);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 12);
|
||||
PutRandomData(1, 2*200, 1000);
|
||||
WaitForFlush(1);
|
||||
AssertNumberOfImmutableMemtables({0, 0, 0, 0});
|
||||
ASSERT_EQ(CountLiveLogFiles(), 7);
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, DifferentMergeOperators) {
|
||||
Open();
|
||||
CreateColumnFamilies({"first", "second"});
|
||||
ColumnFamilyOptions default_cf, first, second;
|
||||
first.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
||||
second.merge_operator = MergeOperators::CreateStringAppendOperator();
|
||||
Reopen({default_cf, first, second});
|
||||
|
||||
std::string one, two, three;
|
||||
PutFixed64(&one, 1);
|
||||
PutFixed64(&two, 2);
|
||||
PutFixed64(&three, 3);
|
||||
|
||||
ASSERT_OK(Put(0, "foo", two));
|
||||
ASSERT_OK(Put(0, "foo", one));
|
||||
ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
|
||||
ASSERT_EQ(Get(0, "foo"), one);
|
||||
|
||||
ASSERT_OK(Put(1, "foo", two));
|
||||
ASSERT_OK(Put(1, "foo", one));
|
||||
ASSERT_OK(Merge(1, "foo", two));
|
||||
ASSERT_EQ(Get(1, "foo"), three);
|
||||
|
||||
ASSERT_OK(Put(2, "foo", two));
|
||||
ASSERT_OK(Put(2, "foo", one));
|
||||
ASSERT_OK(Merge(2, "foo", two));
|
||||
ASSERT_EQ(Get(2, "foo"), one + "," + two);
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST(ColumnFamilyTest, DifferentCompactionStyles) {
|
||||
Open();
|
||||
CreateColumnFamilies({"one", "two"});
|
||||
ColumnFamilyOptions default_cf, one, two;
|
||||
db_options_.max_open_files = 20; // only 10 files in file cache
|
||||
db_options_.disableDataSync = true;
|
||||
|
||||
default_cf.compaction_style = kCompactionStyleLevel;
|
||||
default_cf.num_levels = 3;
|
||||
default_cf.write_buffer_size = 64 << 10; // 64KB
|
||||
default_cf.target_file_size_base = 30 << 10;
|
||||
default_cf.filter_policy = nullptr;
|
||||
default_cf.no_block_cache = true;
|
||||
default_cf.source_compaction_factor = 100;
|
||||
default_cf.disable_seek_compaction = false;
|
||||
|
||||
one.compaction_style = kCompactionStyleUniversal;
|
||||
// trigger compaction if there are >= 4 files
|
||||
one.level0_file_num_compaction_trigger = 4;
|
||||
one.write_buffer_size = 100000;
|
||||
|
||||
two.compaction_style = kCompactionStyleLevel;
|
||||
two.num_levels = 4;
|
||||
two.max_mem_compaction_level = 0;
|
||||
two.level0_file_num_compaction_trigger = 3;
|
||||
two.write_buffer_size = 100000;
|
||||
|
||||
Reopen({default_cf, one, two});
|
||||
|
||||
// SETUP column family "default" - test read compaction
|
||||
ASSERT_EQ("", FilesPerLevel(0));
|
||||
PutRandomData(0, 1, 4096);
|
||||
ASSERT_OK(Flush(0));
|
||||
ASSERT_EQ("0,0,1", FilesPerLevel(0));
|
||||
// write 8MB
|
||||
PutRandomData(0, 2000, 4096);
|
||||
ASSERT_OK(Flush(0));
|
||||
// clear levels 0 and 1
|
||||
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
|
||||
dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
|
||||
// write some new keys into level 0 and 1
|
||||
PutRandomData(0, 1024, 512);
|
||||
ASSERT_OK(Flush(0));
|
||||
WaitForCompaction();
|
||||
PutRandomData(0, 10, 512);
|
||||
ASSERT_OK(Flush(0));
|
||||
// remember number of files in each level
|
||||
int l1 = NumTableFilesAtLevel(0, 0);
|
||||
int l2 = NumTableFilesAtLevel(1, 0);
|
||||
int l3 = NumTableFilesAtLevel(2, 0);
|
||||
ASSERT_NE(l1, 0);
|
||||
ASSERT_NE(l2, 0);
|
||||
ASSERT_NE(l3, 0);
|
||||
|
||||
// SETUP column family "one" -- universal style
|
||||
for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
|
||||
PutRandomData(1, 11, 10000);
|
||||
WaitForFlush(1);
|
||||
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
|
||||
}
|
||||
|
||||
// SETUP column family "two" -- level style with 4 levels
|
||||
for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
|
||||
PutRandomData(2, 15, 10000);
|
||||
WaitForFlush(2);
|
||||
ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
|
||||
}
|
||||
|
||||
// TRIGGER compaction "default"
|
||||
// read a bunch of times, trigger read compaction
|
||||
for (int i = 0; i < 200000; ++i) {
|
||||
Get(0, std::to_string(i));
|
||||
}
|
||||
|
||||
// TRIGGER compaction "one"
|
||||
PutRandomData(1, 12, 10000);
|
||||
|
||||
// TRIGGER compaction "two"
|
||||
PutRandomData(2, 10, 10000);
|
||||
|
||||
// WAIT for compactions
|
||||
WaitForCompaction();
|
||||
|
||||
// VERIFY compaction "default"
|
||||
// verify that the number of files have decreased
|
||||
// in some level, indicating that there was a compaction
|
||||
ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
|
||||
NumTableFilesAtLevel(1, 0) < l2 ||
|
||||
NumTableFilesAtLevel(2, 0) < l3);
|
||||
|
||||
// VERIFY compaction "one"
|
||||
ASSERT_EQ("1", FilesPerLevel(1));
|
||||
|
||||
// VERIFY compaction "two"
|
||||
ASSERT_EQ("0,1", FilesPerLevel(2));
|
||||
CompactAll(2);
|
||||
ASSERT_EQ("0,1", FilesPerLevel(2));
|
||||
|
||||
Close();
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::string IterStatus(Iterator* iter) {
|
||||
std::string result;
|
||||
if (iter->Valid()) {
|
||||
result = iter->key().ToString() + "->" + iter->value().ToString();
|
||||
} else {
|
||||
result = "(invalid)";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TEST(ColumnFamilyTest, NewIteratorsTest) {
|
||||
// iter == 0 -- no tailing
|
||||
// iter == 2 -- tailing
|
||||
for (int iter = 0; iter < 2; ++iter) {
|
||||
Open();
|
||||
CreateColumnFamiliesAndReopen({"one", "two"});
|
||||
ASSERT_OK(Put(0, "a", "b"));
|
||||
ASSERT_OK(Put(1, "b", "a"));
|
||||
ASSERT_OK(Put(2, "c", "m"));
|
||||
ASSERT_OK(Put(2, "v", "t"));
|
||||
std::vector<Iterator*> iterators;
|
||||
ReadOptions options;
|
||||
options.tailing = (iter == 1);
|
||||
ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
|
||||
|
||||
for (auto it : iterators) {
|
||||
it->SeekToFirst();
|
||||
}
|
||||
ASSERT_EQ(IterStatus(iterators[0]), "a->b");
|
||||
ASSERT_EQ(IterStatus(iterators[1]), "b->a");
|
||||
ASSERT_EQ(IterStatus(iterators[2]), "c->m");
|
||||
|
||||
ASSERT_OK(Put(1, "x", "x"));
|
||||
|
||||
for (auto it : iterators) {
|
||||
it->Next();
|
||||
}
|
||||
|
||||
ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
|
||||
if (iter == 0) {
|
||||
// no tailing
|
||||
ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
|
||||
} else {
|
||||
// tailing
|
||||
ASSERT_EQ(IterStatus(iterators[1]), "x->x");
|
||||
}
|
||||
ASSERT_EQ(IterStatus(iterators[2]), "v->t");
|
||||
|
||||
for (auto it : iterators) {
|
||||
delete it;
|
||||
}
|
||||
Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
@ -8,6 +8,7 @@
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/compaction.h"
|
||||
#include "db/column_family.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -29,6 +30,7 @@ Compaction::Compaction(Version* input_version, int level, int out_level,
|
||||
max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
|
||||
input_version_(input_version),
|
||||
number_levels_(input_version_->NumberLevels()),
|
||||
cfd_(input_version_->cfd_),
|
||||
seek_compaction_(seek_compaction),
|
||||
enable_compression_(enable_compression),
|
||||
grandparent_index_(0),
|
||||
@ -42,8 +44,10 @@ Compaction::Compaction(Version* input_version, int level, int out_level,
|
||||
is_manual_compaction_(false),
|
||||
level_ptrs_(std::vector<size_t>(number_levels_)) {
|
||||
|
||||
cfd_->Ref();
|
||||
input_version_->Ref();
|
||||
edit_ = new VersionEdit();
|
||||
edit_->SetColumnFamily(cfd_->GetID());
|
||||
for (int i = 0; i < number_levels_; i++) {
|
||||
level_ptrs_[i] = 0;
|
||||
}
|
||||
@ -54,6 +58,11 @@ Compaction::~Compaction() {
|
||||
if (input_version_ != nullptr) {
|
||||
input_version_->Unref();
|
||||
}
|
||||
if (cfd_ != nullptr) {
|
||||
if (cfd_->Unref()) {
|
||||
delete cfd_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Compaction::IsTrivialMove() const {
|
||||
@ -77,12 +86,11 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
|
||||
}
|
||||
|
||||
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
|
||||
if (input_version_->vset_->options_->compaction_style ==
|
||||
kCompactionStyleUniversal) {
|
||||
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
|
||||
return bottommost_level_;
|
||||
}
|
||||
// Maybe use binary search to find right entry instead of linear search?
|
||||
const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
|
||||
const Comparator* user_cmp = cfd_->user_comparator();
|
||||
for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
|
||||
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
|
||||
for (; level_ptrs_[lvl] < files.size(); ) {
|
||||
@ -103,7 +111,7 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
|
||||
|
||||
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
|
||||
// Scan to find earliest grandparent file that contains key.
|
||||
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
|
||||
const InternalKeyComparator* icmp = &cfd_->internal_comparator();
|
||||
while (grandparent_index_ < grandparents_.size() &&
|
||||
icmp->Compare(internal_key,
|
||||
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
|
||||
@ -141,8 +149,7 @@ void Compaction::MarkFilesBeingCompacted(bool value) {
|
||||
|
||||
// Is this compaction producing files at the bottommost level?
|
||||
void Compaction::SetupBottomMostLevel(bool isManual) {
|
||||
if (input_version_->vset_->options_->compaction_style ==
|
||||
kCompactionStyleUniversal) {
|
||||
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
|
||||
// If universal compaction style is used and manual
|
||||
// compaction is occuring, then we are guaranteed that
|
||||
// all files will be picked in a single compaction
|
||||
@ -155,8 +162,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) {
|
||||
return;
|
||||
}
|
||||
bottommost_level_ = true;
|
||||
int num_levels = input_version_->vset_->NumberLevels();
|
||||
for (int i = output_level() + 1; i < num_levels; i++) {
|
||||
for (int i = output_level() + 1; i < number_levels_; i++) {
|
||||
if (input_version_->NumLevelFiles(i) > 0) {
|
||||
bottommost_level_ = false;
|
||||
break;
|
||||
@ -169,6 +175,16 @@ void Compaction::ReleaseInputs() {
|
||||
input_version_->Unref();
|
||||
input_version_ = nullptr;
|
||||
}
|
||||
if (cfd_ != nullptr) {
|
||||
if (cfd_->Unref()) {
|
||||
delete cfd_;
|
||||
}
|
||||
cfd_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void Compaction::ReleaseCompactionFiles(Status status) {
|
||||
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
|
||||
}
|
||||
|
||||
void Compaction::ResetNextCompactionIndex() {
|
||||
|
@ -13,6 +13,7 @@
|
||||
namespace rocksdb {
|
||||
|
||||
class Version;
|
||||
class ColumnFamilyData;
|
||||
|
||||
// A Compaction encapsulates information about a compaction.
|
||||
class Compaction {
|
||||
@ -36,6 +37,8 @@ class Compaction {
|
||||
// Returns input version of the compaction
|
||||
Version* input_version() const { return input_version_; }
|
||||
|
||||
ColumnFamilyData* column_family_data() const { return cfd_; }
|
||||
|
||||
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
|
||||
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
|
||||
|
||||
@ -67,6 +70,10 @@ class Compaction {
|
||||
// is successful.
|
||||
void ReleaseInputs();
|
||||
|
||||
// Clear all files to indicate that they are not being compacted
|
||||
// Delete this compaction from the list of running compactions.
|
||||
void ReleaseCompactionFiles(Status status);
|
||||
|
||||
void Summary(char* output, int len);
|
||||
|
||||
// Return the score that was used to pick this compaction run.
|
||||
@ -97,6 +104,7 @@ class Compaction {
|
||||
Version* input_version_;
|
||||
VersionEdit* edit_;
|
||||
int number_levels_;
|
||||
ColumnFamilyData* cfd_;
|
||||
|
||||
bool seek_compaction_;
|
||||
bool enable_compression_;
|
||||
|
@ -277,14 +277,10 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) {
|
||||
Log(options_->info_log,
|
||||
"Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
|
||||
"\n",
|
||||
(unsigned long)level,
|
||||
(unsigned long)(c->inputs_[0].size()),
|
||||
(unsigned long)(c->inputs_[1].size()),
|
||||
(unsigned long)inputs0_size,
|
||||
(unsigned long)inputs1_size,
|
||||
(unsigned long)(expanded0.size()),
|
||||
(unsigned long)(expanded1.size()),
|
||||
(unsigned long)expanded0_size,
|
||||
(unsigned long)level, (unsigned long)(c->inputs_[0].size()),
|
||||
(unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
|
||||
(unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
|
||||
(unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
|
||||
(unsigned long)inputs1_size);
|
||||
smallest = new_start;
|
||||
largest = new_limit;
|
||||
@ -587,7 +583,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
|
||||
options_->level0_file_num_compaction_trigger;
|
||||
if ((c = PickCompactionUniversalReadAmp(
|
||||
version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
|
||||
Log(options_->info_log, "Universal: compacting for file num\n");
|
||||
LogToBuffer(log_buffer, "Universal: compacting for file num\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -653,7 +649,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
|
||||
FileMetaData* f = nullptr;
|
||||
bool done = false;
|
||||
int start_index = 0;
|
||||
unsigned int candidate_count;
|
||||
unsigned int candidate_count = 0;
|
||||
assert(file_by_time.size() == version->files_[level].size());
|
||||
|
||||
unsigned int max_files_to_compact = std::min(max_merge_width,
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "db/compaction.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
@ -118,6 +119,7 @@ class CompactionPicker {
|
||||
std::unique_ptr<uint64_t[]> level_max_bytes_;
|
||||
|
||||
const Options* const options_;
|
||||
|
||||
private:
|
||||
int num_levels_;
|
||||
|
||||
|
633
db/db_bench.cc
633
db/db_bench.cc
@ -42,7 +42,6 @@
|
||||
|
||||
|
||||
DEFINE_string(benchmarks,
|
||||
|
||||
"fillseq,"
|
||||
"fillsync,"
|
||||
"fillrandom,"
|
||||
@ -53,6 +52,7 @@ DEFINE_string(benchmarks,
|
||||
"readreverse,"
|
||||
"compact,"
|
||||
"readrandom,"
|
||||
"multireadrandom,"
|
||||
"readseq,"
|
||||
"readtocache,"
|
||||
"readreverse,"
|
||||
@ -64,8 +64,7 @@ DEFINE_string(benchmarks,
|
||||
"crc32c,"
|
||||
"compress,"
|
||||
"uncompress,"
|
||||
"acquireload,"
|
||||
"fillfromstdin,",
|
||||
"acquireload,",
|
||||
|
||||
"Comma-separated list of operations to run in the specified order"
|
||||
"Actual benchmarks:\n"
|
||||
@ -129,16 +128,8 @@ DEFINE_int64(merge_keys, -1,
|
||||
DEFINE_int64(reads, -1, "Number of read operations to do. "
|
||||
"If negative, do FLAGS_num reads.");
|
||||
|
||||
DEFINE_int64(read_range, 1, "When ==1 reads use ::Get, when >1 reads use"
|
||||
" an iterator");
|
||||
|
||||
DEFINE_bool(use_prefix_blooms, false, "Whether to place prefixes in blooms");
|
||||
|
||||
DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
|
||||
|
||||
DEFINE_bool(use_prefix_api, false, "Whether to set ReadOptions.prefix for"
|
||||
" prefixscanrandom. If true, use_prefix_blooms must also be true.");
|
||||
|
||||
DEFINE_int64(seed, 0, "Seed base for random number generators. "
|
||||
"When 0 it is deterministic.");
|
||||
|
||||
@ -278,12 +269,6 @@ DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
|
||||
|
||||
DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
|
||||
|
||||
DEFINE_bool(use_snapshot, false, "If true, create a snapshot per query when"
|
||||
" randomread benchmark is used");
|
||||
|
||||
DEFINE_bool(get_approx, false, "If true, call GetApproximateSizes per query"
|
||||
" when read_range is > 1 and randomread benchmark is used");
|
||||
|
||||
DEFINE_int32(num_levels, 7, "The total number of levels");
|
||||
|
||||
DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
|
||||
@ -461,20 +446,9 @@ DEFINE_string(compaction_fadvice, "NORMAL",
|
||||
static auto FLAGS_compaction_fadvice_e =
|
||||
rocksdb::Options().access_hint_on_compaction_start;
|
||||
|
||||
DEFINE_bool(use_multiget, false,
|
||||
"Use multiget to access a series of keys instead of get");
|
||||
|
||||
DEFINE_bool(use_tailing_iterator, false,
|
||||
"Use tailing iterator to access a series of keys instead of get");
|
||||
|
||||
DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number"
|
||||
" of keys to group per call Arbitrary default is good because it"
|
||||
" agrees with readwritepercent");
|
||||
|
||||
// TODO: Apply this flag to generic Get calls too. Currently only with Multiget
|
||||
DEFINE_bool(warn_missing_keys, true, "Print a message to user when a key is"
|
||||
" missing in a Get/MultiGet call");
|
||||
|
||||
DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
|
||||
"Use adaptive mutex");
|
||||
|
||||
@ -798,7 +772,7 @@ class Duration {
|
||||
start_at_ = FLAGS_env->NowMicros();
|
||||
}
|
||||
|
||||
bool Done(int increment) {
|
||||
bool Done(int64_t increment) {
|
||||
if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
|
||||
ops_ += increment;
|
||||
|
||||
@ -834,13 +808,12 @@ class Benchmark {
|
||||
int key_size_;
|
||||
int prefix_size_;
|
||||
int64_t keys_per_prefix_;
|
||||
int entries_per_batch_;
|
||||
int64_t entries_per_batch_;
|
||||
WriteOptions write_options_;
|
||||
int64_t reads_;
|
||||
int64_t writes_;
|
||||
int64_t readwrites_;
|
||||
int64_t merge_keys_;
|
||||
int heap_counter_;
|
||||
void PrintHeader() {
|
||||
PrintEnvironment();
|
||||
fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size);
|
||||
@ -1037,8 +1010,7 @@ class Benchmark {
|
||||
readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num :
|
||||
((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
|
||||
),
|
||||
merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
|
||||
heap_counter_(0) {
|
||||
merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) {
|
||||
if (FLAGS_prefix_size > FLAGS_key_size) {
|
||||
fprintf(stderr, "prefix size is larger than key size");
|
||||
exit(1);
|
||||
@ -1062,6 +1034,10 @@ class Benchmark {
|
||||
delete prefix_extractor_;
|
||||
}
|
||||
|
||||
Slice AllocateKey() {
|
||||
return Slice(new char[key_size_], key_size_);
|
||||
}
|
||||
|
||||
// Generate key according to the given specification and random number.
|
||||
// The resulting key will have the following format (if keys_per_prefix_
|
||||
// is positive), extra trailing bytes are either cut off or paddd with '0'.
|
||||
@ -1074,10 +1050,8 @@ class Benchmark {
|
||||
// ----------------------------
|
||||
// | key 00000 |
|
||||
// ----------------------------
|
||||
std::string GenerateKeyFromInt(uint64_t v, int64_t num_keys) {
|
||||
std::string key;
|
||||
key.resize(key_size_);
|
||||
char* start = &(key[0]);
|
||||
void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
|
||||
char* start = const_cast<char*>(key->data());
|
||||
char* pos = start;
|
||||
if (keys_per_prefix_ > 0) {
|
||||
int64_t num_prefix = num_keys / keys_per_prefix_;
|
||||
@ -1109,8 +1083,6 @@ class Benchmark {
|
||||
if (key_size_ > pos - start) {
|
||||
memset(pos, '0', key_size_ - (pos - start));
|
||||
}
|
||||
|
||||
return key;
|
||||
}
|
||||
|
||||
void Run() {
|
||||
@ -1155,15 +1127,12 @@ class Benchmark {
|
||||
} else if (name == Slice("fillrandom")) {
|
||||
fresh_db = true;
|
||||
method = &Benchmark::WriteRandom;
|
||||
} else if (name == Slice("fillfromstdin")) {
|
||||
fresh_db = true;
|
||||
method = &Benchmark::WriteFromStdin;
|
||||
} else if (name == Slice("filluniquerandom")) {
|
||||
fresh_db = true;
|
||||
if (num_threads > 1) {
|
||||
fprintf(stderr, "filluniquerandom multithreaded not supported"
|
||||
" set --threads=1");
|
||||
exit(1);
|
||||
", use 1 thread");
|
||||
num_threads = 1;
|
||||
}
|
||||
method = &Benchmark::WriteUniqueRandom;
|
||||
} else if (name == Slice("overwrite")) {
|
||||
@ -1189,19 +1158,18 @@ class Benchmark {
|
||||
method = &Benchmark::ReadReverse;
|
||||
} else if (name == Slice("readrandom")) {
|
||||
method = &Benchmark::ReadRandom;
|
||||
} else if (name == Slice("multireadrandom")) {
|
||||
method = &Benchmark::MultiReadRandom;
|
||||
} else if (name == Slice("readmissing")) {
|
||||
method = &Benchmark::ReadMissing;
|
||||
++key_size_;
|
||||
method = &Benchmark::ReadRandom;
|
||||
} else if (name == Slice("newiterator")) {
|
||||
method = &Benchmark::IteratorCreation;
|
||||
} else if (name == Slice("seekrandom")) {
|
||||
method = &Benchmark::SeekRandom;
|
||||
} else if (name == Slice("readhot")) {
|
||||
method = &Benchmark::ReadHot;
|
||||
} else if (name == Slice("readrandomsmall")) {
|
||||
reads_ /= 1000;
|
||||
method = &Benchmark::ReadRandom;
|
||||
} else if (name == Slice("prefixscanrandom")) {
|
||||
method = &Benchmark::PrefixScanRandom;
|
||||
} else if (name == Slice("deleteseq")) {
|
||||
method = &Benchmark::DeleteSeq;
|
||||
} else if (name == Slice("deleterandom")) {
|
||||
@ -1215,10 +1183,9 @@ class Benchmark {
|
||||
if (FLAGS_merge_operator.empty()) {
|
||||
fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
|
||||
name.ToString().c_str());
|
||||
method = nullptr;
|
||||
} else {
|
||||
method = &Benchmark::ReadRandomMergeRandom;
|
||||
exit(1);
|
||||
}
|
||||
method = &Benchmark::ReadRandomMergeRandom;
|
||||
} else if (name == Slice("updaterandom")) {
|
||||
method = &Benchmark::UpdateRandom;
|
||||
} else if (name == Slice("appendrandom")) {
|
||||
@ -1227,10 +1194,9 @@ class Benchmark {
|
||||
if (FLAGS_merge_operator.empty()) {
|
||||
fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
|
||||
name.ToString().c_str());
|
||||
method = nullptr;
|
||||
} else {
|
||||
method = &Benchmark::MergeRandom;
|
||||
exit(1);
|
||||
}
|
||||
method = &Benchmark::MergeRandom;
|
||||
} else if (name == Slice("randomwithverify")) {
|
||||
method = &Benchmark::RandomWithVerify;
|
||||
} else if (name == Slice("compact")) {
|
||||
@ -1243,8 +1209,6 @@ class Benchmark {
|
||||
method = &Benchmark::Compress;
|
||||
} else if (name == Slice("uncompress")) {
|
||||
method = &Benchmark::Uncompress;
|
||||
} else if (name == Slice("heapprofile")) {
|
||||
HeapProfile();
|
||||
} else if (name == Slice("stats")) {
|
||||
PrintStats("rocksdb.stats");
|
||||
} else if (name == Slice("levelstats")) {
|
||||
@ -1254,6 +1218,7 @@ class Benchmark {
|
||||
} else {
|
||||
if (name != Slice()) { // No error message for empty name
|
||||
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1540,7 +1505,7 @@ class Benchmark {
|
||||
options.compaction_style = FLAGS_compaction_style_e;
|
||||
options.block_size = FLAGS_block_size;
|
||||
options.filter_policy = filter_policy_;
|
||||
if (FLAGS_use_plain_table || FLAGS_use_prefix_blooms) {
|
||||
if (FLAGS_use_plain_table) {
|
||||
options.prefix_extractor.reset(
|
||||
NewFixedPrefixTransform(FLAGS_prefix_size));
|
||||
}
|
||||
@ -1715,54 +1680,6 @@ class Benchmark {
|
||||
DoWrite(thread, UNIQUE_RANDOM);
|
||||
}
|
||||
|
||||
void writeOrFail(WriteBatch& batch) {
|
||||
Status s = db_->Write(write_options_, &batch);
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void WriteFromStdin(ThreadState* thread) {
|
||||
size_t count = 0;
|
||||
WriteBatch batch;
|
||||
const size_t bufferLen = 32 << 20;
|
||||
unique_ptr<char[]> line = unique_ptr<char[]>(new char[bufferLen]);
|
||||
char* linep = line.get();
|
||||
const int batchSize = 100 << 10;
|
||||
const char columnSeparator = '\t';
|
||||
const char lineSeparator = '\n';
|
||||
|
||||
while (fgets(linep, bufferLen, stdin) != nullptr) {
|
||||
++count;
|
||||
char* tab = std::find(linep, linep + bufferLen, columnSeparator);
|
||||
if (tab == linep + bufferLen) {
|
||||
fprintf(stderr, "[Error] No Key delimiter TAB at line %zu\n", count);
|
||||
continue;
|
||||
}
|
||||
Slice key(linep, tab - linep);
|
||||
tab++;
|
||||
char* endLine = std::find(tab, linep + bufferLen, lineSeparator);
|
||||
if (endLine == linep + bufferLen) {
|
||||
fprintf(stderr, "[Error] No ENTER at end of line # %zu\n", count);
|
||||
continue;
|
||||
}
|
||||
Slice value(tab, endLine - tab);
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
thread->stats.AddBytes(endLine - linep - 1);
|
||||
|
||||
if (batch.Count() < batchSize) {
|
||||
batch.Put(key, value);
|
||||
continue;
|
||||
}
|
||||
writeOrFail(batch);
|
||||
batch.Clear();
|
||||
}
|
||||
if (batch.Count() > 0) {
|
||||
writeOrFail(batch);
|
||||
}
|
||||
}
|
||||
|
||||
void DoWrite(ThreadState* thread, WriteMode write_mode) {
|
||||
const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
|
||||
const int64_t num_ops = writes_ == 0 ? num_ : writes_;
|
||||
@ -1783,10 +1700,13 @@ class Benchmark {
|
||||
WriteBatch batch;
|
||||
Status s;
|
||||
int64_t bytes = 0;
|
||||
int i = 0;
|
||||
int64_t i = 0;
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
while (!duration.Done(entries_per_batch_)) {
|
||||
batch.Clear();
|
||||
for (int j = 0; j < entries_per_batch_; j++) {
|
||||
for (int64_t j = 0; j < entries_per_batch_; j++) {
|
||||
int64_t k = 0;
|
||||
switch(write_mode) {
|
||||
case SEQUENTIAL:
|
||||
@ -1825,9 +1745,9 @@ class Benchmark {
|
||||
break;
|
||||
}
|
||||
};
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
GenerateKeyFromInt(k, FLAGS_num, &key);
|
||||
batch.Put(key, gen.Generate(value_size_));
|
||||
bytes += value_size_ + key.size();
|
||||
bytes += value_size_ + key_size_;
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
s = db_->Write(write_options_, &batch);
|
||||
@ -1866,135 +1786,22 @@ class Benchmark {
|
||||
thread->stats.AddBytes(bytes);
|
||||
}
|
||||
|
||||
// Calls MultiGet over a list of keys from a random distribution.
|
||||
// Returns the total number of keys found.
|
||||
long MultiGetRandom(ReadOptions& options, int num_keys,
|
||||
Random64* rand, int64_t range, const char* suffix) {
|
||||
assert(num_keys > 0);
|
||||
std::vector<Slice> keys(num_keys);
|
||||
std::vector<std::string> values(num_keys);
|
||||
std::vector<std::string> gen_keys(num_keys);
|
||||
|
||||
int i;
|
||||
int64_t k;
|
||||
|
||||
// Fill the keys vector
|
||||
for(i=0; i<num_keys; ++i) {
|
||||
k = rand->Next() % range;
|
||||
gen_keys[i] = GenerateKeyFromInt(k, range) + suffix;
|
||||
keys[i] = gen_keys[i];
|
||||
}
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
options.snapshot = db_->GetSnapshot();
|
||||
}
|
||||
|
||||
// Apply the operation
|
||||
std::vector<Status> statuses = db_->MultiGet(options, keys, &values);
|
||||
assert((long)statuses.size() == num_keys);
|
||||
assert((long)keys.size() == num_keys); // Should always be the case.
|
||||
assert((long)values.size() == num_keys);
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
db_->ReleaseSnapshot(options.snapshot);
|
||||
options.snapshot = nullptr;
|
||||
}
|
||||
|
||||
// Count number found
|
||||
long found = 0;
|
||||
for(i=0; i<num_keys; ++i) {
|
||||
if (statuses[i].ok()){
|
||||
++found;
|
||||
} else if (FLAGS_warn_missing_keys == true) {
|
||||
// Key not found, or error.
|
||||
fprintf(stderr, "get error: %s\n", statuses[i].ToString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
void ReadRandom(ThreadState* thread) {
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
Duration duration(FLAGS_duration, reads_);
|
||||
|
||||
int64_t found = 0;
|
||||
int64_t read = 0;
|
||||
if (FLAGS_use_multiget) { // MultiGet
|
||||
const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group
|
||||
long keys_left = reads_;
|
||||
int64_t found = 0;
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
std::string value;
|
||||
|
||||
// Recalculate number of keys per group, and call MultiGet until done
|
||||
long num_keys;
|
||||
while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
|
||||
read += num_keys;
|
||||
found +=
|
||||
MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "");
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
keys_left -= num_keys;
|
||||
Duration duration(FLAGS_duration, reads_);
|
||||
while (!duration.Done(1)) {
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
|
||||
read++;
|
||||
if (db_->Get(options, key, &value).ok()) {
|
||||
found++;
|
||||
}
|
||||
} else if (FLAGS_use_tailing_iterator) { // use tailing iterator for gets
|
||||
options.tailing = true;
|
||||
Iterator* iter = db_->NewIterator(options);
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
|
||||
iter->Seek(key);
|
||||
read++;
|
||||
if (iter->Valid() && iter->key().compare(Slice(key)) == 0) {
|
||||
found++;
|
||||
}
|
||||
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
delete iter;
|
||||
} else { // Regular case. Do one "get" at a time Get
|
||||
options.tailing = true;
|
||||
options.prefix_seek = (FLAGS_prefix_size == 0);
|
||||
Iterator* iter = db_->NewIterator(options);
|
||||
std::string value;
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
if (FLAGS_use_snapshot) {
|
||||
options.snapshot = db_->GetSnapshot();
|
||||
}
|
||||
|
||||
if (FLAGS_read_range < 2) {
|
||||
read++;
|
||||
if (db_->Get(options, key, &value).ok()) {
|
||||
found++;
|
||||
}
|
||||
} else {
|
||||
int count = 1;
|
||||
|
||||
if (FLAGS_get_approx) {
|
||||
std::string key2 =
|
||||
GenerateKeyFromInt(k + static_cast<int>(FLAGS_read_range),
|
||||
FLAGS_num + FLAGS_read_range);
|
||||
Range range(key, key2);
|
||||
uint64_t sizes;
|
||||
db_->GetApproximateSizes(&range, 1, &sizes);
|
||||
}
|
||||
|
||||
read += FLAGS_read_range;
|
||||
for (iter->Seek(key);
|
||||
iter->Valid() && count <= FLAGS_read_range;
|
||||
++count, iter->Next()) {
|
||||
found++;
|
||||
}
|
||||
}
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
db_->ReleaseSnapshot(options.snapshot);
|
||||
options.snapshot = nullptr;
|
||||
}
|
||||
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
|
||||
delete iter;
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
|
||||
char msg[100];
|
||||
@ -2008,113 +1815,41 @@ class Benchmark {
|
||||
}
|
||||
}
|
||||
|
||||
void PrefixScanRandom(ThreadState* thread) {
|
||||
if (FLAGS_use_prefix_api) {
|
||||
assert(FLAGS_use_prefix_blooms);
|
||||
assert(FLAGS_bloom_bits >= 1);
|
||||
// Calls MultiGet over a list of keys from a random distribution.
|
||||
// Returns the total number of keys found.
|
||||
void MultiReadRandom(ThreadState* thread) {
|
||||
int64_t read = 0;
|
||||
int64_t found = 0;
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
std::vector<Slice> keys(entries_per_batch_);
|
||||
std::vector<std::string> values(entries_per_batch_);
|
||||
while (keys.size() < entries_per_batch_) {
|
||||
keys.push_back(AllocateKey());
|
||||
}
|
||||
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
Duration duration(FLAGS_duration, reads_);
|
||||
|
||||
int64_t found = 0;
|
||||
|
||||
while (!duration.Done(1)) {
|
||||
std::string value;
|
||||
const int k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
Slice skey(key);
|
||||
Slice prefix = prefix_extractor_->Transform(skey);
|
||||
options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr;
|
||||
|
||||
Iterator* iter = db_->NewIterator(options);
|
||||
for (iter->Seek(skey);
|
||||
iter->Valid() && iter->key().starts_with(prefix);
|
||||
iter->Next()) {
|
||||
found++;
|
||||
for (int64_t i = 0; i < entries_per_batch_; ++i) {
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num,
|
||||
FLAGS_num, &keys[i]);
|
||||
}
|
||||
delete iter;
|
||||
std::vector<Status> statuses = db_->MultiGet(options, keys, &values);
|
||||
assert(statuses.size() == entries_per_batch_);
|
||||
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
|
||||
char msg[100];
|
||||
snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
|
||||
found, reads_);
|
||||
thread->stats.AddMessage(msg);
|
||||
}
|
||||
|
||||
void ReadMissing(ThreadState* thread) {
|
||||
FLAGS_warn_missing_keys = false; // Never warn about missing keys
|
||||
|
||||
Duration duration(FLAGS_duration, reads_);
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
|
||||
if (FLAGS_use_multiget) {
|
||||
const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group
|
||||
long keys_left = reads_;
|
||||
|
||||
// Recalculate number of keys per group, and call MultiGet until done
|
||||
long num_keys;
|
||||
long found;
|
||||
while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
|
||||
found =
|
||||
MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, ".");
|
||||
|
||||
// We should not find any key since the key we try to get has a
|
||||
// different suffix
|
||||
if (found) {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
keys_left -= num_keys;
|
||||
}
|
||||
} else { // Regular case (not MultiGet)
|
||||
std::string value;
|
||||
Status s;
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num) + ".";
|
||||
s = db_->Get(options, key, &value);
|
||||
assert(!s.ok() && s.IsNotFound());
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ReadHot(ThreadState* thread) {
|
||||
Duration duration(FLAGS_duration, reads_);
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
const int64_t range = (FLAGS_num + 99) / 100;
|
||||
int64_t found = 0;
|
||||
|
||||
if (FLAGS_use_multiget) {
|
||||
const int64_t kpg = FLAGS_keys_per_multiget; // keys per multiget group
|
||||
int64_t keys_left = reads_;
|
||||
|
||||
// Recalculate number of keys per group, and call MultiGet until done
|
||||
long num_keys;
|
||||
while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
|
||||
found += MultiGetRandom(options, num_keys, &thread->rand, range, "");
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
keys_left -= num_keys;
|
||||
}
|
||||
} else {
|
||||
std::string value;
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % range;
|
||||
std::string key = GenerateKeyFromInt(k, range);
|
||||
if (db_->Get(options, key, &value).ok()) {
|
||||
read += entries_per_batch_;
|
||||
for (int64_t i = 0; i < entries_per_batch_; ++i) {
|
||||
if (statuses[i].ok()) {
|
||||
++found;
|
||||
}
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
}
|
||||
for (auto& k : keys) {
|
||||
delete k.data();
|
||||
}
|
||||
|
||||
char msg[100];
|
||||
snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
|
||||
found, reads_);
|
||||
found, read);
|
||||
thread->stats.AddMessage(msg);
|
||||
}
|
||||
|
||||
@ -2129,44 +1864,53 @@ class Benchmark {
|
||||
}
|
||||
|
||||
void SeekRandom(ThreadState* thread) {
|
||||
Duration duration(FLAGS_duration, reads_);
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
std::string value;
|
||||
int64_t read = 0;
|
||||
int64_t found = 0;
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
options.tailing = FLAGS_use_tailing_iterator;
|
||||
auto* iter = db_->NewIterator(options);
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
|
||||
Duration duration(FLAGS_duration, reads_);
|
||||
while (!duration.Done(1)) {
|
||||
Iterator* iter = db_->NewIterator(options);
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
|
||||
iter->Seek(key);
|
||||
if (iter->Valid() && iter->key() == Slice(key)) found++;
|
||||
delete iter;
|
||||
read++;
|
||||
if (iter->Valid() && iter->key().compare(key) == 0) {
|
||||
found++;
|
||||
}
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
delete iter;
|
||||
|
||||
char msg[100];
|
||||
snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
|
||||
found, num_);
|
||||
found, read);
|
||||
thread->stats.AddMessage(msg);
|
||||
}
|
||||
|
||||
void DoDelete(ThreadState* thread, bool seq) {
|
||||
WriteBatch batch;
|
||||
Status s;
|
||||
Duration duration(seq ? 0 : FLAGS_duration, num_);
|
||||
long i = 0;
|
||||
int64_t i = 0;
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
|
||||
while (!duration.Done(entries_per_batch_)) {
|
||||
batch.Clear();
|
||||
for (int j = 0; j < entries_per_batch_; j++) {
|
||||
const int64_t k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
for (int64_t j = 0; j < entries_per_batch_; ++j) {
|
||||
const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
|
||||
GenerateKeyFromInt(k, FLAGS_num, &key);
|
||||
batch.Delete(key);
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
s = db_->Write(write_options_, &batch);
|
||||
auto s = db_->Write(write_options_, &batch);
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "del error: %s\n", s.ToString().c_str());
|
||||
exit(1);
|
||||
}
|
||||
++i;
|
||||
i += entries_per_batch_;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2197,6 +1941,9 @@ class Benchmark {
|
||||
// Don't merge stats from this thread with the readers.
|
||||
thread->stats.SetExcludeFromMerge();
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
|
||||
while (true) {
|
||||
{
|
||||
MutexLock l(&thread->shared->mu);
|
||||
@ -2206,8 +1953,7 @@ class Benchmark {
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
|
||||
Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
|
||||
@ -2235,7 +1981,7 @@ class Benchmark {
|
||||
// Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
|
||||
// in DB atomically i.e in a single batch. Also refer GetMany.
|
||||
Status PutMany(const WriteOptions& writeoptions,
|
||||
const Slice& key, const Slice& value) {
|
||||
const Slice& key, const Slice& value) {
|
||||
std::string suffixes[3] = {"2", "1", "0"};
|
||||
std::string keys[3];
|
||||
|
||||
@ -2273,7 +2019,7 @@ class Benchmark {
|
||||
// in the same snapshot, and verifies that all the values are identical.
|
||||
// ASSUMES that PutMany was used to put (K, V) into the DB.
|
||||
Status GetMany(const ReadOptions& readoptions,
|
||||
const Slice& key, std::string* value) {
|
||||
const Slice& key, std::string* value) {
|
||||
std::string suffixes[3] = {"0", "1", "2"};
|
||||
std::string keys[3];
|
||||
Slice key_slices[3];
|
||||
@ -2328,16 +2074,19 @@ class Benchmark {
|
||||
int64_t puts_done = 0;
|
||||
int64_t deletes_done = 0;
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
|
||||
// the number of iterations is the larger of read_ or write_
|
||||
for (int64_t i = 0; i < readwrites_; i++) {
|
||||
const int64_t k = thread->rand.Next() % (FLAGS_numdistinct);
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_numdistinct);
|
||||
if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
|
||||
// one batch completed, reinitialize for next batch
|
||||
get_weight = FLAGS_readwritepercent;
|
||||
delete_weight = FLAGS_deletepercent;
|
||||
put_weight = 100 - get_weight - delete_weight;
|
||||
}
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
|
||||
FLAGS_numdistinct, &key);
|
||||
if (get_weight > 0) {
|
||||
// do all the gets first
|
||||
Status s = GetMany(options, key, &value);
|
||||
@ -2383,12 +2132,6 @@ class Benchmark {
|
||||
// This is different from ReadWhileWriting because it does not use
|
||||
// an extra thread.
|
||||
void ReadRandomWriteRandom(ThreadState* thread) {
|
||||
if (FLAGS_use_multiget){
|
||||
// Separate function for multiget (for ease of reading)
|
||||
ReadRandomWriteRandomMultiGet(thread);
|
||||
return;
|
||||
}
|
||||
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
RandomGenerator gen;
|
||||
std::string value;
|
||||
@ -2399,28 +2142,18 @@ class Benchmark {
|
||||
int64_t writes_done = 0;
|
||||
Duration duration(FLAGS_duration, readwrites_);
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
|
||||
// the number of iterations is the larger of read_ or write_
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
|
||||
if (get_weight == 0 && put_weight == 0) {
|
||||
// one batch completed, reinitialize for next batch
|
||||
get_weight = FLAGS_readwritepercent;
|
||||
put_weight = 100 - get_weight;
|
||||
}
|
||||
if (get_weight > 0) {
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
options.snapshot = db_->GetSnapshot();
|
||||
}
|
||||
|
||||
if (FLAGS_get_approx) {
|
||||
std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
|
||||
Range range(key, key2);
|
||||
uint64_t sizes;
|
||||
db_->GetApproximateSizes(&range, 1, &sizes);
|
||||
}
|
||||
|
||||
// do all the gets first
|
||||
Status s = db_->Get(options, key, &value);
|
||||
if (!s.ok() && !s.IsNotFound()) {
|
||||
@ -2430,14 +2163,8 @@ class Benchmark {
|
||||
} else if (!s.IsNotFound()) {
|
||||
found++;
|
||||
}
|
||||
|
||||
get_weight--;
|
||||
reads_done++;
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
db_->ReleaseSnapshot(options.snapshot);
|
||||
}
|
||||
|
||||
} else if (put_weight > 0) {
|
||||
// then do all the corresponding number of puts
|
||||
// for all the gets we have done earlier
|
||||
@ -2458,82 +2185,6 @@ class Benchmark {
|
||||
thread->stats.AddMessage(msg);
|
||||
}
|
||||
|
||||
// ReadRandomWriteRandom (with multiget)
|
||||
// Does FLAGS_keys_per_multiget reads (per multiget), followed by some puts.
|
||||
// FLAGS_readwritepercent will specify the ratio of gets to puts.
|
||||
// e.g.: If FLAGS_keys_per_multiget == 100 and FLAGS_readwritepercent == 75
|
||||
// Then each block will do 100 multigets and 33 puts
|
||||
// So there are 133 operations in-total: 100 of them (75%) are gets, and 33
|
||||
// of them (25%) are puts.
|
||||
void ReadRandomWriteRandomMultiGet(ThreadState* thread) {
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
RandomGenerator gen;
|
||||
|
||||
// For multiget
|
||||
const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group
|
||||
|
||||
long keys_left = readwrites_; // number of keys still left to read
|
||||
long num_keys; // number of keys to read in current group
|
||||
long num_put_keys; // number of keys to put in current group
|
||||
|
||||
int64_t found = 0;
|
||||
int64_t reads_done = 0;
|
||||
int64_t writes_done = 0;
|
||||
int64_t multigets_done = 0;
|
||||
|
||||
// the number of iterations is the larger of read_ or write_
|
||||
Duration duration(FLAGS_duration, readwrites_);
|
||||
while(true) {
|
||||
// Read num_keys keys, then write num_put_keys keys.
|
||||
// The ratio of num_keys to num_put_keys is always FLAGS_readwritepercent
|
||||
// And num_keys is set to be FLAGS_keys_per_multiget (kpg)
|
||||
// num_put_keys is calculated accordingly (to maintain the ratio)
|
||||
// Note: On the final iteration, num_keys and num_put_keys will be smaller
|
||||
num_keys = std::min(keys_left*(FLAGS_readwritepercent + 99)/100, kpg);
|
||||
num_put_keys = num_keys * (100-FLAGS_readwritepercent)
|
||||
/ FLAGS_readwritepercent;
|
||||
|
||||
// This will break the loop when duration is complete
|
||||
if (duration.Done(num_keys + num_put_keys)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// A quick check to make sure our formula doesn't break on edge cases
|
||||
assert(num_keys >= 1);
|
||||
assert(num_keys + num_put_keys <= keys_left);
|
||||
|
||||
// Apply the MultiGet operations
|
||||
found += MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "");
|
||||
++multigets_done;
|
||||
reads_done+=num_keys;
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
|
||||
// Now do the puts
|
||||
int i;
|
||||
int64_t k;
|
||||
for(i=0; i<num_put_keys; ++i) {
|
||||
k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
Status s = db_->Put(write_options_, key,
|
||||
gen.Generate(value_size_));
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
|
||||
exit(1);
|
||||
}
|
||||
writes_done++;
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
|
||||
keys_left -= (num_keys + num_put_keys);
|
||||
}
|
||||
char msg[100];
|
||||
snprintf(msg, sizeof(msg),
|
||||
"( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 \
|
||||
" multiget_ops:%" PRIu64 " found:%" PRIu64 ")",
|
||||
reads_done, writes_done, readwrites_, multigets_done, found);
|
||||
thread->stats.AddMessage(msg);
|
||||
}
|
||||
|
||||
//
|
||||
// Read-modify-write for random keys
|
||||
void UpdateRandom(ThreadState* thread) {
|
||||
@ -2543,30 +2194,16 @@ class Benchmark {
|
||||
int64_t found = 0;
|
||||
Duration duration(FLAGS_duration, readwrites_);
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
// the number of iterations is the larger of read_ or write_
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
options.snapshot = db_->GetSnapshot();
|
||||
}
|
||||
|
||||
if (FLAGS_get_approx) {
|
||||
std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
|
||||
Range range(key, key2);
|
||||
uint64_t sizes;
|
||||
db_->GetApproximateSizes(&range, 1, &sizes);
|
||||
}
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
|
||||
|
||||
if (db_->Get(options, key, &value).ok()) {
|
||||
found++;
|
||||
}
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
db_->ReleaseSnapshot(options.snapshot);
|
||||
}
|
||||
|
||||
Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
|
||||
@ -2589,22 +2226,12 @@ class Benchmark {
|
||||
std::string value;
|
||||
int64_t found = 0;
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
// The number of iterations is the larger of read_ or write_
|
||||
Duration duration(FLAGS_duration, readwrites_);
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % FLAGS_num;
|
||||
std::string key = GenerateKeyFromInt(k, FLAGS_num);
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
options.snapshot = db_->GetSnapshot();
|
||||
}
|
||||
|
||||
if (FLAGS_get_approx) {
|
||||
std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
|
||||
Range range(key, key2);
|
||||
uint64_t sizes;
|
||||
db_->GetApproximateSizes(&range, 1, &sizes);
|
||||
}
|
||||
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
|
||||
|
||||
// Get the existing value
|
||||
if (db_->Get(options, key, &value).ok()) {
|
||||
@ -2614,10 +2241,6 @@ class Benchmark {
|
||||
value.clear();
|
||||
}
|
||||
|
||||
if (FLAGS_use_snapshot) {
|
||||
db_->ReleaseSnapshot(options.snapshot);
|
||||
}
|
||||
|
||||
// Update the value (by appending data)
|
||||
Slice operand = gen.Generate(value_size_);
|
||||
if (value.size() > 0) {
|
||||
@ -2634,6 +2257,7 @@ class Benchmark {
|
||||
}
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
|
||||
char msg[100];
|
||||
snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
|
||||
readwrites_, found);
|
||||
@ -2653,11 +2277,12 @@ class Benchmark {
|
||||
void MergeRandom(ThreadState* thread) {
|
||||
RandomGenerator gen;
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
// The number of iterations is the larger of read_ or write_
|
||||
Duration duration(FLAGS_duration, readwrites_);
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % merge_keys_;
|
||||
std::string key = GenerateKeyFromInt(k, merge_keys_);
|
||||
GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
|
||||
|
||||
Status s = db_->Merge(write_options_, key, gen.Generate(value_size_));
|
||||
|
||||
@ -2690,12 +2315,12 @@ class Benchmark {
|
||||
int64_t num_merges = 0;
|
||||
size_t max_length = 0;
|
||||
|
||||
Slice key = AllocateKey();
|
||||
std::unique_ptr<const char[]> key_guard(key.data());
|
||||
// the number of iterations is the larger of read_ or write_
|
||||
Duration duration(FLAGS_duration, readwrites_);
|
||||
|
||||
while (!duration.Done(1)) {
|
||||
const int64_t k = thread->rand.Next() % merge_keys_;
|
||||
std::string key = GenerateKeyFromInt(k, merge_keys_);
|
||||
GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
|
||||
|
||||
bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
|
||||
|
||||
@ -2727,6 +2352,7 @@ class Benchmark {
|
||||
|
||||
thread->stats.FinishedSingleOp(db_);
|
||||
}
|
||||
|
||||
char msg[100];
|
||||
snprintf(msg, sizeof(msg),
|
||||
"(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \
|
||||
@ -2735,7 +2361,6 @@ class Benchmark {
|
||||
thread->stats.AddMessage(msg);
|
||||
}
|
||||
|
||||
|
||||
void Compact(ThreadState* thread) {
|
||||
db_->CompactRange(nullptr, nullptr);
|
||||
}
|
||||
@ -2747,28 +2372,6 @@ class Benchmark {
|
||||
}
|
||||
fprintf(stdout, "\n%s\n", stats.c_str());
|
||||
}
|
||||
|
||||
static void WriteToFile(void* arg, const char* buf, int n) {
|
||||
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
|
||||
}
|
||||
|
||||
void HeapProfile() {
|
||||
char fname[100];
|
||||
EnvOptions soptions;
|
||||
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db.c_str(),
|
||||
++heap_counter_);
|
||||
unique_ptr<WritableFile> file;
|
||||
Status s = FLAGS_env->NewWritableFile(fname, &file, soptions);
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "%s\n", s.ToString().c_str());
|
||||
return;
|
||||
}
|
||||
bool ok = port::GetHeapProfile(WriteToFile, file.get());
|
||||
if (!ok) {
|
||||
fprintf(stderr, "heap profiling not supported\n");
|
||||
FLAGS_env->DeleteFile(fname);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -7,6 +7,8 @@
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
@ -17,6 +19,7 @@
|
||||
#include "rocksdb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -60,21 +63,36 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
||||
|
||||
*manifest_file_size = 0;
|
||||
|
||||
mutex_.Lock();
|
||||
|
||||
if (flush_memtable) {
|
||||
// flush all dirty data to disk.
|
||||
Status status = Flush(FlushOptions());
|
||||
Status status;
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
status = FlushMemTable(cfd, FlushOptions());
|
||||
mutex_.Lock();
|
||||
cfd->Unref();
|
||||
if (!status.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
|
||||
if (!status.ok()) {
|
||||
mutex_.Unlock();
|
||||
Log(options_.info_log, "Cannot Flush data %s\n",
|
||||
status.ToString().c_str());
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
MutexLock l(&mutex_);
|
||||
|
||||
// Make a set of all of the live *.sst files
|
||||
std::set<uint64_t> live;
|
||||
versions_->current()->AddLiveFiles(&live);
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
cfd->current()->AddLiveFiles(&live);
|
||||
}
|
||||
|
||||
ret.clear();
|
||||
ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
|
||||
@ -91,24 +109,60 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
||||
// find length of manifest file while holding the mutex lock
|
||||
*manifest_file_size = versions_->ManifestFileSize();
|
||||
|
||||
mutex_.Unlock();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
|
||||
// First get sorted files in archive dir, then append sorted files from main
|
||||
// dir to maintain sorted order
|
||||
|
||||
// list wal files in archive dir.
|
||||
// First get sorted files in db dir, then get sorted files from archived
|
||||
// dir, to avoid a race condition where a log file is moved to archived
|
||||
// dir in between.
|
||||
Status s;
|
||||
// list wal files in main db dir.
|
||||
VectorLogPtr logs;
|
||||
s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Reproduce the race condition where a log file is moved
|
||||
// to archived dir, between these two sync points, used in
|
||||
// (DBTest,TransactionLogIteratorRace)
|
||||
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
|
||||
TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
|
||||
|
||||
files.clear();
|
||||
// list wal files in archive dir.
|
||||
std::string archivedir = ArchivalDirectory(options_.wal_dir);
|
||||
if (env_->FileExists(archivedir)) {
|
||||
s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile);
|
||||
s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
// list wal files in main db dir.
|
||||
return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile);
|
||||
|
||||
uint64_t latest_archived_log_number = 0;
|
||||
if (!files.empty()) {
|
||||
latest_archived_log_number = files.back()->LogNumber();
|
||||
Log(options_.info_log, "Latest Archived log: %" PRIu64,
|
||||
latest_archived_log_number);
|
||||
}
|
||||
|
||||
files.reserve(files.size() + logs.size());
|
||||
for (auto& log : logs) {
|
||||
if (log->LogNumber() > latest_archived_log_number) {
|
||||
files.push_back(std::move(log));
|
||||
} else {
|
||||
// When the race condition happens, we could see the
|
||||
// same log in both db dir and archived dir. Simply
|
||||
// ignore the one in db dir. Note that, if we read
|
||||
// archived dir first, we would have missed the log file.
|
||||
Log(options_.info_log, "%s already moved to archive",
|
||||
log->PathName().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
1779
db/db_impl.cc
1779
db/db_impl.cc
File diff suppressed because it is too large
Load Diff
287
db/db_impl.h
287
db/db_impl.h
@ -13,10 +13,12 @@
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/snapshot.h"
|
||||
#include "db/column_family.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "memtable_list.h"
|
||||
#include "port/port.h"
|
||||
@ -40,44 +42,79 @@ class CompactionFilterV2;
|
||||
|
||||
class DBImpl : public DB {
|
||||
public:
|
||||
DBImpl(const Options& options, const std::string& dbname);
|
||||
DBImpl(const DBOptions& options, const std::string& dbname);
|
||||
virtual ~DBImpl();
|
||||
|
||||
// Implementations of the DB interface
|
||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
|
||||
virtual Status Merge(const WriteOptions&, const Slice& key,
|
||||
using DB::Put;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
using DB::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
virtual Status Delete(const WriteOptions&, const Slice& key);
|
||||
using DB::Delete;
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key);
|
||||
using DB::Write;
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
||||
using DB::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value);
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values);
|
||||
using DB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys, std::vector<std::string>* values);
|
||||
|
||||
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
||||
const std::string& column_family,
|
||||
ColumnFamilyHandle** handle);
|
||||
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
|
||||
|
||||
// Returns false if key doesn't exist in the database and true if it may.
|
||||
// If value_found is not passed in as null, then return the value if found in
|
||||
// memory. On return, if value was found, then value_found will be set to true
|
||||
// , otherwise false.
|
||||
using DB::KeyMayExist;
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr);
|
||||
virtual Iterator* NewIterator(const ReadOptions&);
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value, bool* value_found = nullptr);
|
||||
using DB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family);
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators);
|
||||
virtual const Snapshot* GetSnapshot();
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
||||
virtual bool GetProperty(const Slice& property, std::string* value);
|
||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
|
||||
virtual Status CompactRange(const Slice* begin, const Slice* end,
|
||||
using DB::GetProperty;
|
||||
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, std::string* value);
|
||||
using DB::GetApproximateSizes;
|
||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||
const Range* range, int n, uint64_t* sizes);
|
||||
using DB::CompactRange;
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1);
|
||||
virtual int NumberLevels();
|
||||
virtual int MaxMemCompactionLevel();
|
||||
virtual int Level0StopWriteTrigger();
|
||||
|
||||
using DB::NumberLevels;
|
||||
virtual int NumberLevels(ColumnFamilyHandle* column_family);
|
||||
using DB::MaxMemCompactionLevel;
|
||||
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
|
||||
using DB::Level0StopWriteTrigger;
|
||||
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
|
||||
virtual const std::string& GetName() const;
|
||||
virtual Env* GetEnv() const;
|
||||
virtual const Options& GetOptions() const;
|
||||
virtual Status Flush(const FlushOptions& options);
|
||||
using DB::GetOptions;
|
||||
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
|
||||
using DB::Flush;
|
||||
virtual Status Flush(const FlushOptions& options,
|
||||
ColumnFamilyHandle* column_family);
|
||||
virtual Status DisableFileDeletions();
|
||||
virtual Status EnableFileDeletions(bool force);
|
||||
// All the returned filenames start with "/"
|
||||
@ -92,8 +129,7 @@ class DBImpl : public DB {
|
||||
read_options = TransactionLogIterator::ReadOptions());
|
||||
virtual Status DeleteFile(std::string name);
|
||||
|
||||
virtual void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData> *metadata);
|
||||
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
|
||||
|
||||
// checks if all live files exist on file system and that their file sizes
|
||||
// match to our in-memory records
|
||||
@ -101,23 +137,21 @@ class DBImpl : public DB {
|
||||
|
||||
virtual Status GetDbIdentity(std::string& identity);
|
||||
|
||||
Status RunManualCompaction(int input_level,
|
||||
int output_level,
|
||||
const Slice* begin,
|
||||
Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
|
||||
int output_level, const Slice* begin,
|
||||
const Slice* end);
|
||||
|
||||
// Extra methods (for testing) that are not in the public DB interface
|
||||
|
||||
// Compact any files in the named level that overlap [*begin, *end]
|
||||
Status TEST_CompactRange(int level,
|
||||
const Slice* begin,
|
||||
const Slice* end);
|
||||
Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
|
||||
ColumnFamilyHandle* column_family = nullptr);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status TEST_FlushMemTable(bool wait = true);
|
||||
|
||||
// Wait for memtable compaction
|
||||
Status TEST_WaitForFlushMemTable();
|
||||
Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
|
||||
|
||||
// Wait for any compaction
|
||||
Status TEST_WaitForCompact();
|
||||
@ -125,14 +159,13 @@ class DBImpl : public DB {
|
||||
// Return an internal iterator over the current state of the database.
|
||||
// The keys of this iterator are internal keys (see format.h).
|
||||
// The returned iterator should be deleted when no longer needed.
|
||||
Iterator* TEST_NewInternalIterator();
|
||||
Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
|
||||
nullptr);
|
||||
|
||||
// Return the maximum overlapping data (in bytes) at next level for any
|
||||
// file at a level >= 1.
|
||||
int64_t TEST_MaxNextLevelOverlappingBytes();
|
||||
|
||||
// Simulate a db crash, no elegant closing of database.
|
||||
void TEST_Destroy_DBImpl();
|
||||
int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
|
||||
nullptr);
|
||||
|
||||
// Return the current manifest file no.
|
||||
uint64_t TEST_Current_Manifest_FileNo();
|
||||
@ -148,61 +181,8 @@ class DBImpl : public DB {
|
||||
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
|
||||
}
|
||||
|
||||
void TEST_GetFilesMetaData(std::vector<std::vector<FileMetaData>>* metadata);
|
||||
|
||||
// holds references to memtable, all immutable memtables and version
|
||||
struct SuperVersion {
|
||||
MemTable* mem;
|
||||
MemTableListVersion* imm;
|
||||
Version* current;
|
||||
std::atomic<uint32_t> refs;
|
||||
// We need to_delete because during Cleanup(), imm->Unref() returns
|
||||
// all memtables that we need to free through this vector. We then
|
||||
// delete all those memtables outside of mutex, during destruction
|
||||
autovector<MemTable*> to_delete;
|
||||
// Version number of the current SuperVersion
|
||||
uint64_t version_number;
|
||||
DBImpl* db;
|
||||
|
||||
// should be called outside the mutex
|
||||
SuperVersion() = default;
|
||||
~SuperVersion();
|
||||
SuperVersion* Ref();
|
||||
// Returns true if this was the last reference and caller should
|
||||
// call Clenaup() and delete the object
|
||||
bool Unref();
|
||||
|
||||
// call these two methods with db mutex held
|
||||
// Cleanup unrefs mem, imm and current. Also, it stores all memtables
|
||||
// that needs to be deleted in to_delete vector. Unrefing those
|
||||
// objects needs to be done in the mutex
|
||||
void Cleanup();
|
||||
void Init(MemTable* new_mem, MemTableListVersion* new_imm,
|
||||
Version* new_current);
|
||||
|
||||
// The value of dummy is not actually used. kSVInUse takes its address as a
|
||||
// mark in the thread local storage to indicate the SuperVersion is in use
|
||||
// by thread. This way, the value of kSVInUse is guaranteed to have no
|
||||
// conflict with SuperVersion object address and portable on different
|
||||
// platform.
|
||||
static int dummy;
|
||||
static void* const kSVInUse;
|
||||
static void* const kSVObsolete;
|
||||
};
|
||||
|
||||
static void SuperVersionUnrefHandle(void* ptr) {
|
||||
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
|
||||
// destroyed. When former happens, the thread shouldn't see kSVInUse.
|
||||
// When latter happens, we are in ~DBImpl(), no get should happen as well.
|
||||
assert(ptr != SuperVersion::kSVInUse);
|
||||
DBImpl::SuperVersion* sv = static_cast<DBImpl::SuperVersion*>(ptr);
|
||||
if (sv->Unref()) {
|
||||
sv->db->mutex_.Lock();
|
||||
sv->Cleanup();
|
||||
sv->db->mutex_.Unlock();
|
||||
delete sv;
|
||||
}
|
||||
}
|
||||
void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
|
||||
std::vector<std::vector<FileMetaData>>* metadata);
|
||||
|
||||
// needed for CleanupIteratorState
|
||||
struct DeletionState {
|
||||
@ -231,7 +211,7 @@ class DBImpl : public DB {
|
||||
|
||||
autovector<SuperVersion*> superversions_to_free;
|
||||
|
||||
SuperVersion* new_superversion; // if nullptr no new superversion
|
||||
SuperVersion* new_superversion; // if nullptr no new superversion
|
||||
|
||||
// the current manifest_file_number, log_number and prev_log_number
|
||||
// that corresponds to the set of files in 'live'.
|
||||
@ -243,8 +223,7 @@ class DBImpl : public DB {
|
||||
pending_manifest_file_number = 0;
|
||||
log_number = 0;
|
||||
prev_log_number = 0;
|
||||
new_superversion =
|
||||
create_superversion ? new SuperVersion() : nullptr;
|
||||
new_superversion = create_superversion ? new SuperVersion() : nullptr;
|
||||
}
|
||||
|
||||
~DeletionState() {
|
||||
@ -277,23 +256,16 @@ class DBImpl : public DB {
|
||||
// It is not necessary to hold the mutex when invoking this method.
|
||||
void PurgeObsoleteFiles(DeletionState& deletion_state);
|
||||
|
||||
ColumnFamilyHandle* DefaultColumnFamily() const;
|
||||
|
||||
protected:
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
unique_ptr<VersionSet> versions_;
|
||||
const InternalKeyComparator internal_comparator_;
|
||||
const Options options_; // options_.comparator == &internal_comparator_
|
||||
const DBOptions options_;
|
||||
|
||||
const Comparator* user_comparator() const {
|
||||
return internal_comparator_.user_comparator();
|
||||
}
|
||||
|
||||
SuperVersion* GetSuperVersion() {
|
||||
return super_version_;
|
||||
}
|
||||
|
||||
Iterator* NewInternalIterator(const ReadOptions&,
|
||||
SequenceNumber* latest_snapshot);
|
||||
Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
|
||||
SuperVersion* super_version);
|
||||
|
||||
private:
|
||||
friend class DB;
|
||||
@ -306,8 +278,10 @@ class DBImpl : public DB {
|
||||
Status NewDB();
|
||||
|
||||
// Recover the descriptor from persistent storage. May do a significant
|
||||
// amount of work to recover recently logged updates.
|
||||
Status Recover(bool read_only = false, bool error_if_log_file_exist = false);
|
||||
// amount of work to recover recently logged updates. Any changes to
|
||||
// be made to the descriptor are added to *edit.
|
||||
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
bool read_only = false, bool error_if_log_file_exist = false);
|
||||
|
||||
void MaybeIgnoreError(Status* s) const;
|
||||
|
||||
@ -318,7 +292,7 @@ class DBImpl : public DB {
|
||||
|
||||
// Flush the in-memory write buffer to storage. Switches to a new
|
||||
// log-file/memtable and writes a new descriptor iff successful.
|
||||
Status FlushMemTableToOutputFile(bool* madeProgress,
|
||||
Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
|
||||
DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
@ -330,25 +304,26 @@ class DBImpl : public DB {
|
||||
// database is opened) and is heavyweight because it holds the mutex
|
||||
// for the entire period. The second method WriteLevel0Table supports
|
||||
// concurrent flush memtables to storage.
|
||||
Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
|
||||
Status WriteLevel0Table(autovector<MemTable*>& mems, VersionEdit* edit,
|
||||
uint64_t* filenumber,
|
||||
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
||||
VersionEdit* edit);
|
||||
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
|
||||
VersionEdit* edit, uint64_t* filenumber,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
uint64_t SlowdownAmount(int n, double bottom, double top);
|
||||
// MakeRoomForWrite will return superversion_to_free through an arugment,
|
||||
// which the caller needs to delete. We do it because caller can delete
|
||||
// the superversion outside of mutex
|
||||
Status MakeRoomForWrite(bool force /* compact even if there is room? */,
|
||||
SuperVersion** superversion_to_free);
|
||||
|
||||
// TODO(icanadi) free superversion_to_free and old_log outside of mutex
|
||||
Status MakeRoomForWrite(ColumnFamilyData* cfd,
|
||||
bool force /* flush even if there is room? */);
|
||||
|
||||
void BuildBatchGroup(Writer** last_writer,
|
||||
autovector<WriteBatch*>* write_batch_group);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status FlushMemTable(const FlushOptions& options);
|
||||
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
|
||||
|
||||
// Wait for memtable flushed
|
||||
Status WaitForFlushMemTable();
|
||||
Status WaitForFlushMemTable(ColumnFamilyData* cfd);
|
||||
|
||||
void MaybeScheduleLogDBDeployStats();
|
||||
static void BGLogDBDeployStats(void* db);
|
||||
@ -368,6 +343,13 @@ class DBImpl : public DB {
|
||||
DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// This function is called as part of compaction. It enables Flush process to
|
||||
// preempt compaction, since it's higher prioirty
|
||||
// Returns: micros spent executing
|
||||
uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
|
||||
DeletionState& deletion_state,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// Call compaction filter if is_compaction_v2 is not true. Then iterate
|
||||
// through input and compact the kv-pairs
|
||||
Status ProcessKeyValueCompaction(
|
||||
@ -388,15 +370,16 @@ class DBImpl : public DB {
|
||||
|
||||
Status OpenCompactionOutputFile(CompactionState* compact);
|
||||
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
||||
Status InstallCompactionResults(CompactionState* compact);
|
||||
Status InstallCompactionResults(CompactionState* compact,
|
||||
LogBuffer* log_buffer);
|
||||
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
|
||||
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
|
||||
|
||||
void PurgeObsoleteWALFiles();
|
||||
|
||||
Status AppendSortedWalsOfType(const std::string& path,
|
||||
VectorLogPtr& log_files,
|
||||
WalFileType type);
|
||||
Status GetSortedWalsOfType(const std::string& path,
|
||||
VectorLogPtr& log_files,
|
||||
WalFileType type);
|
||||
|
||||
// Requires: all_logs should be sorted with earliest log file first
|
||||
// Retains all log files in all_logs which contain updates with seq no.
|
||||
@ -419,30 +402,23 @@ class DBImpl : public DB {
|
||||
|
||||
// Return the minimum empty level that could hold the total data in the
|
||||
// input level. Return the input level, if such level could not be found.
|
||||
int FindMinimumEmptyLevelFitting(int level);
|
||||
int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
|
||||
|
||||
// Move the files in the input level to the target level.
|
||||
// If target_level < 0, automatically calculate the minimum level that could
|
||||
// hold the data set.
|
||||
Status ReFitLevel(int level, int target_level = -1);
|
||||
|
||||
// Returns the current SuperVersion number.
|
||||
uint64_t CurrentVersionNumber() const;
|
||||
Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
|
||||
|
||||
// Returns a pair of iterators (mutable-only and immutable-only) used
|
||||
// internally by TailingIterator and stores CurrentVersionNumber() in
|
||||
// internally by TailingIterator and stores cfd->GetSuperVersionNumber() in
|
||||
// *superversion_number. These iterators are always up-to-date, i.e. can
|
||||
// be used to read new data.
|
||||
std::pair<Iterator*, Iterator*> GetTailingIteratorPair(
|
||||
const ReadOptions& options,
|
||||
uint64_t* superversion_number);
|
||||
|
||||
// Constant after construction
|
||||
const InternalFilterPolicy internal_filter_policy_;
|
||||
bool owns_info_log_;
|
||||
const ReadOptions& options, ColumnFamilyData* cfd,
|
||||
uint64_t* superversion_number);
|
||||
|
||||
// table_cache_ provides its own synchronization
|
||||
unique_ptr<TableCache> table_cache_;
|
||||
std::shared_ptr<Cache> table_cache_;
|
||||
|
||||
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
|
||||
FileLock* db_lock_;
|
||||
@ -451,20 +427,11 @@ class DBImpl : public DB {
|
||||
port::Mutex mutex_;
|
||||
port::AtomicPointer shutting_down_;
|
||||
port::CondVar bg_cv_; // Signalled when background work finishes
|
||||
MemTable* mem_;
|
||||
MemTableList imm_; // Memtable that are not changing
|
||||
uint64_t logfile_number_;
|
||||
unique_ptr<log::Writer> log_;
|
||||
|
||||
SuperVersion* super_version_;
|
||||
|
||||
// An ordinal representing the current SuperVersion. Updated by
|
||||
// InstallSuperVersion(), i.e. incremented every time super_version_
|
||||
// changes.
|
||||
std::atomic<uint64_t> super_version_number_;
|
||||
// Thread's local copy of SuperVersion pointer
|
||||
// This needs to be destructed after mutex_
|
||||
ThreadLocalPtr* local_sv_;
|
||||
ColumnFamilyHandleImpl* default_cf_handle_;
|
||||
unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
|
||||
std::deque<uint64_t> alive_log_files_;
|
||||
|
||||
std::string host_name_;
|
||||
|
||||
@ -500,6 +467,7 @@ class DBImpl : public DB {
|
||||
|
||||
// Information for a manual compaction
|
||||
struct ManualCompaction {
|
||||
ColumnFamilyData* cfd;
|
||||
int input_level;
|
||||
int output_level;
|
||||
bool done;
|
||||
@ -541,8 +509,6 @@ class DBImpl : public DB {
|
||||
|
||||
bool flush_on_destroy_; // Used when disableWAL is true.
|
||||
|
||||
InternalStats internal_stats_;
|
||||
|
||||
static const int KEEP_LOG_FILE_NUM = 1000;
|
||||
std::string db_absolute_path_;
|
||||
|
||||
@ -575,28 +541,21 @@ class DBImpl : public DB {
|
||||
std::vector<SequenceNumber>& snapshots,
|
||||
SequenceNumber* prev_snapshot);
|
||||
|
||||
// will return a pointer to SuperVersion* if previous SuperVersion
|
||||
// if its reference count is zero and needs deletion or nullptr if not
|
||||
// As argument takes a pointer to allocated SuperVersion
|
||||
// Foreground threads call this function directly (they don't carry
|
||||
// deletion state and have to handle their own creation and deletion
|
||||
// of SuperVersion)
|
||||
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
|
||||
// Background threads call this function, which is just a wrapper around
|
||||
// the InstallSuperVersion() function above. Background threads carry
|
||||
// the cfd->InstallSuperVersion() function. Background threads carry
|
||||
// deletion_state which can have new_superversion already allocated.
|
||||
void InstallSuperVersion(DeletionState& deletion_state);
|
||||
void InstallSuperVersion(ColumnFamilyData* cfd,
|
||||
DeletionState& deletion_state);
|
||||
|
||||
void ResetThreadLocalSuperVersions(DeletionState* deletion_state);
|
||||
|
||||
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props)
|
||||
using DB::GetPropertiesOfAllTables;
|
||||
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
||||
TablePropertiesCollection* props)
|
||||
override;
|
||||
|
||||
// Function that Get and KeyMayExist call with no_io true or false
|
||||
// Note: 'value_found' from KeyMayExist propagates here
|
||||
Status GetImpl(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
||||
const Slice& key, std::string* value,
|
||||
bool* value_found = nullptr);
|
||||
};
|
||||
|
||||
@ -606,7 +565,7 @@ extern Options SanitizeOptions(const std::string& db,
|
||||
const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const Options& src);
|
||||
|
||||
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
|
||||
|
||||
// Determine compression type, based on user options, level of the output
|
||||
// file and whether compression is disabled.
|
||||
|
@ -42,8 +42,8 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
DBImplReadOnly::DBImplReadOnly(const Options& options,
|
||||
const std::string& dbname)
|
||||
DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
|
||||
const std::string& dbname)
|
||||
: DBImpl(options, dbname) {
|
||||
Log(options_.info_log, "Opening the db in read only mode");
|
||||
}
|
||||
@ -53,42 +53,57 @@ DBImplReadOnly::~DBImplReadOnly() {
|
||||
|
||||
// Implementations of the DB interface
|
||||
Status DBImplReadOnly::Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value) {
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) {
|
||||
Status s;
|
||||
SequenceNumber snapshot = versions_->LastSequence();
|
||||
SuperVersion* super_version = GetSuperVersion();
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
auto cfd = cfh->cfd();
|
||||
SuperVersion* super_version = cfd->GetSuperVersion();
|
||||
MergeContext merge_context;
|
||||
LookupKey lkey(key, snapshot);
|
||||
if (super_version->mem->Get(lkey, value, &s, merge_context, options_)) {
|
||||
if (super_version->mem->Get(lkey, value, &s, merge_context,
|
||||
*cfd->options())) {
|
||||
} else {
|
||||
Version::GetStats stats;
|
||||
super_version->current->Get(options, lkey, value, &s, &merge_context,
|
||||
&stats, options_);
|
||||
&stats, *cfd->options());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) {
|
||||
SequenceNumber latest_snapshot;
|
||||
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
|
||||
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
auto cfd = cfh->cfd();
|
||||
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
|
||||
SequenceNumber latest_snapshot = versions_->LastSequence();
|
||||
Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
|
||||
return NewDBIterator(
|
||||
&dbname_, env_, options_, user_comparator(),internal_iter,
|
||||
&dbname_, env_, *cfd->options(), cfd->user_comparator(), internal_iter,
|
||||
(options.snapshot != nullptr
|
||||
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
||||
: latest_snapshot));
|
||||
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
||||
: latest_snapshot));
|
||||
}
|
||||
|
||||
|
||||
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
|
||||
DB** dbptr, bool error_if_log_file_exist) {
|
||||
DB** dbptr, bool error_if_log_file_exist) {
|
||||
*dbptr = nullptr;
|
||||
|
||||
DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
|
||||
DBOptions db_options(options);
|
||||
ColumnFamilyOptions cf_options(options);
|
||||
std::vector<ColumnFamilyDescriptor> column_families;
|
||||
column_families.push_back(
|
||||
ColumnFamilyDescriptor(default_column_family_name, cf_options));
|
||||
|
||||
DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
|
||||
impl->mutex_.Lock();
|
||||
Status s = impl->Recover(true /* read only */, error_if_log_file_exist);
|
||||
Status s = impl->Recover(column_families, true /* read only */,
|
||||
error_if_log_file_exist);
|
||||
if (s.ok()) {
|
||||
delete impl->InstallSuperVersion(new DBImpl::SuperVersion());
|
||||
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
||||
delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
|
||||
}
|
||||
}
|
||||
impl->mutex_.Unlock();
|
||||
if (s.ok()) {
|
||||
|
@ -12,6 +12,8 @@
|
||||
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/snapshot.h"
|
||||
@ -23,57 +25,79 @@
|
||||
namespace rocksdb {
|
||||
|
||||
class DBImplReadOnly : public DBImpl {
|
||||
public:
|
||||
DBImplReadOnly(const Options& options, const std::string& dbname);
|
||||
virtual ~DBImplReadOnly();
|
||||
public:
|
||||
DBImplReadOnly(const DBOptions& options, const std::string& dbname);
|
||||
virtual ~DBImplReadOnly();
|
||||
|
||||
// Implementations of the DB interface
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value);
|
||||
// Implementations of the DB interface
|
||||
using DB::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value);
|
||||
|
||||
// TODO: Implement ReadOnly MultiGet?
|
||||
// TODO: Implement ReadOnly MultiGet?
|
||||
|
||||
virtual Iterator* NewIterator(const ReadOptions&);
|
||||
using DBImpl::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions&,
|
||||
ColumnFamilyHandle* column_family);
|
||||
|
||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Merge(const WriteOptions&, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Delete(const WriteOptions&, const Slice& key) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status DisableFileDeletions() {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status EnableFileDeletions(bool force) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Flush(const FlushOptions& options) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
std::vector<Iterator*>* iterators) {
|
||||
// TODO
|
||||
return Status::NotSupported("Not supported yet.");
|
||||
}
|
||||
|
||||
private:
|
||||
friend class DB;
|
||||
using DBImpl::Put;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::Delete;
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::CompactRange;
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false,
|
||||
int target_level = -1) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status DisableFileDeletions() {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status EnableFileDeletions(bool force) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
using DBImpl::Flush;
|
||||
virtual Status Flush(const FlushOptions& options,
|
||||
ColumnFamilyHandle* column_family) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
|
||||
// No copying allowed
|
||||
DBImplReadOnly(const DBImplReadOnly&);
|
||||
void operator=(const DBImplReadOnly&);
|
||||
private:
|
||||
friend class DB;
|
||||
|
||||
// No copying allowed
|
||||
DBImplReadOnly(const DBImplReadOnly&);
|
||||
void operator=(const DBImplReadOnly&);
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -39,71 +39,6 @@ static void DumpInternalIter(Iterator* iter) {
|
||||
|
||||
namespace {
|
||||
|
||||
class IterLookupKey {
|
||||
public:
|
||||
IterLookupKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
|
||||
|
||||
~IterLookupKey() { Clear(); }
|
||||
|
||||
Slice GetKey() const {
|
||||
if (key_ != nullptr) {
|
||||
return Slice(key_, key_size_);
|
||||
} else {
|
||||
return Slice();
|
||||
}
|
||||
}
|
||||
|
||||
bool Valid() const { return key_ != nullptr; }
|
||||
|
||||
void Clear() {
|
||||
if (key_ != nullptr && key_ != space_) {
|
||||
delete[] key_;
|
||||
}
|
||||
key_ = space_;
|
||||
buf_size_ = sizeof(buf_size_);
|
||||
}
|
||||
|
||||
// Enlarge the buffer size if needed based on key_size.
|
||||
// By default, static allocated buffer is used. Once there is a key
|
||||
// larger than the static allocated buffer, another buffer is dynamically
|
||||
// allocated, until a larger key buffer is requested. In that case, we
|
||||
// reallocate buffer and delete the old one.
|
||||
void EnlargeBufferIfNeeded(size_t key_size) {
|
||||
// If size is smaller than buffer size, continue using current buffer,
|
||||
// or the static allocated one, as default
|
||||
if (key_size > buf_size_) {
|
||||
// Need to enlarge the buffer.
|
||||
Clear();
|
||||
key_ = new char[key_size];
|
||||
buf_size_ = key_size;
|
||||
}
|
||||
key_size_ = key_size;
|
||||
}
|
||||
|
||||
void SetUserKey(const Slice& user_key) {
|
||||
size_t size = user_key.size();
|
||||
EnlargeBufferIfNeeded(size);
|
||||
memcpy(key_, user_key.data(), size);
|
||||
}
|
||||
|
||||
void SetInternalKey(const Slice& user_key, SequenceNumber s) {
|
||||
size_t usize = user_key.size();
|
||||
EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
|
||||
memcpy(key_, user_key.data(), usize);
|
||||
EncodeFixed64(key_ + usize, PackSequenceAndType(s, kValueTypeForSeek));
|
||||
}
|
||||
|
||||
private:
|
||||
char* key_;
|
||||
size_t buf_size_;
|
||||
size_t key_size_;
|
||||
char space_[32]; // Avoid allocation for short keys
|
||||
|
||||
// No copying allowed
|
||||
IterLookupKey(const IterLookupKey&) = delete;
|
||||
void operator=(const LookupKey&) = delete;
|
||||
};
|
||||
|
||||
// Memtables and sstables that make the DB representation contain
|
||||
// (userkey,seq,type) => uservalue entries. DBIter
|
||||
// combines multiple entries for the same userkey found in the DB
|
||||
@ -191,7 +126,7 @@ class DBIter: public Iterator {
|
||||
SequenceNumber const sequence_;
|
||||
|
||||
Status status_;
|
||||
IterLookupKey saved_key_; // == current key when direction_==kReverse
|
||||
IterKey saved_key_; // == current key when direction_==kReverse
|
||||
std::string saved_value_; // == current raw value when direction_==kReverse
|
||||
std::string skip_key_;
|
||||
Direction direction_;
|
||||
@ -254,10 +189,9 @@ void DBIter::Next() {
|
||||
// NOTE: In between, saved_key_ can point to a user key that has
|
||||
// a delete marker
|
||||
inline void DBIter::FindNextUserEntry(bool skipping) {
|
||||
StopWatchNano timer(env_, false);
|
||||
StartPerfTimer(&timer);
|
||||
PERF_TIMER_AUTO(find_next_user_entry_time);
|
||||
FindNextUserEntryInternal(skipping);
|
||||
BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
|
||||
PERF_TIMER_STOP(find_next_user_entry_time);
|
||||
}
|
||||
|
||||
// Actual implementation of DBIter::FindNextUserEntry()
|
||||
@ -273,7 +207,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
|
||||
if (skipping &&
|
||||
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
|
||||
num_skipped++; // skip this entry
|
||||
BumpPerfCount(&perf_context.internal_key_skipped_count);
|
||||
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
|
||||
} else {
|
||||
skipping = false;
|
||||
switch (ikey.type) {
|
||||
@ -283,7 +217,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
|
||||
saved_key_.SetUserKey(ikey.user_key);
|
||||
skipping = true;
|
||||
num_skipped = 0;
|
||||
BumpPerfCount(&perf_context.internal_delete_skipped_count);
|
||||
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
|
||||
break;
|
||||
case kTypeValue:
|
||||
valid_ = true;
|
||||
@ -488,10 +422,9 @@ void DBIter::Seek(const Slice& target) {
|
||||
saved_key_.Clear();
|
||||
// now savved_key is used to store internal key.
|
||||
saved_key_.SetInternalKey(target, sequence_);
|
||||
StopWatchNano internal_seek_timer(env_, false);
|
||||
StartPerfTimer(&internal_seek_timer);
|
||||
PERF_TIMER_AUTO(seek_internal_seek_time);
|
||||
iter_->Seek(saved_key_.GetKey());
|
||||
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
|
||||
PERF_TIMER_STOP(seek_internal_seek_time);
|
||||
if (iter_->Valid()) {
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
@ -504,10 +437,9 @@ void DBIter::Seek(const Slice& target) {
|
||||
void DBIter::SeekToFirst() {
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
StopWatchNano internal_seek_timer(env_, false);
|
||||
StartPerfTimer(&internal_seek_timer);
|
||||
PERF_TIMER_AUTO(seek_internal_seek_time);
|
||||
iter_->SeekToFirst();
|
||||
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
|
||||
PERF_TIMER_STOP(seek_internal_seek_time);
|
||||
if (iter_->Valid()) {
|
||||
FindNextUserEntry(false /* not skipping */);
|
||||
} else {
|
||||
@ -526,10 +458,9 @@ void DBIter::SeekToLast() {
|
||||
|
||||
direction_ = kReverse;
|
||||
ClearSavedValue();
|
||||
StopWatchNano internal_seek_timer(env_, false);
|
||||
StartPerfTimer(&internal_seek_timer);
|
||||
PERF_TIMER_AUTO(seek_internal_seek_time);
|
||||
iter_->SeekToLast();
|
||||
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
|
||||
PERF_TIMER_STOP(seek_internal_seek_time);
|
||||
FindPrevUserEntry();
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,7 @@ void DBImpl::LogDBDeployStats() {
|
||||
|
||||
uint64_t file_total_size = 0;
|
||||
uint32_t file_total_num = 0;
|
||||
Version* current = versions_->current();
|
||||
Version* current = default_cf_handle_->cfd()->current();
|
||||
for (int i = 0; i < current->NumberLevels(); i++) {
|
||||
file_total_num += current->NumLevelFiles(i);
|
||||
file_total_size += current->NumLevelBytes(i);
|
||||
|
2361
db/db_test.cc
2361
db/db_test.cc
File diff suppressed because it is too large
Load Diff
@ -59,7 +59,7 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
|
||||
// decreasing sequence number
|
||||
// decreasing type (though sequence# should be enough to disambiguate)
|
||||
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
||||
BumpPerfCount(&perf_context.user_key_comparison_count);
|
||||
PERF_COUNTER_ADD(user_key_comparison_count, 1);
|
||||
if (r == 0) {
|
||||
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
|
||||
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
|
||||
@ -79,7 +79,7 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a,
|
||||
// decreasing sequence number
|
||||
// decreasing type (though sequence# should be enough to disambiguate)
|
||||
int r = user_comparator_->Compare(a.user_key, b.user_key);
|
||||
BumpPerfCount(&perf_context.user_key_comparison_count);
|
||||
PERF_COUNTER_ADD(user_key_comparison_count, 1);
|
||||
if (r == 0) {
|
||||
if (a.sequence > b.sequence) {
|
||||
r = -1;
|
||||
|
@ -32,6 +32,9 @@ enum ValueType : unsigned char {
|
||||
kTypeValue = 0x1,
|
||||
kTypeMerge = 0x2,
|
||||
kTypeLogData = 0x3,
|
||||
kTypeColumnFamilyDeletion = 0x4,
|
||||
kTypeColumnFamilyValue = 0x5,
|
||||
kTypeColumnFamilyMerge = 0x6,
|
||||
kMaxValue = 0x7F
|
||||
};
|
||||
|
||||
@ -235,4 +238,74 @@ inline LookupKey::~LookupKey() {
|
||||
if (start_ != space_) delete[] start_;
|
||||
}
|
||||
|
||||
class IterKey {
|
||||
public:
|
||||
IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
|
||||
|
||||
~IterKey() { Clear(); }
|
||||
|
||||
Slice GetKey() const {
|
||||
if (key_ != nullptr) {
|
||||
return Slice(key_, key_size_);
|
||||
} else {
|
||||
return Slice();
|
||||
}
|
||||
}
|
||||
|
||||
bool Valid() const { return key_ != nullptr; }
|
||||
|
||||
void Clear() {
|
||||
if (key_ != nullptr && key_ != space_) {
|
||||
delete[] key_;
|
||||
}
|
||||
key_ = space_;
|
||||
buf_size_ = sizeof(buf_size_);
|
||||
}
|
||||
|
||||
// Enlarge the buffer size if needed based on key_size.
|
||||
// By default, static allocated buffer is used. Once there is a key
|
||||
// larger than the static allocated buffer, another buffer is dynamically
|
||||
// allocated, until a larger key buffer is requested. In that case, we
|
||||
// reallocate buffer and delete the old one.
|
||||
void EnlargeBufferIfNeeded(size_t key_size) {
|
||||
// If size is smaller than buffer size, continue using current buffer,
|
||||
// or the static allocated one, as default
|
||||
if (key_size > buf_size_) {
|
||||
// Need to enlarge the buffer.
|
||||
Clear();
|
||||
key_ = new char[key_size];
|
||||
buf_size_ = key_size;
|
||||
}
|
||||
key_size_ = key_size;
|
||||
}
|
||||
|
||||
void SetUserKey(const Slice& user_key) {
|
||||
size_t size = user_key.size();
|
||||
EnlargeBufferIfNeeded(size);
|
||||
memcpy(key_, user_key.data(), size);
|
||||
}
|
||||
|
||||
void SetInternalKey(const Slice& user_key, SequenceNumber s,
|
||||
ValueType value_type = kValueTypeForSeek) {
|
||||
size_t usize = user_key.size();
|
||||
EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
|
||||
memcpy(key_, user_key.data(), usize);
|
||||
EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
|
||||
}
|
||||
|
||||
void SetInternalKey(const ParsedInternalKey& parsed_key) {
|
||||
SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
|
||||
}
|
||||
|
||||
private:
|
||||
char* key_;
|
||||
size_t buf_size_;
|
||||
size_t key_size_;
|
||||
char space_[32]; // Avoid allocation for short keys
|
||||
|
||||
// No copying allowed
|
||||
IterKey(const IterKey&) = delete;
|
||||
void operator=(const IterKey&) = delete;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -7,8 +7,7 @@
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/memtable_list.h"
|
||||
#include "db/column_family.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
@ -44,10 +43,8 @@ DBPropertyType GetPropertyType(const Slice& property) {
|
||||
|
||||
bool InternalStats::GetProperty(DBPropertyType property_type,
|
||||
const Slice& property, std::string* value,
|
||||
DBImpl* db) {
|
||||
VersionSet* version_set = db->versions_.get();
|
||||
Version* current = version_set->current();
|
||||
const MemTableList& imm = db->imm_;
|
||||
ColumnFamilyData* cfd) {
|
||||
Version* current = cfd->current();
|
||||
Slice in = property;
|
||||
|
||||
switch (property_type) {
|
||||
@ -110,7 +107,6 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
|
||||
write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
|
||||
}
|
||||
|
||||
// Pardon the long line but I think it is easier to read this way.
|
||||
snprintf(
|
||||
buf, sizeof(buf),
|
||||
" Compactions\n"
|
||||
@ -159,7 +155,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
|
||||
"%9lu\n",
|
||||
level, files, current->NumLevelBytes(level) / 1048576.0,
|
||||
current->NumLevelBytes(level) /
|
||||
version_set->MaxBytesForLevel(level),
|
||||
cfd->compaction_picker()->MaxBytesForLevel(level),
|
||||
compaction_stats_[level].micros / 1e6,
|
||||
bytes_read / 1048576.0,
|
||||
compaction_stats_[level].bytes_written / 1048576.0,
|
||||
@ -334,11 +330,11 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
|
||||
*value = current->DebugString();
|
||||
return true;
|
||||
case kNumImmutableMemTable:
|
||||
*value = std::to_string(imm.size());
|
||||
*value = std::to_string(cfd->imm()->size());
|
||||
return true;
|
||||
case kMemtableFlushPending:
|
||||
// Return number of mem tables that are ready to flush (made immutable)
|
||||
*value = std::to_string(imm.IsFlushPending() ? 1 : 0);
|
||||
*value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
|
||||
return true;
|
||||
case kCompactionPending:
|
||||
// 1 if the system already determines at least one compacdtion is needed.
|
||||
@ -351,7 +347,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
|
||||
return true;
|
||||
case kCurSizeActiveMemTable:
|
||||
// Current size of the active memtable
|
||||
*value = std::to_string(db->mem_->ApproximateMemoryUsage());
|
||||
*value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -16,6 +16,8 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
class ColumnFamilyData;
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class MemTableList;
|
||||
@ -126,7 +128,7 @@ class InternalStats {
|
||||
uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
|
||||
|
||||
bool GetProperty(DBPropertyType property_type, const Slice& property,
|
||||
std::string* value, DBImpl* db);
|
||||
std::string* value, ColumnFamilyData* cfd);
|
||||
|
||||
private:
|
||||
std::vector<CompactionStats> compaction_stats_;
|
||||
|
@ -29,7 +29,8 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
|
||||
MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
const Options& options)
|
||||
: comparator_(cmp),
|
||||
refs_(0),
|
||||
kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
|
||||
@ -42,7 +43,6 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
|
||||
file_number_(0),
|
||||
first_seqno_(0),
|
||||
mem_next_logfile_number_(0),
|
||||
mem_logfile_number_(0),
|
||||
locks_(options.inplace_update_support ? options.inplace_update_num_locks
|
||||
: 0),
|
||||
prefix_extractor_(options.prefix_extractor.get()),
|
||||
@ -142,6 +142,11 @@ Slice MemTableRep::UserKey(const char* key) const {
|
||||
return Slice(slice.data(), slice.size() - 8);
|
||||
}
|
||||
|
||||
KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
|
||||
*buf = arena_->Allocate(len);
|
||||
return static_cast<KeyHandle>(*buf);
|
||||
}
|
||||
|
||||
// Encode a suitable internal key target for "target" and return it.
|
||||
// Uses *scratch as scratch space, and the returned pointer will point
|
||||
// into this scratch space.
|
||||
@ -243,7 +248,9 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
const size_t encoded_len =
|
||||
VarintLength(internal_key_size) + internal_key_size +
|
||||
VarintLength(val_size) + val_size;
|
||||
char* buf = arena_.Allocate(encoded_len);
|
||||
char* buf = nullptr;
|
||||
KeyHandle handle = table_->Allocate(encoded_len, &buf);
|
||||
assert(buf != nullptr);
|
||||
char* p = EncodeVarint32(buf, internal_key_size);
|
||||
memcpy(p, key.data(), key_size);
|
||||
p += key_size;
|
||||
@ -252,7 +259,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
p = EncodeVarint32(p, val_size);
|
||||
memcpy(p, value.data(), val_size);
|
||||
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
|
||||
table_->Insert(buf);
|
||||
table_->Insert(handle);
|
||||
|
||||
if (prefix_bloom_) {
|
||||
assert(prefix_extractor_);
|
||||
@ -370,8 +377,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
||||
|
||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext& merge_context, const Options& options) {
|
||||
StopWatchNano memtable_get_timer(options.env, false);
|
||||
StartPerfTimer(&memtable_get_timer);
|
||||
PERF_TIMER_AUTO(get_from_memtable_time);
|
||||
|
||||
Slice user_key = key.user_key();
|
||||
bool found_final_value = false;
|
||||
@ -401,8 +407,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
||||
if (!found_final_value && merge_in_progress) {
|
||||
*s = Status::MergeInProgress("");
|
||||
}
|
||||
BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
|
||||
BumpPerfCount(&perf_context.get_from_memtable_count);
|
||||
PERF_TIMER_STOP(get_from_memtable_time);
|
||||
PERF_COUNTER_ADD(get_from_memtable_count, 1);
|
||||
return found_final_value;
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include <deque>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/skiplist.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "util/arena.h"
|
||||
@ -39,7 +39,7 @@ class MemTable {
|
||||
// MemTables are reference counted. The initial reference count
|
||||
// is zero and the caller must call Ref() at least once.
|
||||
explicit MemTable(const InternalKeyComparator& comparator,
|
||||
const Options& options = Options());
|
||||
const Options& options);
|
||||
|
||||
~MemTable();
|
||||
|
||||
@ -147,14 +147,6 @@ class MemTable {
|
||||
// be flushed to storage
|
||||
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
|
||||
|
||||
// Returns the logfile number that can be safely deleted when this
|
||||
// memstore is flushed to storage
|
||||
uint64_t GetLogNumber() { return mem_logfile_number_; }
|
||||
|
||||
// Sets the logfile number that can be safely deleted when this
|
||||
// memstore is flushed to storage
|
||||
void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; }
|
||||
|
||||
// Notify the underlying storage that no more items will be added
|
||||
void MarkImmutable() { table_->MarkReadOnly(); }
|
||||
|
||||
@ -197,10 +189,6 @@ class MemTable {
|
||||
// The log files earlier than this number can be deleted.
|
||||
uint64_t mem_next_logfile_number_;
|
||||
|
||||
// The log file that backs this memtable (to be deleted when
|
||||
// memtable flush is done)
|
||||
uint64_t mem_logfile_number_;
|
||||
|
||||
// rw locks for inplace updates
|
||||
std::vector<port::RWMutex> locks_;
|
||||
|
||||
|
@ -8,9 +8,11 @@
|
||||
#include <string>
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/version_set.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/log_buffer.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -120,7 +122,8 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
|
||||
}
|
||||
|
||||
void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
|
||||
uint64_t file_number, std::set<uint64_t>* pending_outputs) {
|
||||
uint64_t file_number,
|
||||
std::set<uint64_t>* pending_outputs) {
|
||||
assert(!mems.empty());
|
||||
|
||||
// If the flush was not successful, then just reset state.
|
||||
@ -140,10 +143,10 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
|
||||
|
||||
// Record a successful flush in the manifest file
|
||||
Status MemTableList::InstallMemtableFlushResults(
|
||||
const autovector<MemTable*>& mems, VersionSet* vset,
|
||||
ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
|
||||
port::Mutex* mu, Logger* info_log, uint64_t file_number,
|
||||
std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
|
||||
Directory* db_directory) {
|
||||
Directory* db_directory, LogBuffer* log_buffer) {
|
||||
mu->AssertHeld();
|
||||
|
||||
// flush was sucessful
|
||||
@ -173,12 +176,11 @@ Status MemTableList::InstallMemtableFlushResults(
|
||||
break;
|
||||
}
|
||||
|
||||
Log(info_log,
|
||||
"Level-0 commit table #%lu started",
|
||||
(unsigned long)m->file_number_);
|
||||
LogToBuffer(log_buffer, "Level-0 commit table #%lu started",
|
||||
(unsigned long)m->file_number_);
|
||||
|
||||
// this can release and reacquire the mutex.
|
||||
s = vset->LogAndApply(&m->edit_, mu, db_directory);
|
||||
s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
|
||||
|
||||
// we will be changing the version in the next code path,
|
||||
// so we better create a new one, since versions are immutable
|
||||
@ -189,10 +191,8 @@ Status MemTableList::InstallMemtableFlushResults(
|
||||
uint64_t mem_id = 1; // how many memtables has been flushed.
|
||||
do {
|
||||
if (s.ok()) { // commit new state
|
||||
Log(info_log,
|
||||
"Level-0 commit table #%lu: memtable #%lu done",
|
||||
(unsigned long)m->file_number_,
|
||||
(unsigned long)mem_id);
|
||||
LogToBuffer(log_buffer, "Level-0 commit table #%lu: memtable #%lu done",
|
||||
(unsigned long)m->file_number_, (unsigned long)mem_id);
|
||||
current_->Remove(m);
|
||||
assert(m->file_number_ > 0);
|
||||
|
||||
|
@ -7,19 +7,25 @@
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <deque>
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/skiplist.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/memtable.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/log_buffer.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class ColumnFamilyData;
|
||||
class InternalKeyComparator;
|
||||
class Mutex;
|
||||
|
||||
@ -99,12 +105,14 @@ class MemTableList {
|
||||
std::set<uint64_t>* pending_outputs);
|
||||
|
||||
// Commit a successful flush in the manifest file
|
||||
Status InstallMemtableFlushResults(const autovector<MemTable*>& m,
|
||||
Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
|
||||
const autovector<MemTable*>& m,
|
||||
VersionSet* vset, port::Mutex* mu,
|
||||
Logger* info_log, uint64_t file_number,
|
||||
std::set<uint64_t>& pending_outputs,
|
||||
autovector<MemTable*>* to_delete,
|
||||
Directory* db_directory);
|
||||
Directory* db_directory,
|
||||
LogBuffer* log_buffer);
|
||||
|
||||
// New memtables are inserted at the front of the list.
|
||||
// Takes ownership of the referenced held on *m by the caller of Add().
|
||||
|
@ -429,6 +429,48 @@ TEST(PlainTableDBTest, Iterator) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string MakeLongKey(size_t length, char c) {
|
||||
return std::string(length, c);
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, IteratorLargeKeys) {
|
||||
Options options = CurrentOptions();
|
||||
options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
|
||||
options.create_if_missing = true;
|
||||
options.prefix_extractor.reset();
|
||||
DestroyAndReopen(&options);
|
||||
|
||||
std::string key_list[] = {
|
||||
MakeLongKey(30, '0'),
|
||||
MakeLongKey(16, '1'),
|
||||
MakeLongKey(32, '2'),
|
||||
MakeLongKey(60, '3'),
|
||||
MakeLongKey(90, '4'),
|
||||
MakeLongKey(50, '5'),
|
||||
MakeLongKey(26, '6')
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
ASSERT_OK(Put(key_list[i], std::to_string(i)));
|
||||
}
|
||||
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
Iterator* iter = dbfull()->NewIterator(ro_);
|
||||
iter->Seek(key_list[0]);
|
||||
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(key_list[i], iter->key().ToString());
|
||||
ASSERT_EQ(std::to_string(i), iter->value().ToString());
|
||||
iter->Next();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
|
||||
delete iter;
|
||||
}
|
||||
|
||||
// A test comparator which compare two strings in this way:
|
||||
// (1) first compare prefix of 8 bytes in alphabet order,
|
||||
// (2) if two strings share the same prefix, sort the other part of the string
|
||||
|
15
db/repair.cc
15
db/repair.cc
@ -55,14 +55,20 @@ class Repairer {
|
||||
icmp_(options.comparator),
|
||||
ipolicy_(options.filter_policy),
|
||||
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
|
||||
raw_table_cache_(
|
||||
// TableCache can be small since we expect each table to be opened
|
||||
// once.
|
||||
NewLRUCache(10, options_.table_cache_numshardbits,
|
||||
options_.table_cache_remove_scan_count_limit)),
|
||||
next_file_number_(1) {
|
||||
// TableCache can be small since we expect each table to be opened once.
|
||||
table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
|
||||
table_cache_ = new TableCache(dbname_, &options_, storage_options_,
|
||||
raw_table_cache_.get());
|
||||
edit_ = new VersionEdit();
|
||||
}
|
||||
|
||||
~Repairer() {
|
||||
delete table_cache_;
|
||||
raw_table_cache_.reset();
|
||||
delete edit_;
|
||||
}
|
||||
|
||||
@ -102,6 +108,7 @@ class Repairer {
|
||||
InternalKeyComparator const icmp_;
|
||||
InternalFilterPolicy const ipolicy_;
|
||||
Options const options_;
|
||||
std::shared_ptr<Cache> raw_table_cache_;
|
||||
TableCache* table_cache_;
|
||||
VersionEdit* edit_;
|
||||
|
||||
@ -197,6 +204,7 @@ class Repairer {
|
||||
Slice record;
|
||||
WriteBatch batch;
|
||||
MemTable* mem = new MemTable(icmp_, options_);
|
||||
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
|
||||
mem->Ref();
|
||||
int counter = 0;
|
||||
while (reader.ReadRecord(&record, &scratch)) {
|
||||
@ -206,7 +214,7 @@ class Repairer {
|
||||
continue;
|
||||
}
|
||||
WriteBatchInternal::SetContents(&batch, record);
|
||||
status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
|
||||
status = WriteBatchInternal::InsertInto(&batch, cf_mems_default);
|
||||
if (status.ok()) {
|
||||
counter += WriteBatchInternal::Count(&batch);
|
||||
} else {
|
||||
@ -226,6 +234,7 @@ class Repairer {
|
||||
iter, &meta, icmp_, 0, 0, kNoCompression);
|
||||
delete iter;
|
||||
delete mem->Unref();
|
||||
delete cf_mems_default;
|
||||
mem = nullptr;
|
||||
if (status.ok()) {
|
||||
if (meta.file_size > 0) {
|
||||
|
@ -35,18 +35,13 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) {
|
||||
sizeof(*file_number));
|
||||
}
|
||||
|
||||
TableCache::TableCache(const std::string& dbname,
|
||||
const Options* options,
|
||||
const EnvOptions& storage_options,
|
||||
int entries)
|
||||
TableCache::TableCache(const std::string& dbname, const Options* options,
|
||||
const EnvOptions& storage_options, Cache* const cache)
|
||||
: env_(options->env),
|
||||
dbname_(dbname),
|
||||
options_(options),
|
||||
storage_options_(storage_options),
|
||||
cache_(
|
||||
NewLRUCache(entries, options->table_cache_numshardbits,
|
||||
options->table_cache_remove_scan_count_limit)) {
|
||||
}
|
||||
cache_(cache) {}
|
||||
|
||||
TableCache::~TableCache() {
|
||||
}
|
||||
@ -124,7 +119,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
|
||||
TableReader* table_reader = GetTableReaderFromHandle(handle);
|
||||
Iterator* result = table_reader->NewIterator(options);
|
||||
if (!file_meta.table_reader_handle) {
|
||||
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
|
||||
result->RegisterCleanup(&UnrefEntry, cache_, handle);
|
||||
}
|
||||
if (table_reader_ptr != nullptr) {
|
||||
*table_reader_ptr = table_reader;
|
||||
@ -216,8 +211,8 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
|
||||
return may_match;
|
||||
}
|
||||
|
||||
void TableCache::Evict(uint64_t file_number) {
|
||||
cache_->Erase(GetSliceForFileNumber(&file_number));
|
||||
void TableCache::Evict(Cache* cache, uint64_t file_number) {
|
||||
cache->Erase(GetSliceForFileNumber(&file_number));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -30,7 +30,7 @@ struct FileMetaData;
|
||||
class TableCache {
|
||||
public:
|
||||
TableCache(const std::string& dbname, const Options* options,
|
||||
const EnvOptions& storage_options, int entries);
|
||||
const EnvOptions& storage_options, Cache* cache);
|
||||
~TableCache();
|
||||
|
||||
// Return an iterator for the specified file number (the corresponding
|
||||
@ -64,7 +64,7 @@ class TableCache {
|
||||
const Slice& internal_prefix, bool* table_io);
|
||||
|
||||
// Evict any entry for the specified file number
|
||||
void Evict(uint64_t file_number);
|
||||
static void Evict(Cache* cache, uint64_t file_number);
|
||||
|
||||
// Find table reader
|
||||
Status FindTable(const EnvOptions& toptions,
|
||||
@ -95,7 +95,7 @@ class TableCache {
|
||||
const std::string dbname_;
|
||||
const Options* options_;
|
||||
const EnvOptions& storage_options_;
|
||||
std::shared_ptr<Cache> cache_;
|
||||
Cache* const cache_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -8,15 +8,19 @@
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "db/db_impl.h"
|
||||
#include "db/column_family.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
TailingIterator::TailingIterator(DBImpl* db, const ReadOptions& options,
|
||||
const Comparator* comparator)
|
||||
: db_(db), options_(options), comparator_(comparator),
|
||||
version_number_(0), current_(nullptr),
|
||||
ColumnFamilyData* cfd)
|
||||
: db_(db),
|
||||
options_(options),
|
||||
cfd_(cfd),
|
||||
version_number_(0),
|
||||
current_(nullptr),
|
||||
status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
|
||||
|
||||
bool TailingIterator::Valid() const {
|
||||
@ -53,10 +57,9 @@ void TailingIterator::Seek(const Slice& target) {
|
||||
// 'target' -- in this case, prev_key_ is included in the interval, so
|
||||
// prev_inclusive_ has to be set.
|
||||
|
||||
if (!is_prev_set_ ||
|
||||
comparator_->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
|
||||
(immutable_->Valid() &&
|
||||
comparator_->Compare(target, immutable_->key()) > 0) ||
|
||||
const Comparator* cmp = cfd_->user_comparator();
|
||||
if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
|
||||
(immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
|
||||
(options_.prefix_seek && !IsSamePrefix(target))) {
|
||||
SeekImmutable(target);
|
||||
}
|
||||
@ -121,7 +124,7 @@ void TailingIterator::SeekToLast() {
|
||||
|
||||
void TailingIterator::CreateIterators() {
|
||||
std::pair<Iterator*, Iterator*> iters =
|
||||
db_->GetTailingIteratorPair(options_, &version_number_);
|
||||
db_->GetTailingIteratorPair(options_, cfd_, &version_number_);
|
||||
|
||||
assert(iters.first && iters.second);
|
||||
|
||||
@ -137,9 +140,10 @@ void TailingIterator::UpdateCurrent() {
|
||||
if (mutable_->Valid()) {
|
||||
current_ = mutable_.get();
|
||||
}
|
||||
const Comparator* cmp = cfd_->user_comparator();
|
||||
if (immutable_->Valid() &&
|
||||
(current_ == nullptr ||
|
||||
comparator_->Compare(immutable_->key(), current_->key()) < 0)) {
|
||||
cmp->Compare(immutable_->key(), current_->key()) < 0)) {
|
||||
current_ = immutable_.get();
|
||||
}
|
||||
|
||||
@ -151,11 +155,11 @@ void TailingIterator::UpdateCurrent() {
|
||||
|
||||
bool TailingIterator::IsCurrentVersion() const {
|
||||
return mutable_ != nullptr && immutable_ != nullptr &&
|
||||
version_number_ == db_->CurrentVersionNumber();
|
||||
version_number_ == cfd_->GetSuperVersionNumber();
|
||||
}
|
||||
|
||||
bool TailingIterator::IsSamePrefix(const Slice& target) const {
|
||||
const SliceTransform* extractor = db_->options_.prefix_extractor.get();
|
||||
const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
|
||||
|
||||
assert(extractor);
|
||||
assert(is_prev_set_);
|
||||
|
@ -13,6 +13,7 @@
|
||||
namespace rocksdb {
|
||||
|
||||
class DBImpl;
|
||||
class ColumnFamilyData;
|
||||
|
||||
/**
|
||||
* TailingIterator is a special type of iterator that doesn't use an (implicit)
|
||||
@ -25,7 +26,7 @@ class DBImpl;
|
||||
class TailingIterator : public Iterator {
|
||||
public:
|
||||
TailingIterator(DBImpl* db, const ReadOptions& options,
|
||||
const Comparator* comparator);
|
||||
ColumnFamilyData* cfd);
|
||||
virtual ~TailingIterator() {}
|
||||
|
||||
virtual bool Valid() const override;
|
||||
@ -41,7 +42,7 @@ class TailingIterator : public Iterator {
|
||||
private:
|
||||
DBImpl* const db_;
|
||||
const ReadOptions options_;
|
||||
const Comparator* const comparator_;
|
||||
ColumnFamilyData* const cfd_;
|
||||
uint64_t version_number_;
|
||||
|
||||
// TailingIterator merges the contents of the two iterators below (one using
|
||||
|
@ -9,7 +9,7 @@
|
||||
namespace rocksdb {
|
||||
|
||||
TransactionLogIteratorImpl::TransactionLogIteratorImpl(
|
||||
const std::string& dir, const Options* options,
|
||||
const std::string& dir, const DBOptions* options,
|
||||
const TransactionLogIterator::ReadOptions& read_options,
|
||||
const EnvOptions& soptions, const SequenceNumber seq,
|
||||
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
|
||||
|
@ -67,7 +67,7 @@ class LogFileImpl : public LogFile {
|
||||
class TransactionLogIteratorImpl : public TransactionLogIterator {
|
||||
public:
|
||||
TransactionLogIteratorImpl(
|
||||
const std::string& dir, const Options* options,
|
||||
const std::string& dir, const DBOptions* options,
|
||||
const TransactionLogIterator::ReadOptions& read_options,
|
||||
const EnvOptions& soptions, const SequenceNumber seqNum,
|
||||
std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
|
||||
@ -82,7 +82,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
|
||||
|
||||
private:
|
||||
const std::string& dir_;
|
||||
const Options* options_;
|
||||
const DBOptions* options_;
|
||||
const TransactionLogIterator::ReadOptions read_options_;
|
||||
const EnvOptions& soptions_;
|
||||
SequenceNumber startingSequenceNumber_;
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include "db/version_set.h"
|
||||
#include "util/coding.h"
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -29,6 +30,11 @@ enum Tag {
|
||||
|
||||
// these are new formats divergent from open source leveldb
|
||||
kNewFile2 = 100, // store smallest & largest seqno
|
||||
|
||||
kColumnFamily = 200, // specify column family for version edit
|
||||
kColumnFamilyAdd = 201,
|
||||
kColumnFamilyDrop = 202,
|
||||
kMaxColumnFamily = 203,
|
||||
};
|
||||
|
||||
void VersionEdit::Clear() {
|
||||
@ -38,13 +44,19 @@ void VersionEdit::Clear() {
|
||||
prev_log_number_ = 0;
|
||||
last_sequence_ = 0;
|
||||
next_file_number_ = 0;
|
||||
max_column_family_ = 0;
|
||||
has_comparator_ = false;
|
||||
has_log_number_ = false;
|
||||
has_prev_log_number_ = false;
|
||||
has_next_file_number_ = false;
|
||||
has_last_sequence_ = false;
|
||||
has_max_column_family_ = false;
|
||||
deleted_files_.clear();
|
||||
new_files_.clear();
|
||||
column_family_ = 0;
|
||||
is_column_family_add_ = 0;
|
||||
is_column_family_drop_ = 0;
|
||||
column_family_name_.clear();
|
||||
}
|
||||
|
||||
void VersionEdit::EncodeTo(std::string* dst) const {
|
||||
@ -68,6 +80,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
|
||||
PutVarint32(dst, kLastSequence);
|
||||
PutVarint64(dst, last_sequence_);
|
||||
}
|
||||
if (has_max_column_family_) {
|
||||
PutVarint32(dst, kMaxColumnFamily);
|
||||
PutVarint32(dst, max_column_family_);
|
||||
}
|
||||
|
||||
for (const auto& deleted : deleted_files_) {
|
||||
PutVarint32(dst, kDeletedFile);
|
||||
@ -86,6 +102,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {
|
||||
PutVarint64(dst, f.smallest_seqno);
|
||||
PutVarint64(dst, f.largest_seqno);
|
||||
}
|
||||
|
||||
// 0 is default and does not need to be explicitly written
|
||||
if (column_family_ != 0) {
|
||||
PutVarint32(dst, kColumnFamily);
|
||||
PutVarint32(dst, column_family_);
|
||||
}
|
||||
|
||||
if (is_column_family_add_) {
|
||||
PutVarint32(dst, kColumnFamilyAdd);
|
||||
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
|
||||
}
|
||||
|
||||
if (is_column_family_drop_) {
|
||||
PutVarint32(dst, kColumnFamilyDrop);
|
||||
}
|
||||
}
|
||||
|
||||
static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
||||
@ -167,6 +198,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
|
||||
}
|
||||
break;
|
||||
|
||||
case kMaxColumnFamily:
|
||||
if (GetVarint32(&input, &max_column_family_)) {
|
||||
has_max_column_family_ = true;
|
||||
} else {
|
||||
msg = "max column family";
|
||||
}
|
||||
break;
|
||||
|
||||
case kCompactPointer:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetInternalKey(&input, &key)) {
|
||||
@ -221,6 +260,29 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
|
||||
}
|
||||
break;
|
||||
|
||||
case kColumnFamily:
|
||||
if (!GetVarint32(&input, &column_family_)) {
|
||||
if (!msg) {
|
||||
msg = "set column family id";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kColumnFamilyAdd:
|
||||
if (GetLengthPrefixedSlice(&input, &str)) {
|
||||
is_column_family_add_ = true;
|
||||
column_family_name_ = str.ToString();
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "column family add";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kColumnFamilyDrop:
|
||||
is_column_family_drop_ = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
msg = "unknown tag";
|
||||
break;
|
||||
@ -282,6 +344,19 @@ std::string VersionEdit::DebugString(bool hex_key) const {
|
||||
r.append(" .. ");
|
||||
r.append(f.largest.DebugString(hex_key));
|
||||
}
|
||||
r.append("\n ColumnFamily: ");
|
||||
AppendNumberTo(&r, column_family_);
|
||||
if (is_column_family_add_) {
|
||||
r.append("\n ColumnFamilyAdd: ");
|
||||
r.append(column_family_name_);
|
||||
}
|
||||
if (is_column_family_drop_) {
|
||||
r.append("\n ColumnFamilyDrop");
|
||||
}
|
||||
if (has_max_column_family_) {
|
||||
r.append("\n MaxColumnFamily: ");
|
||||
AppendNumberTo(&r, max_column_family_);
|
||||
}
|
||||
r.append("\n}\n");
|
||||
return r;
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "rocksdb/cache.h"
|
||||
#include "db/dbformat.h"
|
||||
|
||||
@ -32,11 +33,14 @@ struct FileMetaData {
|
||||
// Needs to be disposed when refs becomes 0.
|
||||
Cache::Handle* table_reader_handle;
|
||||
|
||||
FileMetaData(uint64_t number, uint64_t file_size) :
|
||||
refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size),
|
||||
being_compacted(false), table_reader_handle(nullptr) {
|
||||
}
|
||||
FileMetaData() : FileMetaData(0, 0) { }
|
||||
FileMetaData(uint64_t number, uint64_t file_size)
|
||||
: refs(0),
|
||||
allowed_seeks(1 << 30),
|
||||
number(number),
|
||||
file_size(file_size),
|
||||
being_compacted(false),
|
||||
table_reader_handle(nullptr) {}
|
||||
FileMetaData() : FileMetaData(0, 0) {}
|
||||
};
|
||||
|
||||
class VersionEdit {
|
||||
@ -66,6 +70,10 @@ class VersionEdit {
|
||||
has_last_sequence_ = true;
|
||||
last_sequence_ = seq;
|
||||
}
|
||||
void SetMaxColumnFamily(uint32_t max_column_family) {
|
||||
has_max_column_family_ = true;
|
||||
max_column_family_ = max_column_family;
|
||||
}
|
||||
|
||||
// Add the specified file at the specified number.
|
||||
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
||||
@ -97,6 +105,31 @@ class VersionEdit {
|
||||
return new_files_.size() + deleted_files_.size();
|
||||
}
|
||||
|
||||
bool IsColumnFamilyManipulation() {
|
||||
return is_column_family_add_ || is_column_family_drop_;
|
||||
}
|
||||
|
||||
void SetColumnFamily(uint32_t column_family_id) {
|
||||
column_family_ = column_family_id;
|
||||
}
|
||||
|
||||
// set column family ID by calling SetColumnFamily()
|
||||
void AddColumnFamily(const std::string& name) {
|
||||
assert(!is_column_family_drop_);
|
||||
assert(!is_column_family_add_);
|
||||
assert(NumEntries() == 0);
|
||||
is_column_family_add_ = true;
|
||||
column_family_name_ = name;
|
||||
}
|
||||
|
||||
// set column family ID by calling SetColumnFamily()
|
||||
void DropColumnFamily() {
|
||||
assert(!is_column_family_drop_);
|
||||
assert(!is_column_family_add_);
|
||||
assert(NumEntries() == 0);
|
||||
is_column_family_drop_ = true;
|
||||
}
|
||||
|
||||
void EncodeTo(std::string* dst) const;
|
||||
Status DecodeFrom(const Slice& src);
|
||||
|
||||
@ -114,15 +147,27 @@ class VersionEdit {
|
||||
uint64_t log_number_;
|
||||
uint64_t prev_log_number_;
|
||||
uint64_t next_file_number_;
|
||||
uint32_t max_column_family_;
|
||||
SequenceNumber last_sequence_;
|
||||
bool has_comparator_;
|
||||
bool has_log_number_;
|
||||
bool has_prev_log_number_;
|
||||
bool has_next_file_number_;
|
||||
bool has_last_sequence_;
|
||||
bool has_max_column_family_;
|
||||
|
||||
DeletedFileSet deleted_files_;
|
||||
std::vector<std::pair<int, FileMetaData> > new_files_;
|
||||
std::vector<std::pair<int, FileMetaData>> new_files_;
|
||||
|
||||
// Each version edit record should have column_family_id set
|
||||
// If it's not set, it is default (0)
|
||||
uint32_t column_family_;
|
||||
// a version edit can be either column_family add or
|
||||
// column_family drop. If it's column family add,
|
||||
// it also includes column family name.
|
||||
bool is_column_family_drop_;
|
||||
bool is_column_family_add_;
|
||||
std::string column_family_name_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -45,6 +45,19 @@ TEST(VersionEditTest, EncodeDecode) {
|
||||
TestEncodeDecode(edit);
|
||||
}
|
||||
|
||||
TEST(VersionEditTest, ColumnFamilyTest) {
|
||||
VersionEdit edit;
|
||||
edit.SetColumnFamily(2);
|
||||
edit.AddColumnFamily("column_family");
|
||||
edit.SetMaxColumnFamily(5);
|
||||
TestEncodeDecode(edit);
|
||||
|
||||
edit.Clear();
|
||||
edit.SetColumnFamily(3);
|
||||
edit.DropColumnFamily();
|
||||
TestEncodeDecode(edit);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
1023
db/version_set.cc
1023
db/version_set.cc
File diff suppressed because it is too large
Load Diff
131
db/version_set.h
131
db/version_set.h
@ -24,12 +24,15 @@
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "port/port.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/compaction.h"
|
||||
#include "db/compaction_picker.h"
|
||||
#include "db/column_family.h"
|
||||
#include "db/log_reader.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -41,10 +44,12 @@ class Iterator;
|
||||
class LogBuffer;
|
||||
class LookupKey;
|
||||
class MemTable;
|
||||
class MergeContext;
|
||||
class TableCache;
|
||||
class Version;
|
||||
class VersionSet;
|
||||
class MergeContext;
|
||||
class ColumnFamilyData;
|
||||
class ColumnFamilySet;
|
||||
class TableCache;
|
||||
|
||||
// Return the smallest index i such that files[i]->largest >= key.
|
||||
// Return files.size() if there is no such file.
|
||||
@ -208,6 +213,7 @@ class Version {
|
||||
friend class Compaction;
|
||||
friend class VersionSet;
|
||||
friend class DBImpl;
|
||||
friend class ColumnFamilyData;
|
||||
friend class CompactionPicker;
|
||||
friend class LevelCompactionPicker;
|
||||
friend class UniversalCompactionPicker;
|
||||
@ -223,6 +229,7 @@ class Version {
|
||||
// record results in files_by_size_. The largest files are listed first.
|
||||
void UpdateFilesBySize();
|
||||
|
||||
ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
|
||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||
Version* next_; // Next version in linked list
|
||||
Version* prev_; // Previous version in linked list
|
||||
@ -268,7 +275,7 @@ class Version {
|
||||
// used for debugging and logging purposes only.
|
||||
uint64_t version_number_;
|
||||
|
||||
explicit Version(VersionSet* vset, uint64_t version_number = 0);
|
||||
Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
|
||||
|
||||
~Version();
|
||||
|
||||
@ -285,22 +292,29 @@ class Version {
|
||||
|
||||
class VersionSet {
|
||||
public:
|
||||
VersionSet(const std::string& dbname, const Options* options,
|
||||
const EnvOptions& storage_options, TableCache* table_cache,
|
||||
const InternalKeyComparator*);
|
||||
VersionSet(const std::string& dbname, const DBOptions* options,
|
||||
const EnvOptions& storage_options, Cache* table_cache);
|
||||
~VersionSet();
|
||||
|
||||
// Apply *edit to the current version to form a new descriptor that
|
||||
// is both saved to persistent state and installed as the new
|
||||
// current version. Will release *mu while actually writing to the file.
|
||||
// column_family_options has to be set if edit is column family add
|
||||
// REQUIRES: *mu is held on entry.
|
||||
// REQUIRES: no other thread concurrently calls LogAndApply()
|
||||
Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
|
||||
Directory* db_directory = nullptr,
|
||||
bool new_descriptor_log = false);
|
||||
Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
|
||||
port::Mutex* mu, Directory* db_directory = nullptr,
|
||||
bool new_descriptor_log = false,
|
||||
const ColumnFamilyOptions* column_family_options =
|
||||
nullptr);
|
||||
|
||||
// Recover the last saved descriptor from persistent storage.
|
||||
Status Recover();
|
||||
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families);
|
||||
|
||||
// Reads a manifest file and returns a list of column families in
|
||||
// column_families.
|
||||
static Status ListColumnFamilies(std::vector<std::string>* column_families,
|
||||
const std::string& dbname, Env* env);
|
||||
|
||||
// Try to reduce the number of levels. This call is valid when
|
||||
// only one level from the new max level to the old
|
||||
@ -316,15 +330,6 @@ class VersionSet {
|
||||
const EnvOptions& storage_options,
|
||||
int new_levels);
|
||||
|
||||
// Return the current version.
|
||||
Version* current() const { return current_; }
|
||||
|
||||
// A Flag indicating whether write needs to slowdown because of there are
|
||||
// too many number of level0 files.
|
||||
bool NeedSlowdownForNumLevel0Files() const {
|
||||
return need_slowdown_for_num_level0_files_;
|
||||
}
|
||||
|
||||
// Return the current manifest file number
|
||||
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
|
||||
|
||||
@ -358,37 +363,21 @@ class VersionSet {
|
||||
// Mark the specified file number as used.
|
||||
void MarkFileNumberUsed(uint64_t number);
|
||||
|
||||
// Return the current log file number.
|
||||
uint64_t LogNumber() const { return log_number_; }
|
||||
|
||||
// Return the log file number for the log file that is currently
|
||||
// being compacted, or zero if there is no such log file.
|
||||
uint64_t PrevLogNumber() const { return prev_log_number_; }
|
||||
|
||||
int NumberLevels() const { return num_levels_; }
|
||||
|
||||
// Pick level and inputs for a new compaction.
|
||||
// Returns nullptr if there is no compaction to be done.
|
||||
// Otherwise returns a pointer to a heap-allocated object that
|
||||
// describes the compaction. Caller should delete the result.
|
||||
Compaction* PickCompaction(LogBuffer* log_buffer);
|
||||
|
||||
// Return a compaction object for compacting the range [begin,end] in
|
||||
// the specified level. Returns nullptr if there is nothing in that
|
||||
// level that overlaps the specified range. Caller should delete
|
||||
// the result.
|
||||
//
|
||||
// The returned Compaction might not include the whole requested range.
|
||||
// In that case, compaction_end will be set to the next key that needs
|
||||
// compacting. In case the compaction will compact the whole range,
|
||||
// compaction_end will be set to nullptr.
|
||||
// Client is responsible for compaction_end storage -- when called,
|
||||
// *compaction_end should point to valid InternalKey!
|
||||
Compaction* CompactRange(int input_level,
|
||||
int output_level,
|
||||
const InternalKey* begin,
|
||||
const InternalKey* end,
|
||||
InternalKey** compaction_end);
|
||||
// Returns the minimum log number such that all
|
||||
// log numbers less than or equal to it can be deleted
|
||||
uint64_t MinLogNumber() const {
|
||||
uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
|
||||
for (auto cfd : *column_family_set_) {
|
||||
if (min_log_num > cfd->GetLogNumber()) {
|
||||
min_log_num = cfd->GetLogNumber();
|
||||
}
|
||||
}
|
||||
return min_log_num;
|
||||
}
|
||||
|
||||
// Create an iterator that reads over the compaction inputs for "*c".
|
||||
// The caller should delete the iterator when no longer needed.
|
||||
@ -414,62 +403,53 @@ class VersionSet {
|
||||
// pick the same files to compact.
|
||||
bool VerifyCompactionFileConsistency(Compaction* c);
|
||||
|
||||
double MaxBytesForLevel(int level);
|
||||
|
||||
// Get the max file size in a given level.
|
||||
uint64_t MaxFileSizeForLevel(int level);
|
||||
|
||||
void ReleaseCompactionFiles(Compaction* c, Status status);
|
||||
|
||||
Status GetMetadataForFile(
|
||||
uint64_t number, int *filelevel, FileMetaData **metadata);
|
||||
Status GetMetadataForFile(uint64_t number, int* filelevel,
|
||||
FileMetaData** metadata, ColumnFamilyData** cfd);
|
||||
|
||||
void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData> *metadata);
|
||||
|
||||
void GetObsoleteFiles(std::vector<FileMetaData*>* files);
|
||||
|
||||
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
|
||||
|
||||
private:
|
||||
class Builder;
|
||||
struct ManifestWriter;
|
||||
|
||||
friend class Compaction;
|
||||
friend class Version;
|
||||
|
||||
struct LogReporter : public log::Reader::Reporter {
|
||||
Status* status;
|
||||
virtual void Corruption(size_t bytes, const Status& s) {
|
||||
if (this->status->ok()) *this->status = s;
|
||||
}
|
||||
};
|
||||
|
||||
// Save current contents to *log
|
||||
Status WriteSnapshot(log::Writer* log);
|
||||
|
||||
void AppendVersion(Version* v);
|
||||
void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
|
||||
|
||||
bool ManifestContains(uint64_t manifest_file_number,
|
||||
const std::string& record) const;
|
||||
|
||||
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
|
||||
VersionEdit* edit);
|
||||
|
||||
std::unique_ptr<ColumnFamilySet> column_family_set_;
|
||||
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
const Options* const options_;
|
||||
TableCache* const table_cache_;
|
||||
const InternalKeyComparator icmp_;
|
||||
const DBOptions* const options_;
|
||||
uint64_t next_file_number_;
|
||||
uint64_t manifest_file_number_;
|
||||
uint64_t pending_manifest_file_number_;
|
||||
std::atomic<uint64_t> last_sequence_;
|
||||
uint64_t log_number_;
|
||||
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
|
||||
|
||||
int num_levels_;
|
||||
|
||||
// Opened lazily
|
||||
unique_ptr<log::Writer> descriptor_log_;
|
||||
Version dummy_versions_; // Head of circular doubly-linked list of versions.
|
||||
Version* current_; // == dummy_versions_.prev_
|
||||
|
||||
// A flag indicating whether we should delay writes because
|
||||
// we have too many level 0 files
|
||||
bool need_slowdown_for_num_level0_files_;
|
||||
|
||||
// An object that keeps all the compaction stats
|
||||
// and picks the next compaction
|
||||
std::unique_ptr<CompactionPicker> compaction_picker_;
|
||||
|
||||
// generates a increasing version number for every new version
|
||||
uint64_t current_version_number_;
|
||||
@ -493,8 +473,9 @@ class VersionSet {
|
||||
VersionSet(const VersionSet&);
|
||||
void operator=(const VersionSet&);
|
||||
|
||||
void LogAndApplyHelper(Builder*b, Version* v,
|
||||
VersionEdit* edit, port::Mutex* mu);
|
||||
void LogAndApplyCFHelper(VersionEdit* edit);
|
||||
void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
|
||||
VersionEdit* edit, port::Mutex* mu);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -15,6 +15,9 @@
|
||||
// kTypeValue varstring varstring
|
||||
// kTypeMerge varstring varstring
|
||||
// kTypeDeletion varstring
|
||||
// kTypeColumnFamilyValue varint32 varstring varstring
|
||||
// kTypeColumnFamilyMerge varint32 varstring varstring
|
||||
// kTypeColumnFamilyDeletion varint32 varstring varstring
|
||||
// varstring :=
|
||||
// len: varint32
|
||||
// data: uint8[len]
|
||||
@ -45,10 +48,20 @@ WriteBatch::~WriteBatch() { }
|
||||
|
||||
WriteBatch::Handler::~Handler() { }
|
||||
|
||||
void WriteBatch::Handler::Put(const Slice& key, const Slice& value) {
|
||||
// you need to either implement Put or PutCF
|
||||
throw std::runtime_error("Handler::Put not implemented!");
|
||||
}
|
||||
|
||||
void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
|
||||
throw std::runtime_error("Handler::Merge not implemented!");
|
||||
}
|
||||
|
||||
void WriteBatch::Handler::Delete(const Slice& key) {
|
||||
// you need to either implement Delete or DeleteCF
|
||||
throw std::runtime_error("Handler::Delete not implemented!");
|
||||
}
|
||||
|
||||
void WriteBatch::Handler::LogData(const Slice& blob) {
|
||||
// If the user has not specified something to do with blobs, then we ignore
|
||||
// them.
|
||||
@ -76,31 +89,48 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
||||
input.remove_prefix(kHeader);
|
||||
Slice key, value, blob;
|
||||
int found = 0;
|
||||
while (!input.empty() && handler->Continue()) {
|
||||
Status s;
|
||||
while (s.ok() && !input.empty() && handler->Continue()) {
|
||||
char tag = input[0];
|
||||
input.remove_prefix(1);
|
||||
uint32_t column_family = 0; // default
|
||||
switch (tag) {
|
||||
case kTypeColumnFamilyValue:
|
||||
if (!GetVarint32(&input, &column_family)) {
|
||||
return Status::Corruption("bad WriteBatch Put");
|
||||
}
|
||||
// intentional fallthrough
|
||||
case kTypeValue:
|
||||
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||
GetLengthPrefixedSlice(&input, &value)) {
|
||||
handler->Put(key, value);
|
||||
s = handler->PutCF(column_family, key, value);
|
||||
found++;
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Put");
|
||||
}
|
||||
break;
|
||||
case kTypeColumnFamilyDeletion:
|
||||
if (!GetVarint32(&input, &column_family)) {
|
||||
return Status::Corruption("bad WriteBatch Delete");
|
||||
}
|
||||
// intentional fallthrough
|
||||
case kTypeDeletion:
|
||||
if (GetLengthPrefixedSlice(&input, &key)) {
|
||||
handler->Delete(key);
|
||||
s = handler->DeleteCF(column_family, key);
|
||||
found++;
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Delete");
|
||||
}
|
||||
break;
|
||||
case kTypeColumnFamilyMerge:
|
||||
if (!GetVarint32(&input, &column_family)) {
|
||||
return Status::Corruption("bad WriteBatch Merge");
|
||||
}
|
||||
// intentional fallthrough
|
||||
case kTypeMerge:
|
||||
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||
GetLengthPrefixedSlice(&input, &value)) {
|
||||
handler->Merge(key, value);
|
||||
s = handler->MergeCF(column_family, key, value);
|
||||
found++;
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Merge");
|
||||
@ -117,7 +147,10 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
||||
return Status::Corruption("unknown WriteBatch tag");
|
||||
}
|
||||
}
|
||||
if (found != WriteBatchInternal::Count(this)) {
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
if (found != WriteBatchInternal::Count(this)) {
|
||||
return Status::Corruption("WriteBatch has wrong count");
|
||||
} else {
|
||||
return Status::OK();
|
||||
@ -140,29 +173,76 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
|
||||
EncodeFixed64(&b->rep_[0], seq);
|
||||
}
|
||||
|
||||
void WriteBatch::Put(const Slice& key, const Slice& value) {
|
||||
void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) {
|
||||
uint32_t column_family_id = 0;
|
||||
if (column_family != nullptr) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
column_family_id = cfh->GetID();
|
||||
}
|
||||
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeValue));
|
||||
if (column_family_id == 0) {
|
||||
rep_.push_back(static_cast<char>(kTypeValue));
|
||||
} else {
|
||||
rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
|
||||
PutVarint32(&rep_, column_family_id);
|
||||
}
|
||||
PutLengthPrefixedSlice(&rep_, key);
|
||||
PutLengthPrefixedSlice(&rep_, value);
|
||||
}
|
||||
|
||||
void WriteBatch::Put(const SliceParts& key, const SliceParts& value) {
|
||||
void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
||||
const SliceParts& value) {
|
||||
uint32_t column_family_id = 0;
|
||||
if (column_family != nullptr) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
column_family_id = cfh->GetID();
|
||||
}
|
||||
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeValue));
|
||||
if (column_family_id == 0) {
|
||||
rep_.push_back(static_cast<char>(kTypeValue));
|
||||
} else {
|
||||
rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
|
||||
PutVarint32(&rep_, column_family_id);
|
||||
}
|
||||
PutLengthPrefixedSliceParts(&rep_, key);
|
||||
PutLengthPrefixedSliceParts(&rep_, value);
|
||||
}
|
||||
|
||||
void WriteBatch::Delete(const Slice& key) {
|
||||
void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
|
||||
uint32_t column_family_id = 0;
|
||||
if (column_family != nullptr) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
column_family_id = cfh->GetID();
|
||||
}
|
||||
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeDeletion));
|
||||
if (column_family_id == 0) {
|
||||
rep_.push_back(static_cast<char>(kTypeDeletion));
|
||||
} else {
|
||||
rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
|
||||
PutVarint32(&rep_, column_family_id);
|
||||
}
|
||||
PutLengthPrefixedSlice(&rep_, key);
|
||||
}
|
||||
|
||||
void WriteBatch::Merge(const Slice& key, const Slice& value) {
|
||||
void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) {
|
||||
uint32_t column_family_id = 0;
|
||||
if (column_family != nullptr) {
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
column_family_id = cfh->GetID();
|
||||
}
|
||||
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeMerge));
|
||||
if (column_family_id == 0) {
|
||||
rep_.push_back(static_cast<char>(kTypeMerge));
|
||||
} else {
|
||||
rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
|
||||
PutVarint32(&rep_, column_family_id);
|
||||
}
|
||||
PutLengthPrefixedSlice(&rep_, key);
|
||||
PutLengthPrefixedSlice(&rep_, value);
|
||||
}
|
||||
@ -176,33 +256,70 @@ namespace {
|
||||
class MemTableInserter : public WriteBatch::Handler {
|
||||
public:
|
||||
SequenceNumber sequence_;
|
||||
MemTable* mem_;
|
||||
const Options* options_;
|
||||
ColumnFamilyMemTables* cf_mems_;
|
||||
bool recovery_;
|
||||
uint64_t log_number_;
|
||||
DBImpl* db_;
|
||||
const bool filter_deletes_;
|
||||
const bool dont_filter_deletes_;
|
||||
|
||||
MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts,
|
||||
DB* db, const bool filter_deletes)
|
||||
: sequence_(sequence),
|
||||
mem_(mem),
|
||||
options_(opts),
|
||||
db_(reinterpret_cast<DBImpl*>(db)),
|
||||
filter_deletes_(filter_deletes) {
|
||||
assert(mem_);
|
||||
if (filter_deletes_) {
|
||||
assert(options_);
|
||||
MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
|
||||
bool recovery, uint64_t log_number, DB* db,
|
||||
const bool dont_filter_deletes)
|
||||
: sequence_(sequence),
|
||||
cf_mems_(cf_mems),
|
||||
recovery_(recovery),
|
||||
log_number_(log_number),
|
||||
db_(reinterpret_cast<DBImpl*>(db)),
|
||||
dont_filter_deletes_(dont_filter_deletes) {
|
||||
assert(cf_mems);
|
||||
if (!dont_filter_deletes_) {
|
||||
assert(db_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
if (!options_->inplace_update_support) {
|
||||
mem_->Add(sequence_, kTypeValue, key, value);
|
||||
} else if (options_->inplace_callback == nullptr) {
|
||||
mem_->Update(sequence_, key, value);
|
||||
RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED);
|
||||
bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
|
||||
bool found = cf_mems_->Seek(column_family_id);
|
||||
if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) {
|
||||
// if in recovery envoronment:
|
||||
// * If column family was not found, it might mean that the WAL write
|
||||
// batch references to the column family that was dropped after the
|
||||
// insert. We don't want to fail the whole write batch in that case -- we
|
||||
// just ignore the update.
|
||||
// * If log_number_ < cf_mems_->GetLogNumber(), this means that column
|
||||
// family already contains updates from this log. We can't apply updates
|
||||
// twice because of update-in-place or merge workloads -- ignore the
|
||||
// update
|
||||
*s = Status::OK();
|
||||
return false;
|
||||
}
|
||||
if (!found) {
|
||||
assert(!recovery_);
|
||||
// If the column family was not found in non-recovery enviornment
|
||||
// (client's write code-path), we have to fail the write and return
|
||||
// the failure status to the client.
|
||||
*s = Status::InvalidArgument(
|
||||
"Invalid column family specified in write batch");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
Status seek_status;
|
||||
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
|
||||
++sequence_;
|
||||
return seek_status;
|
||||
}
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
const Options* options = cf_mems_->GetOptions();
|
||||
if (!options->inplace_update_support) {
|
||||
mem->Add(sequence_, kTypeValue, key, value);
|
||||
} else if (options->inplace_callback == nullptr) {
|
||||
mem->Update(sequence_, key, value);
|
||||
RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
|
||||
} else {
|
||||
if (mem_->UpdateCallback(sequence_, key, value, *options_)) {
|
||||
if (mem->UpdateCallback(sequence_, key, value, *options)) {
|
||||
} else {
|
||||
// key not found in memtable. Do sst get, update, add
|
||||
SnapshotImpl read_from_snapshot;
|
||||
@ -212,21 +329,26 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
|
||||
std::string prev_value;
|
||||
std::string merged_value;
|
||||
Status s = db_->Get(ropts, key, &prev_value);
|
||||
|
||||
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
||||
if (cf_handle == nullptr) {
|
||||
cf_handle = db_->DefaultColumnFamily();
|
||||
}
|
||||
Status s = db_->Get(ropts, cf_handle, key, &prev_value);
|
||||
|
||||
char* prev_buffer = const_cast<char*>(prev_value.c_str());
|
||||
uint32_t prev_size = prev_value.size();
|
||||
auto status =
|
||||
options_->inplace_callback(s.ok() ? prev_buffer: nullptr,
|
||||
s.ok() ? &prev_size: nullptr,
|
||||
value, &merged_value);
|
||||
auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
|
||||
s.ok() ? &prev_size : nullptr,
|
||||
value, &merged_value);
|
||||
if (status == UpdateStatus::UPDATED_INPLACE) {
|
||||
// prev_value is updated in-place with final value.
|
||||
mem_->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
|
||||
RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||
mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
|
||||
RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||
} else if (status == UpdateStatus::UPDATED) {
|
||||
// merged_value contains the final value.
|
||||
mem_->Add(sequence_, kTypeValue, key, Slice(merged_value));
|
||||
RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||
mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
|
||||
RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -234,19 +356,28 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
// sequence number. Even if the update eventually fails and does not result
|
||||
// in memtable add/update.
|
||||
sequence_++;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual void Merge(const Slice& key, const Slice& value) {
|
||||
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
Status seek_status;
|
||||
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
|
||||
++sequence_;
|
||||
return seek_status;
|
||||
}
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
const Options* options = cf_mems_->GetOptions();
|
||||
bool perform_merge = false;
|
||||
|
||||
if (options_->max_successive_merges > 0 && db_ != nullptr) {
|
||||
if (options->max_successive_merges > 0 && db_ != nullptr) {
|
||||
LookupKey lkey(key, sequence_);
|
||||
|
||||
// Count the number of successive merges at the head
|
||||
// of the key in the memtable
|
||||
size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey);
|
||||
size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
|
||||
|
||||
if (num_merges >= options_->max_successive_merges) {
|
||||
if (num_merges >= options->max_successive_merges) {
|
||||
perform_merge = true;
|
||||
}
|
||||
}
|
||||
@ -262,62 +393,78 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
ReadOptions read_options;
|
||||
read_options.snapshot = &read_from_snapshot;
|
||||
|
||||
db_->Get(read_options, key, &get_value);
|
||||
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
||||
if (cf_handle == nullptr) {
|
||||
cf_handle = db_->DefaultColumnFamily();
|
||||
}
|
||||
db_->Get(read_options, cf_handle, key, &get_value);
|
||||
Slice get_value_slice = Slice(get_value);
|
||||
|
||||
// 2) Apply this merge
|
||||
auto merge_operator = options_->merge_operator.get();
|
||||
auto merge_operator = options->merge_operator.get();
|
||||
assert(merge_operator);
|
||||
|
||||
std::deque<std::string> operands;
|
||||
operands.push_front(value.ToString());
|
||||
std::string new_value;
|
||||
if (!merge_operator->FullMerge(key,
|
||||
&get_value_slice,
|
||||
operands,
|
||||
&new_value,
|
||||
options_->info_log.get())) {
|
||||
if (!merge_operator->FullMerge(key, &get_value_slice, operands,
|
||||
&new_value, options->info_log.get())) {
|
||||
// Failed to merge!
|
||||
RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES);
|
||||
RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES);
|
||||
|
||||
// Store the delta in memtable
|
||||
perform_merge = false;
|
||||
} else {
|
||||
// 3) Add value to memtable
|
||||
mem_->Add(sequence_, kTypeValue, key, new_value);
|
||||
mem->Add(sequence_, kTypeValue, key, new_value);
|
||||
}
|
||||
}
|
||||
|
||||
if (!perform_merge) {
|
||||
// Add merge operator to memtable
|
||||
mem_->Add(sequence_, kTypeMerge, key, value);
|
||||
mem->Add(sequence_, kTypeMerge, key, value);
|
||||
}
|
||||
|
||||
sequence_++;
|
||||
return Status::OK();
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
if (filter_deletes_) {
|
||||
|
||||
virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
|
||||
Status seek_status;
|
||||
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
|
||||
++sequence_;
|
||||
return seek_status;
|
||||
}
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
const Options* options = cf_mems_->GetOptions();
|
||||
if (!dont_filter_deletes_ && options->filter_deletes) {
|
||||
SnapshotImpl read_from_snapshot;
|
||||
read_from_snapshot.number_ = sequence_;
|
||||
ReadOptions ropts;
|
||||
ropts.snapshot = &read_from_snapshot;
|
||||
std::string value;
|
||||
if (!db_->KeyMayExist(ropts, key, &value)) {
|
||||
RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES);
|
||||
return;
|
||||
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
||||
if (cf_handle == nullptr) {
|
||||
cf_handle = db_->DefaultColumnFamily();
|
||||
}
|
||||
if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
|
||||
RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES);
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
mem_->Add(sequence_, kTypeDeletion, key, Slice());
|
||||
mem->Add(sequence_, kTypeDeletion, key, Slice());
|
||||
sequence_++;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem,
|
||||
const Options* opts, DB* db,
|
||||
const bool filter_deletes) {
|
||||
MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db,
|
||||
filter_deletes);
|
||||
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
|
||||
ColumnFamilyMemTables* memtables,
|
||||
bool recovery, uint64_t log_number,
|
||||
DB* db, const bool dont_filter_deletes) {
|
||||
MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
|
||||
recovery, log_number, db, dont_filter_deletes);
|
||||
return b->Iterate(&inserter);
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,49 @@ namespace rocksdb {
|
||||
|
||||
class MemTable;
|
||||
|
||||
class ColumnFamilyMemTables {
|
||||
public:
|
||||
virtual ~ColumnFamilyMemTables() {}
|
||||
virtual bool Seek(uint32_t column_family_id) = 0;
|
||||
// returns true if the update to memtable should be ignored
|
||||
// (useful when recovering from log whose updates have already
|
||||
// been processed)
|
||||
virtual uint64_t GetLogNumber() const = 0;
|
||||
virtual MemTable* GetMemTable() const = 0;
|
||||
virtual const Options* GetOptions() const = 0;
|
||||
virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
|
||||
};
|
||||
|
||||
class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
|
||||
public:
|
||||
ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options)
|
||||
: ok_(false), mem_(mem), options_(options) {}
|
||||
|
||||
bool Seek(uint32_t column_family_id) override {
|
||||
ok_ = (column_family_id == 0);
|
||||
return ok_;
|
||||
}
|
||||
|
||||
uint64_t GetLogNumber() const override { return 0; }
|
||||
|
||||
MemTable* GetMemTable() const override {
|
||||
assert(ok_);
|
||||
return mem_;
|
||||
}
|
||||
|
||||
const Options* GetOptions() const override {
|
||||
assert(ok_);
|
||||
return options_;
|
||||
}
|
||||
|
||||
ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
|
||||
|
||||
private:
|
||||
bool ok_;
|
||||
MemTable* mem_;
|
||||
const Options* const options_;
|
||||
};
|
||||
|
||||
// WriteBatchInternal provides static methods for manipulating a
|
||||
// WriteBatch that we don't want in the public WriteBatch interface.
|
||||
class WriteBatchInternal {
|
||||
@ -45,11 +88,21 @@ class WriteBatchInternal {
|
||||
static void SetContents(WriteBatch* batch, const Slice& contents);
|
||||
|
||||
// Inserts batch entries into memtable
|
||||
// Drops deletes in batch if filter_del is set to true and
|
||||
// db->KeyMayExist returns false
|
||||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable,
|
||||
const Options* opts, DB* db = nullptr,
|
||||
const bool filter_del = false);
|
||||
// If dont_filter_deletes is false AND options.filter_deletes is true,
|
||||
// then --> Drops deletes in batch if db->KeyMayExist returns false
|
||||
// If recovery == true, this means InsertInto is executed on a recovery
|
||||
// code-path. WriteBatch referencing a dropped column family can be
|
||||
// found on a recovery code-path and should be ignored (recovery should not
|
||||
// fail). Additionally, the memtable will be updated only if
|
||||
// memtables->GetLogNumber() >= log_number
|
||||
// However, if recovery == false, any WriteBatch referencing
|
||||
// non-existing column family will return a failure. Also, log_number is
|
||||
// ignored in that case
|
||||
static Status InsertInto(const WriteBatch* batch,
|
||||
ColumnFamilyMemTables* memtables,
|
||||
bool recovery = false, uint64_t log_number = 0,
|
||||
DB* db = nullptr,
|
||||
const bool dont_filter_deletes = true);
|
||||
|
||||
static void Append(WriteBatch* dst, const WriteBatch* src);
|
||||
};
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include <memory>
|
||||
#include "db/memtable.h"
|
||||
#include "db/column_family.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
@ -27,7 +28,8 @@ static std::string PrintContents(WriteBatch* b) {
|
||||
MemTable* mem = new MemTable(cmp, options);
|
||||
mem->Ref();
|
||||
std::string state;
|
||||
Status s = WriteBatchInternal::InsertInto(b, mem, &options);
|
||||
ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
|
||||
Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
|
||||
int count = 0;
|
||||
Iterator* iter = mem->NewIterator();
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
@ -144,17 +146,37 @@ TEST(WriteBatchTest, Append) {
|
||||
namespace {
|
||||
struct TestHandler : public WriteBatch::Handler {
|
||||
std::string seen;
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
if (column_family_id == 0) {
|
||||
seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
|
||||
} else {
|
||||
seen += "PutCF(" + std::to_string(column_family_id) + ", " +
|
||||
key.ToString() + ", " + value.ToString() + ")";
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
virtual void Merge(const Slice& key, const Slice& value) {
|
||||
seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
|
||||
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
if (column_family_id == 0) {
|
||||
seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
|
||||
} else {
|
||||
seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
|
||||
key.ToString() + ", " + value.ToString() + ")";
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
virtual void LogData(const Slice& blob) {
|
||||
seen += "LogData(" + blob.ToString() + ")";
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
seen += "Delete(" + key.ToString() + ")";
|
||||
virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
|
||||
if (column_family_id == 0) {
|
||||
seen += "Delete(" + key.ToString() + ")";
|
||||
} else {
|
||||
seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
|
||||
key.ToString() + ")";
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
}
|
||||
@ -194,21 +216,23 @@ TEST(WriteBatchTest, Continue) {
|
||||
|
||||
struct Handler : public TestHandler {
|
||||
int num_seen = 0;
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
++num_seen;
|
||||
TestHandler::Put(key, value);
|
||||
return TestHandler::PutCF(column_family_id, key, value);
|
||||
}
|
||||
virtual void Merge(const Slice& key, const Slice& value) {
|
||||
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
++num_seen;
|
||||
TestHandler::Merge(key, value);
|
||||
return TestHandler::MergeCF(column_family_id, key, value);
|
||||
}
|
||||
virtual void LogData(const Slice& blob) {
|
||||
++num_seen;
|
||||
TestHandler::LogData(blob);
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
|
||||
++num_seen;
|
||||
TestHandler::Delete(key);
|
||||
return TestHandler::DeleteCF(column_family_id, key);
|
||||
}
|
||||
virtual bool Continue() override {
|
||||
return num_seen < 3;
|
||||
@ -256,6 +280,42 @@ TEST(WriteBatchTest, PutGatherSlices) {
|
||||
ASSERT_EQ(3, batch.Count());
|
||||
}
|
||||
|
||||
namespace {
|
||||
class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
|
||||
public:
|
||||
ColumnFamilyHandleImplDummy(int id)
|
||||
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
|
||||
uint32_t GetID() const override { return id_; }
|
||||
|
||||
private:
|
||||
uint32_t id_;
|
||||
};
|
||||
} // namespace anonymous
|
||||
|
||||
TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
|
||||
WriteBatch batch;
|
||||
ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
|
||||
batch.Put(&zero, Slice("foo"), Slice("bar"));
|
||||
batch.Put(&two, Slice("twofoo"), Slice("bar2"));
|
||||
batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
|
||||
batch.Delete(&eight, Slice("eightfoo"));
|
||||
batch.Merge(&three, Slice("threethree"), Slice("3three"));
|
||||
batch.Put(&zero, Slice("foo"), Slice("bar"));
|
||||
batch.Merge(Slice("omom"), Slice("nom"));
|
||||
|
||||
TestHandler handler;
|
||||
batch.Iterate(&handler);
|
||||
ASSERT_EQ(
|
||||
"Put(foo, bar)"
|
||||
"PutCF(2, twofoo, bar2)"
|
||||
"PutCF(8, eightfoo, bar8)"
|
||||
"DeleteCF(8, eightfoo)"
|
||||
"MergeCF(3, threethree, 3three)"
|
||||
"Put(foo, bar)"
|
||||
"Merge(omom, nom)",
|
||||
handler.seen);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
@ -243,6 +243,7 @@ extern void rocksdb_options_set_paranoid_checks(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
|
||||
extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
|
||||
extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
|
||||
@ -275,6 +276,8 @@ extern void rocksdb_options_set_expanded_compaction_factor(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_grandparent_overlap_factor(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
|
||||
rocksdb_options_t*, int* level_values, size_t num_levels);
|
||||
extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
|
||||
|
||||
extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
|
||||
@ -330,10 +333,14 @@ extern void rocksdb_options_set_block_size_deviation(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_advise_random_on_open(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_access_hint_on_compaction_start(
|
||||
rocksdb_options_t*, int);
|
||||
extern void rocksdb_options_set_use_adaptive_mutex(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_bytes_per_sync(
|
||||
rocksdb_options_t*, uint64_t);
|
||||
extern void rocksdb_options_set_verify_checksums_in_compaction(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_filter_deletes(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_max_sequential_skip_in_iterations(
|
||||
@ -348,6 +355,7 @@ extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
|
||||
extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
|
||||
extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
|
||||
extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
|
||||
|
||||
extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
|
||||
extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
|
||||
@ -360,6 +368,16 @@ extern void rocksdb_options_set_memtable_prefix_bloom_probes(
|
||||
rocksdb_options_t*, uint32_t);
|
||||
extern void rocksdb_options_set_max_successive_merges(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern void rocksdb_options_set_min_partial_merge_operands(
|
||||
rocksdb_options_t*, uint32_t);
|
||||
extern void rocksdb_options_set_bloom_locality(
|
||||
rocksdb_options_t*, uint32_t);
|
||||
extern void rocksdb_options_set_allow_thread_local(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_inplace_update_support(
|
||||
rocksdb_options_t*, unsigned char);
|
||||
extern void rocksdb_options_set_inplace_update_num_locks(
|
||||
rocksdb_options_t*, size_t);
|
||||
|
||||
enum {
|
||||
rocksdb_no_compression = 0,
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <stdio.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
@ -23,8 +24,24 @@ namespace rocksdb {
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
class ColumnFamilyHandle {
|
||||
public:
|
||||
virtual ~ColumnFamilyHandle() {}
|
||||
};
|
||||
extern const std::string default_column_family_name;
|
||||
|
||||
struct ColumnFamilyDescriptor {
|
||||
std::string name;
|
||||
ColumnFamilyOptions options;
|
||||
ColumnFamilyDescriptor()
|
||||
: name(default_column_family_name), options(ColumnFamilyOptions()) {}
|
||||
ColumnFamilyDescriptor(const std::string& name,
|
||||
const ColumnFamilyOptions& options)
|
||||
: name(name), options(options) {}
|
||||
};
|
||||
|
||||
// Update Makefile if you change these
|
||||
static const int kMajorVersion = 2;
|
||||
static const int kMajorVersion = 3;
|
||||
static const int kMinorVersion = 0;
|
||||
|
||||
struct Options;
|
||||
@ -87,33 +104,80 @@ class DB {
|
||||
// that modify data, like put/delete, will return error.
|
||||
// If the db is opened in read only mode, then no compactions
|
||||
// will happen.
|
||||
// TODO(icanadi): implement OpenForReadOnly that specifies column families.
|
||||
// User can open DB in read-only mode even if not specifying all column
|
||||
// families
|
||||
static Status OpenForReadOnly(const Options& options,
|
||||
const std::string& name, DB** dbptr,
|
||||
bool error_if_log_file_exist = false);
|
||||
|
||||
// Open DB with column families.
|
||||
// db_options specify database specific options
|
||||
// column_families is the vector of all column families you'd like to open,
|
||||
// containing column family name and options. The default column family name
|
||||
// is 'default'.
|
||||
// If everything is OK, handles will on return be the same size
|
||||
// as column_families --- handles[i] will be a handle that you
|
||||
// will use to operate on column family column_family[i]
|
||||
static Status Open(const DBOptions& db_options, const std::string& name,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
|
||||
|
||||
// ListColumnFamilies will open the DB specified by argument name
|
||||
// and return the list of all column families in that DB
|
||||
// through column_families argument. The ordering of
|
||||
// column families in column_families is unspecified.
|
||||
static Status ListColumnFamilies(const DBOptions& db_options,
|
||||
const std::string& name,
|
||||
std::vector<std::string>* column_families);
|
||||
|
||||
DB() { }
|
||||
virtual ~DB();
|
||||
|
||||
// Create a column_family and return the handle of column family
|
||||
// through the argument handle.
|
||||
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
||||
const std::string& column_family_name,
|
||||
ColumnFamilyHandle** handle);
|
||||
|
||||
// Drop a column family specified by column_family handle. This call
|
||||
// only records a drop record in the manifest and prevents the column
|
||||
// family from flushing and compacting.
|
||||
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
|
||||
|
||||
// Set the database entry for "key" to "value".
|
||||
// Returns OK on success, and a non-OK status on error.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
Status Put(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Put(options, DefaultColumnFamily(), key, value);
|
||||
}
|
||||
|
||||
// Remove the database entry (if any) for "key". Returns OK on
|
||||
// success, and a non-OK status on error. It is not an error if "key"
|
||||
// did not exist in the database.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) = 0;
|
||||
Status Delete(const WriteOptions& options, const Slice& key) {
|
||||
return Delete(options, DefaultColumnFamily(), key);
|
||||
}
|
||||
|
||||
// Merge the database entry for "key" with "value". Returns OK on success,
|
||||
// and a non-OK status on error. The semantics of this operation is
|
||||
// determined by the user provided merge_operator when opening DB.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
Status Merge(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Merge(options, DefaultColumnFamily(), key, value);
|
||||
}
|
||||
|
||||
// Apply the specified updates to the database.
|
||||
// Returns OK on success, non-OK on failure.
|
||||
@ -128,8 +192,11 @@ class DB {
|
||||
//
|
||||
// May return some other Status on an error.
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) = 0;
|
||||
Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
|
||||
return Get(options, DefaultColumnFamily(), key, value);
|
||||
}
|
||||
|
||||
// If keys[i] does not exist in the database, then the i'th returned
|
||||
// status will be one for which Status::IsNotFound() is true, and
|
||||
@ -141,9 +208,17 @@ class DB {
|
||||
// Similarly, the number of returned statuses will be the number of keys.
|
||||
// Note: keys will not be "de-duplicated". Duplicate keys will return
|
||||
// duplicate values in order.
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) = 0;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
|
||||
std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) {
|
||||
return MultiGet(options, std::vector<ColumnFamilyHandle*>(
|
||||
keys.size(), DefaultColumnFamily()),
|
||||
keys, values);
|
||||
}
|
||||
|
||||
// If the key definitely does not exist in the database, then this method
|
||||
// returns false, else true. If the caller wants to obtain value when the key
|
||||
@ -153,14 +228,17 @@ class DB {
|
||||
// to make this lighter weight is to avoid doing any IOs.
|
||||
// Default implementation here returns true and sets 'value_found' to false
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) {
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value, bool* value_found = nullptr) {
|
||||
if (value_found != nullptr) {
|
||||
*value_found = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool KeyMayExist(const ReadOptions& options, const Slice& key,
|
||||
std::string* value, bool* value_found = nullptr) {
|
||||
return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
|
||||
}
|
||||
|
||||
// Return a heap-allocated iterator over the contents of the database.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
@ -168,7 +246,18 @@ class DB {
|
||||
//
|
||||
// Caller should delete the iterator when it is no longer needed.
|
||||
// The returned iterator should be deleted before this db is deleted.
|
||||
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
|
||||
virtual Iterator* NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family) = 0;
|
||||
Iterator* NewIterator(const ReadOptions& options) {
|
||||
return NewIterator(options, DefaultColumnFamily());
|
||||
}
|
||||
// Returns iterators from a consistent database state across multiple
|
||||
// column families. Iterators are heap allocated and need to be deleted
|
||||
// before the db is deleted
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators) = 0;
|
||||
|
||||
// Return a handle to the current DB state. Iterators created with
|
||||
// this handle will all observe a stable snapshot of the current DB
|
||||
@ -194,7 +283,11 @@ class DB {
|
||||
// about the internal operation of the DB.
|
||||
// "rocksdb.sstables" - returns a multi-line string that describes all
|
||||
// of the sstables that make up the db contents.
|
||||
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
|
||||
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, std::string* value) = 0;
|
||||
bool GetProperty(const Slice& property, std::string* value) {
|
||||
return GetProperty(DefaultColumnFamily(), property, value);
|
||||
}
|
||||
|
||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||
// file system space used by keys in "[range[i].start .. range[i].limit)".
|
||||
@ -204,8 +297,12 @@ class DB {
|
||||
// sizes will be one-tenth the size of the corresponding user data size.
|
||||
//
|
||||
// The results may not include the sizes of recently written data.
|
||||
virtual void GetApproximateSizes(const Range* range, int n,
|
||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||
const Range* range, int n,
|
||||
uint64_t* sizes) = 0;
|
||||
void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
|
||||
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
|
||||
}
|
||||
|
||||
// Compact the underlying storage for the key range [*begin,*end].
|
||||
// The actual compaction interval might be superset of [*begin, *end].
|
||||
@ -224,19 +321,32 @@ class DB {
|
||||
// hosting all the files. In this case, client could set reduce_level
|
||||
// to true, to move the files back to the minimum level capable of holding
|
||||
// the data set or a given level (specified by non-negative target_level).
|
||||
virtual Status CompactRange(const Slice* begin, const Slice* end,
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false,
|
||||
int target_level = -1) = 0;
|
||||
Status CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1) {
|
||||
return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
|
||||
target_level);
|
||||
}
|
||||
|
||||
// Number of levels used for this DB.
|
||||
virtual int NumberLevels() = 0;
|
||||
virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
|
||||
int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
|
||||
|
||||
// Maximum level to which a new compacted memtable is pushed if it
|
||||
// does not create overlap.
|
||||
virtual int MaxMemCompactionLevel() = 0;
|
||||
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
|
||||
int MaxMemCompactionLevel() {
|
||||
return MaxMemCompactionLevel(DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// Number of files in level-0 that would stop writes.
|
||||
virtual int Level0StopWriteTrigger() = 0;
|
||||
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
|
||||
int Level0StopWriteTrigger() {
|
||||
return Level0StopWriteTrigger(DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// Get DB name -- the exact same name that was provided as an argument to
|
||||
// DB::Open()
|
||||
@ -246,10 +356,18 @@ class DB {
|
||||
virtual Env* GetEnv() const = 0;
|
||||
|
||||
// Get DB Options that we use
|
||||
virtual const Options& GetOptions() const = 0;
|
||||
virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
|
||||
const = 0;
|
||||
const Options& GetOptions() const {
|
||||
return GetOptions(DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// Flush all mem-table data.
|
||||
virtual Status Flush(const FlushOptions& options) = 0;
|
||||
virtual Status Flush(const FlushOptions& options,
|
||||
ColumnFamilyHandle* column_family) = 0;
|
||||
Status Flush(const FlushOptions& options) {
|
||||
return Flush(options, DefaultColumnFamily());
|
||||
}
|
||||
|
||||
// Prevent file deletions. Compactions will continue to occur,
|
||||
// but no obsolete files will be deleted. Calling this multiple
|
||||
@ -279,9 +397,12 @@ class DB {
|
||||
// Setting flush_memtable to true does Flush before recording the live files.
|
||||
// Setting flush_memtable to false is useful when we don't want to wait for
|
||||
// flush which may have to wait for compaction to complete taking an
|
||||
// indeterminate time. But this will have to use GetSortedWalFiles after
|
||||
// GetLiveFiles to compensate for memtables missed in this snapshot due to the
|
||||
// absence of Flush, by WAL files to recover the database consistently later
|
||||
// indeterminate time.
|
||||
//
|
||||
// In case you have multiple column families, even if flush_memtable is true,
|
||||
// you still need to call GetSortedWalFiles after GetLiveFiles to compensate
|
||||
// for new data that arrived to already-flushed column families while other
|
||||
// column families were flushing
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) = 0;
|
||||
@ -319,7 +440,14 @@ class DB {
|
||||
// be set properly
|
||||
virtual Status GetDbIdentity(std::string& identity) = 0;
|
||||
|
||||
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) = 0;
|
||||
// Returns default column family handle
|
||||
virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
|
||||
|
||||
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
||||
TablePropertiesCollection* props) = 0;
|
||||
Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
|
||||
return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
|
||||
}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
|
@ -34,7 +34,7 @@ class Slice;
|
||||
class WritableFile;
|
||||
class RandomRWFile;
|
||||
class Directory;
|
||||
struct Options;
|
||||
struct DBOptions;
|
||||
|
||||
using std::unique_ptr;
|
||||
using std::shared_ptr;
|
||||
@ -47,7 +47,7 @@ struct EnvOptions {
|
||||
EnvOptions();
|
||||
|
||||
// construct from Options
|
||||
explicit EnvOptions(const Options& options);
|
||||
explicit EnvOptions(const DBOptions& options);
|
||||
|
||||
// If true, then allow caching of data in environment buffers
|
||||
bool use_os_buffer = true;
|
||||
|
@ -45,6 +45,8 @@ class LookupKey;
|
||||
class Slice;
|
||||
class SliceTransform;
|
||||
|
||||
typedef void* KeyHandle;
|
||||
|
||||
class MemTableRep {
|
||||
public:
|
||||
// KeyComparator provides a means to compare keys, which are internal keys
|
||||
@ -62,11 +64,19 @@ class MemTableRep {
|
||||
virtual ~KeyComparator() { }
|
||||
};
|
||||
|
||||
explicit MemTableRep(Arena* arena) : arena_(arena) {}
|
||||
|
||||
// Allocate a buf of len size for storing key. The idea is that a specific
|
||||
// memtable representation knows its underlying data structure better. By
|
||||
// allowing it to allocate memory, it can possibly put correlated stuff
|
||||
// in consecutive memory area to make processor prefetching more efficient.
|
||||
virtual KeyHandle Allocate(const size_t len, char** buf);
|
||||
|
||||
// Insert key into the collection. (The caller will pack key and value into a
|
||||
// single buffer and pass that in as the parameter to Insert)
|
||||
// single buffer and pass that in as the parameter to Insert).
|
||||
// REQUIRES: nothing that compares equal to key is currently in the
|
||||
// collection.
|
||||
virtual void Insert(const char* key) = 0;
|
||||
virtual void Insert(KeyHandle handle) = 0;
|
||||
|
||||
// Returns true iff an entry that compares equal to key is in the collection.
|
||||
virtual bool Contains(const char* key) const = 0;
|
||||
@ -153,6 +163,8 @@ class MemTableRep {
|
||||
// When *key is an internal key concatenated with the value, returns the
|
||||
// user key.
|
||||
virtual Slice UserKey(const char* key) const;
|
||||
|
||||
Arena* arena_;
|
||||
};
|
||||
|
||||
// This is the base class for all factories that are used by RocksDB to create
|
||||
|
@ -72,8 +72,9 @@ enum UpdateStatus { // Return status For inplace update callback
|
||||
UPDATED = 2, // No inplace update. Merged value set
|
||||
};
|
||||
|
||||
// Options to control the behavior of a database (passed to DB::Open)
|
||||
struct Options {
|
||||
struct Options;
|
||||
|
||||
struct ColumnFamilyOptions {
|
||||
// -------------------
|
||||
// Parameters that affect behavior
|
||||
|
||||
@ -130,38 +131,6 @@ struct Options {
|
||||
// Default: a factory that doesn't provide any object
|
||||
std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
|
||||
|
||||
// If true, the database will be created if it is missing.
|
||||
// Default: false
|
||||
bool create_if_missing;
|
||||
|
||||
// If true, an error is raised if the database already exists.
|
||||
// Default: false
|
||||
bool error_if_exists;
|
||||
|
||||
// If true, the implementation will do aggressive checking of the
|
||||
// data it is processing and will stop early if it detects any
|
||||
// errors. This may have unforeseen ramifications: for example, a
|
||||
// corruption of one DB entry may cause a large number of entries to
|
||||
// become unreadable or for the entire DB to become unopenable.
|
||||
// If any of the writes to the database fails (Put, Delete, Merge, Write),
|
||||
// the database will switch to read-only mode and fail all other
|
||||
// Write operations.
|
||||
// Default: true
|
||||
bool paranoid_checks;
|
||||
|
||||
// Use the specified object to interact with the environment,
|
||||
// e.g. to read/write files, schedule background work, etc.
|
||||
// Default: Env::Default()
|
||||
Env* env;
|
||||
|
||||
// Any internal progress/error information generated by the db will
|
||||
// be written to info_log if it is non-nullptr, or to a file stored
|
||||
// in the same directory as the DB contents if info_log is nullptr.
|
||||
// Default: nullptr
|
||||
shared_ptr<Logger> info_log;
|
||||
|
||||
InfoLogLevel info_log_level;
|
||||
|
||||
// -------------------
|
||||
// Parameters that affect performance
|
||||
|
||||
@ -193,15 +162,6 @@ struct Options {
|
||||
// individual write buffers. Default: 1
|
||||
int min_write_buffer_number_to_merge;
|
||||
|
||||
// Number of open files that can be used by the DB. You may need to
|
||||
// increase this if your database has a large working set. Value -1 means
|
||||
// files opened are always kept open. You can estimate number of files based
|
||||
// on target_file_size_base and target_file_size_multiplier for level-based
|
||||
// compaction. For universal-style compaction, you can usually set it to -1.
|
||||
//
|
||||
// Default: 5000
|
||||
int max_open_files;
|
||||
|
||||
// Control over blocks (user data is stored in a set of blocks, and
|
||||
// a block is the unit of reading from disk).
|
||||
|
||||
@ -369,93 +329,12 @@ struct Options {
|
||||
// stop building a single file in a level->level+1 compaction.
|
||||
int max_grandparent_overlap_factor;
|
||||
|
||||
// If non-null, then we should collect metrics about database operations
|
||||
// Statistics objects should not be shared between DB instances as
|
||||
// it does not use any locks to prevent concurrent updates.
|
||||
shared_ptr<Statistics> statistics;
|
||||
|
||||
// If true, then the contents of data files are not synced
|
||||
// to stable storage. Their contents remain in the OS buffers till the
|
||||
// OS decides to flush them. This option is good for bulk-loading
|
||||
// of data. Once the bulk-loading is complete, please issue a
|
||||
// sync to the OS to flush all dirty buffesrs to stable storage.
|
||||
// Default: false
|
||||
bool disableDataSync;
|
||||
|
||||
// If true, then every store to stable storage will issue a fsync.
|
||||
// If false, then every store to stable storage will issue a fdatasync.
|
||||
// This parameter should be set to true while storing data to
|
||||
// filesystem like ext3 that can lose files after a reboot.
|
||||
// Default: false
|
||||
bool use_fsync;
|
||||
|
||||
// This number controls how often a new scribe log about
|
||||
// db deploy stats is written out.
|
||||
// -1 indicates no logging at all.
|
||||
// Default value is 1800 (half an hour).
|
||||
int db_stats_log_interval;
|
||||
|
||||
// This specifies the info LOG dir.
|
||||
// If it is empty, the log files will be in the same dir as data.
|
||||
// If it is non empty, the log files will be in the specified dir,
|
||||
// and the db data dir's absolute path will be used as the log file
|
||||
// name's prefix.
|
||||
std::string db_log_dir;
|
||||
|
||||
// This specifies the absolute dir path for write-ahead logs (WAL).
|
||||
// If it is empty, the log files will be in the same dir as data,
|
||||
// dbname is used as the data dir by default
|
||||
// If it is non empty, the log files will be in kept the specified dir.
|
||||
// When destroying the db,
|
||||
// all log files in wal_dir and the dir itself is deleted
|
||||
std::string wal_dir;
|
||||
|
||||
// Disable compaction triggered by seek.
|
||||
// With bloomfilter and fast storage, a miss on one level
|
||||
// is very cheap if the file handle is cached in table cache
|
||||
// (which is true if max_open_files is large).
|
||||
bool disable_seek_compaction;
|
||||
|
||||
// The periodicity when obsolete files get deleted. The default
|
||||
// value is 6 hours. The files that get out of scope by compaction
|
||||
// process will still get automatically delete on every compaction,
|
||||
// regardless of this setting
|
||||
uint64_t delete_obsolete_files_period_micros;
|
||||
|
||||
// Maximum number of concurrent background jobs, submitted to
|
||||
// the default LOW priority thread pool
|
||||
// Default: 1
|
||||
int max_background_compactions;
|
||||
|
||||
// Maximum number of concurrent background memtable flush jobs, submitted to
|
||||
// the HIGH priority thread pool.
|
||||
// By default, all background jobs (major compaction and memtable flush) go
|
||||
// to the LOW priority pool. If this option is set to a positive number,
|
||||
// memtable flush jobs will be submitted to the HIGH priority pool.
|
||||
// It is important when the same Env is shared by multiple db instances.
|
||||
// Without a separate pool, long running major compaction jobs could
|
||||
// potentially block memtable flush jobs of other db instances, leading to
|
||||
// unnecessary Put stalls.
|
||||
// Default: 1
|
||||
int max_background_flushes;
|
||||
|
||||
// Specify the maximal size of the info log file. If the log file
|
||||
// is larger than `max_log_file_size`, a new info log file will
|
||||
// be created.
|
||||
// If max_log_file_size == 0, all logs will be written to one
|
||||
// log file.
|
||||
size_t max_log_file_size;
|
||||
|
||||
// Time for the info log file to roll (in seconds).
|
||||
// If specified with non-zero value, log file will be rolled
|
||||
// if it has been active longer than `log_file_time_to_roll`.
|
||||
// Default: 0 (disabled)
|
||||
size_t log_file_time_to_roll;
|
||||
|
||||
// Maximal info log files to be kept.
|
||||
// Default: 1000
|
||||
size_t keep_log_file_num;
|
||||
|
||||
// Puts are delayed 0-1 ms when any level has a compaction score that exceeds
|
||||
// soft_rate_limit. This is ignored when == 0.0.
|
||||
// CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
|
||||
@ -473,32 +352,14 @@ struct Options {
|
||||
// Default: 1000
|
||||
unsigned int rate_limit_delay_max_milliseconds;
|
||||
|
||||
// manifest file is rolled over on reaching this limit.
|
||||
// The older manifest file be deleted.
|
||||
// The default value is MAX_INT so that roll-over does not take place.
|
||||
uint64_t max_manifest_file_size;
|
||||
|
||||
// Disable block cache. If this is set to true,
|
||||
// then no block cache should be used, and the block_cache should
|
||||
// point to a nullptr object.
|
||||
// Default: false
|
||||
bool no_block_cache;
|
||||
|
||||
// Number of shards used for table cache.
|
||||
int table_cache_numshardbits;
|
||||
|
||||
// During data eviction of table's LRU cache, it would be inefficient
|
||||
// to strictly follow LRU because this piece of memory will not really
|
||||
// be released unless its refcount falls to zero. Instead, make two
|
||||
// passes: the first pass will release items with refcount = 1,
|
||||
// and if not enough space releases after scanning the number of
|
||||
// elements specified by this parameter, we will remove items in LRU
|
||||
// order.
|
||||
int table_cache_remove_scan_count_limit;
|
||||
|
||||
// Size of one block in arena memory allocation.
|
||||
//
|
||||
// If <= 0, a proper value is automatically calculated (usually about 1/10 of
|
||||
// size of one block in arena memory allocation.
|
||||
// If <= 0, a proper value is automatically calculated (usually 1/10 of
|
||||
// writer_buffer_size).
|
||||
//
|
||||
// There are two additonal restriction of the The specified size:
|
||||
@ -512,71 +373,14 @@ struct Options {
|
||||
// Default: 0
|
||||
size_t arena_block_size;
|
||||
|
||||
// Create an Options object with default values for all fields.
|
||||
Options();
|
||||
|
||||
void Dump(Logger* log) const;
|
||||
|
||||
// Set appropriate parameters for bulk loading.
|
||||
// The reason that this is a function that returns "this" instead of a
|
||||
// constructor is to enable chaining of multiple similar calls in the future.
|
||||
//
|
||||
// All data will be in level 0 without any automatic compaction.
|
||||
// It's recommended to manually call CompactRange(NULL, NULL) before reading
|
||||
// from the database, because otherwise the read can be very slow.
|
||||
Options* PrepareForBulkLoad();
|
||||
|
||||
// Disable automatic compactions. Manual compactions can still
|
||||
// be issued on this database.
|
||||
// be issued on this column family
|
||||
bool disable_auto_compactions;
|
||||
|
||||
// The following two fields affect how archived logs will be deleted.
|
||||
// 1. If both set to 0, logs will be deleted asap and will not get into
|
||||
// the archive.
|
||||
// 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
|
||||
// WAL files will be checked every 10 min and if total size is greater
|
||||
// then WAL_size_limit_MB, they will be deleted starting with the
|
||||
// earliest until size_limit is met. All empty files will be deleted.
|
||||
// 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
|
||||
// WAL files will be checked every WAL_ttl_secondsi / 2 and those that
|
||||
// are older than WAL_ttl_seconds will be deleted.
|
||||
// 4. If both are not 0, WAL files will be checked every 10 min and both
|
||||
// checks will be performed with ttl being first.
|
||||
uint64_t WAL_ttl_seconds;
|
||||
uint64_t WAL_size_limit_MB;
|
||||
|
||||
// Number of bytes to preallocate (via fallocate) the manifest
|
||||
// files. Default is 4mb, which is reasonable to reduce random IO
|
||||
// as well as prevent overallocation for mounts that preallocate
|
||||
// large amounts of data (such as xfs's allocsize option).
|
||||
size_t manifest_preallocation_size;
|
||||
|
||||
// Purge duplicate/deleted keys when a memtable is flushed to storage.
|
||||
// Default: true
|
||||
bool purge_redundant_kvs_while_flush;
|
||||
|
||||
// Data being read from file storage may be buffered in the OS
|
||||
// Default: true
|
||||
bool allow_os_buffer;
|
||||
|
||||
// Allow the OS to mmap file for reading sst tables. Default: false
|
||||
bool allow_mmap_reads;
|
||||
|
||||
// Allow the OS to mmap file for writing. Default: false
|
||||
bool allow_mmap_writes;
|
||||
|
||||
// Disable child process inherit open files. Default: true
|
||||
bool is_fd_close_on_exec;
|
||||
|
||||
// Skip log corruption error on recovery (If client is ok with
|
||||
// losing most recent changes)
|
||||
// Default: false
|
||||
bool skip_log_error_on_recovery;
|
||||
|
||||
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
|
||||
// Default: 3600 (1 hour)
|
||||
unsigned int stats_dump_period_sec;
|
||||
|
||||
// This is used to close a block before it reaches the configured
|
||||
// 'block_size'. If the percentage of free space in the current block is less
|
||||
// than this specified number and adding a new record to the block will
|
||||
@ -585,45 +389,17 @@ struct Options {
|
||||
// Default is 10.
|
||||
int block_size_deviation;
|
||||
|
||||
// If set true, will hint the underlying file system that the file
|
||||
// access pattern is random, when a sst file is opened.
|
||||
// Default: true
|
||||
bool advise_random_on_open;
|
||||
|
||||
// Specify the file access pattern once a compaction is started.
|
||||
// It will be applied to all input files of a compaction.
|
||||
// Default: NORMAL
|
||||
enum {
|
||||
NONE,
|
||||
NORMAL,
|
||||
SEQUENTIAL,
|
||||
WILLNEED
|
||||
} access_hint_on_compaction_start;
|
||||
|
||||
// Use adaptive mutex, which spins in the user space before resorting
|
||||
// to kernel. This could reduce context switch when the mutex is not
|
||||
// heavily contended. However, if the mutex is hot, we could end up
|
||||
// wasting spin time.
|
||||
// Default: false
|
||||
bool use_adaptive_mutex;
|
||||
|
||||
// Allows OS to incrementally sync files to disk while they are being
|
||||
// written, asynchronously, in the background.
|
||||
// Issue one request for every bytes_per_sync written. 0 turns it off.
|
||||
// Default: 0
|
||||
uint64_t bytes_per_sync;
|
||||
|
||||
// The compaction style. Default: kCompactionStyleLevel
|
||||
CompactionStyle compaction_style;
|
||||
|
||||
// The options needed to support Universal Style compactions
|
||||
CompactionOptionsUniversal compaction_options_universal;
|
||||
|
||||
// If true, compaction will verify checksum on every read that happens
|
||||
// as part of compaction
|
||||
// Default: true
|
||||
bool verify_checksums_in_compaction;
|
||||
|
||||
// The options needed to support Universal Style compactions
|
||||
CompactionOptionsUniversal compaction_options_universal;
|
||||
|
||||
// Use KeyMayExist API to filter deletes when this is true.
|
||||
// If KeyMayExist returns false, i.e. the key definitely does not exist, then
|
||||
// the delete is a noop. KeyMayExist only incurs in-memory look up.
|
||||
@ -653,7 +429,7 @@ struct Options {
|
||||
// Default: emtpy vector -- no user-defined statistics collection will be
|
||||
// performed.
|
||||
typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
|
||||
TablePropertiesCollectors;
|
||||
TablePropertiesCollectors;
|
||||
TablePropertiesCollectors table_properties_collectors;
|
||||
|
||||
// Allows thread-safe inplace updates.
|
||||
@ -750,9 +526,266 @@ struct Options {
|
||||
// Default: 2
|
||||
uint32_t min_partial_merge_operands;
|
||||
|
||||
// Create ColumnFamilyOptions with default values for all fields
|
||||
ColumnFamilyOptions();
|
||||
// Create ColumnFamilyOptions from Options
|
||||
explicit ColumnFamilyOptions(const Options& options);
|
||||
|
||||
void Dump(Logger* log) const;
|
||||
};
|
||||
|
||||
struct DBOptions {
|
||||
// If true, the database will be created if it is missing.
|
||||
// Default: false
|
||||
bool create_if_missing;
|
||||
|
||||
// If true, an error is raised if the database already exists.
|
||||
// Default: false
|
||||
bool error_if_exists;
|
||||
|
||||
// If true, the implementation will do aggressive checking of the
|
||||
// data it is processing and will stop early if it detects any
|
||||
// errors. This may have unforeseen ramifications: for example, a
|
||||
// corruption of one DB entry may cause a large number of entries to
|
||||
// become unreadable or for the entire DB to become unopenable.
|
||||
// If any of the writes to the database fails (Put, Delete, Merge, Write),
|
||||
// the database will switch to read-only mode and fail all other
|
||||
// Write operations.
|
||||
// Default: true
|
||||
bool paranoid_checks;
|
||||
|
||||
// Use the specified object to interact with the environment,
|
||||
// e.g. to read/write files, schedule background work, etc.
|
||||
// Default: Env::Default()
|
||||
Env* env;
|
||||
|
||||
// Any internal progress/error information generated by the db will
|
||||
// be written to info_log if it is non-nullptr, or to a file stored
|
||||
// in the same directory as the DB contents if info_log is nullptr.
|
||||
// Default: nullptr
|
||||
shared_ptr<Logger> info_log;
|
||||
|
||||
InfoLogLevel info_log_level;
|
||||
|
||||
// Number of open files that can be used by the DB. You may need to
|
||||
// increase this if your database has a large working set. Value -1 means
|
||||
// files opened are always kept open. You can estimate number of files based
|
||||
// on target_file_size_base and target_file_size_multiplier for level-based
|
||||
// compaction. For universal-style compaction, you can usually set it to -1.
|
||||
// Default: 5000
|
||||
int max_open_files;
|
||||
|
||||
// If non-null, then we should collect metrics about database operations
|
||||
// Statistics objects should not be shared between DB instances as
|
||||
// it does not use any locks to prevent concurrent updates.
|
||||
shared_ptr<Statistics> statistics;
|
||||
|
||||
// If true, then the contents of data files are not synced
|
||||
// to stable storage. Their contents remain in the OS buffers till the
|
||||
// OS decides to flush them. This option is good for bulk-loading
|
||||
// of data. Once the bulk-loading is complete, please issue a
|
||||
// sync to the OS to flush all dirty buffesrs to stable storage.
|
||||
// Default: false
|
||||
bool disableDataSync;
|
||||
|
||||
// If true, then every store to stable storage will issue a fsync.
|
||||
// If false, then every store to stable storage will issue a fdatasync.
|
||||
// This parameter should be set to true while storing data to
|
||||
// filesystem like ext3 that can lose files after a reboot.
|
||||
// Default: false
|
||||
bool use_fsync;
|
||||
|
||||
// This number controls how often a new scribe log about
|
||||
// db deploy stats is written out.
|
||||
// -1 indicates no logging at all.
|
||||
// Default value is 1800 (half an hour).
|
||||
int db_stats_log_interval;
|
||||
|
||||
// This specifies the info LOG dir.
|
||||
// If it is empty, the log files will be in the same dir as data.
|
||||
// If it is non empty, the log files will be in the specified dir,
|
||||
// and the db data dir's absolute path will be used as the log file
|
||||
// name's prefix.
|
||||
std::string db_log_dir;
|
||||
|
||||
// This specifies the absolute dir path for write-ahead logs (WAL).
|
||||
// If it is empty, the log files will be in the same dir as data,
|
||||
// dbname is used as the data dir by default
|
||||
// If it is non empty, the log files will be in kept the specified dir.
|
||||
// When destroying the db,
|
||||
// all log files in wal_dir and the dir itself is deleted
|
||||
std::string wal_dir;
|
||||
|
||||
// The periodicity when obsolete files get deleted. The default
|
||||
// value is 6 hours. The files that get out of scope by compaction
|
||||
// process will still get automatically delete on every compaction,
|
||||
// regardless of this setting
|
||||
uint64_t delete_obsolete_files_period_micros;
|
||||
|
||||
// Maximum number of concurrent background compaction jobs, submitted to
|
||||
// the default LOW priority thread pool.
|
||||
// If you're increasing this, also consider increasing number of threads in
|
||||
// LOW priority thread pool. For more information, see
|
||||
// Env::SetBackgroundThreads
|
||||
// Default: 1
|
||||
int max_background_compactions;
|
||||
|
||||
// Maximum number of concurrent background memtable flush jobs, submitted to
|
||||
// the HIGH priority thread pool.
|
||||
//
|
||||
// By default, all background jobs (major compaction and memtable flush) go
|
||||
// to the LOW priority pool. If this option is set to a positive number,
|
||||
// memtable flush jobs will be submitted to the HIGH priority pool.
|
||||
// It is important when the same Env is shared by multiple db instances.
|
||||
// Without a separate pool, long running major compaction jobs could
|
||||
// potentially block memtable flush jobs of other db instances, leading to
|
||||
// unnecessary Put stalls.
|
||||
//
|
||||
// If you're increasing this, also consider increasing number of threads in
|
||||
// HIGH priority thread pool. For more information, see
|
||||
// Env::SetBackgroundThreads
|
||||
// Default: 1
|
||||
int max_background_flushes;
|
||||
|
||||
// Specify the maximal size of the info log file. If the log file
|
||||
// is larger than `max_log_file_size`, a new info log file will
|
||||
// be created.
|
||||
// If max_log_file_size == 0, all logs will be written to one
|
||||
// log file.
|
||||
size_t max_log_file_size;
|
||||
|
||||
// Time for the info log file to roll (in seconds).
|
||||
// If specified with non-zero value, log file will be rolled
|
||||
// if it has been active longer than `log_file_time_to_roll`.
|
||||
// Default: 0 (disabled)
|
||||
size_t log_file_time_to_roll;
|
||||
|
||||
// Maximal info log files to be kept.
|
||||
// Default: 1000
|
||||
size_t keep_log_file_num;
|
||||
|
||||
// manifest file is rolled over on reaching this limit.
|
||||
// The older manifest file be deleted.
|
||||
// The default value is MAX_INT so that roll-over does not take place.
|
||||
uint64_t max_manifest_file_size;
|
||||
|
||||
// Number of shards used for table cache.
|
||||
int table_cache_numshardbits;
|
||||
|
||||
// During data eviction of table's LRU cache, it would be inefficient
|
||||
// to strictly follow LRU because this piece of memory will not really
|
||||
// be released unless its refcount falls to zero. Instead, make two
|
||||
// passes: the first pass will release items with refcount = 1,
|
||||
// and if not enough space releases after scanning the number of
|
||||
// elements specified by this parameter, we will remove items in LRU
|
||||
// order.
|
||||
int table_cache_remove_scan_count_limit;
|
||||
|
||||
// The following two fields affect how archived logs will be deleted.
|
||||
// 1. If both set to 0, logs will be deleted asap and will not get into
|
||||
// the archive.
|
||||
// 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
|
||||
// WAL files will be checked every 10 min and if total size is greater
|
||||
// then WAL_size_limit_MB, they will be deleted starting with the
|
||||
// earliest until size_limit is met. All empty files will be deleted.
|
||||
// 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
|
||||
// WAL files will be checked every WAL_ttl_secondsi / 2 and those that
|
||||
// are older than WAL_ttl_seconds will be deleted.
|
||||
// 4. If both are not 0, WAL files will be checked every 10 min and both
|
||||
// checks will be performed with ttl being first.
|
||||
uint64_t WAL_ttl_seconds;
|
||||
uint64_t WAL_size_limit_MB;
|
||||
|
||||
// Number of bytes to preallocate (via fallocate) the manifest
|
||||
// files. Default is 4mb, which is reasonable to reduce random IO
|
||||
// as well as prevent overallocation for mounts that preallocate
|
||||
// large amounts of data (such as xfs's allocsize option).
|
||||
size_t manifest_preallocation_size;
|
||||
|
||||
// Data being read from file storage may be buffered in the OS
|
||||
// Default: true
|
||||
bool allow_os_buffer;
|
||||
|
||||
// Allow the OS to mmap file for reading sst tables. Default: false
|
||||
bool allow_mmap_reads;
|
||||
|
||||
// Allow the OS to mmap file for writing. Default: false
|
||||
bool allow_mmap_writes;
|
||||
|
||||
// Disable child process inherit open files. Default: true
|
||||
bool is_fd_close_on_exec;
|
||||
|
||||
// Skip log corruption error on recovery (If client is ok with
|
||||
// losing most recent changes)
|
||||
// Default: false
|
||||
bool skip_log_error_on_recovery;
|
||||
|
||||
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
|
||||
// Default: 3600 (1 hour)
|
||||
unsigned int stats_dump_period_sec;
|
||||
|
||||
// If set true, will hint the underlying file system that the file
|
||||
// access pattern is random, when a sst file is opened.
|
||||
// Default: true
|
||||
bool advise_random_on_open;
|
||||
|
||||
// Specify the file access pattern once a compaction is started.
|
||||
// It will be applied to all input files of a compaction.
|
||||
// Default: NORMAL
|
||||
enum {
|
||||
NONE,
|
||||
NORMAL,
|
||||
SEQUENTIAL,
|
||||
WILLNEED
|
||||
} access_hint_on_compaction_start;
|
||||
|
||||
// Use adaptive mutex, which spins in the user space before resorting
|
||||
// to kernel. This could reduce context switch when the mutex is not
|
||||
// heavily contended. However, if the mutex is hot, we could end up
|
||||
// wasting spin time.
|
||||
// Default: false
|
||||
bool use_adaptive_mutex;
|
||||
|
||||
// Allows OS to incrementally sync files to disk while they are being
|
||||
// written, asynchronously, in the background.
|
||||
// Issue one request for every bytes_per_sync written. 0 turns it off.
|
||||
// Default: 0
|
||||
uint64_t bytes_per_sync;
|
||||
|
||||
// Allow RocksDB to use thread local storage to optimize performance.
|
||||
// Default: true
|
||||
bool allow_thread_local;
|
||||
|
||||
// Create DBOptions with default values for all fields
|
||||
DBOptions();
|
||||
// Create DBOptions from Options
|
||||
explicit DBOptions(const Options& options);
|
||||
|
||||
void Dump(Logger* log) const;
|
||||
};
|
||||
|
||||
// Options to control the behavior of a database (passed to DB::Open)
|
||||
struct Options : public DBOptions, public ColumnFamilyOptions {
|
||||
// Create an Options object with default values for all fields.
|
||||
Options() :
|
||||
DBOptions(),
|
||||
ColumnFamilyOptions() {}
|
||||
|
||||
Options(const DBOptions& db_options,
|
||||
const ColumnFamilyOptions& column_family_options)
|
||||
: DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
|
||||
|
||||
void Dump(Logger* log) const;
|
||||
|
||||
// Set appropriate parameters for bulk loading.
|
||||
// The reason that this is a function that returns "this" instead of a
|
||||
// constructor is to enable chaining of multiple similar calls in the future.
|
||||
//
|
||||
|
||||
// All data will be in level 0 without any automatic compaction.
|
||||
// It's recommended to manually call CompactRange(NULL, NULL) before reading
|
||||
// from the database, because otherwise the read can be very slow.
|
||||
Options* PrepareForBulkLoad();
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -64,7 +64,11 @@ struct PerfContext {
|
||||
uint64_t write_memtable_time;
|
||||
};
|
||||
|
||||
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
|
||||
extern PerfContext perf_context;
|
||||
#else
|
||||
extern __thread PerfContext perf_context;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
@ -31,6 +31,7 @@
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class ColumnFamilyHandle;
|
||||
struct SliceParts;
|
||||
|
||||
class WriteBatch {
|
||||
@ -39,19 +40,34 @@ class WriteBatch {
|
||||
~WriteBatch();
|
||||
|
||||
// Store the mapping "key->value" in the database.
|
||||
void Put(const Slice& key, const Slice& value);
|
||||
void Put(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
void Put(const Slice& key, const Slice& value) {
|
||||
Put(nullptr, key, value);
|
||||
}
|
||||
|
||||
// Variant of Put() that gathers output like writev(2). The key and value
|
||||
// that will be written to the database are concatentations of arrays of
|
||||
// slices.
|
||||
void Put(const SliceParts& key, const SliceParts& value);
|
||||
void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
||||
const SliceParts& value);
|
||||
void Put(const SliceParts& key, const SliceParts& value) {
|
||||
Put(nullptr, key, value);
|
||||
}
|
||||
|
||||
// Merge "value" with the existing value of "key" in the database.
|
||||
// "key->merge(existing, value)"
|
||||
void Merge(const Slice& key, const Slice& value);
|
||||
void Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value);
|
||||
void Merge(const Slice& key, const Slice& value) {
|
||||
Merge(nullptr, key, value);
|
||||
}
|
||||
|
||||
// If the database contains a mapping for "key", erase it. Else do nothing.
|
||||
void Delete(const Slice& key);
|
||||
void Delete(ColumnFamilyHandle* column_family, const Slice& key);
|
||||
void Delete(const Slice& key) {
|
||||
Delete(nullptr, key);
|
||||
}
|
||||
|
||||
// Append a blob of arbitrary size to the records in this batch. The blob will
|
||||
// be stored in the transaction log but not in any other file. In particular,
|
||||
@ -72,14 +88,46 @@ class WriteBatch {
|
||||
class Handler {
|
||||
public:
|
||||
virtual ~Handler();
|
||||
virtual void Put(const Slice& key, const Slice& value) = 0;
|
||||
// default implementation will just call Put without column family for
|
||||
// backwards compatibility. If the column family is not default,
|
||||
// the function is noop
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
if (column_family_id == 0) {
|
||||
// Put() historically doesn't return status. We didn't want to be
|
||||
// backwards incompatible so we didn't change the return status
|
||||
// (this is a public API). We do an ordinary get and return Status::OK()
|
||||
Put(key, value);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"non-default column family and PutCF not implemented");
|
||||
}
|
||||
virtual void Put(const Slice& key, const Slice& value);
|
||||
// Merge and LogData are not pure virtual. Otherwise, we would break
|
||||
// existing clients of Handler on a source code level. The default
|
||||
// implementation of Merge simply throws a runtime exception.
|
||||
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) {
|
||||
if (column_family_id == 0) {
|
||||
Merge(key, value);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"non-default column family and MergeCF not implemented");
|
||||
}
|
||||
virtual void Merge(const Slice& key, const Slice& value);
|
||||
// The default implementation of LogData does nothing.
|
||||
virtual void LogData(const Slice& blob);
|
||||
virtual void Delete(const Slice& key) = 0;
|
||||
virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
|
||||
if (column_family_id == 0) {
|
||||
Delete(key);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"non-default column family and DeleteCF not implemented");
|
||||
}
|
||||
virtual void Delete(const Slice& key);
|
||||
// Continue is called by WriteBatch::Iterate. If it returns false,
|
||||
// iteration is halted. Otherwise, it continues iterating. The default
|
||||
// implementation always returns true.
|
||||
|
@ -21,40 +21,49 @@ class StackableDB : public DB {
|
||||
return db_;
|
||||
}
|
||||
|
||||
using DB::Put;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& val) override {
|
||||
return db_->Put(options, key, val);
|
||||
return db_->Put(options, column_family, key, val);
|
||||
}
|
||||
|
||||
using DB::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) override {
|
||||
return db_->Get(options, key, value);
|
||||
return db_->Get(options, column_family, key, value);
|
||||
}
|
||||
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values)
|
||||
override {
|
||||
return db_->MultiGet(options, keys, values);
|
||||
using DB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) override {
|
||||
return db_->MultiGet(options, column_family, keys, values);
|
||||
}
|
||||
|
||||
using DB::KeyMayExist;
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) override {
|
||||
return db_->KeyMayExist(options, key, value, value_found);
|
||||
return db_->KeyMayExist(options, column_family, key, value, value_found);
|
||||
}
|
||||
|
||||
virtual Status Delete(const WriteOptions& wopts, const Slice& key) override {
|
||||
return db_->Delete(wopts, key);
|
||||
using DB::Delete;
|
||||
virtual Status Delete(const WriteOptions& wopts,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) override {
|
||||
return db_->Delete(wopts, column_family, key);
|
||||
}
|
||||
|
||||
using DB::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) override {
|
||||
return db_->Merge(options, key, value);
|
||||
return db_->Merge(options, column_family, key, value);
|
||||
}
|
||||
|
||||
|
||||
@ -63,10 +72,20 @@ class StackableDB : public DB {
|
||||
return db_->Write(opts, updates);
|
||||
}
|
||||
|
||||
virtual Iterator* NewIterator(const ReadOptions& opts) override {
|
||||
return db_->NewIterator(opts);
|
||||
using DB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& opts,
|
||||
ColumnFamilyHandle* column_family) override {
|
||||
return db_->NewIterator(opts, column_family);
|
||||
}
|
||||
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators) {
|
||||
return db_->NewIterators(options, column_families, iterators);
|
||||
}
|
||||
|
||||
|
||||
virtual const Snapshot* GetSnapshot() override {
|
||||
return db_->GetSnapshot();
|
||||
}
|
||||
@ -75,32 +94,43 @@ class StackableDB : public DB {
|
||||
return db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
virtual bool GetProperty(const Slice& property, std::string* value)
|
||||
override {
|
||||
return db_->GetProperty(property, value);
|
||||
using DB::GetProperty;
|
||||
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, std::string* value) override {
|
||||
return db_->GetProperty(column_family, property, value);
|
||||
}
|
||||
|
||||
virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes)
|
||||
override {
|
||||
return db_->GetApproximateSizes(r, n, sizes);
|
||||
using DB::GetApproximateSizes;
|
||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||
const Range* r, int n,
|
||||
uint64_t* sizes) override {
|
||||
return db_->GetApproximateSizes(column_family, r, n, sizes);
|
||||
}
|
||||
|
||||
virtual Status CompactRange(const Slice* begin, const Slice* end,
|
||||
using DB::CompactRange;
|
||||
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false,
|
||||
int target_level = -1) override {
|
||||
return db_->CompactRange(begin, end, reduce_level, target_level);
|
||||
return db_->CompactRange(column_family, begin, end, reduce_level,
|
||||
target_level);
|
||||
}
|
||||
|
||||
virtual int NumberLevels() override {
|
||||
return db_->NumberLevels();
|
||||
using DB::NumberLevels;
|
||||
virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
|
||||
return db_->NumberLevels(column_family);
|
||||
}
|
||||
|
||||
virtual int MaxMemCompactionLevel() override {
|
||||
return db_->MaxMemCompactionLevel();
|
||||
using DB::MaxMemCompactionLevel;
|
||||
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
|
||||
override {
|
||||
return db_->MaxMemCompactionLevel(column_family);
|
||||
}
|
||||
|
||||
virtual int Level0StopWriteTrigger() override {
|
||||
return db_->Level0StopWriteTrigger();
|
||||
using DB::Level0StopWriteTrigger;
|
||||
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
|
||||
override {
|
||||
return db_->Level0StopWriteTrigger(column_family);
|
||||
}
|
||||
|
||||
virtual const std::string& GetName() const override {
|
||||
@ -111,12 +141,16 @@ class StackableDB : public DB {
|
||||
return db_->GetEnv();
|
||||
}
|
||||
|
||||
virtual const Options& GetOptions() const override {
|
||||
return db_->GetOptions();
|
||||
using DB::GetOptions;
|
||||
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
|
||||
override {
|
||||
return db_->GetOptions(column_family);
|
||||
}
|
||||
|
||||
virtual Status Flush(const FlushOptions& fopts) override {
|
||||
return db_->Flush(fopts);
|
||||
using DB::Flush;
|
||||
virtual Status Flush(const FlushOptions& fopts,
|
||||
ColumnFamilyHandle* column_family) override {
|
||||
return db_->Flush(fopts, column_family);
|
||||
}
|
||||
|
||||
virtual Status DisableFileDeletions() override {
|
||||
@ -148,8 +182,10 @@ class StackableDB : public DB {
|
||||
return db_->GetDbIdentity(identity);
|
||||
}
|
||||
|
||||
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
|
||||
return db_->GetPropertiesOfAllTables(props);
|
||||
using DB::GetPropertiesOfAllTables;
|
||||
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
||||
TablePropertiesCollection* props) {
|
||||
return db_->GetPropertiesOfAllTables(column_family, props);
|
||||
}
|
||||
|
||||
virtual Status GetUpdatesSince(
|
||||
@ -158,6 +194,10 @@ class StackableDB : public DB {
|
||||
return db_->GetUpdatesSince(seq_number, iter, read_options);
|
||||
}
|
||||
|
||||
virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
|
||||
return db_->DefaultColumnFamily();
|
||||
}
|
||||
|
||||
protected:
|
||||
DB* db_;
|
||||
};
|
||||
|
@ -208,7 +208,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
|
||||
rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options);
|
||||
mem->Ref();
|
||||
std::string state;
|
||||
rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, mem, &options);
|
||||
rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
|
||||
rocksdb::Status s =
|
||||
rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
|
||||
int count = 0;
|
||||
rocksdb::Iterator* iter = mem->NewIterator();
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
|
@ -127,13 +127,6 @@ extern bool Snappy_GetUncompressedLength(const char* input, size_t length,
|
||||
extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
|
||||
char* output);
|
||||
|
||||
// ------------------ Miscellaneous -------------------
|
||||
|
||||
// If heap profiling is not supported, returns false.
|
||||
// Else repeatedly calls (*func)(arg, data, n) and then returns true.
|
||||
// The concatenation of all "data[0,n-1]" fragments is the heap profile.
|
||||
extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
|
||||
|
||||
} // namespace port
|
||||
} // namespace rocksdb
|
||||
|
||||
|
@ -476,10 +476,6 @@ inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool GetHeapProfile(void (*func)(void *, const char *, int), void *arg) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#define CACHE_LINE_SIZE 64U
|
||||
|
||||
} // namespace port
|
||||
|
@ -45,7 +45,9 @@ namespace {
|
||||
// The longest the prefix of the cache key used to identify blocks can be.
|
||||
// We are using the fact that we know for Posix files the unique ID is three
|
||||
// varints.
|
||||
const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
|
||||
// For some reason, compiling for iOS complains that this variable is unused
|
||||
const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
|
||||
kMaxVarint64Length * 3 + 1;
|
||||
|
||||
// Read the block identified by "handle" from "file".
|
||||
// The only relevant option is options.verify_checksums for now.
|
||||
@ -105,7 +107,7 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
|
||||
Statistics* statistics) {
|
||||
auto cache_handle = block_cache->Lookup(key);
|
||||
if (cache_handle != nullptr) {
|
||||
BumpPerfCount(&perf_context.block_cache_hit_count);
|
||||
PERF_COUNTER_ADD(block_cache_hit_count, 1);
|
||||
// overall cache hit
|
||||
RecordTick(statistics, BLOCK_CACHE_HIT);
|
||||
// block-type specific cache hit
|
||||
|
@ -46,6 +46,9 @@ class FilterBlockBuilder {
|
||||
bool SamePrefix(const Slice &key1, const Slice &key2) const;
|
||||
void GenerateFilter();
|
||||
|
||||
// important: all of these might point to invalid addresses
|
||||
// at the time of destruction of this filter block. destructor
|
||||
// should NOT dereference them.
|
||||
const FilterPolicy* policy_;
|
||||
const SliceTransform* prefix_extractor_;
|
||||
bool whole_key_filtering_;
|
||||
|
@ -125,12 +125,11 @@ Status ReadBlockContents(RandomAccessFile* file,
|
||||
char* buf = new char[n + kBlockTrailerSize];
|
||||
Slice contents;
|
||||
|
||||
StopWatchNano timer(env);
|
||||
StartPerfTimer(&timer);
|
||||
PERF_TIMER_AUTO(block_read_time);
|
||||
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
|
||||
BumpPerfCount(&perf_context.block_read_count);
|
||||
BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize);
|
||||
BumpPerfTime(&perf_context.block_read_time, &timer);
|
||||
PERF_TIMER_MEASURE(block_read_time);
|
||||
PERF_COUNTER_ADD(block_read_count, 1);
|
||||
PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
|
||||
|
||||
if (!s.ok()) {
|
||||
delete[] buf;
|
||||
@ -151,7 +150,7 @@ Status ReadBlockContents(RandomAccessFile* file,
|
||||
s = Status::Corruption("block checksum mismatch");
|
||||
return s;
|
||||
}
|
||||
BumpPerfTime(&perf_context.block_checksum_time, &timer);
|
||||
PERF_TIMER_MEASURE(block_checksum_time);
|
||||
}
|
||||
|
||||
// If the caller has requested that the block not be uncompressed
|
||||
@ -175,7 +174,7 @@ Status ReadBlockContents(RandomAccessFile* file,
|
||||
s = UncompressBlockContents(data, n, result);
|
||||
delete[] buf;
|
||||
}
|
||||
BumpPerfTime(&perf_context.block_decompress_time, &timer);
|
||||
PERF_TIMER_STOP(block_decompress_time);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -25,16 +25,14 @@ namespace {
|
||||
|
||||
class MergingIterator : public Iterator {
|
||||
public:
|
||||
MergingIterator(Env* const env, const Comparator* comparator,
|
||||
Iterator** children, int n)
|
||||
MergingIterator(const Comparator* comparator, Iterator** children, int n)
|
||||
: comparator_(comparator),
|
||||
children_(n),
|
||||
current_(nullptr),
|
||||
use_heap_(true),
|
||||
env_(env),
|
||||
direction_(kForward),
|
||||
maxHeap_(NewMaxIterHeap(comparator_)),
|
||||
minHeap_ (NewMinIterHeap(comparator_)) {
|
||||
minHeap_(NewMinIterHeap(comparator_)) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
children_[i].Set(children[i]);
|
||||
}
|
||||
@ -79,13 +77,13 @@ class MergingIterator : public Iterator {
|
||||
// Invalidate the heap.
|
||||
use_heap_ = false;
|
||||
IteratorWrapper* first_child = nullptr;
|
||||
StopWatchNano child_seek_timer(env_, false);
|
||||
StopWatchNano min_heap_timer(env_, false);
|
||||
PERF_TIMER_DECLARE();
|
||||
|
||||
for (auto& child : children_) {
|
||||
StartPerfTimer(&child_seek_timer);
|
||||
PERF_TIMER_START(seek_child_seek_time);
|
||||
child.Seek(target);
|
||||
BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
|
||||
BumpPerfCount(&perf_context.seek_child_seek_count);
|
||||
PERF_TIMER_STOP(seek_child_seek_time);
|
||||
PERF_COUNTER_ADD(seek_child_seek_count, 1);
|
||||
|
||||
if (child.Valid()) {
|
||||
// This child has valid key
|
||||
@ -97,26 +95,24 @@ class MergingIterator : public Iterator {
|
||||
} else {
|
||||
// We have more than one children with valid keys. Initialize
|
||||
// the heap and put the first child into the heap.
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
PERF_TIMER_START(seek_min_heap_time);
|
||||
ClearHeaps();
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
minHeap_.push(first_child);
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
PERF_TIMER_STOP(seek_min_heap_time);
|
||||
}
|
||||
}
|
||||
if (use_heap_) {
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
PERF_TIMER_START(seek_min_heap_time);
|
||||
minHeap_.push(&child);
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
PERF_TIMER_STOP(seek_min_heap_time);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (use_heap_) {
|
||||
// If heap is valid, need to put the smallest key to curent_.
|
||||
StartPerfTimer(&min_heap_timer);
|
||||
PERF_TIMER_START(seek_min_heap_time);
|
||||
FindSmallest();
|
||||
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||
PERF_TIMER_STOP(seek_min_heap_time);
|
||||
} else {
|
||||
// The heap is not valid, then the current_ iterator is the first
|
||||
// one, or null if there is no first child.
|
||||
@ -232,7 +228,6 @@ class MergingIterator : public Iterator {
|
||||
// This flag is always true for reverse direction, as we always use heap for
|
||||
// the reverse iterating case.
|
||||
bool use_heap_;
|
||||
Env* const env_;
|
||||
// Which direction is the iterator moving?
|
||||
enum Direction {
|
||||
kForward,
|
||||
@ -272,15 +267,14 @@ void MergingIterator::ClearHeaps() {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
|
||||
Iterator** list, int n) {
|
||||
Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
|
||||
assert(n >= 0);
|
||||
if (n == 0) {
|
||||
return NewEmptyIterator();
|
||||
} else if (n == 1) {
|
||||
return list[0];
|
||||
} else {
|
||||
return new MergingIterator(env, cmp, list, n);
|
||||
return new MergingIterator(cmp, list, n);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,8 +23,7 @@ class Env;
|
||||
// key is present in K child iterators, it will be yielded K times.
|
||||
//
|
||||
// REQUIRES: n >= 0
|
||||
extern Iterator* NewMergingIterator(Env* const env,
|
||||
const Comparator* comparator,
|
||||
extern Iterator* NewMergingIterator(const Comparator* comparator,
|
||||
Iterator** children, int n);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -81,10 +81,9 @@ class PlainTableIterator : public Iterator {
|
||||
bool use_prefix_seek_;
|
||||
uint32_t offset_;
|
||||
uint32_t next_offset_;
|
||||
Slice key_;
|
||||
IterKey key_;
|
||||
Slice value_;
|
||||
Status status_;
|
||||
std::string tmp_str_;
|
||||
// No copying allowed
|
||||
PlainTableIterator(const PlainTableIterator&) = delete;
|
||||
void operator=(const Iterator&) = delete;
|
||||
@ -720,9 +719,7 @@ void PlainTableIterator::Next() {
|
||||
status_ = table_->Next(&next_offset_, &parsed_key, &value_);
|
||||
if (status_.ok()) {
|
||||
// Make a copy in this case. TODO optimize.
|
||||
tmp_str_.clear();
|
||||
AppendInternalKey(&tmp_str_, parsed_key);
|
||||
key_ = Slice(tmp_str_);
|
||||
key_.SetInternalKey(parsed_key);
|
||||
} else {
|
||||
offset_ = next_offset_ = table_->data_end_offset_;
|
||||
}
|
||||
@ -735,7 +732,7 @@ void PlainTableIterator::Prev() {
|
||||
|
||||
Slice PlainTableIterator::key() const {
|
||||
assert(Valid());
|
||||
return key_;
|
||||
return key_.GetKey();
|
||||
}
|
||||
|
||||
Slice PlainTableIterator::value() const {
|
||||
|
@ -1554,7 +1554,8 @@ TEST(MemTableTest, Simple) {
|
||||
batch.Put(std::string("k2"), std::string("v2"));
|
||||
batch.Put(std::string("k3"), std::string("v3"));
|
||||
batch.Put(std::string("largekey"), std::string("vlarge"));
|
||||
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok());
|
||||
ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options);
|
||||
ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok());
|
||||
|
||||
Iterator* iter = memtable->NewIterator();
|
||||
iter->SeekToFirst();
|
||||
|
71
tools/auto_sanity_test.sh
Executable file
71
tools/auto_sanity_test.sh
Executable file
@ -0,0 +1,71 @@
|
||||
TMP_DIR="/tmp/rocksdb-sanity-test"
|
||||
|
||||
if [ "$#" -lt 2 ]; then
|
||||
echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
|
||||
echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
|
||||
recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
|
||||
commit_new=`echo "$recent_commits" | head -n1`
|
||||
commit_old=`echo "$recent_commits" | tail -n1`
|
||||
echo "the most recent commits are:"
|
||||
echo "$recent_commits"
|
||||
else
|
||||
commit_new=$1
|
||||
commit_old=$2
|
||||
fi
|
||||
|
||||
if [ ! -d $TMP_DIR ]; then
|
||||
mkdir $TMP_DIR
|
||||
fi
|
||||
dir_new="${TMP_DIR}/${commit_new}"
|
||||
dir_old="${TMP_DIR}/${commit_old}"
|
||||
|
||||
function makestuff() {
|
||||
echo "make clean"
|
||||
make clean > /dev/null
|
||||
echo "make db_sanity_test -j32"
|
||||
make db_sanity_test -j32 > /dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[ERROR] Failed to perform 'make db_sanity_test'"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
rm -r -f $dir_new
|
||||
rm -r -f $dir_old
|
||||
|
||||
echo "Running db sanity check with commits $commit_new and $commit_old."
|
||||
|
||||
echo "============================================================="
|
||||
echo "Making build $commit_new"
|
||||
makestuff
|
||||
mv db_sanity_test new_db_sanity_test
|
||||
echo "Creating db based on the new commit --- $commit_new"
|
||||
./new_db_sanity_test $dir_new create
|
||||
|
||||
echo "============================================================="
|
||||
echo "Making build $commit_old"
|
||||
makestuff
|
||||
mv db_sanity_test old_db_sanity_test
|
||||
echo "Creating db based on the old commit --- $commit_old"
|
||||
./old_db_sanity_test $dir_old create
|
||||
|
||||
echo "============================================================="
|
||||
echo "Verifying new db $dir_new using the old commit --- $commit_old"
|
||||
./old_db_sanity_test $dir_new verify
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[ERROR] Verification of $dir_new using commit $commit_old failed."
|
||||
exit 2
|
||||
fi
|
||||
|
||||
echo "============================================================="
|
||||
echo "Verifying old db $dir_old using the new commit --- $commit_new"
|
||||
./new_db_sanity_test $dir_old verify
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[ERROR] Verification of $dir_old using commit $commit_new failed."
|
||||
exit 2
|
||||
fi
|
||||
|
||||
rm old_db_sanity_test
|
||||
rm new_db_sanity_test
|
||||
|
||||
echo "Auto sanity test passed!"
|
@ -88,6 +88,7 @@ def main(argv):
|
||||
--open_files=500000
|
||||
--verify_checksum=1
|
||||
--sync=0
|
||||
--progress_reports=0
|
||||
--disable_wal=0
|
||||
--disable_data_sync=1
|
||||
--target_file_size_base=2097152
|
||||
|
@ -101,6 +101,7 @@ def main(argv):
|
||||
--open_files=500000
|
||||
--verify_checksum=1
|
||||
--sync=0
|
||||
--progress_reports=0
|
||||
--disable_wal=0
|
||||
--disable_data_sync=1
|
||||
--target_file_size_base=2097152
|
||||
|
@ -60,14 +60,16 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) {
|
||||
return true;
|
||||
}
|
||||
DEFINE_uint64(seed, 2341234, "Seed for PRNG");
|
||||
static const bool FLAGS_seed_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
|
||||
static const bool FLAGS_seed_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
|
||||
|
||||
DEFINE_int64(max_key, 1 * KB * KB * KB,
|
||||
DEFINE_int64(max_key, 1 * KB* KB,
|
||||
"Max number of key/values to place in database");
|
||||
|
||||
DEFINE_int32(column_families, 10, "Number of column families");
|
||||
|
||||
DEFINE_bool(test_batches_snapshots, false,
|
||||
"If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
|
||||
"If set, the test uses MultiGet(), Multiut() and MultiDelete()"
|
||||
" which read/write/delete multiple keys in a batch. In this mode,"
|
||||
" we do not verify db content by comparing the content with the "
|
||||
"pre-allocated array. Instead, we do partial verification inside"
|
||||
@ -95,7 +97,10 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings");
|
||||
DEFINE_bool(destroy_db_initially, true,
|
||||
"Destroys the database dir before start if this is true");
|
||||
|
||||
DEFINE_bool (verbose, false, "Verbose");
|
||||
DEFINE_bool(verbose, false, "Verbose");
|
||||
|
||||
DEFINE_bool(progress_reports, true,
|
||||
"If true, db_stress will report number of finished operations");
|
||||
|
||||
DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
|
||||
"Number of bytes to buffer in memtable before compacting");
|
||||
@ -146,6 +151,10 @@ DEFINE_int32(max_background_compactions,
|
||||
"The maximum number of concurrent background compactions "
|
||||
"that can occur in parallel.");
|
||||
|
||||
DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
|
||||
"The maximum number of concurrent background flushes "
|
||||
"that can occur in parallel.");
|
||||
|
||||
DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
|
||||
" compaction in universal style");
|
||||
|
||||
@ -158,6 +167,11 @@ DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
|
||||
DEFINE_int32(universal_max_size_amplification_percent, 0,
|
||||
"The max size amplification for universal style compaction");
|
||||
|
||||
DEFINE_int32(clear_column_family_one_in, 1000000,
|
||||
"With a chance of 1/N, delete a column family and then recreate "
|
||||
"it again. If N == 0, never drop/create column families. "
|
||||
"When test_batches_snapshots is true, this flag has no effect");
|
||||
|
||||
DEFINE_int64(cache_size, 2 * KB * KB * KB,
|
||||
"Number of bytes to use as a cache of uncompressed data.");
|
||||
|
||||
@ -170,8 +184,8 @@ static bool ValidateInt32Positive(const char* flagname, int32_t value) {
|
||||
return true;
|
||||
}
|
||||
DEFINE_int32(reopen, 10, "Number of times database reopens");
|
||||
static const bool FLAGS_reopen_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
|
||||
static const bool FLAGS_reopen_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
|
||||
|
||||
DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
|
||||
"Negative means use default settings.");
|
||||
@ -198,9 +212,9 @@ DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
|
||||
DEFINE_int32(kill_random_test, 0,
|
||||
"If non-zero, kill at various points in source code with "
|
||||
"probability 1/this");
|
||||
static const bool FLAGS_kill_random_test_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_kill_random_test,
|
||||
&ValidateInt32Positive);
|
||||
static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_kill_random_test,
|
||||
&ValidateInt32Positive);
|
||||
extern int rocksdb_kill_odds;
|
||||
|
||||
DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
|
||||
@ -226,42 +240,37 @@ static bool ValidateInt32Percent(const char* flagname, int32_t value) {
|
||||
}
|
||||
DEFINE_int32(readpercent, 10,
|
||||
"Ratio of reads to total workload (expressed as a percentage)");
|
||||
static const bool FLAGS_readpercent_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
|
||||
static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
|
||||
|
||||
DEFINE_int32(prefixpercent, 20,
|
||||
"Ratio of prefix iterators to total workload (expressed as a"
|
||||
" percentage)");
|
||||
static const bool FLAGS_prefixpercent_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
|
||||
static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
|
||||
|
||||
DEFINE_int32(writepercent, 45,
|
||||
" Ratio of deletes to total workload (expressed as a percentage)");
|
||||
static const bool FLAGS_writepercent_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
|
||||
static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
|
||||
|
||||
DEFINE_int32(delpercent, 15,
|
||||
"Ratio of deletes to total workload (expressed as a percentage)");
|
||||
static const bool FLAGS_delpercent_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
|
||||
static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
|
||||
|
||||
DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
|
||||
" (expressed as a percentage)");
|
||||
static const bool FLAGS_iterpercent_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
|
||||
static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
|
||||
|
||||
DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
|
||||
static const bool FLAGS_num_iterations_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
|
||||
static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
|
||||
|
||||
DEFINE_bool(disable_seek_compaction, false,
|
||||
"Option to disable compation triggered by read.");
|
||||
|
||||
DEFINE_uint64(delete_obsolete_files_period_micros, 0,
|
||||
"Option to delete obsolete files periodically"
|
||||
"0 means that obsolete files are "
|
||||
" deleted after every compaction run.");
|
||||
|
||||
enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
|
||||
assert(ctype);
|
||||
|
||||
@ -290,21 +299,21 @@ DEFINE_string(hdfs, "", "Name of hdfs environment");
|
||||
// posix or hdfs environment
|
||||
static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
|
||||
|
||||
DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread.");
|
||||
static const bool FLAGS_ops_per_thread_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
|
||||
DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
|
||||
static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
|
||||
|
||||
DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
|
||||
static const bool FLAGS_log2_keys_per_lock_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
|
||||
&ValidateUint32Range);
|
||||
static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
|
||||
&ValidateUint32Range);
|
||||
|
||||
DEFINE_int32(purge_redundant_percent, 50,
|
||||
"Percentage of times we want to purge redundant keys in memory "
|
||||
"before flushing");
|
||||
static const bool FLAGS_purge_redundant_percent_dummy =
|
||||
google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
|
||||
&ValidateInt32Percent);
|
||||
static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) =
|
||||
google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
|
||||
&ValidateInt32Percent);
|
||||
|
||||
DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
|
||||
" the delete if key not present");
|
||||
@ -438,16 +447,18 @@ class Stats {
|
||||
last_op_finish_ = now;
|
||||
}
|
||||
|
||||
done_++;
|
||||
if (done_ >= next_report_) {
|
||||
if (next_report_ < 1000) next_report_ += 100;
|
||||
else if (next_report_ < 5000) next_report_ += 500;
|
||||
else if (next_report_ < 10000) next_report_ += 1000;
|
||||
else if (next_report_ < 50000) next_report_ += 5000;
|
||||
else if (next_report_ < 100000) next_report_ += 10000;
|
||||
else if (next_report_ < 500000) next_report_ += 50000;
|
||||
else next_report_ += 100000;
|
||||
fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
|
||||
done_++;
|
||||
if (FLAGS_progress_reports) {
|
||||
if (done_ >= next_report_) {
|
||||
if (next_report_ < 1000) next_report_ += 100;
|
||||
else if (next_report_ < 5000) next_report_ += 500;
|
||||
else if (next_report_ < 10000) next_report_ += 1000;
|
||||
else if (next_report_ < 50000) next_report_ += 5000;
|
||||
else if (next_report_ < 100000) next_report_ += 10000;
|
||||
else if (next_report_ < 500000) next_report_ += 50000;
|
||||
else next_report_ += 100000;
|
||||
fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -515,7 +526,7 @@ class Stats {
|
||||
// State shared by all concurrent executions of the same benchmark.
|
||||
class SharedState {
|
||||
public:
|
||||
static const uint32_t SENTINEL = 0xffffffff;
|
||||
static const uint32_t SENTINEL;
|
||||
|
||||
explicit SharedState(StressTest* stress_test) :
|
||||
cv_(&mu_),
|
||||
@ -531,28 +542,27 @@ class SharedState {
|
||||
start_verify_(false),
|
||||
stress_test_(stress_test) {
|
||||
if (FLAGS_test_batches_snapshots) {
|
||||
key_locks_ = nullptr;
|
||||
values_ = nullptr;
|
||||
fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
|
||||
return;
|
||||
}
|
||||
values_ = new uint32_t[max_key_];
|
||||
for (long i = 0; i < max_key_; i++) {
|
||||
values_[i] = SENTINEL;
|
||||
values_.resize(FLAGS_column_families);
|
||||
|
||||
for (int i = 0; i < FLAGS_column_families; ++i) {
|
||||
values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
|
||||
}
|
||||
|
||||
long num_locks = (max_key_ >> log2_keys_per_lock_);
|
||||
if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
|
||||
num_locks ++;
|
||||
num_locks++;
|
||||
}
|
||||
fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
|
||||
key_locks_.resize(FLAGS_column_families);
|
||||
for (int i = 0; i < FLAGS_column_families; ++i) {
|
||||
key_locks_[i] = std::vector<port::Mutex>(num_locks);
|
||||
}
|
||||
fprintf(stdout, "Creating %ld locks\n", num_locks);
|
||||
key_locks_ = new port::Mutex[num_locks];
|
||||
}
|
||||
|
||||
~SharedState() {
|
||||
delete[] values_;
|
||||
delete[] key_locks_;
|
||||
}
|
||||
~SharedState() {}
|
||||
|
||||
port::Mutex* GetMutex() {
|
||||
return &mu_;
|
||||
@ -622,26 +632,36 @@ class SharedState {
|
||||
return start_verify_;
|
||||
}
|
||||
|
||||
port::Mutex* GetMutexForKey(long key) {
|
||||
return &key_locks_[key >> log2_keys_per_lock_];
|
||||
port::Mutex* GetMutexForKey(int cf, long key) {
|
||||
return &key_locks_[cf][key >> log2_keys_per_lock_];
|
||||
}
|
||||
|
||||
void Put(long key, uint32_t value_base) {
|
||||
values_[key] = value_base;
|
||||
void LockColumnFamily(int cf) {
|
||||
for (auto& mutex : key_locks_[cf]) {
|
||||
mutex.Lock();
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t Get(long key) const {
|
||||
return values_[key];
|
||||
void UnlockColumnFamily(int cf) {
|
||||
for (auto& mutex : key_locks_[cf]) {
|
||||
mutex.Unlock();
|
||||
}
|
||||
}
|
||||
|
||||
void Delete(long key) const {
|
||||
values_[key] = SENTINEL;
|
||||
void ClearColumnFamily(int cf) {
|
||||
std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
|
||||
}
|
||||
|
||||
uint32_t GetSeed() const {
|
||||
return seed_;
|
||||
void Put(int cf, long key, uint32_t value_base) {
|
||||
values_[cf][key] = value_base;
|
||||
}
|
||||
|
||||
uint32_t Get(int cf, long key) const { return values_[cf][key]; }
|
||||
|
||||
void Delete(int cf, long key) { values_[cf][key] = SENTINEL; }
|
||||
|
||||
uint32_t GetSeed() const { return seed_; }
|
||||
|
||||
private:
|
||||
port::Mutex mu_;
|
||||
port::CondVar cv_;
|
||||
@ -657,11 +677,12 @@ class SharedState {
|
||||
bool start_verify_;
|
||||
StressTest* stress_test_;
|
||||
|
||||
uint32_t *values_;
|
||||
port::Mutex *key_locks_;
|
||||
|
||||
std::vector<std::vector<uint32_t>> values_;
|
||||
std::vector<std::vector<port::Mutex>> key_locks_;
|
||||
};
|
||||
|
||||
const uint32_t SharedState::SENTINEL = 0xffffffff;
|
||||
|
||||
// Per-thread state for concurrent executions of the same benchmark.
|
||||
struct ThreadState {
|
||||
uint32_t tid; // 0..n-1
|
||||
@ -682,13 +703,14 @@ class StressTest {
|
||||
public:
|
||||
StressTest()
|
||||
: cache_(NewLRUCache(FLAGS_cache_size)),
|
||||
compressed_cache_(FLAGS_compressed_cache_size >= 0 ?
|
||||
NewLRUCache(FLAGS_compressed_cache_size) :
|
||||
nullptr),
|
||||
compressed_cache_(FLAGS_compressed_cache_size >= 0
|
||||
? NewLRUCache(FLAGS_compressed_cache_size)
|
||||
: nullptr),
|
||||
filter_policy_(FLAGS_bloom_bits >= 0
|
||||
? NewBloomFilterPolicy(FLAGS_bloom_bits)
|
||||
: nullptr),
|
||||
? NewBloomFilterPolicy(FLAGS_bloom_bits)
|
||||
: nullptr),
|
||||
db_(nullptr),
|
||||
new_column_family_name_(0),
|
||||
num_times_reopened_(0) {
|
||||
if (FLAGS_destroy_db_initially) {
|
||||
std::vector<std::string> files;
|
||||
@ -703,6 +725,10 @@ class StressTest {
|
||||
}
|
||||
|
||||
~StressTest() {
|
||||
for (auto cf : column_families_) {
|
||||
delete cf;
|
||||
}
|
||||
column_families_.clear();
|
||||
delete db_;
|
||||
delete filter_policy_;
|
||||
}
|
||||
@ -817,9 +843,9 @@ class StressTest {
|
||||
// Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
|
||||
// ("9"+K, "9"+V) in DB atomically i.e in a single batch.
|
||||
// Also refer MultiGet.
|
||||
Status MultiPut(ThreadState* thread,
|
||||
const WriteOptions& writeoptions,
|
||||
const Slice& key, const Slice& value, size_t sz) {
|
||||
Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value, size_t sz) {
|
||||
std::string keys[10] = {"9", "8", "7", "6", "5",
|
||||
"4", "3", "2", "1", "0"};
|
||||
std::string values[10] = {"9", "8", "7", "6", "5",
|
||||
@ -832,9 +858,9 @@ class StressTest {
|
||||
values[i] += value.ToString();
|
||||
value_slices[i] = values[i];
|
||||
if (FLAGS_use_merge) {
|
||||
batch.Merge(keys[i], value_slices[i]);
|
||||
batch.Merge(column_family, keys[i], value_slices[i]);
|
||||
} else {
|
||||
batch.Put(keys[i], value_slices[i]);
|
||||
batch.Put(column_family, keys[i], value_slices[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -852,9 +878,8 @@ class StressTest {
|
||||
|
||||
// Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
|
||||
// in DB atomically i.e in a single batch. Also refer MultiGet.
|
||||
Status MultiDelete(ThreadState* thread,
|
||||
const WriteOptions& writeoptions,
|
||||
const Slice& key) {
|
||||
Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
|
||||
ColumnFamilyHandle* column_family, const Slice& key) {
|
||||
std::string keys[10] = {"9", "7", "5", "3", "1",
|
||||
"8", "6", "4", "2", "0"};
|
||||
|
||||
@ -862,7 +887,7 @@ class StressTest {
|
||||
Status s;
|
||||
for (int i = 0; i < 10; i++) {
|
||||
keys[i] += key.ToString();
|
||||
batch.Delete(keys[i]);
|
||||
batch.Delete(column_family, keys[i]);
|
||||
}
|
||||
|
||||
s = db_->Write(writeoptions, &batch);
|
||||
@ -880,9 +905,9 @@ class StressTest {
|
||||
// in the same snapshot, and verifies that all the values are of the form
|
||||
// "0"+V, "1"+V,..."9"+V.
|
||||
// ASSUMES that MultiPut was used to put (K, V) into the DB.
|
||||
Status MultiGet(ThreadState* thread,
|
||||
const ReadOptions& readoptions,
|
||||
const Slice& key, std::string* value) {
|
||||
Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) {
|
||||
std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
|
||||
Slice key_slices[10];
|
||||
std::string values[10];
|
||||
@ -892,7 +917,7 @@ class StressTest {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
keys[i] += key.ToString();
|
||||
key_slices[i] = keys[i];
|
||||
s = db_->Get(readoptionscopy, key_slices[i], value);
|
||||
s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
|
||||
if (!s.ok() && !s.IsNotFound()) {
|
||||
fprintf(stderr, "get error: %s\n", s.ToString().c_str());
|
||||
values[i] = "";
|
||||
@ -937,8 +962,8 @@ class StressTest {
|
||||
// each series should be the same length, and it is verified for each
|
||||
// index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
|
||||
// ASSUMES that MultiPut was used to put (K, V)
|
||||
Status MultiPrefixScan(ThreadState* thread,
|
||||
const ReadOptions& readoptions,
|
||||
Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) {
|
||||
std::string prefixes[10] = {"0", "1", "2", "3", "4",
|
||||
"5", "6", "7", "8", "9"};
|
||||
@ -954,7 +979,7 @@ class StressTest {
|
||||
readoptionscopy[i] = readoptions;
|
||||
readoptionscopy[i].prefix_seek = true;
|
||||
readoptionscopy[i].snapshot = snapshot;
|
||||
iters[i] = db_->NewIterator(readoptionscopy[i]);
|
||||
iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
|
||||
iters[i]->Seek(prefix_slices[i]);
|
||||
}
|
||||
|
||||
@ -1012,15 +1037,14 @@ class StressTest {
|
||||
|
||||
// Given a key K, this creates an iterator which scans to K and then
|
||||
// does a random sequence of Next/Prev operations.
|
||||
Status MultiIterate(ThreadState* thread,
|
||||
const ReadOptions& readoptions,
|
||||
const Slice& key) {
|
||||
Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
|
||||
ColumnFamilyHandle* column_family, const Slice& key) {
|
||||
Status s;
|
||||
const Snapshot* snapshot = db_->GetSnapshot();
|
||||
ReadOptions readoptionscopy = readoptions;
|
||||
readoptionscopy.snapshot = snapshot;
|
||||
readoptionscopy.prefix_seek = FLAGS_prefix_size > 0;
|
||||
unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy));
|
||||
unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
|
||||
|
||||
iter->Seek(key);
|
||||
for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
|
||||
@ -1075,15 +1099,50 @@ class StressTest {
|
||||
}
|
||||
}
|
||||
|
||||
if (!FLAGS_test_batches_snapshots &&
|
||||
FLAGS_clear_column_family_one_in != 0) {
|
||||
if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
|
||||
// drop column family and then create it again (can't drop default)
|
||||
int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
|
||||
std::string new_name =
|
||||
std::to_string(new_column_family_name_.fetch_add(1));
|
||||
{
|
||||
MutexLock l(thread->shared->GetMutex());
|
||||
fprintf(
|
||||
stdout,
|
||||
"[CF %d] Dropping and recreating column family. new name: %s\n",
|
||||
cf, new_name.c_str());
|
||||
}
|
||||
thread->shared->LockColumnFamily(cf);
|
||||
Status s __attribute__((unused));
|
||||
s = db_->DropColumnFamily(column_families_[cf]);
|
||||
delete column_families_[cf];
|
||||
assert(s.ok());
|
||||
s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
|
||||
&column_families_[cf]);
|
||||
column_family_names_[cf] = new_name;
|
||||
thread->shared->ClearColumnFamily(cf);
|
||||
assert(s.ok());
|
||||
thread->shared->UnlockColumnFamily(cf);
|
||||
}
|
||||
}
|
||||
|
||||
long rand_key = thread->rand.Next() % max_key;
|
||||
int rand_column_family = thread->rand.Next() % FLAGS_column_families;
|
||||
std::string keystr = Key(rand_key);
|
||||
Slice key = keystr;
|
||||
int prob_op = thread->rand.Uniform(100);
|
||||
std::unique_ptr<MutexLock> l;
|
||||
if (!FLAGS_test_batches_snapshots) {
|
||||
l.reset(new MutexLock(
|
||||
thread->shared->GetMutexForKey(rand_column_family, rand_key)));
|
||||
}
|
||||
auto column_family = column_families_[rand_column_family];
|
||||
|
||||
if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
|
||||
// OPERATION read
|
||||
if (!FLAGS_test_batches_snapshots) {
|
||||
Status s = db_->Get(read_opts, key, &from_db);
|
||||
Status s = db_->Get(read_opts, column_family, key, &from_db);
|
||||
if (s.ok()) {
|
||||
// found case
|
||||
thread->stats.AddGets(1, 1);
|
||||
@ -1095,7 +1154,7 @@ class StressTest {
|
||||
thread->stats.AddErrors(1);
|
||||
}
|
||||
} else {
|
||||
MultiGet(thread, read_opts, key, &from_db);
|
||||
MultiGet(thread, read_opts, column_family, key, &from_db);
|
||||
}
|
||||
} else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
|
||||
// OPERATION prefix scan
|
||||
@ -1106,7 +1165,7 @@ class StressTest {
|
||||
if (!FLAGS_test_batches_snapshots) {
|
||||
Slice prefix = Slice(key.data(), FLAGS_prefix_size);
|
||||
read_opts.prefix_seek = true;
|
||||
Iterator* iter = db_->NewIterator(read_opts);
|
||||
Iterator* iter = db_->NewIterator(read_opts, column_family);
|
||||
int64_t count = 0;
|
||||
for (iter->Seek(prefix);
|
||||
iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
|
||||
@ -1121,7 +1180,7 @@ class StressTest {
|
||||
}
|
||||
delete iter;
|
||||
} else {
|
||||
MultiPrefixScan(thread, read_opts, key);
|
||||
MultiPrefixScan(thread, read_opts, column_family, key);
|
||||
}
|
||||
} else if (prefixBound <= prob_op && prob_op < writeBound) {
|
||||
// OPERATION write
|
||||
@ -1129,42 +1188,36 @@ class StressTest {
|
||||
size_t sz = GenerateValue(value_base, value, sizeof(value));
|
||||
Slice v(value, sz);
|
||||
if (!FLAGS_test_batches_snapshots) {
|
||||
MutexLock l(thread->shared->GetMutexForKey(rand_key));
|
||||
if (FLAGS_verify_before_write) {
|
||||
std::string keystr2 = Key(rand_key);
|
||||
Slice k = keystr2;
|
||||
Status s = db_->Get(read_opts, k, &from_db);
|
||||
VerifyValue(rand_key,
|
||||
read_opts,
|
||||
*(thread->shared),
|
||||
from_db,
|
||||
s,
|
||||
true);
|
||||
Status s = db_->Get(read_opts, column_family, k, &from_db);
|
||||
VerifyValue(rand_column_family, rand_key, read_opts,
|
||||
*(thread->shared), from_db, s, true);
|
||||
}
|
||||
thread->shared->Put(rand_key, value_base);
|
||||
thread->shared->Put(rand_column_family, rand_key, value_base);
|
||||
if (FLAGS_use_merge) {
|
||||
db_->Merge(write_opts, key, v);
|
||||
db_->Merge(write_opts, column_family, key, v);
|
||||
} else {
|
||||
db_->Put(write_opts, key, v);
|
||||
db_->Put(write_opts, column_family, key, v);
|
||||
}
|
||||
thread->stats.AddBytesForWrites(1, sz);
|
||||
} else {
|
||||
MultiPut(thread, write_opts, key, v, sz);
|
||||
MultiPut(thread, write_opts, column_family, key, v, sz);
|
||||
}
|
||||
PrintKeyValue(rand_key, value, sz);
|
||||
PrintKeyValue(rand_column_family, rand_key, value, sz);
|
||||
} else if (writeBound <= prob_op && prob_op < delBound) {
|
||||
// OPERATION delete
|
||||
if (!FLAGS_test_batches_snapshots) {
|
||||
MutexLock l(thread->shared->GetMutexForKey(rand_key));
|
||||
thread->shared->Delete(rand_key);
|
||||
db_->Delete(write_opts, key);
|
||||
thread->shared->Delete(rand_column_family, rand_key);
|
||||
db_->Delete(write_opts, column_family, key);
|
||||
thread->stats.AddDeletes(1);
|
||||
} else {
|
||||
MultiDelete(thread, write_opts, key);
|
||||
MultiDelete(thread, write_opts, column_family, key);
|
||||
}
|
||||
} else {
|
||||
// OPERATION iterate
|
||||
MultiIterate(thread, read_opts, key);
|
||||
MultiIterate(thread, read_opts, column_family, key);
|
||||
}
|
||||
thread->stats.FinishedSingleOp();
|
||||
}
|
||||
@ -1182,97 +1235,100 @@ class StressTest {
|
||||
if (thread->tid == shared.GetNumThreads() - 1) {
|
||||
end = max_key;
|
||||
}
|
||||
|
||||
if (!thread->rand.OneIn(2)) {
|
||||
options.prefix_seek = FLAGS_prefix_size > 0;
|
||||
// Use iterator to verify this range
|
||||
unique_ptr<Iterator> iter(db_->NewIterator(options));
|
||||
iter->Seek(Key(start));
|
||||
for (long i = start; i < end; i++) {
|
||||
// TODO(ljin): update "long" to uint64_t
|
||||
// Reseek when the prefix changes
|
||||
if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) == 0) {
|
||||
iter->Seek(Key(i));
|
||||
}
|
||||
std::string from_db;
|
||||
std::string keystr = Key(i);
|
||||
Slice k = keystr;
|
||||
Status s = iter->status();
|
||||
if (iter->Valid()) {
|
||||
if (iter->key().compare(k) > 0) {
|
||||
s = Status::NotFound(Slice());
|
||||
} else if (iter->key().compare(k) == 0) {
|
||||
from_db = iter->value().ToString();
|
||||
iter->Next();
|
||||
} else if (iter->key().compare(k) < 0) {
|
||||
VerificationAbort("An out of range key was found", i);
|
||||
for (size_t cf = 0; cf < column_families_.size(); ++cf) {
|
||||
if (!thread->rand.OneIn(2)) {
|
||||
// Use iterator to verify this range
|
||||
options.prefix_seek = FLAGS_prefix_size > 0;
|
||||
unique_ptr<Iterator> iter(
|
||||
db_->NewIterator(options, column_families_[cf]));
|
||||
iter->Seek(Key(start));
|
||||
for (long i = start; i < end; i++) {
|
||||
// TODO(ljin): update "long" to uint64_t
|
||||
// Reseek when the prefix changes
|
||||
if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
|
||||
0) {
|
||||
iter->Seek(Key(i));
|
||||
}
|
||||
std::string from_db;
|
||||
std::string keystr = Key(i);
|
||||
Slice k = keystr;
|
||||
Status s = iter->status();
|
||||
if (iter->Valid()) {
|
||||
if (iter->key().compare(k) > 0) {
|
||||
s = Status::NotFound(Slice());
|
||||
} else if (iter->key().compare(k) == 0) {
|
||||
from_db = iter->value().ToString();
|
||||
iter->Next();
|
||||
} else if (iter->key().compare(k) < 0) {
|
||||
VerificationAbort("An out of range key was found", cf, i);
|
||||
}
|
||||
} else {
|
||||
// The iterator found no value for the key in question, so do not
|
||||
// move to the next item in the iterator
|
||||
s = Status::NotFound(Slice());
|
||||
}
|
||||
VerifyValue(cf, i, options, shared, from_db, s, true);
|
||||
if (from_db.length()) {
|
||||
PrintKeyValue(cf, i, from_db.data(), from_db.length());
|
||||
}
|
||||
} else {
|
||||
// The iterator found no value for the key in question, so do not
|
||||
// move to the next item in the iterator
|
||||
s = Status::NotFound(Slice());
|
||||
}
|
||||
VerifyValue(i, options, shared, from_db, s, true);
|
||||
if (from_db.length()) {
|
||||
PrintKeyValue(i, from_db.data(), from_db.length());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Use Get to verify this range
|
||||
for (long i = start; i < end; i++) {
|
||||
std::string from_db;
|
||||
std::string keystr = Key(i);
|
||||
Slice k = keystr;
|
||||
Status s = db_->Get(options, k, &from_db);
|
||||
VerifyValue(i, options, shared, from_db, s, true);
|
||||
if (from_db.length()) {
|
||||
PrintKeyValue(i, from_db.data(), from_db.length());
|
||||
} else {
|
||||
// Use Get to verify this range
|
||||
for (long i = start; i < end; i++) {
|
||||
std::string from_db;
|
||||
std::string keystr = Key(i);
|
||||
Slice k = keystr;
|
||||
Status s = db_->Get(options, column_families_[cf], k, &from_db);
|
||||
VerifyValue(cf, i, options, shared, from_db, s, true);
|
||||
if (from_db.length()) {
|
||||
PrintKeyValue(cf, i, from_db.data(), from_db.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VerificationAbort(std::string msg, long key) const {
|
||||
fprintf(stderr, "Verification failed for key %ld: %s\n",
|
||||
key, msg.c_str());
|
||||
void VerificationAbort(std::string msg, int cf, long key) const {
|
||||
fprintf(stderr, "Verification failed for column family %d key %ld: %s\n",
|
||||
cf, key, msg.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void VerifyValue(long key,
|
||||
const ReadOptions &opts,
|
||||
const SharedState &shared,
|
||||
const std::string &value_from_db,
|
||||
Status s,
|
||||
bool strict=false) const {
|
||||
void VerifyValue(int cf, long key, const ReadOptions& opts,
|
||||
const SharedState& shared, const std::string& value_from_db,
|
||||
Status s, bool strict = false) const {
|
||||
// compare value_from_db with the value in the shared state
|
||||
char value[100];
|
||||
uint32_t value_base = shared.Get(key);
|
||||
uint32_t value_base = shared.Get(cf, key);
|
||||
if (value_base == SharedState::SENTINEL && !strict) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
if (value_base == SharedState::SENTINEL) {
|
||||
VerificationAbort("Unexpected value found", key);
|
||||
VerificationAbort("Unexpected value found", cf, key);
|
||||
}
|
||||
size_t sz = GenerateValue(value_base, value, sizeof(value));
|
||||
if (value_from_db.length() != sz) {
|
||||
VerificationAbort("Length of value read is not equal", key);
|
||||
VerificationAbort("Length of value read is not equal", cf, key);
|
||||
}
|
||||
if (memcmp(value_from_db.data(), value, sz) != 0) {
|
||||
VerificationAbort("Contents of value read don't match", key);
|
||||
VerificationAbort("Contents of value read don't match", cf, key);
|
||||
}
|
||||
} else {
|
||||
if (value_base != SharedState::SENTINEL) {
|
||||
VerificationAbort("Value not found", key);
|
||||
VerificationAbort("Value not found", cf, key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void PrintKeyValue(uint32_t key, const char *value, size_t sz) {
|
||||
if (!FLAGS_verbose) return;
|
||||
fprintf(stdout, "%u ==> (%u) ", key, (unsigned int)sz);
|
||||
for (size_t i=0; i<sz; i++) {
|
||||
static void PrintKeyValue(int cf, uint32_t key, const char* value,
|
||||
size_t sz) {
|
||||
if (!FLAGS_verbose) {
|
||||
return;
|
||||
}
|
||||
fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz);
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
fprintf(stdout, "%X", value[i]);
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
@ -1290,8 +1346,13 @@ class StressTest {
|
||||
}
|
||||
|
||||
void PrintEnv() const {
|
||||
fprintf(stdout, "LevelDB version : %d.%d\n",
|
||||
kMajorVersion, kMinorVersion);
|
||||
fprintf(stdout, "RocksDB version : %d.%d\n", kMajorVersion,
|
||||
kMinorVersion);
|
||||
fprintf(stdout, "Column families : %d\n", FLAGS_column_families);
|
||||
if (!FLAGS_test_batches_snapshots) {
|
||||
fprintf(stdout, "Clear CFs one in : %d\n",
|
||||
FLAGS_clear_column_family_one_in);
|
||||
}
|
||||
fprintf(stdout, "Number of threads : %d\n", FLAGS_threads);
|
||||
fprintf(stdout,
|
||||
"Ops per thread : %lu\n",
|
||||
@ -1368,43 +1429,41 @@ class StressTest {
|
||||
|
||||
void Open() {
|
||||
assert(db_ == nullptr);
|
||||
Options options;
|
||||
options.block_cache = cache_;
|
||||
options.block_cache_compressed = compressed_cache_;
|
||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
|
||||
options.min_write_buffer_number_to_merge =
|
||||
FLAGS_min_write_buffer_number_to_merge;
|
||||
options.max_background_compactions = FLAGS_max_background_compactions;
|
||||
options.compaction_style =
|
||||
static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
|
||||
options.block_size = FLAGS_block_size;
|
||||
options.filter_policy = filter_policy_;
|
||||
options.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
|
||||
options.max_open_files = FLAGS_open_files;
|
||||
options.statistics = dbstats;
|
||||
options.env = FLAGS_env;
|
||||
options.disableDataSync = FLAGS_disable_data_sync;
|
||||
options.use_fsync = FLAGS_use_fsync;
|
||||
options.allow_mmap_reads = FLAGS_mmap_read;
|
||||
options_.block_cache = cache_;
|
||||
options_.block_cache_compressed = compressed_cache_;
|
||||
options_.write_buffer_size = FLAGS_write_buffer_size;
|
||||
options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
|
||||
options_.min_write_buffer_number_to_merge =
|
||||
FLAGS_min_write_buffer_number_to_merge;
|
||||
options_.max_background_compactions = FLAGS_max_background_compactions;
|
||||
options_.max_background_flushes = FLAGS_max_background_flushes;
|
||||
options_.compaction_style =
|
||||
static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
|
||||
options_.block_size = FLAGS_block_size;
|
||||
options_.filter_policy = filter_policy_;
|
||||
options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
|
||||
options_.max_open_files = FLAGS_open_files;
|
||||
options_.statistics = dbstats;
|
||||
options_.env = FLAGS_env;
|
||||
options_.disableDataSync = FLAGS_disable_data_sync;
|
||||
options_.use_fsync = FLAGS_use_fsync;
|
||||
options_.allow_mmap_reads = FLAGS_mmap_read;
|
||||
rocksdb_kill_odds = FLAGS_kill_random_test;
|
||||
options.target_file_size_base = FLAGS_target_file_size_base;
|
||||
options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
|
||||
options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
|
||||
options.max_bytes_for_level_multiplier =
|
||||
options_.target_file_size_base = FLAGS_target_file_size_base;
|
||||
options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
|
||||
options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
|
||||
options_.max_bytes_for_level_multiplier =
|
||||
FLAGS_max_bytes_for_level_multiplier;
|
||||
options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
|
||||
options.level0_slowdown_writes_trigger =
|
||||
FLAGS_level0_slowdown_writes_trigger;
|
||||
options.level0_file_num_compaction_trigger =
|
||||
FLAGS_level0_file_num_compaction_trigger;
|
||||
options.compression = FLAGS_compression_type_e;
|
||||
options.create_if_missing = true;
|
||||
options.disable_seek_compaction = FLAGS_disable_seek_compaction;
|
||||
options.delete_obsolete_files_period_micros =
|
||||
FLAGS_delete_obsolete_files_period_micros;
|
||||
options.max_manifest_file_size = 1024;
|
||||
options.filter_deletes = FLAGS_filter_deletes;
|
||||
options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
|
||||
options_.level0_slowdown_writes_trigger =
|
||||
FLAGS_level0_slowdown_writes_trigger;
|
||||
options_.level0_file_num_compaction_trigger =
|
||||
FLAGS_level0_file_num_compaction_trigger;
|
||||
options_.compression = FLAGS_compression_type_e;
|
||||
options_.create_if_missing = true;
|
||||
options_.disable_seek_compaction = FLAGS_disable_seek_compaction;
|
||||
options_.max_manifest_file_size = 10 * 1024;
|
||||
options_.filter_deletes = FLAGS_filter_deletes;
|
||||
if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
|
||||
fprintf(stderr,
|
||||
"prefix_size should be non-zero iff memtablerep == prefix_hash\n");
|
||||
@ -1412,51 +1471,107 @@ class StressTest {
|
||||
}
|
||||
switch (FLAGS_rep_factory) {
|
||||
case kHashSkipList:
|
||||
options.memtable_factory.reset(NewHashSkipListRepFactory());
|
||||
options_.memtable_factory.reset(NewHashSkipListRepFactory());
|
||||
break;
|
||||
case kSkipList:
|
||||
// no need to do anything
|
||||
break;
|
||||
case kVectorRep:
|
||||
options.memtable_factory.reset(new VectorRepFactory());
|
||||
options_.memtable_factory.reset(new VectorRepFactory());
|
||||
break;
|
||||
}
|
||||
static Random purge_percent(1000); // no benefit from non-determinism here
|
||||
if (static_cast<int32_t>(purge_percent.Uniform(100)) <
|
||||
FLAGS_purge_redundant_percent - 1) {
|
||||
options.purge_redundant_kvs_while_flush = false;
|
||||
options_.purge_redundant_kvs_while_flush = false;
|
||||
}
|
||||
|
||||
if (FLAGS_use_merge) {
|
||||
options.merge_operator = MergeOperators::CreatePutOperator();
|
||||
options_.merge_operator = MergeOperators::CreatePutOperator();
|
||||
}
|
||||
|
||||
// set universal style compaction configurations, if applicable
|
||||
if (FLAGS_universal_size_ratio != 0) {
|
||||
options.compaction_options_universal.size_ratio =
|
||||
FLAGS_universal_size_ratio;
|
||||
options_.compaction_options_universal.size_ratio =
|
||||
FLAGS_universal_size_ratio;
|
||||
}
|
||||
if (FLAGS_universal_min_merge_width != 0) {
|
||||
options.compaction_options_universal.min_merge_width =
|
||||
FLAGS_universal_min_merge_width;
|
||||
options_.compaction_options_universal.min_merge_width =
|
||||
FLAGS_universal_min_merge_width;
|
||||
}
|
||||
if (FLAGS_universal_max_merge_width != 0) {
|
||||
options.compaction_options_universal.max_merge_width =
|
||||
FLAGS_universal_max_merge_width;
|
||||
options_.compaction_options_universal.max_merge_width =
|
||||
FLAGS_universal_max_merge_width;
|
||||
}
|
||||
if (FLAGS_universal_max_size_amplification_percent != 0) {
|
||||
options.compaction_options_universal.max_size_amplification_percent =
|
||||
FLAGS_universal_max_size_amplification_percent;
|
||||
options_.compaction_options_universal.max_size_amplification_percent =
|
||||
FLAGS_universal_max_size_amplification_percent;
|
||||
}
|
||||
|
||||
fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
|
||||
|
||||
Status s;
|
||||
if (FLAGS_ttl == -1) {
|
||||
s = DB::Open(options, FLAGS_db, &db_);
|
||||
std::vector<std::string> existing_column_families;
|
||||
s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
|
||||
&existing_column_families); // ignore errors
|
||||
if (!s.ok()) {
|
||||
// DB doesn't exist
|
||||
assert(existing_column_families.empty());
|
||||
assert(column_family_names_.empty());
|
||||
column_family_names_.push_back(default_column_family_name);
|
||||
} else if (column_family_names_.empty()) {
|
||||
// this is the first call to the function Open()
|
||||
column_family_names_ = existing_column_families;
|
||||
} else {
|
||||
// this is a reopen. just assert that existing column_family_names are
|
||||
// equivalent to what we remember
|
||||
auto sorted_cfn = column_family_names_;
|
||||
sort(sorted_cfn.begin(), sorted_cfn.end());
|
||||
sort(existing_column_families.begin(), existing_column_families.end());
|
||||
if (sorted_cfn != existing_column_families) {
|
||||
fprintf(stderr,
|
||||
"Expected column families differ from the existing:\n");
|
||||
printf("Expected: {");
|
||||
for (auto cf : sorted_cfn) {
|
||||
printf("%s ", cf.c_str());
|
||||
}
|
||||
printf("}\n");
|
||||
printf("Existing: {");
|
||||
for (auto cf : existing_column_families) {
|
||||
printf("%s ", cf.c_str());
|
||||
}
|
||||
printf("}\n");
|
||||
}
|
||||
assert(sorted_cfn == existing_column_families);
|
||||
}
|
||||
std::vector<ColumnFamilyDescriptor> cf_descriptors;
|
||||
for (auto name : column_family_names_) {
|
||||
if (name != default_column_family_name) {
|
||||
new_column_family_name_ =
|
||||
std::max(new_column_family_name_.load(), std::stoi(name) + 1);
|
||||
}
|
||||
cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
|
||||
}
|
||||
s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
|
||||
&column_families_, &db_);
|
||||
if (s.ok()) {
|
||||
while (s.ok() &&
|
||||
column_families_.size() < (size_t)FLAGS_column_families) {
|
||||
ColumnFamilyHandle* cf = nullptr;
|
||||
std::string name = std::to_string(new_column_family_name_.load());
|
||||
new_column_family_name_++;
|
||||
s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), name, &cf);
|
||||
column_families_.push_back(cf);
|
||||
column_family_names_.push_back(name);
|
||||
}
|
||||
}
|
||||
assert(!s.ok() || column_families_.size() ==
|
||||
static_cast<size_t>(FLAGS_column_families));
|
||||
} else {
|
||||
s = UtilityDB::OpenTtlDB(options, FLAGS_db, &sdb_, FLAGS_ttl);
|
||||
db_ = sdb_;
|
||||
StackableDB* sdb;
|
||||
s = UtilityDB::OpenTtlDB(options_, FLAGS_db, &sdb, FLAGS_ttl);
|
||||
db_ = sdb;
|
||||
}
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
|
||||
@ -1465,13 +1580,11 @@ class StressTest {
|
||||
}
|
||||
|
||||
void Reopen() {
|
||||
// do not close the db. Just delete the lock file. This
|
||||
// simulates a crash-recovery kind of situation.
|
||||
if (FLAGS_ttl != -1) {
|
||||
((DBWithTTL*) db_)->TEST_Destroy_DBWithTtl();
|
||||
} else {
|
||||
((DBImpl*) db_)->TEST_Destroy_DBImpl();
|
||||
for (auto cf : column_families_) {
|
||||
delete cf;
|
||||
}
|
||||
column_families_.clear();
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
|
||||
num_times_reopened_++;
|
||||
@ -1493,14 +1606,15 @@ class StressTest {
|
||||
shared_ptr<Cache> compressed_cache_;
|
||||
const FilterPolicy* filter_policy_;
|
||||
DB* db_;
|
||||
StackableDB* sdb_;
|
||||
Options options_;
|
||||
std::vector<ColumnFamilyHandle*> column_families_;
|
||||
std::vector<std::string> column_family_names_;
|
||||
std::atomic<int> new_column_family_name_;
|
||||
int num_times_reopened_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
|
||||
" [OPTIONS]...");
|
||||
|
@ -81,7 +81,7 @@ Status CreateLoggerFromOptions(
|
||||
const std::string& dbname,
|
||||
const std::string& db_log_dir,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const DBOptions& options,
|
||||
std::shared_ptr<Logger>* logger) {
|
||||
std::string db_absolute_path;
|
||||
env->GetAbsolutePath(dbname, &db_absolute_path);
|
||||
|
@ -85,7 +85,7 @@ Status CreateLoggerFromOptions(
|
||||
const std::string& dbname,
|
||||
const std::string& db_log_dir,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const DBOptions& options,
|
||||
std::shared_ptr<Logger>* logger);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -197,7 +197,7 @@ TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
|
||||
}
|
||||
|
||||
TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
|
||||
Options options;
|
||||
DBOptions options;
|
||||
shared_ptr<Logger> logger;
|
||||
|
||||
// Normal logger
|
||||
|
@ -314,24 +314,12 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
|
||||
}
|
||||
|
||||
static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
|
||||
#ifdef __SSE4_2__
|
||||
#ifdef __SSE4_2__
|
||||
*l = _mm_crc32_u64(*l, LE_LOAD64(*p));
|
||||
*p += 8;
|
||||
#else
|
||||
#else
|
||||
Slow_CRC32(l, p);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Detect if SS42 or not.
|
||||
static bool isSSE42() {
|
||||
#ifdef __GNUC__
|
||||
uint32_t c_;
|
||||
uint32_t d_;
|
||||
__asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
|
||||
return c_ & (1U << 20); // copied from CpuId.h in Folly.
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
template<void (*CRC32)(uint64_t*, uint8_t const**)>
|
||||
@ -377,6 +365,18 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
|
||||
return l ^ 0xffffffffu;
|
||||
}
|
||||
|
||||
// Detect if SS42 or not.
|
||||
static bool isSSE42() {
|
||||
#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
|
||||
uint32_t c_;
|
||||
uint32_t d_;
|
||||
__asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
|
||||
return c_ & (1U << 20); // copied from CpuId.h in Folly.
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef uint32_t (*Function)(uint32_t, const char*, size_t);
|
||||
|
||||
static inline Function Choose_Extend() {
|
||||
|
@ -3,6 +3,8 @@
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <algorithm>
|
||||
#include <gflags/gflags.h>
|
||||
|
||||
@ -74,11 +76,12 @@ TEST(DynamicBloomTest, VaryingLengths) {
|
||||
// Count number of filters that significantly exceed the false positive rate
|
||||
int mediocre_filters = 0;
|
||||
int good_filters = 0;
|
||||
uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
|
||||
|
||||
fprintf(stderr, "bits_per_key: %d num_probes: %d\n",
|
||||
FLAGS_bits_per_key, FLAGS_num_probes);
|
||||
FLAGS_bits_per_key, num_probes);
|
||||
|
||||
for (uint32_t cl_per_block = 0; cl_per_block < FLAGS_num_probes;
|
||||
for (uint32_t cl_per_block = 0; cl_per_block < num_probes;
|
||||
++cl_per_block) {
|
||||
for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
|
||||
uint32_t bloom_bits = 0;
|
||||
@ -88,7 +91,7 @@ TEST(DynamicBloomTest, VaryingLengths) {
|
||||
bloom_bits = std::max(num * FLAGS_bits_per_key,
|
||||
cl_per_block * CACHE_LINE_SIZE * 8);
|
||||
}
|
||||
DynamicBloom bloom(bloom_bits, cl_per_block, FLAGS_num_probes);
|
||||
DynamicBloom bloom(bloom_bits, cl_per_block, num_probes);
|
||||
for (uint64_t i = 0; i < num; i++) {
|
||||
bloom.Add(Key(i, buffer));
|
||||
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
|
||||
@ -127,6 +130,7 @@ TEST(DynamicBloomTest, VaryingLengths) {
|
||||
|
||||
TEST(DynamicBloomTest, perf) {
|
||||
StopWatchNano timer(Env::Default());
|
||||
uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
|
||||
|
||||
if (!FLAGS_enable_perf) {
|
||||
return;
|
||||
@ -134,9 +138,9 @@ TEST(DynamicBloomTest, perf) {
|
||||
|
||||
for (uint64_t m = 1; m <= 8; ++m) {
|
||||
const uint64_t num_keys = m * 8 * 1024 * 1024;
|
||||
fprintf(stderr, "testing %luM keys\n", m * 8);
|
||||
fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8);
|
||||
|
||||
DynamicBloom std_bloom(num_keys * 10, 0, FLAGS_num_probes);
|
||||
DynamicBloom std_bloom(num_keys * 10, 0, num_probes);
|
||||
|
||||
timer.Start();
|
||||
for (uint64_t i = 1; i <= num_keys; ++i) {
|
||||
@ -144,7 +148,7 @@ TEST(DynamicBloomTest, perf) {
|
||||
}
|
||||
|
||||
uint64_t elapsed = timer.ElapsedNanos();
|
||||
fprintf(stderr, "standard bloom, avg add latency %lu\n",
|
||||
fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
|
||||
elapsed / num_keys);
|
||||
|
||||
uint64_t count = 0;
|
||||
@ -155,13 +159,13 @@ TEST(DynamicBloomTest, perf) {
|
||||
}
|
||||
}
|
||||
elapsed = timer.ElapsedNanos();
|
||||
fprintf(stderr, "standard bloom, avg query latency %lu\n",
|
||||
fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
|
||||
elapsed / count);
|
||||
ASSERT_TRUE(count == num_keys);
|
||||
|
||||
for (int cl_per_block = 1; cl_per_block <= FLAGS_num_probes;
|
||||
for (uint32_t cl_per_block = 1; cl_per_block <= num_probes;
|
||||
++cl_per_block) {
|
||||
DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, FLAGS_num_probes);
|
||||
DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes);
|
||||
|
||||
timer.Start();
|
||||
for (uint64_t i = 1; i <= num_keys; ++i) {
|
||||
@ -169,7 +173,7 @@ TEST(DynamicBloomTest, perf) {
|
||||
}
|
||||
|
||||
uint64_t elapsed = timer.ElapsedNanos();
|
||||
fprintf(stderr, "blocked bloom(%d), avg add latency %lu\n",
|
||||
fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n",
|
||||
cl_per_block, elapsed / num_keys);
|
||||
|
||||
uint64_t count = 0;
|
||||
@ -182,7 +186,7 @@ TEST(DynamicBloomTest, perf) {
|
||||
}
|
||||
|
||||
elapsed = timer.ElapsedNanos();
|
||||
fprintf(stderr, "blocked bloom(%d), avg query latency %lu\n",
|
||||
fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n",
|
||||
cl_per_block, elapsed / count);
|
||||
ASSERT_TRUE(count == num_keys);
|
||||
}
|
||||
|
@ -231,7 +231,7 @@ EnvWrapper::~EnvWrapper() {
|
||||
|
||||
namespace { // anonymous namespace
|
||||
|
||||
void AssignEnvOptions(EnvOptions* env_options, const Options& options) {
|
||||
void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
|
||||
env_options->use_os_buffer = options.allow_os_buffer;
|
||||
env_options->use_mmap_reads = options.allow_mmap_reads;
|
||||
env_options->use_mmap_writes = options.allow_mmap_writes;
|
||||
@ -249,12 +249,12 @@ EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
|
||||
return env_options;
|
||||
}
|
||||
|
||||
EnvOptions::EnvOptions(const Options& options) {
|
||||
EnvOptions::EnvOptions(const DBOptions& options) {
|
||||
AssignEnvOptions(this, options);
|
||||
}
|
||||
|
||||
EnvOptions::EnvOptions() {
|
||||
Options options;
|
||||
DBOptions options;
|
||||
AssignEnvOptions(this, options);
|
||||
}
|
||||
|
||||
|
@ -22,12 +22,6 @@ namespace {
|
||||
typedef const char* Key;
|
||||
|
||||
struct Node {
|
||||
explicit Node(const Key& k) :
|
||||
key(k) {
|
||||
}
|
||||
|
||||
Key const key;
|
||||
|
||||
// Accessors/mutators for links. Wrapped in methods so we can
|
||||
// add the appropriate barriers as necessary.
|
||||
Node* Next() {
|
||||
@ -40,17 +34,19 @@ struct Node {
|
||||
// pointer observes a fully initialized version of the inserted node.
|
||||
next_.Release_Store(x);
|
||||
}
|
||||
|
||||
// No-barrier variants that can be safely used in a few locations.
|
||||
Node* NoBarrier_Next() {
|
||||
return reinterpret_cast<Node*>(next_.NoBarrier_Load());
|
||||
}
|
||||
|
||||
void NoBarrier_SetNext(Node* x) {
|
||||
next_.NoBarrier_Store(x);
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
port::AtomicPointer next_;
|
||||
public:
|
||||
char key[0];
|
||||
};
|
||||
|
||||
class HashLinkListRep : public MemTableRep {
|
||||
@ -58,7 +54,9 @@ class HashLinkListRep : public MemTableRep {
|
||||
HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
|
||||
const SliceTransform* transform, size_t bucket_size);
|
||||
|
||||
virtual void Insert(const char* key) override;
|
||||
virtual KeyHandle Allocate(const size_t len, char** buf) override;
|
||||
|
||||
virtual void Insert(KeyHandle handle) override;
|
||||
|
||||
virtual bool Contains(const char* key) const override;
|
||||
|
||||
@ -93,8 +91,6 @@ class HashLinkListRep : public MemTableRep {
|
||||
const SliceTransform* transform_;
|
||||
|
||||
const MemTableRep::KeyComparator& compare_;
|
||||
// immutable after construction
|
||||
Arena* const arena_;
|
||||
|
||||
bool BucketContains(Node* head, const Slice& key) const;
|
||||
|
||||
@ -114,11 +110,6 @@ class HashLinkListRep : public MemTableRep {
|
||||
return GetBucket(GetHash(slice));
|
||||
}
|
||||
|
||||
Node* NewNode(const Key& key) {
|
||||
char* mem = arena_->AllocateAligned(sizeof(Node));
|
||||
return new (mem) Node(key);
|
||||
}
|
||||
|
||||
bool Equal(const Slice& a, const Key& b) const {
|
||||
return (compare_(b, a) == 0);
|
||||
}
|
||||
@ -318,10 +309,10 @@ class HashLinkListRep : public MemTableRep {
|
||||
HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
|
||||
Arena* arena, const SliceTransform* transform,
|
||||
size_t bucket_size)
|
||||
: bucket_size_(bucket_size),
|
||||
: MemTableRep(arena),
|
||||
bucket_size_(bucket_size),
|
||||
transform_(transform),
|
||||
compare_(compare),
|
||||
arena_(arena) {
|
||||
compare_(compare) {
|
||||
char* mem = arena_->AllocateAligned(
|
||||
sizeof(port::AtomicPointer) * bucket_size);
|
||||
|
||||
@ -335,15 +326,22 @@ HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
|
||||
HashLinkListRep::~HashLinkListRep() {
|
||||
}
|
||||
|
||||
void HashLinkListRep::Insert(const char* key) {
|
||||
assert(!Contains(key));
|
||||
Slice internal_key = GetLengthPrefixedSlice(key);
|
||||
KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
|
||||
char* mem = arena_->AllocateAligned(sizeof(Node) + len);
|
||||
Node* x = new (mem) Node();
|
||||
*buf = x->key;
|
||||
return static_cast<void*>(x);
|
||||
}
|
||||
|
||||
void HashLinkListRep::Insert(KeyHandle handle) {
|
||||
Node* x = static_cast<Node*>(handle);
|
||||
assert(!Contains(x->key));
|
||||
Slice internal_key = GetLengthPrefixedSlice(x->key);
|
||||
auto transformed = GetPrefix(internal_key);
|
||||
auto& bucket = buckets_[GetHash(transformed)];
|
||||
Node* head = static_cast<Node*>(bucket.Acquire_Load());
|
||||
|
||||
if (!head) {
|
||||
Node* x = NewNode(key);
|
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
x->NoBarrier_SetNext(nullptr);
|
||||
@ -372,9 +370,7 @@ void HashLinkListRep::Insert(const char* key) {
|
||||
}
|
||||
|
||||
// Our data structure does not allow duplicate insertion
|
||||
assert(cur == nullptr || !Equal(key, cur->key));
|
||||
|
||||
Node* x = NewNode(key);
|
||||
assert(cur == nullptr || !Equal(x->key, cur->key));
|
||||
|
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
|
@ -25,7 +25,7 @@ class HashSkipListRep : public MemTableRep {
|
||||
const SliceTransform* transform, size_t bucket_size,
|
||||
int32_t skiplist_height, int32_t skiplist_branching_factor);
|
||||
|
||||
virtual void Insert(const char* key) override;
|
||||
virtual void Insert(KeyHandle handle) override;
|
||||
|
||||
virtual bool Contains(const char* key) const override;
|
||||
|
||||
@ -225,7 +225,8 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
|
||||
Arena* arena, const SliceTransform* transform,
|
||||
size_t bucket_size, int32_t skiplist_height,
|
||||
int32_t skiplist_branching_factor)
|
||||
: bucket_size_(bucket_size),
|
||||
: MemTableRep(arena),
|
||||
bucket_size_(bucket_size),
|
||||
skiplist_height_(skiplist_height),
|
||||
skiplist_branching_factor_(skiplist_branching_factor),
|
||||
transform_(transform),
|
||||
@ -255,7 +256,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
|
||||
return bucket;
|
||||
}
|
||||
|
||||
void HashSkipListRep::Insert(const char* key) {
|
||||
void HashSkipListRep::Insert(KeyHandle handle) {
|
||||
auto* key = static_cast<char*>(handle);
|
||||
assert(!Contains(key));
|
||||
auto transformed = transform_->Transform(UserKey(key));
|
||||
auto bucket = GetInitializedBucket(transformed);
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "db/filename.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/write_batch.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
#include <ctime>
|
||||
@ -152,6 +153,8 @@ LDBCommand* LDBCommand::SelectCommand(
|
||||
return new DBLoaderCommand(cmdParams, option_map, flags);
|
||||
} else if (cmd == ManifestDumpCommand::Name()) {
|
||||
return new ManifestDumpCommand(cmdParams, option_map, flags);
|
||||
} else if (cmd == ListColumnFamiliesCommand::Name()) {
|
||||
return new ListColumnFamiliesCommand(cmdParams, option_map, flags);
|
||||
} else if (cmd == InternalDumpCommand::Name()) {
|
||||
return new InternalDumpCommand(cmdParams, option_map, flags);
|
||||
} else if (cmd == CheckConsistencyCommand::Name()) {
|
||||
@ -540,11 +543,10 @@ void ManifestDumpCommand::DoCommand() {
|
||||
EnvOptions sopt;
|
||||
std::string file(manifestfile);
|
||||
std::string dbname("dummy");
|
||||
TableCache* tc = new TableCache(dbname, &options, sopt, 10);
|
||||
const InternalKeyComparator* cmp =
|
||||
new InternalKeyComparator(options.comparator);
|
||||
|
||||
VersionSet* versions = new VersionSet(dbname, &options, sopt, tc, cmp);
|
||||
std::shared_ptr<Cache> tc(NewLRUCache(
|
||||
options.max_open_files - 10, options.table_cache_numshardbits,
|
||||
options.table_cache_remove_scan_count_limit));
|
||||
VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get());
|
||||
Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_);
|
||||
if (!s.ok()) {
|
||||
printf("Error in processing file %s %s\n", manifestfile.c_str(),
|
||||
@ -557,6 +559,48 @@ void ManifestDumpCommand::DoCommand() {
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void ListColumnFamiliesCommand::Help(string& ret) {
|
||||
ret.append(" ");
|
||||
ret.append(ListColumnFamiliesCommand::Name());
|
||||
ret.append(" full_path_to_db_directory ");
|
||||
ret.append("\n");
|
||||
}
|
||||
|
||||
ListColumnFamiliesCommand::ListColumnFamiliesCommand(
|
||||
const vector<string>& params, const map<string, string>& options,
|
||||
const vector<string>& flags)
|
||||
: LDBCommand(options, flags, false, {}) {
|
||||
|
||||
if (params.size() != 1) {
|
||||
exec_state_ = LDBCommandExecuteResult::FAILED(
|
||||
"dbname must be specified for the list_column_families command");
|
||||
} else {
|
||||
dbname_ = params[0];
|
||||
}
|
||||
}
|
||||
|
||||
void ListColumnFamiliesCommand::DoCommand() {
|
||||
vector<string> column_families;
|
||||
Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
|
||||
if (!s.ok()) {
|
||||
printf("Error in processing db %s %s\n", dbname_.c_str(),
|
||||
s.ToString().c_str());
|
||||
} else {
|
||||
printf("Column families in %s: \n{", dbname_.c_str());
|
||||
bool first = true;
|
||||
for (auto cf : column_families) {
|
||||
if (!first) {
|
||||
printf(", ");
|
||||
}
|
||||
first = false;
|
||||
printf("%s", cf.c_str());
|
||||
}
|
||||
printf("}\n");
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
string ReadableTime(int unixtime) {
|
||||
char time_buffer [80];
|
||||
time_t rawtime = unixtime;
|
||||
@ -1018,19 +1062,26 @@ Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
|
||||
Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
|
||||
int* levels) {
|
||||
EnvOptions soptions;
|
||||
TableCache tc(db_path_, &opt, soptions, 10);
|
||||
std::shared_ptr<Cache> tc(
|
||||
NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits,
|
||||
opt.table_cache_remove_scan_count_limit));
|
||||
const InternalKeyComparator cmp(opt.comparator);
|
||||
VersionSet versions(db_path_, &opt, soptions, &tc, &cmp);
|
||||
VersionSet versions(db_path_, &opt, soptions, tc.get());
|
||||
std::vector<ColumnFamilyDescriptor> dummy;
|
||||
ColumnFamilyDescriptor dummy_descriptor(default_column_family_name,
|
||||
ColumnFamilyOptions(opt));
|
||||
dummy.push_back(dummy_descriptor);
|
||||
// We rely the VersionSet::Recover to tell us the internal data structures
|
||||
// in the db. And the Recover() should never do any change
|
||||
// (like LogAndApply) to the manifest file.
|
||||
Status st = versions.Recover();
|
||||
Status st = versions.Recover(dummy);
|
||||
if (!st.ok()) {
|
||||
return st;
|
||||
}
|
||||
int max = -1;
|
||||
for (int i = 0; i < versions.NumberLevels(); i++) {
|
||||
if (versions.current()->NumLevelFiles(i)) {
|
||||
auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
|
||||
for (int i = 0; i < default_cfd->NumberLevels(); i++) {
|
||||
if (default_cfd->current()->NumLevelFiles(i)) {
|
||||
max = i;
|
||||
}
|
||||
}
|
||||
@ -1075,7 +1126,6 @@ void ReduceDBLevelsCommand::DoCommand() {
|
||||
CloseDB();
|
||||
|
||||
EnvOptions soptions;
|
||||
|
||||
st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
|
||||
if (!st.ok()) {
|
||||
exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
|
||||
|
@ -484,6 +484,23 @@ private:
|
||||
static const string ARG_PATH;
|
||||
};
|
||||
|
||||
class ListColumnFamiliesCommand : public LDBCommand {
|
||||
public:
|
||||
static string Name() { return "list_column_families"; }
|
||||
|
||||
ListColumnFamiliesCommand(const vector<string>& params,
|
||||
const map<string, string>& options,
|
||||
const vector<string>& flags);
|
||||
|
||||
static void Help(string& ret);
|
||||
virtual void DoCommand();
|
||||
|
||||
virtual bool NoDBOpen() { return true; }
|
||||
|
||||
private:
|
||||
string dbname_;
|
||||
};
|
||||
|
||||
class ReduceDBLevelsCommand : public LDBCommand {
|
||||
public:
|
||||
static string Name() { return "reduce_levels"; }
|
||||
|
@ -64,6 +64,7 @@ public:
|
||||
DBDumperCommand::Help(ret);
|
||||
DBLoaderCommand::Help(ret);
|
||||
ManifestDumpCommand::Help(ret);
|
||||
ListColumnFamiliesCommand::Help(ret);
|
||||
InternalDumpCommand::Help(ret);
|
||||
|
||||
fprintf(stderr, "%s\n", ret.c_str());
|
||||
|
327
util/options.cc
327
util/options.cc
@ -26,23 +26,17 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Options::Options()
|
||||
ColumnFamilyOptions::ColumnFamilyOptions()
|
||||
: comparator(BytewiseComparator()),
|
||||
merge_operator(nullptr),
|
||||
compaction_filter(nullptr),
|
||||
compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
|
||||
new DefaultCompactionFilterFactory())),
|
||||
compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()),
|
||||
create_if_missing(false),
|
||||
error_if_exists(false),
|
||||
paranoid_checks(true),
|
||||
env(Env::Default()),
|
||||
info_log(nullptr),
|
||||
info_log_level(INFO),
|
||||
compaction_filter_factory_v2(
|
||||
new DefaultCompactionFilterFactoryV2()),
|
||||
write_buffer_size(4 << 20),
|
||||
max_write_buffer_number(2),
|
||||
min_write_buffer_number_to_merge(1),
|
||||
max_open_files(5000),
|
||||
block_cache(nullptr),
|
||||
block_cache_compressed(nullptr),
|
||||
block_size(4096),
|
||||
@ -64,42 +58,15 @@ Options::Options()
|
||||
expanded_compaction_factor(25),
|
||||
source_compaction_factor(1),
|
||||
max_grandparent_overlap_factor(10),
|
||||
disableDataSync(false),
|
||||
use_fsync(false),
|
||||
db_stats_log_interval(1800),
|
||||
db_log_dir(""),
|
||||
wal_dir(""),
|
||||
disable_seek_compaction(true),
|
||||
delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
|
||||
max_background_compactions(1),
|
||||
max_background_flushes(1),
|
||||
max_log_file_size(0),
|
||||
log_file_time_to_roll(0),
|
||||
keep_log_file_num(1000),
|
||||
soft_rate_limit(0.0),
|
||||
hard_rate_limit(0.0),
|
||||
rate_limit_delay_max_milliseconds(1000),
|
||||
max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
|
||||
no_block_cache(false),
|
||||
table_cache_numshardbits(4),
|
||||
table_cache_remove_scan_count_limit(16),
|
||||
arena_block_size(0),
|
||||
disable_auto_compactions(false),
|
||||
WAL_ttl_seconds(0),
|
||||
WAL_size_limit_MB(0),
|
||||
manifest_preallocation_size(4 * 1024 * 1024),
|
||||
purge_redundant_kvs_while_flush(true),
|
||||
allow_os_buffer(true),
|
||||
allow_mmap_reads(false),
|
||||
allow_mmap_writes(false),
|
||||
is_fd_close_on_exec(true),
|
||||
skip_log_error_on_recovery(false),
|
||||
stats_dump_period_sec(3600),
|
||||
block_size_deviation(10),
|
||||
advise_random_on_open(true),
|
||||
access_hint_on_compaction_start(NORMAL),
|
||||
use_adaptive_mutex(false),
|
||||
bytes_per_sync(0),
|
||||
compaction_style(kCompactionStyleLevel),
|
||||
verify_checksums_in_compaction(true),
|
||||
filter_deletes(false),
|
||||
@ -114,38 +81,229 @@ Options::Options()
|
||||
memtable_prefix_bloom_probes(6),
|
||||
bloom_locality(0),
|
||||
max_successive_merges(0),
|
||||
min_partial_merge_operands(2),
|
||||
allow_thread_local(true) {
|
||||
min_partial_merge_operands(2) {
|
||||
assert(memtable_factory.get() != nullptr);
|
||||
}
|
||||
|
||||
ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
|
||||
: comparator(options.comparator),
|
||||
merge_operator(options.merge_operator),
|
||||
compaction_filter(options.compaction_filter),
|
||||
compaction_filter_factory(options.compaction_filter_factory),
|
||||
compaction_filter_factory_v2(options.compaction_filter_factory_v2),
|
||||
write_buffer_size(options.write_buffer_size),
|
||||
max_write_buffer_number(options.max_write_buffer_number),
|
||||
min_write_buffer_number_to_merge(
|
||||
options.min_write_buffer_number_to_merge),
|
||||
block_cache(options.block_cache),
|
||||
block_cache_compressed(options.block_cache_compressed),
|
||||
block_size(options.block_size),
|
||||
block_restart_interval(options.block_restart_interval),
|
||||
compression(options.compression),
|
||||
compression_per_level(options.compression_per_level),
|
||||
compression_opts(options.compression_opts),
|
||||
filter_policy(options.filter_policy),
|
||||
prefix_extractor(options.prefix_extractor),
|
||||
whole_key_filtering(options.whole_key_filtering),
|
||||
num_levels(options.num_levels),
|
||||
level0_file_num_compaction_trigger(
|
||||
options.level0_file_num_compaction_trigger),
|
||||
level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
|
||||
level0_stop_writes_trigger(options.level0_stop_writes_trigger),
|
||||
max_mem_compaction_level(options.max_mem_compaction_level),
|
||||
target_file_size_base(options.target_file_size_base),
|
||||
target_file_size_multiplier(options.target_file_size_multiplier),
|
||||
max_bytes_for_level_base(options.max_bytes_for_level_base),
|
||||
max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
|
||||
max_bytes_for_level_multiplier_additional(
|
||||
options.max_bytes_for_level_multiplier_additional),
|
||||
expanded_compaction_factor(options.expanded_compaction_factor),
|
||||
source_compaction_factor(options.source_compaction_factor),
|
||||
max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
|
||||
disable_seek_compaction(options.disable_seek_compaction),
|
||||
soft_rate_limit(options.soft_rate_limit),
|
||||
hard_rate_limit(options.hard_rate_limit),
|
||||
rate_limit_delay_max_milliseconds(
|
||||
options.rate_limit_delay_max_milliseconds),
|
||||
no_block_cache(options.no_block_cache),
|
||||
arena_block_size(options.arena_block_size),
|
||||
disable_auto_compactions(options.disable_auto_compactions),
|
||||
purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
|
||||
block_size_deviation(options.block_size_deviation),
|
||||
compaction_style(options.compaction_style),
|
||||
verify_checksums_in_compaction(options.verify_checksums_in_compaction),
|
||||
compaction_options_universal(options.compaction_options_universal),
|
||||
filter_deletes(options.filter_deletes),
|
||||
max_sequential_skip_in_iterations(
|
||||
options.max_sequential_skip_in_iterations),
|
||||
memtable_factory(options.memtable_factory),
|
||||
table_factory(options.table_factory),
|
||||
table_properties_collectors(options.table_properties_collectors),
|
||||
inplace_update_support(options.inplace_update_support),
|
||||
inplace_update_num_locks(options.inplace_update_num_locks),
|
||||
inplace_callback(options.inplace_callback),
|
||||
memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
|
||||
memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
|
||||
bloom_locality(options.bloom_locality),
|
||||
max_successive_merges(options.max_successive_merges),
|
||||
min_partial_merge_operands(options.min_partial_merge_operands) {
|
||||
assert(memtable_factory.get() != nullptr);
|
||||
}
|
||||
|
||||
DBOptions::DBOptions()
|
||||
: create_if_missing(false),
|
||||
error_if_exists(false),
|
||||
paranoid_checks(true),
|
||||
env(Env::Default()),
|
||||
info_log(nullptr),
|
||||
info_log_level(INFO),
|
||||
max_open_files(5000),
|
||||
statistics(nullptr),
|
||||
disableDataSync(false),
|
||||
use_fsync(false),
|
||||
db_stats_log_interval(1800),
|
||||
db_log_dir(""),
|
||||
wal_dir(""),
|
||||
delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
|
||||
max_background_compactions(1),
|
||||
max_background_flushes(1),
|
||||
max_log_file_size(0),
|
||||
log_file_time_to_roll(0),
|
||||
keep_log_file_num(1000),
|
||||
max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
|
||||
table_cache_numshardbits(4),
|
||||
table_cache_remove_scan_count_limit(16),
|
||||
WAL_ttl_seconds(0),
|
||||
WAL_size_limit_MB(0),
|
||||
manifest_preallocation_size(4 * 1024 * 1024),
|
||||
allow_os_buffer(true),
|
||||
allow_mmap_reads(false),
|
||||
allow_mmap_writes(false),
|
||||
is_fd_close_on_exec(true),
|
||||
skip_log_error_on_recovery(false),
|
||||
stats_dump_period_sec(3600),
|
||||
advise_random_on_open(true),
|
||||
access_hint_on_compaction_start(NORMAL),
|
||||
use_adaptive_mutex(false),
|
||||
bytes_per_sync(0),
|
||||
allow_thread_local(true) {}
|
||||
|
||||
DBOptions::DBOptions(const Options& options)
|
||||
: create_if_missing(options.create_if_missing),
|
||||
error_if_exists(options.error_if_exists),
|
||||
paranoid_checks(options.paranoid_checks),
|
||||
env(options.env),
|
||||
info_log(options.info_log),
|
||||
info_log_level(options.info_log_level),
|
||||
max_open_files(options.max_open_files),
|
||||
statistics(options.statistics),
|
||||
disableDataSync(options.disableDataSync),
|
||||
use_fsync(options.use_fsync),
|
||||
db_stats_log_interval(options.db_stats_log_interval),
|
||||
db_log_dir(options.db_log_dir),
|
||||
wal_dir(options.wal_dir),
|
||||
delete_obsolete_files_period_micros(
|
||||
options.delete_obsolete_files_period_micros),
|
||||
max_background_compactions(options.max_background_compactions),
|
||||
max_background_flushes(options.max_background_flushes),
|
||||
max_log_file_size(options.max_log_file_size),
|
||||
log_file_time_to_roll(options.log_file_time_to_roll),
|
||||
keep_log_file_num(options.keep_log_file_num),
|
||||
max_manifest_file_size(options.max_manifest_file_size),
|
||||
table_cache_numshardbits(options.table_cache_numshardbits),
|
||||
table_cache_remove_scan_count_limit(
|
||||
options.table_cache_remove_scan_count_limit),
|
||||
WAL_ttl_seconds(options.WAL_ttl_seconds),
|
||||
WAL_size_limit_MB(options.WAL_size_limit_MB),
|
||||
manifest_preallocation_size(options.manifest_preallocation_size),
|
||||
allow_os_buffer(options.allow_os_buffer),
|
||||
allow_mmap_reads(options.allow_mmap_reads),
|
||||
allow_mmap_writes(options.allow_mmap_writes),
|
||||
is_fd_close_on_exec(options.is_fd_close_on_exec),
|
||||
skip_log_error_on_recovery(options.skip_log_error_on_recovery),
|
||||
stats_dump_period_sec(options.stats_dump_period_sec),
|
||||
advise_random_on_open(options.advise_random_on_open),
|
||||
access_hint_on_compaction_start(options.access_hint_on_compaction_start),
|
||||
use_adaptive_mutex(options.use_adaptive_mutex),
|
||||
bytes_per_sync(options.bytes_per_sync),
|
||||
allow_thread_local(options.allow_thread_local) {}
|
||||
|
||||
static const char* const access_hints[] = {
|
||||
"NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
|
||||
};
|
||||
|
||||
void
|
||||
Options::Dump(Logger* log) const
|
||||
{
|
||||
Log(log," Options.comparator: %s", comparator->Name());
|
||||
Log(log," Options.merge_operator: %s",
|
||||
merge_operator? merge_operator->Name() : "None");
|
||||
Log(log," Options.compaction_filter: %s",
|
||||
compaction_filter? compaction_filter->Name() : "None");
|
||||
Log(log," Options.compaction_filter_factory: %s",
|
||||
compaction_filter_factory->Name());
|
||||
Log(log, " Options.compaction_filter_factory_v2: %s",
|
||||
compaction_filter_factory_v2->Name());
|
||||
Log(log," Options.memtable_factory: %s",
|
||||
memtable_factory->Name());
|
||||
Log(log," Options.table_factory: %s", table_factory->Name());
|
||||
void DBOptions::Dump(Logger* log) const {
|
||||
Log(log," Options.error_if_exists: %d", error_if_exists);
|
||||
Log(log," Options.create_if_missing: %d", create_if_missing);
|
||||
Log(log," Options.paranoid_checks: %d", paranoid_checks);
|
||||
Log(log," Options.env: %p", env);
|
||||
Log(log," Options.info_log: %p", info_log.get());
|
||||
Log(log," Options.write_buffer_size: %zd", write_buffer_size);
|
||||
Log(log," Options.max_write_buffer_number: %d", max_write_buffer_number);
|
||||
Log(log," Options.max_open_files: %d", max_open_files);
|
||||
Log(log, " Options.disableDataSync: %d", disableDataSync);
|
||||
Log(log, " Options.use_fsync: %d", use_fsync);
|
||||
Log(log, " Options.max_log_file_size: %zu", max_log_file_size);
|
||||
Log(log, "Options.max_manifest_file_size: %lu",
|
||||
(unsigned long)max_manifest_file_size);
|
||||
Log(log, " Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
|
||||
Log(log, " Options.keep_log_file_num: %zu", keep_log_file_num);
|
||||
Log(log, " Options.db_stats_log_interval: %d", db_stats_log_interval);
|
||||
Log(log, " Options.allow_os_buffer: %d", allow_os_buffer);
|
||||
Log(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
|
||||
Log(log, " Options.allow_mmap_writes: %d", allow_mmap_writes);
|
||||
Log(log, " Options.db_log_dir: %s",
|
||||
db_log_dir.c_str());
|
||||
Log(log, " Options.wal_dir: %s",
|
||||
wal_dir.c_str());
|
||||
Log(log, " Options.table_cache_numshardbits: %d",
|
||||
table_cache_numshardbits);
|
||||
Log(log, " Options.table_cache_remove_scan_count_limit: %d",
|
||||
table_cache_remove_scan_count_limit);
|
||||
Log(log, " Options.delete_obsolete_files_period_micros: %lu",
|
||||
(unsigned long)delete_obsolete_files_period_micros);
|
||||
Log(log, " Options.max_background_compactions: %d",
|
||||
max_background_compactions);
|
||||
Log(log, " Options.max_background_flushes: %d",
|
||||
max_background_flushes);
|
||||
Log(log, " Options.WAL_ttl_seconds: %lu",
|
||||
(unsigned long)WAL_ttl_seconds);
|
||||
Log(log, " Options.WAL_size_limit_MB: %lu",
|
||||
(unsigned long)WAL_size_limit_MB);
|
||||
Log(log, " Options.manifest_preallocation_size: %zu",
|
||||
manifest_preallocation_size);
|
||||
Log(log, " Options.allow_os_buffer: %d",
|
||||
allow_os_buffer);
|
||||
Log(log, " Options.allow_mmap_reads: %d",
|
||||
allow_mmap_reads);
|
||||
Log(log, " Options.allow_mmap_writes: %d",
|
||||
allow_mmap_writes);
|
||||
Log(log, " Options.is_fd_close_on_exec: %d",
|
||||
is_fd_close_on_exec);
|
||||
Log(log, " Options.skip_log_error_on_recovery: %d",
|
||||
skip_log_error_on_recovery);
|
||||
Log(log, " Options.stats_dump_period_sec: %u",
|
||||
stats_dump_period_sec);
|
||||
Log(log, " Options.advise_random_on_open: %d",
|
||||
advise_random_on_open);
|
||||
Log(log, " Options.access_hint_on_compaction_start: %s",
|
||||
access_hints[access_hint_on_compaction_start]);
|
||||
Log(log, " Options.use_adaptive_mutex: %d",
|
||||
use_adaptive_mutex);
|
||||
Log(log, " Options.bytes_per_sync: %lu",
|
||||
(unsigned long)bytes_per_sync);
|
||||
} // DBOptions::Dump
|
||||
|
||||
void ColumnFamilyOptions::Dump(Logger* log) const {
|
||||
Log(log, " Options.comparator: %s", comparator->Name());
|
||||
Log(log, " Options.merge_operator: %s",
|
||||
merge_operator ? merge_operator->Name() : "None");
|
||||
Log(log, " Options.compaction_filter_factory: %s",
|
||||
compaction_filter_factory->Name());
|
||||
Log(log, " Options.compaction_filter_factory_v2: %s",
|
||||
compaction_filter_factory_v2->Name());
|
||||
Log(log, " Options.memtable_factory: %s", memtable_factory->Name());
|
||||
Log(log, " Options.table_factory: %s", table_factory->Name());
|
||||
Log(log, " Options.write_buffer_size: %zd", write_buffer_size);
|
||||
Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number);
|
||||
Log(log," Options.block_cache: %p", block_cache.get());
|
||||
Log(log," Options.block_cache_compressed: %p",
|
||||
block_cache_compressed.get());
|
||||
@ -173,18 +331,6 @@ Options::Dump(Logger* log) const
|
||||
prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
|
||||
Log(log," Options.whole_key_filtering: %d", whole_key_filtering);
|
||||
Log(log," Options.num_levels: %d", num_levels);
|
||||
Log(log," Options.disableDataSync: %d", disableDataSync);
|
||||
Log(log," Options.use_fsync: %d", use_fsync);
|
||||
Log(log," Options.max_log_file_size: %zu", max_log_file_size);
|
||||
Log(log,"Options.max_manifest_file_size: %lu",
|
||||
(unsigned long)max_manifest_file_size);
|
||||
Log(log," Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
|
||||
Log(log," Options.keep_log_file_num: %zu", keep_log_file_num);
|
||||
Log(log," Options.db_stats_log_interval: %d",
|
||||
db_stats_log_interval);
|
||||
Log(log," Options.allow_os_buffer: %d", allow_os_buffer);
|
||||
Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads);
|
||||
Log(log," Options.allow_mmap_writes: %d", allow_mmap_writes);
|
||||
Log(log," Options.min_write_buffer_number_to_merge: %d",
|
||||
min_write_buffer_number_to_merge);
|
||||
Log(log," Options.purge_redundant_kvs_while_flush: %d",
|
||||
@ -223,26 +369,12 @@ Options::Dump(Logger* log) const
|
||||
source_compaction_factor);
|
||||
Log(log," Options.max_grandparent_overlap_factor: %d",
|
||||
max_grandparent_overlap_factor);
|
||||
Log(log," Options.db_log_dir: %s",
|
||||
db_log_dir.c_str());
|
||||
Log(log," Options.wal_dir: %s",
|
||||
wal_dir.c_str());
|
||||
Log(log," Options.disable_seek_compaction: %d",
|
||||
disable_seek_compaction);
|
||||
Log(log," Options.no_block_cache: %d",
|
||||
no_block_cache);
|
||||
Log(log," Options.table_cache_numshardbits: %d",
|
||||
table_cache_numshardbits);
|
||||
Log(log," Options.table_cache_remove_scan_count_limit: %d",
|
||||
table_cache_remove_scan_count_limit);
|
||||
Log(log," Options.arena_block_size: %zu",
|
||||
arena_block_size);
|
||||
Log(log," Options.delete_obsolete_files_period_micros: %lu",
|
||||
(unsigned long)delete_obsolete_files_period_micros);
|
||||
Log(log," Options.max_background_compactions: %d",
|
||||
max_background_compactions);
|
||||
Log(log," Options.max_background_flushes: %d",
|
||||
max_background_flushes);
|
||||
Log(log," Options.soft_rate_limit: %.2f",
|
||||
soft_rate_limit);
|
||||
Log(log," Options.hard_rate_limit: %.2f",
|
||||
@ -251,36 +383,10 @@ Options::Dump(Logger* log) const
|
||||
rate_limit_delay_max_milliseconds);
|
||||
Log(log," Options.disable_auto_compactions: %d",
|
||||
disable_auto_compactions);
|
||||
Log(log," Options.WAL_ttl_seconds: %lu",
|
||||
(unsigned long)WAL_ttl_seconds);
|
||||
Log(log," Options.WAL_size_limit_MB: %lu",
|
||||
(unsigned long)WAL_size_limit_MB);
|
||||
Log(log," Options.manifest_preallocation_size: %zu",
|
||||
manifest_preallocation_size);
|
||||
Log(log," Options.purge_redundant_kvs_while_flush: %d",
|
||||
purge_redundant_kvs_while_flush);
|
||||
Log(log," Options.allow_os_buffer: %d",
|
||||
allow_os_buffer);
|
||||
Log(log," Options.allow_mmap_reads: %d",
|
||||
allow_mmap_reads);
|
||||
Log(log," Options.allow_mmap_writes: %d",
|
||||
allow_mmap_writes);
|
||||
Log(log," Options.is_fd_close_on_exec: %d",
|
||||
is_fd_close_on_exec);
|
||||
Log(log," Options.skip_log_error_on_recovery: %d",
|
||||
skip_log_error_on_recovery);
|
||||
Log(log," Options.stats_dump_period_sec: %u",
|
||||
stats_dump_period_sec);
|
||||
Log(log," Options.block_size_deviation: %d",
|
||||
block_size_deviation);
|
||||
Log(log," Options.advise_random_on_open: %d",
|
||||
advise_random_on_open);
|
||||
Log(log," Options.access_hint_on_compaction_start: %s",
|
||||
access_hints[access_hint_on_compaction_start]);
|
||||
Log(log," Options.use_adaptive_mutex: %d",
|
||||
use_adaptive_mutex);
|
||||
Log(log," Options.bytes_per_sync: %lu",
|
||||
(unsigned long)bytes_per_sync);
|
||||
Log(log," Options.filter_deletes: %d",
|
||||
filter_deletes);
|
||||
Log(log, " Options.verify_checksums_in_compaction: %d",
|
||||
@ -317,8 +423,15 @@ Options::Dump(Logger* log) const
|
||||
memtable_prefix_bloom_bits);
|
||||
Log(log, " Options.memtable_prefix_bloom_probes: %d",
|
||||
memtable_prefix_bloom_probes);
|
||||
Log(log, " Options.bloom_locality: %d",
|
||||
bloom_locality);
|
||||
Log(log, " Options.max_successive_merges: %zd",
|
||||
max_successive_merges);
|
||||
} // ColumnFamilyOptions::Dump
|
||||
|
||||
void Options::Dump(Logger* log) const {
|
||||
DBOptions::Dump(log);
|
||||
ColumnFamilyOptions::Dump(log);
|
||||
} // Options::Dump
|
||||
|
||||
//
|
||||
|
@ -9,12 +9,21 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// by default, enable counts only
|
||||
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
|
||||
PerfLevel perf_level = kEnableCount;
|
||||
// This is a dummy variable since some place references it
|
||||
PerfContext perf_context;
|
||||
#else
|
||||
__thread PerfLevel perf_level = kEnableCount;
|
||||
__thread PerfContext perf_context;
|
||||
#endif
|
||||
|
||||
void SetPerfLevel(PerfLevel level) { perf_level = level; }
|
||||
void SetPerfLevel(PerfLevel level) {
|
||||
perf_level = level;
|
||||
}
|
||||
|
||||
void PerfContext::Reset() {
|
||||
#if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE)
|
||||
user_key_comparison_count = 0;
|
||||
block_cache_hit_count = 0;
|
||||
block_read_count = 0;
|
||||
@ -38,11 +47,15 @@ void PerfContext::Reset() {
|
||||
find_next_user_entry_time = 0;
|
||||
write_pre_and_post_process_time = 0;
|
||||
write_memtable_time = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define OUTPUT(counter) #counter << " = " << counter << ", "
|
||||
|
||||
std::string PerfContext::ToString() const {
|
||||
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
|
||||
return "";
|
||||
#else
|
||||
std::ostringstream ss;
|
||||
ss << OUTPUT(user_key_comparison_count)
|
||||
<< OUTPUT(block_cache_hit_count)
|
||||
@ -67,8 +80,7 @@ std::string PerfContext::ToString() const {
|
||||
<< OUTPUT(write_pre_and_post_process_time)
|
||||
<< OUTPUT(write_memtable_time);
|
||||
return ss.str();
|
||||
#endif
|
||||
}
|
||||
|
||||
__thread PerfContext perf_context;
|
||||
|
||||
}
|
||||
|
@ -9,26 +9,80 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
extern enum PerfLevel perf_level;
|
||||
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
|
||||
|
||||
inline void StartPerfTimer(StopWatchNano* timer) {
|
||||
if (perf_level >= PerfLevel::kEnableTime) {
|
||||
timer->Start();
|
||||
#define PERF_TIMER_DECLARE()
|
||||
#define PERF_TIMER_START(metric)
|
||||
#define PERF_TIMER_AUTO(metric)
|
||||
#define PERF_TIMER_MEASURE(metric)
|
||||
#define PERF_TIMER_STOP(metric)
|
||||
#define PERF_COUNTER_ADD(metric, value)
|
||||
|
||||
#else
|
||||
|
||||
extern __thread PerfLevel perf_level;
|
||||
|
||||
class PerfStepTimer {
|
||||
public:
|
||||
PerfStepTimer()
|
||||
: enabled_(perf_level >= PerfLevel::kEnableTime),
|
||||
env_(enabled_ ? Env::Default() : nullptr),
|
||||
start_(0) {
|
||||
}
|
||||
}
|
||||
|
||||
inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) {
|
||||
if (perf_level >= PerfLevel::kEnableCount) {
|
||||
*count += delta;
|
||||
void Start() {
|
||||
if (enabled_) {
|
||||
start_ = env_->NowNanos();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void BumpPerfTime(uint64_t* time,
|
||||
StopWatchNano* timer,
|
||||
bool reset = true) {
|
||||
if (perf_level >= PerfLevel::kEnableTime) {
|
||||
*time += timer->ElapsedNanos(reset);
|
||||
void Measure(uint64_t* metric) {
|
||||
if (start_) {
|
||||
uint64_t now = env_->NowNanos();
|
||||
*metric += now - start_;
|
||||
start_ = now;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Stop(uint64_t* metric) {
|
||||
if (start_) {
|
||||
*metric += env_->NowNanos() - start_;
|
||||
start_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const bool enabled_;
|
||||
Env* const env_;
|
||||
uint64_t start_;
|
||||
};
|
||||
|
||||
// Declare the local timer object to be used later on
|
||||
#define PERF_TIMER_DECLARE() \
|
||||
PerfStepTimer perf_step_timer;
|
||||
|
||||
// Set start time of the timer
|
||||
#define PERF_TIMER_START(metric) \
|
||||
perf_step_timer.Start();
|
||||
|
||||
// Declare and set start time of the timer
|
||||
#define PERF_TIMER_AUTO(metric) \
|
||||
PerfStepTimer perf_step_timer; \
|
||||
perf_step_timer.Start();
|
||||
|
||||
// Update metric with time elapsed since last START. start time is reset
|
||||
// to current timestamp.
|
||||
#define PERF_TIMER_MEASURE(metric) \
|
||||
perf_step_timer.Measure(&(perf_context.metric));
|
||||
|
||||
// Update metric with time elapsed since last START. But start time is not set.
|
||||
#define PERF_TIMER_STOP(metric) \
|
||||
perf_step_timer.Stop(&(perf_context.metric));
|
||||
|
||||
// Increase metric value
|
||||
#define PERF_COUNTER_ADD(metric, value) \
|
||||
perf_context.metric += value;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
@ -13,13 +13,13 @@ class SkipListRep : public MemTableRep {
|
||||
SkipList<const char*, const MemTableRep::KeyComparator&> skip_list_;
|
||||
public:
|
||||
explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena)
|
||||
: skip_list_(compare, arena) {
|
||||
: MemTableRep(arena), skip_list_(compare, arena) {
|
||||
}
|
||||
|
||||
// Insert key into the list.
|
||||
// REQUIRES: nothing that compares equal to key is currently in the list.
|
||||
virtual void Insert(const char* key) override {
|
||||
skip_list_.Insert(key);
|
||||
virtual void Insert(KeyHandle handle) override {
|
||||
skip_list_.Insert(static_cast<char*>(handle));
|
||||
}
|
||||
|
||||
// Returns true iff an entry that compares equal to key is in the list.
|
||||
|
62
util/sync_point.cc
Normal file
62
util/sync_point.cc
Normal file
@ -0,0 +1,62 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
SyncPoint* SyncPoint::GetInstance() {
|
||||
static SyncPoint sync_point;
|
||||
return &sync_point;
|
||||
}
|
||||
|
||||
void SyncPoint::LoadDependency(const std::vector<Dependency>& dependencies) {
|
||||
successors_.clear();
|
||||
predecessors_.clear();
|
||||
cleared_points_.clear();
|
||||
for (const auto& dependency : dependencies) {
|
||||
successors_[dependency.predecessor].push_back(dependency.successor);
|
||||
predecessors_[dependency.successor].push_back(dependency.predecessor);
|
||||
}
|
||||
}
|
||||
|
||||
bool SyncPoint::PredecessorsAllCleared(const std::string& point) {
|
||||
for (const auto& pred : predecessors_[point]) {
|
||||
if (cleared_points_.count(pred) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void SyncPoint::EnableProcessing() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
enabled_ = true;
|
||||
}
|
||||
|
||||
void SyncPoint::DisableProcessing() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
enabled_ = false;
|
||||
}
|
||||
|
||||
void SyncPoint::ClearTrace() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
cleared_points_.clear();
|
||||
}
|
||||
|
||||
void SyncPoint::Process(const std::string& point) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
|
||||
if (!enabled_) return;
|
||||
|
||||
while (!PredecessorsAllCleared(point)) {
|
||||
cv_.wait(lock);
|
||||
}
|
||||
|
||||
cleared_points_.insert(point);
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
79
util/sync_point.h
Normal file
79
util/sync_point.h
Normal file
@ -0,0 +1,79 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
#include <condition_variable>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// This class provides facility to reproduce race conditions deterministically
|
||||
// in unit tests.
|
||||
// Developer could specify sync points in the codebase via TEST_SYNC_POINT.
|
||||
// Each sync point represents a position in the execution stream of a thread.
|
||||
// In the unit test, 'Happens After' relationship among sync points could be
|
||||
// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of
|
||||
// threads execution.
|
||||
// Refer to (DBTest,TransactionLogIteratorRace), for an exmaple use case.
|
||||
|
||||
class SyncPoint {
|
||||
public:
|
||||
static SyncPoint* GetInstance();
|
||||
|
||||
struct Dependency {
|
||||
std::string predecessor;
|
||||
std::string successor;
|
||||
};
|
||||
// call once at the beginning of a test to setup the dependency between
|
||||
// sync points
|
||||
void LoadDependency(const std::vector<Dependency>& dependencies);
|
||||
|
||||
// enable sync point processing (disabled on startup)
|
||||
void EnableProcessing();
|
||||
|
||||
// disable sync point processing
|
||||
void DisableProcessing();
|
||||
|
||||
// remove the execution trace of all sync points
|
||||
void ClearTrace();
|
||||
|
||||
// triggered by TEST_SYNC_POINT, blocking execution until all predecessors
|
||||
// are executed.
|
||||
void Process(const std::string& point);
|
||||
|
||||
// TODO: it might be useful to provide a function that blocks until all
|
||||
// sync points are cleared.
|
||||
|
||||
private:
|
||||
bool PredecessorsAllCleared(const std::string& point);
|
||||
|
||||
// successor/predecessor map loaded from LoadDependency
|
||||
std::unordered_map<std::string, std::vector<std::string>> successors_;
|
||||
std::unordered_map<std::string, std::vector<std::string>> predecessors_;
|
||||
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cv_;
|
||||
// sync points that have been passed through
|
||||
std::unordered_set<std::string> cleared_points_;
|
||||
bool enabled_ = false;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
// Use TEST_SYNC_POINT to specify sync points inside code base.
|
||||
// Sync points can have happens-after depedency on other sync points,
|
||||
// configured at runtime via SyncPoint::LoadDependency. This could be
|
||||
// utilized to re-produce race conditions between threads.
|
||||
// See TransactionLogIteratorRace in db_test.cc for an example use case.
|
||||
// TEST_SYNC_POINT is no op in release build.
|
||||
#ifdef NDEBUG
|
||||
#define TEST_SYNC_POINT(x)
|
||||
#else
|
||||
#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x)
|
||||
#endif
|
@ -16,6 +16,7 @@
|
||||
|
||||
#include "util/autovector.h"
|
||||
#include "port/port_posix.h"
|
||||
#include "util/thread_local.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
|
@ -30,7 +30,7 @@ class VectorRep : public MemTableRep {
|
||||
// single buffer and pass that in as the parameter to Insert)
|
||||
// REQUIRES: nothing that compares equal to key is currently in the
|
||||
// collection.
|
||||
virtual void Insert(const char* key) override;
|
||||
virtual void Insert(KeyHandle handle) override;
|
||||
|
||||
// Returns true iff an entry that compares equal to key is in the collection.
|
||||
virtual bool Contains(const char* key) const override;
|
||||
@ -106,7 +106,8 @@ class VectorRep : public MemTableRep {
|
||||
const KeyComparator& compare_;
|
||||
};
|
||||
|
||||
void VectorRep::Insert(const char* key) {
|
||||
void VectorRep::Insert(KeyHandle handle) {
|
||||
auto* key = static_cast<char*>(handle);
|
||||
assert(!Contains(key));
|
||||
WriteLock l(&rwlock_);
|
||||
assert(!immutable_);
|
||||
@ -134,7 +135,8 @@ size_t VectorRep::ApproximateMemoryUsage() {
|
||||
}
|
||||
|
||||
VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
|
||||
: bucket_(new Bucket()),
|
||||
: MemTableRep(arena),
|
||||
bucket_(new Bucket()),
|
||||
immutable_(false),
|
||||
sorted_(false),
|
||||
compare_(compare) { bucket_.get()->reserve(count); }
|
||||
|
@ -44,7 +44,9 @@ class DummyDB : public StackableDB {
|
||||
return options_.env;
|
||||
}
|
||||
|
||||
virtual const Options& GetOptions() const override {
|
||||
using DB::GetOptions;
|
||||
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
|
||||
override {
|
||||
return options_;
|
||||
}
|
||||
|
||||
@ -68,6 +70,10 @@ class DummyDB : public StackableDB {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
class DummyLogFile : public LogFile {
|
||||
public:
|
||||
/* implicit */
|
||||
@ -345,7 +351,7 @@ class BackupableDBTest {
|
||||
options_.wal_dir = dbname_;
|
||||
// set up backup db options
|
||||
CreateLoggerFromOptions(dbname_, backupdir_, env_,
|
||||
Options(), &logger_);
|
||||
DBOptions(), &logger_);
|
||||
backupable_options_.reset(new BackupableDBOptions(
|
||||
backupdir_, test_backup_env_.get(), true, logger_.get(), true));
|
||||
|
||||
@ -425,6 +431,19 @@ class BackupableDBTest {
|
||||
}
|
||||
}
|
||||
|
||||
void DeleteLogFiles() {
|
||||
std::vector<std::string> delete_logs;
|
||||
env_->GetChildren(dbname_, &delete_logs);
|
||||
for (auto f : delete_logs) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
bool ok = ParseFileName(f, &number, &type);
|
||||
if (ok && type == kLogFile) {
|
||||
env_->DeleteFile(dbname_ + "/" + f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// files
|
||||
std::string dbname_;
|
||||
std::string backupdir_;
|
||||
@ -721,10 +740,11 @@ TEST(BackupableDBTest, FailOverwritingBackups) {
|
||||
// create backups 1, 2, 3, 4, 5
|
||||
OpenBackupableDB(true);
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
CloseBackupableDB();
|
||||
DeleteLogFiles();
|
||||
OpenBackupableDB(false);
|
||||
FillDB(db_.get(), 100 * i, 100 * (i + 1));
|
||||
ASSERT_OK(db_->CreateNewBackup(true));
|
||||
CloseBackupableDB();
|
||||
OpenBackupableDB(false);
|
||||
}
|
||||
CloseBackupableDB();
|
||||
|
||||
@ -826,7 +846,7 @@ TEST(BackupableDBTest, RateLimiting) {
|
||||
auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
|
||||
backupable_options_->backup_rate_limit;
|
||||
ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time);
|
||||
ASSERT_LT(backup_time, 1.5 * rate_limited_backup_time);
|
||||
ASSERT_LT(backup_time, 2.5 * rate_limited_backup_time);
|
||||
|
||||
CloseBackupableDB();
|
||||
|
||||
@ -838,7 +858,7 @@ TEST(BackupableDBTest, RateLimiting) {
|
||||
auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
|
||||
backupable_options_->restore_rate_limit;
|
||||
ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time);
|
||||
ASSERT_LT(restore_time, 1.5 * rate_limited_restore_time);
|
||||
ASSERT_LT(restore_time, 2.5 * rate_limited_restore_time);
|
||||
|
||||
AssertBackupConsistency(0, 0, 100000, 100010);
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ class GeoDBTest {
|
||||
}
|
||||
};
|
||||
|
||||
const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault/";
|
||||
const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault";
|
||||
Options GeoDBTest::options = Options();
|
||||
|
||||
// Insert, Get and Remove
|
||||
@ -106,14 +106,14 @@ TEST(GeoDBTest, Search) {
|
||||
std::vector<GeoObject> values;
|
||||
status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values);
|
||||
ASSERT_TRUE(status.ok());
|
||||
ASSERT_EQ(values.size(), 1);
|
||||
ASSERT_EQ(values.size(), 1U);
|
||||
|
||||
// search all objects centered at 46 degree latitude with
|
||||
// a radius of 2 kilometers. There should be none.
|
||||
values.clear();
|
||||
status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values);
|
||||
ASSERT_TRUE(status.ok());
|
||||
ASSERT_EQ(values.size(), 0);
|
||||
ASSERT_EQ(values.size(), 0U);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -119,15 +119,16 @@ Status DBWithTTL::StripTS(std::string* str) {
|
||||
return st;
|
||||
}
|
||||
|
||||
Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key,
|
||||
Status DBWithTTL::Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& val) {
|
||||
WriteBatch batch;
|
||||
batch.Put(key, val);
|
||||
return Write(opt, &batch);
|
||||
return Write(options, &batch);
|
||||
}
|
||||
|
||||
Status DBWithTTL::Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) {
|
||||
Status st = db_->Get(options, key, value);
|
||||
if (!st.ok()) {
|
||||
@ -140,18 +141,18 @@ Status DBWithTTL::Get(const ReadOptions& options,
|
||||
return StripTS(value);
|
||||
}
|
||||
|
||||
std::vector<Status> DBWithTTL::MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) {
|
||||
std::vector<Status> DBWithTTL::MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
||||
return std::vector<Status>(keys.size(),
|
||||
Status::NotSupported("MultiGet not\
|
||||
supported with TTL"));
|
||||
}
|
||||
|
||||
bool DBWithTTL::KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found) {
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value, bool* value_found) {
|
||||
bool ret = db_->KeyMayExist(options, key, value, value_found);
|
||||
if (ret && value != nullptr && value_found != nullptr && *value_found) {
|
||||
if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) {
|
||||
@ -161,12 +162,12 @@ bool DBWithTTL::KeyMayExist(const ReadOptions& options,
|
||||
return ret;
|
||||
}
|
||||
|
||||
Status DBWithTTL::Merge(const WriteOptions& opt,
|
||||
const Slice& key,
|
||||
Status DBWithTTL::Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) {
|
||||
WriteBatch batch;
|
||||
batch.Merge(key, value);
|
||||
return Write(opt, &batch);
|
||||
return Write(options, &batch);
|
||||
}
|
||||
|
||||
Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) {
|
||||
@ -208,12 +209,9 @@ Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) {
|
||||
}
|
||||
}
|
||||
|
||||
Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) {
|
||||
return new TtlIterator(db_->NewIterator(opts));
|
||||
}
|
||||
|
||||
void DBWithTTL::TEST_Destroy_DBWithTtl() {
|
||||
((DBImpl*) db_)->TEST_Destroy_DBImpl();
|
||||
Iterator* DBWithTTL::NewIterator(const ReadOptions& opts,
|
||||
ColumnFamilyHandle* column_family) {
|
||||
return new TtlIterator(db_->NewIterator(opts, column_family));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -23,30 +23,39 @@ class DBWithTTL : public StackableDB {
|
||||
|
||||
virtual ~DBWithTTL();
|
||||
|
||||
virtual Status Put(const WriteOptions& o, const Slice& key,
|
||||
using StackableDB::Put;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& val) override;
|
||||
|
||||
virtual Status Get(const ReadOptions& options, const Slice& key,
|
||||
using StackableDB::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value) override;
|
||||
|
||||
using StackableDB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options, const std::vector<Slice>& keys,
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) override;
|
||||
|
||||
using StackableDB::KeyMayExist;
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) override;
|
||||
|
||||
virtual Status Merge(const WriteOptions& options, const Slice& key,
|
||||
using StackableDB::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) override;
|
||||
|
||||
virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
|
||||
|
||||
virtual Iterator* NewIterator(const ReadOptions& opts) override;
|
||||
|
||||
// Simulate a db crash, no elegant closing of database.
|
||||
void TEST_Destroy_DBWithTtl();
|
||||
using StackableDB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& opts,
|
||||
ColumnFamilyHandle* column_family) override;
|
||||
|
||||
virtual DB* GetBaseDB() {
|
||||
return db_;
|
||||
|
Loading…
x
Reference in New Issue
Block a user