diff --git a/.gitignore b/.gitignore index 974991fd8..a3a70ee31 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,10 @@ build_config.mk *_bench *_stress *.out +*.class +*.jar +*.*jnilib* +*.d-e ldb manifest_dump @@ -23,3 +27,5 @@ coverage/COVERAGE_REPORT .gdbhistory .phutil_module_cache tags +java/*.log +java/include/org_rocksdb_*.h diff --git a/HISTORY.md b/HISTORY.md index 0946d441e..003313767 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,11 +1,15 @@ # Rocksdb Change Log -## Unreleased +## Unreleased (will be released in 3.0) +* Column family support ### Public API changes +## 2.8.0 (04/04/2014) + * Removed arena.h from public header files. * By default, checksums are verified on every read from database +* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false * Added is_manual_compaction to CompactionFilter::Context * Added "virtual void WaitForJoin()" in class Env. Default operation is no-op. * Removed BackupEngine::DeleteBackupsNewerThan() function @@ -15,11 +19,18 @@ * Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools * Added a command "checkconsistency" in ldb tool, which checks if file system state matches DB state (file existence and file sizes) +* Separate options related to block based table to a new struct BlockBasedTableOptions +* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy +* Add more counters to perf context. +* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table. ### New Features * If we find one truncated record at the end of the MANIFEST or WAL files, we will ignore it. We assume that writers of these records were interrupted and that we can safely ignore it. +* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory(). +* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory(). +* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands. * Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1. * Geo-spatial support for locations and radial-search. diff --git a/INSTALL.md b/INSTALL.md index 86934db69..2a91be697 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -67,6 +67,9 @@ libraries. You are on your own. * Please note that some of the optimizations/features are disabled in OSX. We did not run any production workloads on it. +* **iOS**: + * Run: `TARGET_OS=IOS make static_lib` + ## Compilation `make clean; make` will compile librocksdb.a (RocksDB static library) and all the unit tests. You can run all unit tests with `make check`. diff --git a/Makefile b/Makefile index e1e982f15..47aeb5847 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,14 @@ $(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platf # this file is generated by the previous line to set build flags and sources include build_config.mk +ifneq ($(PLATFORM), IOS) +CFLAGS += -g +CXXFLAGS += -g +else +# no debug info for IOS, that will make our library big +OPT += -DNDEBUG +endif + # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc. ifdef COMPILE_WITH_ASAN # ASAN compile flags @@ -37,8 +45,8 @@ else endif WARNING_FLAGS = -Wall -Werror -Wno-sign-compare -CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) -CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual +CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) +CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual LDFLAGS += $(PLATFORM_LDFLAGS) @@ -57,6 +65,7 @@ TESTS = \ db_test \ block_hash_index_test \ autovector_test \ + column_family_test \ table_properties_collector_test \ arena_test \ auto_roll_logger_test \ @@ -148,11 +157,15 @@ $(SHARED3): endif # PLATFORM_SHARED_EXT .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ - release tags valgrind_check whitebox_crash_test format shared_lib all \ + release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ dbg all: $(LIBRARY) $(PROGRAMS) +static_lib: $(LIBRARY) + +shared_lib: $(SHARED) + dbg: $(LIBRARY) $(PROGRAMS) # Will also generate shared libraries. @@ -218,8 +231,6 @@ tags: format: build_tools/format-diff.sh -shared_lib: $(SHARED) - # --------------------------------------------------------------------------- # Unit tests and tools # --------------------------------------------------------------------------- @@ -260,6 +271,9 @@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -404,7 +418,7 @@ ldb: tools/ldb.o $(LIBOBJECTS) # --------------------------------------------------------------------------- JNI_NATIVE_SOURCES = ./java/rocksjni/rocksjni.cc ./java/rocksjni/options.cc ./java/rocksjni/write_batch.cc -JAVA_INCLUDE = -I/usr/lib/jvm/java-openjdk/include/ -I/usr/lib/jvm/java-openjdk/include/linux +JAVA_INCLUDE = -I/usr/lib/jvm/java-openjdk/include/ -I/usr/lib/jvm/java-openjdk/include/linux ROCKSDBJNILIB = ./java/librocksdbjni.so ifeq ($(PLATFORM), OS_MACOSX) @@ -435,20 +449,20 @@ ifeq ($(PLATFORM), IOS) PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer -IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString) +IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString) .cc.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS) + $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS) + xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ .c.o: mkdir -p ios-x86/$(dir $@) - $(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ + $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ mkdir -p ios-arm/$(dir $@) - $(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ + xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@ lipo ios-x86/$@ ios-arm/$@ -create -output $@ else diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 5a15aca33..94aafd62e 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -87,7 +87,7 @@ PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_VERSIONED=false # generic port files (working on all platform by #ifdef) go directly in /port -GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "` +GENERIC_PORT_FILES=`cd $ROCKSDB_ROOT; find port -name '*.cc' | tr "\n" " "` # On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp case "$TARGET_OS" in @@ -98,6 +98,13 @@ case "$TARGET_OS" in PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " # PORT_FILES=port/darwin/darwin_specific.cc ;; + IOS) + PLATFORM=IOS + COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE" + PLATFORM_SHARED_EXT=dylib + PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " + CROSS_COMPILE=true + ;; Linux) PLATFORM=OS_LINUX COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX" diff --git a/db/c.cc b/db/c.cc index 2e55c0ea1..b566daf64 100644 --- a/db/c.cc +++ b/db/c.cc @@ -25,12 +25,14 @@ #include "rocksdb/universal_compaction.h" #include "rocksdb/statistics.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" using rocksdb::Cache; using rocksdb::Comparator; using rocksdb::CompressionType; using rocksdb::DB; using rocksdb::Env; +using rocksdb::InfoLogLevel; using rocksdb::FileLock; using rocksdb::FilterPolicy; using rocksdb::FlushOptions; @@ -656,6 +658,11 @@ void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { } } +void rocksdb_options_set_info_log_level( + rocksdb_options_t* opt, int v) { + opt->rep.info_log_level = static_cast(v); +} + void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { opt->rep.write_buffer_size = s; } @@ -714,6 +721,14 @@ void rocksdb_options_set_max_grandparent_overlap_factor( opt->rep.max_grandparent_overlap_factor = n; } +void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t* opt, int* level_values, size_t num_levels) { + opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i]; + } +} + void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { opt->rep.statistics = rocksdb::CreateDBStatistics(); } @@ -857,6 +872,24 @@ void rocksdb_options_set_advise_random_on_open( opt->rep.advise_random_on_open = v; } +void rocksdb_options_set_access_hint_on_compaction_start( + rocksdb_options_t* opt, int v) { + switch(v) { + case 0: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::NONE; + break; + case 1: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::NORMAL; + break; + case 2: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::SEQUENTIAL; + break; + case 3: + opt->rep.access_hint_on_compaction_start = rocksdb::Options::WILLNEED; + break; + } +} + void rocksdb_options_set_use_adaptive_mutex( rocksdb_options_t* opt, unsigned char v) { opt->rep.use_adaptive_mutex = v; @@ -867,6 +900,11 @@ void rocksdb_options_set_bytes_per_sync( opt->rep.bytes_per_sync = v; } +void rocksdb_options_set_verify_checksums_in_compaction( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.verify_checksums_in_compaction = v; +} + void rocksdb_options_set_filter_deletes( rocksdb_options_t* opt, unsigned char v) { opt->rep.filter_deletes = v; @@ -1003,11 +1041,48 @@ void rocksdb_options_set_hash_link_list_rep( opt->rep.memtable_factory.reset(factory); } +void rocksdb_options_set_plain_table_factory( + rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { + static rocksdb::TableFactory* factory = 0; + if (!factory) { + factory = rocksdb::NewPlainTableFactory( + user_key_len, bloom_bits_per_key, + hash_table_ratio, index_sparseness); + } + opt->rep.table_factory.reset(factory); +} + void rocksdb_options_set_max_successive_merges( rocksdb_options_t* opt, size_t v) { opt->rep.max_successive_merges = v; } +void rocksdb_options_set_min_partial_merge_operands( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.min_partial_merge_operands = v; +} + +void rocksdb_options_set_bloom_locality( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.bloom_locality = v; +} + +void rocksdb_options_set_allow_thread_local( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_thread_local = v; +} + +void rocksdb_options_set_inplace_update_support( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.inplace_update_support = v; +} + +void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t* opt, size_t v) { + opt->rep.inplace_update_num_locks = v; +} + void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { opt->rep.compaction_style = static_cast(style); } @@ -1022,21 +1097,14 @@ DB::OpenForReadOnly DB::MultiGet DB::KeyMayExist DB::GetOptions -DB::GetLiveFiles DB::GetSortedWalFiles DB::GetLatestSequenceNumber DB::GetUpdatesSince -DB::DeleteFile DB::GetDbIdentity DB::RunManualCompaction custom cache compaction_filter -max_bytes_for_level_multiplier_additional -access_hint_on_compaction_start -table_factory table_properties_collectors -inplace_update_support -inplace_update_num_locks */ rocksdb_comparator_t* rocksdb_comparator_create( diff --git a/db/c_test.c b/db/c_test.c index 4a7957b14..e6c5a9e67 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -443,6 +443,7 @@ int main(int argc, char** argv) { rocksdb_options_set_filter_policy(options, policy); rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4); + rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); db = rocksdb_open(options, dbname, &err); CheckNoError(err); diff --git a/db/column_family.cc b/db/column_family.cc new file mode 100644 index 000000000..45ea22c23 --- /dev/null +++ b/db/column_family.cc @@ -0,0 +1,489 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/column_family.h" + +#include +#include +#include + +#include "db/db_impl.h" +#include "db/version_set.h" +#include "db/internal_stats.h" +#include "db/compaction_picker.h" +#include "db/table_properties_collector.h" +#include "util/autovector.h" +#include "util/hash_skiplist_rep.h" + +namespace rocksdb { + +ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd, + DBImpl* db, port::Mutex* mutex) + : cfd_(cfd), db_(db), mutex_(mutex) { + if (cfd_ != nullptr) { + cfd_->Ref(); + } +} + +ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { + if (cfd_ != nullptr) { + DBImpl::DeletionState deletion_state; + mutex_->Lock(); + if (cfd_->Unref()) { + delete cfd_; + } + db_->FindObsoleteFiles(deletion_state, false, true); + mutex_->Unlock(); + if (deletion_state.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles(deletion_state); + } + } +} + +uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } + +namespace { +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} +} // anonymous namespace + +ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const ColumnFamilyOptions& src) { + ColumnFamilyOptions result = src; + result.comparator = icmp; + result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr; + ClipToRange(&result.write_buffer_size, + ((size_t)64) << 10, ((size_t)64) << 30); + // if user sets arena_block_size, we trust user to use this value. Otherwise, + // calculate a proper value from writer_buffer_size; + if (result.arena_block_size <= 0) { + result.arena_block_size = result.write_buffer_size / 10; + } + result.min_write_buffer_number_to_merge = + std::min(result.min_write_buffer_number_to_merge, + result.max_write_buffer_number - 1); + if (result.block_cache == nullptr && !result.no_block_cache) { + result.block_cache = NewLRUCache(8 << 20); + } + result.compression_per_level = src.compression_per_level; + if (result.block_size_deviation < 0 || result.block_size_deviation > 100) { + result.block_size_deviation = 0; + } + if (result.max_mem_compaction_level >= result.num_levels) { + result.max_mem_compaction_level = result.num_levels - 1; + } + if (result.soft_rate_limit > result.hard_rate_limit) { + result.soft_rate_limit = result.hard_rate_limit; + } + if (!result.prefix_extractor) { + assert(result.memtable_factory); + Slice name = result.memtable_factory->Name(); + if (name.compare("HashSkipListRepFactory") == 0 || + name.compare("HashLinkListRepFactory") == 0) { + result.memtable_factory = std::make_shared(); + } + } + + // -- Sanitize the table properties collector + // All user defined properties collectors will be wrapped by + // UserKeyTablePropertiesCollector since for them they only have the + // knowledge of the user keys; internal keys are invisible to them. + auto& collectors = result.table_properties_collectors; + for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) { + assert(collectors[i]); + collectors[i] = + std::make_shared(collectors[i]); + } + // Add collector to collect internal key statistics + collectors.push_back(std::make_shared()); + + return result; +} + +int SuperVersion::dummy = 0; +void* const SuperVersion::kSVInUse = &SuperVersion::dummy; +void* const SuperVersion::kSVObsolete = nullptr; + +SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +SuperVersion* SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool SuperVersion::Unref() { + // fetch_sub returns the previous value of ref + uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed); + assert(previous_refs > 0); + return previous_refs == 1; +} + +void SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + imm->Unref(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } + current->Unref(); +} + +void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm, + Version* new_current) { + mem = new_mem; + imm = new_imm; + current = new_current; + mem->Ref(); + imm->Ref(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + +namespace { +void SuperVersionUnrefHandle(void* ptr) { + // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets + // destroyed. When former happens, the thread shouldn't see kSVInUse. + // When latter happens, we are in ~ColumnFamilyData(), no get should happen as + // well. + SuperVersion* sv = static_cast(ptr); + if (sv->Unref()) { + sv->db_mutex->Lock(); + sv->Cleanup(); + sv->db_mutex->Unlock(); + delete sv; + } +} +} // anonymous namespace + +ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id, + const std::string& name, + Version* dummy_versions, Cache* table_cache, + const ColumnFamilyOptions& options, + const DBOptions* db_options, + const EnvOptions& storage_options, + ColumnFamilySet* column_family_set) + : id_(id), + name_(name), + dummy_versions_(dummy_versions), + current_(nullptr), + refs_(0), + dropped_(false), + internal_comparator_(options.comparator), + internal_filter_policy_(options.filter_policy), + options_(*db_options, SanitizeOptions(&internal_comparator_, + &internal_filter_policy_, options)), + mem_(nullptr), + imm_(options.min_write_buffer_number_to_merge), + super_version_(nullptr), + super_version_number_(0), + local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), + next_(nullptr), + prev_(nullptr), + log_number_(0), + need_slowdown_for_num_level0_files_(false), + column_family_set_(column_family_set) { + Ref(); + + // if dummy_versions is nullptr, then this is a dummy column family. + if (dummy_versions != nullptr) { + internal_stats_.reset(new InternalStats(options.num_levels, db_options->env, + db_options->statistics.get())); + table_cache_.reset( + new TableCache(dbname, &options_, storage_options, table_cache)); + if (options_.compaction_style == kCompactionStyleUniversal) { + compaction_picker_.reset( + new UniversalCompactionPicker(&options_, &internal_comparator_)); + } else { + compaction_picker_.reset( + new LevelCompactionPicker(&options_, &internal_comparator_)); + } + + Log(options_.info_log, "Options for column family \"%s\":\n", + name.c_str()); + const ColumnFamilyOptions* cf_options = &options_; + cf_options->Dump(options_.info_log.get()); + } +} + +// DB mutex held +ColumnFamilyData::~ColumnFamilyData() { + assert(refs_ == 0); + // remove from linked list + auto prev = prev_; + auto next = next_; + prev->next_ = next; + next->prev_ = prev; + + // it's nullptr for dummy CFD + if (column_family_set_ != nullptr) { + // remove from column_family_set + column_family_set_->RemoveColumnFamily(this); + } + + if (current_ != nullptr) { + current_->Unref(); + } + + if (super_version_ != nullptr) { + // Release SuperVersion reference kept in ThreadLocalPtr. + // This must be done outside of mutex_ since unref handler can lock mutex. + super_version_->db_mutex->Unlock(); + local_sv_.reset(); + super_version_->db_mutex->Lock(); + + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + super_version_ = nullptr; + } + + if (dummy_versions_ != nullptr) { + // List must be empty + assert(dummy_versions_->next_ == dummy_versions_); + delete dummy_versions_; + } + + if (mem_ != nullptr) { + delete mem_->Unref(); + } + autovector to_delete; + imm_.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } +} + +void ColumnFamilyData::SetCurrent(Version* current) { + current_ = current; + need_slowdown_for_num_level0_files_ = + (options_.level0_slowdown_writes_trigger >= 0 && + current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger); +} + +void ColumnFamilyData::CreateNewMemtable() { + assert(current_ != nullptr); + if (mem_ != nullptr) { + delete mem_->Unref(); + } + mem_ = new MemTable(internal_comparator_, options_); + mem_->Ref(); +} + +Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) { + return compaction_picker_->PickCompaction(current_, log_buffer); +} + +Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + return compaction_picker_->CompactRange(current_, input_level, output_level, + begin, end, compaction_end); +} + +SuperVersion* ColumnFamilyData::InstallSuperVersion( + SuperVersion* new_superversion, port::Mutex* db_mutex) { + new_superversion->db_mutex = db_mutex; + new_superversion->Init(mem_, imm_.current(), current_); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + ++super_version_number_; + super_version_->version_number = super_version_number_; + if (old_superversion != nullptr && old_superversion->Unref()) { + old_superversion->Cleanup(); + return old_superversion; // will let caller delete outside of mutex + } + return nullptr; +} + +void ColumnFamilyData::ResetThreadLocalSuperVersions() { + autovector sv_ptrs; + local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); + for (auto ptr : sv_ptrs) { + assert(ptr); + if (ptr == SuperVersion::kSVInUse) { + continue; + } + auto sv = static_cast(ptr); + if (sv->Unref()) { + sv->Cleanup(); + delete sv; + } + } +} + +ColumnFamilySet::ColumnFamilySet(const std::string& dbname, + const DBOptions* db_options, + const EnvOptions& storage_options, + Cache* table_cache) + : max_column_family_(0), + dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr, + ColumnFamilyOptions(), db_options, + storage_options_, nullptr)), + default_cfd_cache_(nullptr), + db_name_(dbname), + db_options_(db_options), + storage_options_(storage_options), + table_cache_(table_cache), + spin_lock_(ATOMIC_FLAG_INIT) { + // initialize linked list + dummy_cfd_->prev_ = dummy_cfd_; + dummy_cfd_->next_ = dummy_cfd_; +} + +ColumnFamilySet::~ColumnFamilySet() { + while (column_family_data_.size() > 0) { + // cfd destructor will delete itself from column_family_data_ + auto cfd = column_family_data_.begin()->second; + cfd->Unref(); + delete cfd; + } + dummy_cfd_->Unref(); + delete dummy_cfd_; +} + +ColumnFamilyData* ColumnFamilySet::GetDefault() const { + assert(default_cfd_cache_ != nullptr); + return default_cfd_cache_; +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const { + auto cfd_iter = column_family_data_.find(id); + if (cfd_iter != column_family_data_.end()) { + return cfd_iter->second; + } else { + return nullptr; + } +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name) + const { + auto cfd_iter = column_families_.find(name); + if (cfd_iter != column_families_.end()) { + auto cfd = GetColumnFamily(cfd_iter->second); + assert(cfd != nullptr); + return cfd; + } else { + return nullptr; + } +} + +uint32_t ColumnFamilySet::GetNextColumnFamilyID() { + return ++max_column_family_; +} + +uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; } + +void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) { + max_column_family_ = std::max(new_max_column_family, max_column_family_); +} + +// under a DB mutex +ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( + const std::string& name, uint32_t id, Version* dummy_versions, + const ColumnFamilyOptions& options) { + assert(column_families_.find(name) == column_families_.end()); + ColumnFamilyData* new_cfd = + new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_, + options, db_options_, storage_options_, this); + Lock(); + column_families_.insert({name, id}); + column_family_data_.insert({id, new_cfd}); + Unlock(); + max_column_family_ = std::max(max_column_family_, id); + // add to linked list + new_cfd->next_ = dummy_cfd_; + auto prev = dummy_cfd_->prev_; + new_cfd->prev_ = prev; + prev->next_ = new_cfd; + dummy_cfd_->prev_ = new_cfd; + if (id == 0) { + default_cfd_cache_ = new_cfd; + } + return new_cfd; +} + +void ColumnFamilySet::Lock() { + // spin lock + while (spin_lock_.test_and_set(std::memory_order_acquire)) { + } +} + +void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); } + +// REQUIRES: DB mutex held +void ColumnFamilySet::FreeDeadColumnFamilies() { + autovector to_delete; + for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) { + if (cfd->refs_ == 0) { + to_delete.push_back(cfd); + } + } + for (auto cfd : to_delete) { + // this is very rare, so it's not a problem that we do it under a mutex + delete cfd; + } +} + +// under a DB mutex +void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { + auto cfd_iter = column_family_data_.find(cfd->GetID()); + assert(cfd_iter != column_family_data_.end()); + Lock(); + column_family_data_.erase(cfd_iter); + column_families_.erase(cfd->GetName()); + Unlock(); +} + +bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { + if (column_family_id == 0) { + // optimization for common case + current_ = column_family_set_->GetDefault(); + } else { + // maybe outside of db mutex, should lock + column_family_set_->Lock(); + current_ = column_family_set_->GetColumnFamily(column_family_id); + column_family_set_->Unlock(); + } + handle_.SetCFD(current_); + return current_ != nullptr; +} + +uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const { + assert(current_ != nullptr); + return current_->GetLogNumber(); +} + +MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const { + assert(current_ != nullptr); + return current_->mem(); +} + +const Options* ColumnFamilyMemTablesImpl::GetOptions() const { + assert(current_ != nullptr); + return current_->options(); +} + +ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { + assert(current_ != nullptr); + return &handle_; +} + +} // namespace rocksdb diff --git a/db/column_family.h b/db/column_family.h new file mode 100644 index 000000000..42e02f07d --- /dev/null +++ b/db/column_family.h @@ -0,0 +1,408 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "db/memtable_list.h" +#include "db/write_batch_internal.h" +#include "db/table_cache.h" +#include "util/thread_local.h" + +namespace rocksdb { + +class Version; +class VersionSet; +class MemTable; +class MemTableListVersion; +class CompactionPicker; +class Compaction; +class InternalKey; +class InternalStats; +class ColumnFamilyData; +class DBImpl; +class LogBuffer; + +// ColumnFamilyHandleImpl is the class that clients use to access different +// column families. It has non-trivial destructor, which gets called when client +// is done using the column family +class ColumnFamilyHandleImpl : public ColumnFamilyHandle { + public: + // create while holding the mutex + ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex); + // destroy without mutex + virtual ~ColumnFamilyHandleImpl(); + virtual ColumnFamilyData* cfd() const { return cfd_; } + + virtual uint32_t GetID() const; + + private: + ColumnFamilyData* cfd_; + DBImpl* db_; + port::Mutex* mutex_; +}; + +// Does not ref-count ColumnFamilyData +// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter +// calls DBImpl methods. When this happens, MemTableInserter need access to +// ColumnFamilyHandle (same as the client would need). In that case, we feed +// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl +// methods +class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { + public: + ColumnFamilyHandleInternal() + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {} + + void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; } + virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } + + private: + ColumnFamilyData* internal_cfd_; +}; + +// holds references to memtable, all immutable memtables and version +struct SuperVersion { + MemTable* mem; + MemTableListVersion* imm; + Version* current; + std::atomic refs; + // We need to_delete because during Cleanup(), imm->Unref() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + autovector to_delete; + // Version number of the current SuperVersion + uint64_t version_number; + port::Mutex* db_mutex; + + // should be called outside the mutex + SuperVersion() = default; + ~SuperVersion(); + SuperVersion* Ref(); + + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(MemTable* new_mem, MemTableListVersion* new_imm, + Version* new_current); + + // The value of dummy is not actually used. kSVInUse takes its address as a + // mark in the thread local storage to indicate the SuperVersion is in use + // by thread. This way, the value of kSVInUse is guaranteed to have no + // conflict with SuperVersion object address and portable on different + // platform. + static int dummy; + static void* const kSVInUse; + static void* const kSVObsolete; +}; + +extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, + const InternalFilterPolicy* ipolicy, + const ColumnFamilyOptions& src); + +class ColumnFamilySet; + +// This class keeps all the data that a column family needs. It's mosly dumb and +// used just to provide access to metadata. +// Most methods require DB mutex held, unless otherwise noted +class ColumnFamilyData { + public: + ~ColumnFamilyData(); + + // thread-safe + uint32_t GetID() const { return id_; } + // thread-safe + const std::string& GetName() const { return name_; } + + void Ref() { ++refs_; } + // will just decrease reference count to 0, but will not delete it. returns + // true if the ref count was decreased to zero. in that case, it can be + // deleted by the caller immediatelly, or later, by calling + // FreeDeadColumnFamilies() + bool Unref() { + assert(refs_ > 0); + return --refs_ == 0; + } + + // This can only be called from single-threaded VersionSet::LogAndApply() + // After dropping column family no other operation on that column family + // will be executed. All the files and memory will be, however, kept around + // until client drops the column family handle. That way, client can still + // access data from dropped column family. + // Column family can be dropped and still alive. In that state: + // *) Column family is not included in the iteration. + // *) Compaction and flush is not executed on the dropped column family. + // *) Client can continue writing and reading from column family. However, all + // writes stay in the current memtable. + // When the dropped column family is unreferenced, then we: + // *) delete all memory associated with that column family + // *) delete all the files associated with that column family + void SetDropped() { + // can't drop default CF + assert(id_ != 0); + dropped_ = true; + } + bool IsDropped() const { return dropped_; } + + // thread-safe + int NumberLevels() const { return options_.num_levels; } + + void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } + uint64_t GetLogNumber() const { return log_number_; } + + // thread-safe + const Options* options() const { return &options_; } + + InternalStats* internal_stats() { return internal_stats_.get(); } + + MemTableList* imm() { return &imm_; } + MemTable* mem() { return mem_; } + Version* current() { return current_; } + Version* dummy_versions() { return dummy_versions_; } + void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } + void SetCurrent(Version* current); + void CreateNewMemtable(); + + TableCache* table_cache() { return table_cache_.get(); } + + // See documentation in compaction_picker.h + Compaction* PickCompaction(LogBuffer* log_buffer); + Compaction* CompactRange(int input_level, int output_level, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end); + + CompactionPicker* compaction_picker() { return compaction_picker_.get(); } + // thread-safe + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } + // thread-safe + const InternalKeyComparator& internal_comparator() const { + return internal_comparator_; + } + + SuperVersion* GetSuperVersion() { return super_version_; } + // thread-safe + ThreadLocalPtr* GetThreadLocalSuperVersion() const { return local_sv_.get(); } + // thread-safe + uint64_t GetSuperVersionNumber() const { + return super_version_number_.load(); + } + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion to enable + // the clients to allocate SuperVersion outside of mutex. + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, + port::Mutex* db_mutex); + + void ResetThreadLocalSuperVersions(); + + // A Flag indicating whether write needs to slowdown because of there are + // too many number of level0 files. + bool NeedSlowdownForNumLevel0Files() const { + return need_slowdown_for_num_level0_files_; + } + + private: + friend class ColumnFamilySet; + ColumnFamilyData(const std::string& dbname, uint32_t id, + const std::string& name, Version* dummy_versions, + Cache* table_cache, const ColumnFamilyOptions& options, + const DBOptions* db_options, + const EnvOptions& storage_options, + ColumnFamilySet* column_family_set); + + uint32_t id_; + const std::string name_; + Version* dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions->prev_ + + int refs_; // outstanding references to ColumnFamilyData + bool dropped_; // true if client dropped it + + const InternalKeyComparator internal_comparator_; + const InternalFilterPolicy internal_filter_policy_; + + Options const options_; + + std::unique_ptr table_cache_; + + std::unique_ptr internal_stats_; + + MemTable* mem_; + MemTableList imm_; + SuperVersion* super_version_; + + // An ordinal representing the current SuperVersion. Updated by + // InstallSuperVersion(), i.e. incremented every time super_version_ + // changes. + std::atomic super_version_number_; + + // Thread's local copy of SuperVersion pointer + // This needs to be destructed before mutex_ + std::unique_ptr local_sv_; + + // pointers for a circular linked list. we use it to support iterations + // that can be concurrent with writes + ColumnFamilyData* next_; + ColumnFamilyData* prev_; + + // This is the earliest log file number that contains data from this + // Column Family. All earlier log files must be ignored and not + // recovered from + uint64_t log_number_; + + // A flag indicating whether we should delay writes because + // we have too many level 0 files + bool need_slowdown_for_num_level0_files_; + + // An object that keeps all the compaction stats + // and picks the next compaction + std::unique_ptr compaction_picker_; + + ColumnFamilySet* column_family_set_; +}; + +// ColumnFamilySet has interesting thread-safety requirements +// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB +// mutex. Inside, column_family_data_ and column_families_ will be protected +// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from +// VersionSet::LogAndApply() in the normal runtime. It is also called +// during Recovery and in DumpManifest(). RemoveColumnFamily() is called +// from ColumnFamilyData destructor +// * Iteration -- hold DB mutex, but you can release it in the body of +// iteration. If you release DB mutex in body, reference the column +// family before the mutex and unreference after you unlock, since the column +// family might get dropped when the DB mutex is released +// * GetDefault() -- thread safe +// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock() +// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily() -- +// inside of DB mutex +class ColumnFamilySet { + public: + // ColumnFamilySet supports iteration + class iterator { + public: + explicit iterator(ColumnFamilyData* cfd) + : current_(cfd) {} + iterator& operator++() { + // dummy is never dead or dropped, so this will never be infinite + do { + current_ = current_->next_; + } while (current_->refs_ == 0 || current_->IsDropped()); + return *this; + } + bool operator!=(const iterator& other) { + return this->current_ != other.current_; + } + ColumnFamilyData* operator*() { return current_; } + + private: + ColumnFamilyData* current_; + }; + + ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, + const EnvOptions& storage_options, Cache* table_cache); + ~ColumnFamilySet(); + + ColumnFamilyData* GetDefault() const; + // GetColumnFamily() calls return nullptr if column family is not found + ColumnFamilyData* GetColumnFamily(uint32_t id) const; + ColumnFamilyData* GetColumnFamily(const std::string& name) const; + // this call will return the next available column family ID. it guarantees + // that there is no column family with id greater than or equal to the + // returned value in the current running instance or anytime in RocksDB + // instance history. + uint32_t GetNextColumnFamilyID(); + uint32_t GetMaxColumnFamily(); + void UpdateMaxColumnFamily(uint32_t new_max_column_family); + + ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, + Version* dummy_version, + const ColumnFamilyOptions& options); + + iterator begin() { return iterator(dummy_cfd_->next_); } + iterator end() { return iterator(dummy_cfd_); } + + void Lock(); + void Unlock(); + + // REQUIRES: DB mutex held + // Don't call while iterating over ColumnFamilySet + void FreeDeadColumnFamilies(); + + private: + friend class ColumnFamilyData; + // helper function that gets called from cfd destructor + // REQUIRES: DB mutex held + void RemoveColumnFamily(ColumnFamilyData* cfd); + + // column_families_ and column_family_data_ need to be protected: + // * when mutating: 1. DB mutex locked first, 2. spinlock locked second + // * when reading, either: 1. lock DB mutex, or 2. lock spinlock + // (if both, respect the ordering to avoid deadlock!) + std::unordered_map column_families_; + std::unordered_map column_family_data_; + + uint32_t max_column_family_; + ColumnFamilyData* dummy_cfd_; + // We don't hold the refcount here, since default column family always exists + // We are also not responsible for cleaning up default_cfd_cache_. This is + // just a cache that makes common case (accessing default column family) + // faster + ColumnFamilyData* default_cfd_cache_; + + const std::string db_name_; + const DBOptions* const db_options_; + const EnvOptions storage_options_; + Cache* table_cache_; + std::atomic_flag spin_lock_; +}; + +// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access +// memtables of different column families (specified by ID in the write batch) +class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) + : column_family_set_(column_family_set), current_(nullptr) {} + + // sets current_ to ColumnFamilyData with column_family_id + // returns false if column family doesn't exist + bool Seek(uint32_t column_family_id) override; + + // Returns log number of the selected column family + uint64_t GetLogNumber() const override; + + // REQUIRES: Seek() called first + virtual MemTable* GetMemTable() const override; + + // Returns options for selected column family + // REQUIRES: Seek() called first + virtual const Options* GetOptions() const override; + + // Returns column family handle for the selected column family + virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + + private: + ColumnFamilySet* column_family_set_; + ColumnFamilyData* current_; + ColumnFamilyHandleInternal handle_; +}; + +} // namespace rocksdb diff --git a/db/column_family_test.cc b/db/column_family_test.cc new file mode 100644 index 000000000..16e98629f --- /dev/null +++ b/db/column_family_test.cc @@ -0,0 +1,857 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "db/db_impl.h" +#include "rocksdb/env.h" +#include "rocksdb/db.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +namespace { +std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +} // anonymous namespace + +class ColumnFamilyTest { + public: + ColumnFamilyTest() : rnd_(139) { + env_ = Env::Default(); + dbname_ = test::TmpDir() + "/column_family_test"; + db_options_.create_if_missing = true; + DestroyDB(dbname_, Options(db_options_, column_family_options_)); + } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + names_.clear(); + delete db_; + db_ = nullptr; + } + + Status TryOpen(std::vector cf, + std::vector options = {}) { + std::vector column_families; + names_.clear(); + for (size_t i = 0; i < cf.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor( + cf[i], options.size() == 0 ? column_family_options_ : options[i])); + names_.push_back(cf[i]); + } + return DB::Open(db_options_, dbname_, column_families, &handles_, &db_); + } + + void Open(std::vector cf, + std::vector options = {}) { + ASSERT_OK(TryOpen(cf, options)); + } + + void Open() { + Open({"default"}); + } + + DBImpl* dbfull() { return reinterpret_cast(db_); } + + int GetProperty(int cf, std::string property) { + std::string value; + ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value)); + return std::stoi(value); + } + + void Destroy() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + names_.clear(); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_))); + } + + void CreateColumnFamilies( + const std::vector& cfs, + const std::vector options = {}) { + int cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + names_.resize(cfi + cfs.size()); + for (size_t i = 0; i < cfs.size(); ++i) { + ASSERT_OK(db_->CreateColumnFamily( + options.size() == 0 ? column_family_options_ : options[i], cfs[i], + &handles_[cfi])); + names_[cfi] = cfs[i]; + cfi++; + } + } + + void Reopen(const std::vector options = {}) { + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + Close(); + assert(options.size() == 0 || names.size() == options.size()); + Open(names, options); + } + + void CreateColumnFamiliesAndReopen(const std::vector& cfs) { + CreateColumnFamilies(cfs); + Reopen(); + } + + void DropColumnFamilies(const std::vector& cfs) { + for (auto cf : cfs) { + ASSERT_OK(db_->DropColumnFamily(handles_[cf])); + delete handles_[cf]; + handles_[cf] = nullptr; + names_[cf] = ""; + } + } + + void PutRandomData(int cf, int num, int key_value_size) { + for (int i = 0; i < num; ++i) { + // 10 bytes for key, rest is value + ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10), + RandomString(&rnd_, key_value_size - 10))); + } + } + + void WaitForFlush(int cf) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + + void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); } + + Status Put(int cf, const std::string& key, const std::string& value) { + return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Merge(int cf, const std::string& key, const std::string& value) { + return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Flush(int cf) { + return db_->Flush(FlushOptions(), handles_[cf]); + } + + std::string Get(int cf, const std::string& key) { + ReadOptions options; + options.verify_checksums = true; + std::string result; + Status s = db_->Get(options, handles_[cf], Slice(key), &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + void CompactAll(int cf) { + ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit)); + } + + int NumTableFilesAtLevel(int level, int cf) { + return GetProperty(cf, + "rocksdb.num-files-at-level" + std::to_string(level)); + } + + // Return spread of files per level + std::string FilesPerLevel(int cf) { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + int CountLiveFiles(int cf) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return static_cast(metadata.size()); + } + + // Do n memtable flushes, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int cf, int n, const std::string& small, + const std::string& large) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf])); + } + } + + int CountLiveLogFiles() { + int micros_wait_for_log_deletion = 20000; + env_->SleepForMicroseconds(micros_wait_for_log_deletion); + int ret = 0; + VectorLogPtr wal_files; + Status s; + // GetSortedWalFiles is a flakey function -- it gets all the wal_dir + // children files and then later checks for their existance. if some of the + // log files doesn't exist anymore, it reports an error. it does all of this + // without DB mutex held, so if a background process deletes the log file + // while the function is being executed, it returns an error. We retry the + // function 10 times to avoid the error failing the test + for (int retries = 0; retries < 10; ++retries) { + wal_files.clear(); + s = db_->GetSortedWalFiles(wal_files); + if (s.ok()) { + break; + } + } + ASSERT_OK(s); + for (const auto& wal : wal_files) { + if (wal->Type() == kAliveLogFile) { + ++ret; + } + } + return ret; + } + + void AssertNumberOfImmutableMemtables(std::vector num_per_cf) { + assert(num_per_cf.size() == handles_.size()); + + for (size_t i = 0; i < num_per_cf.size(); ++i) { + ASSERT_EQ(num_per_cf[i], + GetProperty(i, "rocksdb.num-immutable-mem-table")); + } + } + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0) { + const EnvOptions soptions; + unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); + } + + std::vector handles_; + std::vector names_; + ColumnFamilyOptions column_family_options_; + DBOptions db_options_; + std::string dbname_; + DB* db_ = nullptr; + Env* env_; + Random rnd_; +}; + +TEST(ColumnFamilyTest, DontReuseColumnFamilyID) { + for (int iter = 0; iter < 3; ++iter) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + for (size_t i = 0; i < handles_.size(); ++i) { + auto cfh = reinterpret_cast(handles_[i]); + ASSERT_EQ(i, cfh->GetID()); + } + if (iter == 1) { + Reopen(); + } + DropColumnFamilies({3}); + Reopen(); + if (iter == 2) { + // this tests if max_column_family is correctly persisted with + // WriteSnapshot() + Reopen(); + } + CreateColumnFamilies({"three2"}); + // ID 3 that was used for dropped column family "three" should not be reused + auto cfh3 = reinterpret_cast(handles_[3]); + ASSERT_EQ(4, cfh3->GetID()); + Close(); + Destroy(); + } +} + + +TEST(ColumnFamilyTest, AddDrop) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "fodor")); + DropColumnFamilies({2}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + CreateColumnFamilies({"four"}); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_EQ("mirko", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + Close(); + ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument()); + Open({"default", "one", "three", "four"}); + DropColumnFamilies({1}); + Reopen(); + Close(); + + std::vector families; + ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families)); + sort(families.begin(), families.end()); + ASSERT_TRUE(families == + std::vector({"default", "four", "three"})); +} + +TEST(ColumnFamilyTest, DropTest) { + // first iteration - dont reopen DB before dropping + // second iteration - reopen DB before dropping + for (int iter = 0; iter < 2; ++iter) { + Open({"default"}); + CreateColumnFamiliesAndReopen({"pikachu"}); + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i))); + } + ASSERT_OK(Flush(1)); + + if (iter == 1) { + Reopen(); + } + ASSERT_EQ("bar1", Get(1, "1")); + + ASSERT_EQ(CountLiveFiles(1), 1); + DropColumnFamilies({1}); + // make sure that all files are deleted when we drop the column family + ASSERT_EQ(CountLiveFiles(1), 0); + Destroy(); + } +} + +TEST(ColumnFamilyTest, WriteBatchFailure) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + WriteBatch batch; + batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + DropColumnFamilies({1}); + Status s = db_->Write(WriteOptions(), &batch); + ASSERT_TRUE(s.IsInvalidArgument()); + Close(); +} + +TEST(ColumnFamilyTest, ReadWrite) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + + for (int iter = 0; iter <= 3; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +TEST(ColumnFamilyTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + + column_family_options_.merge_operator = + MergeOperators::CreateUInt64AddOperator(); + db_options_.wal_dir = dbname_ + "/logs"; + Destroy(); + Open(); + CreateColumnFamilies({"cf1", "cf2"}); + + // fill up the DB + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(1, "mirko", one)); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(2, "fodor", one)); + ASSERT_OK(Merge(0, "bar", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(1, "mirko", two)); + ASSERT_OK(Merge(1, "franjo", one)); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(db_options_.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Close(); + + // 1. check consistency + // 2. copy the logs from backup back to WAL dir. if the recovery happens + // again on the same log files, this should lead to incorrect results + // due to applying merge operator twice + // 3. check consistency + for (int iter = 0; iter < 2; ++iter) { + // assert consistency + Open({"default", "cf1", "cf2"}); + ASSERT_EQ(two, Get(0, "foo")); + ASSERT_EQ(one, Get(0, "bar")); + ASSERT_EQ(three, Get(1, "mirko")); + ASSERT_EQ(one, Get(1, "franjo")); + ASSERT_EQ(one, Get(2, "fodor")); + ASSERT_EQ(two, Get(2, "bla")); + Close(); + + if (iter == 0) { + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); + } + } + } + } +} + +TEST(ColumnFamilyTest, FlushTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + for (int i = 0; i < 3; ++i) { + Flush(i); + } + Reopen(); + + for (int iter = 0; iter <= 2; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +// Makes sure that obsolete log files get deleted +TEST(ColumnFamilyTest, LogDeletionTest) { + column_family_options_.write_buffer_size = 100000; // 100KB + Open(); + CreateColumnFamilies({"one", "two", "three", "four"}); + // Each bracket is one log file. if number is in (), it means + // we don't need it anymore (it's been flushed) + // [] + ASSERT_EQ(CountLiveLogFiles(), 0); + PutRandomData(0, 1, 100); + // [0] + PutRandomData(1, 1, 100); + // [0, 1] + PutRandomData(1, 1000, 100); + WaitForFlush(1); + // [0, (1)] [1] + ASSERT_EQ(CountLiveLogFiles(), 2); + PutRandomData(0, 1, 100); + // [0, (1)] [0, 1] + ASSERT_EQ(CountLiveLogFiles(), 2); + PutRandomData(2, 1, 100); + // [0, (1)] [0, 1, 2] + PutRandomData(2, 1000, 100); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [2] + ASSERT_EQ(CountLiveLogFiles(), 3); + PutRandomData(2, 1000, 100); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [(2)] [2] + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(3, 1, 100); + // [0, (1)] [0, 1, (2)] [(2)] [2, 3] + PutRandomData(1, 1, 100); + // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3] + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(1, 1000, 100); + WaitForFlush(1); + // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1] + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(0, 1000, 100); + WaitForFlush(0); + // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0] + // delete obsolete logs --> + // [(1), 2, 3] [1, (0)] [0] + ASSERT_EQ(CountLiveLogFiles(), 3); + PutRandomData(0, 1000, 100); + WaitForFlush(0); + // [(1), 2, 3] [1, (0)], [(0)] [0] + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(1, 1000, 100); + WaitForFlush(1); + // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1] + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(2, 1000, 100); + WaitForFlush(2); + // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2] + ASSERT_EQ(CountLiveLogFiles(), 6); + PutRandomData(3, 1000, 100); + WaitForFlush(3); + // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3] + // delete obsolete logs --> + // [0, (1)] [1, (2)], [2, (3)] [3] + ASSERT_EQ(CountLiveLogFiles(), 4); + Close(); +} + +// Makes sure that obsolete log files get deleted +TEST(ColumnFamilyTest, DifferentWriteBufferSizes) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ColumnFamilyOptions default_cf, one, two, three; + // setup options. all column families have max_write_buffer_number setup to 10 + // "default" -> 100KB memtable, start flushing immediatelly + // "one" -> 200KB memtable, start flushing with two immutable memtables + // "two" -> 1MB memtable, start flushing with three immutable memtables + // "three" -> 90KB memtable, start flushing with four immutable memtables + default_cf.write_buffer_size = 100000; + default_cf.max_write_buffer_number = 10; + default_cf.min_write_buffer_number_to_merge = 1; + one.write_buffer_size = 200000; + one.max_write_buffer_number = 10; + one.min_write_buffer_number_to_merge = 2; + two.write_buffer_size = 1000000; + two.max_write_buffer_number = 10; + two.min_write_buffer_number_to_merge = 3; + three.write_buffer_size = 90000; + three.max_write_buffer_number = 10; + three.min_write_buffer_number_to_merge = 4; + + Reopen({default_cf, one, two, three}); + + int micros_wait_for_flush = 10000; + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 1); + PutRandomData(1, 200, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 2); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 1, 0}); + ASSERT_EQ(CountLiveLogFiles(), 3); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 0}); + ASSERT_EQ(CountLiveLogFiles(), 4); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 1}); + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 2}); + ASSERT_EQ(CountLiveLogFiles(), 6); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + ASSERT_EQ(CountLiveLogFiles(), 7); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + ASSERT_EQ(CountLiveLogFiles(), 8); + PutRandomData(2, 100, 10000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 1, 0, 3}); + ASSERT_EQ(CountLiveLogFiles(), 9); + PutRandomData(3, 90, 1000); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 10); + PutRandomData(3, 90, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 1}); + ASSERT_EQ(CountLiveLogFiles(), 11); + PutRandomData(1, 200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 1}); + ASSERT_EQ(CountLiveLogFiles(), 5); + PutRandomData(3, 90*6, 1000); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 12); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 12); + PutRandomData(2, 3*100, 10000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 12); + PutRandomData(1, 2*200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + ASSERT_EQ(CountLiveLogFiles(), 7); + Close(); +} + +TEST(ColumnFamilyTest, DifferentMergeOperators) { + Open(); + CreateColumnFamilies({"first", "second"}); + ColumnFamilyOptions default_cf, first, second; + first.merge_operator = MergeOperators::CreateUInt64AddOperator(); + second.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen({default_cf, first, second}); + + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + + ASSERT_OK(Put(0, "foo", two)); + ASSERT_OK(Put(0, "foo", one)); + ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported()); + ASSERT_EQ(Get(0, "foo"), one); + + ASSERT_OK(Put(1, "foo", two)); + ASSERT_OK(Put(1, "foo", one)); + ASSERT_OK(Merge(1, "foo", two)); + ASSERT_EQ(Get(1, "foo"), three); + + ASSERT_OK(Put(2, "foo", two)); + ASSERT_OK(Put(2, "foo", one)); + ASSERT_OK(Merge(2, "foo", two)); + ASSERT_EQ(Get(2, "foo"), one + "," + two); + Close(); +} + +TEST(ColumnFamilyTest, DifferentCompactionStyles) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.disableDataSync = true; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.filter_policy = nullptr; + default_cf.no_block_cache = true; + default_cf.source_compaction_factor = 100; + default_cf.disable_seek_compaction = false; + + one.compaction_style = kCompactionStyleUniversal; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 100000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.max_mem_compaction_level = 0; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + // SETUP column family "default" - test read compaction + ASSERT_EQ("", FilesPerLevel(0)); + PutRandomData(0, 1, 4096); + ASSERT_OK(Flush(0)); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + // write 8MB + PutRandomData(0, 2000, 4096); + ASSERT_OK(Flush(0)); + // clear levels 0 and 1 + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); + // write some new keys into level 0 and 1 + PutRandomData(0, 1024, 512); + ASSERT_OK(Flush(0)); + WaitForCompaction(); + PutRandomData(0, 10, 512); + ASSERT_OK(Flush(0)); + // remember number of files in each level + int l1 = NumTableFilesAtLevel(0, 0); + int l2 = NumTableFilesAtLevel(1, 0); + int l3 = NumTableFilesAtLevel(2, 0); + ASSERT_NE(l1, 0); + ASSERT_NE(l2, 0); + ASSERT_NE(l3, 0); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(1, 11, 10000); + WaitForFlush(1); + ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1)); + } + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(2, 15, 10000); + WaitForFlush(2); + ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2)); + } + + // TRIGGER compaction "default" + // read a bunch of times, trigger read compaction + for (int i = 0; i < 200000; ++i) { + Get(0, std::to_string(i)); + } + + // TRIGGER compaction "one" + PutRandomData(1, 12, 10000); + + // TRIGGER compaction "two" + PutRandomData(2, 10, 10000); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "default" + // verify that the number of files have decreased + // in some level, indicating that there was a compaction + ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 || + NumTableFilesAtLevel(1, 0) < l2 || + NumTableFilesAtLevel(2, 0) < l3); + + // VERIFY compaction "one" + ASSERT_EQ("1", FilesPerLevel(1)); + + // VERIFY compaction "two" + ASSERT_EQ("0,1", FilesPerLevel(2)); + CompactAll(2); + ASSERT_EQ("0,1", FilesPerLevel(2)); + + Close(); +} + +namespace { +std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; +} +} // anonymous namespace + +TEST(ColumnFamilyTest, NewIteratorsTest) { + // iter == 0 -- no tailing + // iter == 2 -- tailing + for (int iter = 0; iter < 2; ++iter) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "b", "a")); + ASSERT_OK(Put(2, "c", "m")); + ASSERT_OK(Put(2, "v", "t")); + std::vector iterators; + ReadOptions options; + options.tailing = (iter == 1); + ASSERT_OK(db_->NewIterators(options, handles_, &iterators)); + + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "b->a"); + ASSERT_EQ(IterStatus(iterators[2]), "c->m"); + + ASSERT_OK(Put(1, "x", "x")); + + for (auto it : iterators) { + it->Next(); + } + + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + if (iter == 0) { + // no tailing + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + } else { + // tailing + ASSERT_EQ(IterStatus(iterators[1]), "x->x"); + } + ASSERT_EQ(IterStatus(iterators[2]), "v->t"); + + for (auto it : iterators) { + delete it; + } + Destroy(); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/compaction.cc b/db/compaction.cc index 8bb4f9c61..bafb5b4ea 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/compaction.h" +#include "db/column_family.h" namespace rocksdb { @@ -29,6 +30,7 @@ Compaction::Compaction(Version* input_version, int level, int out_level, max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes), input_version_(input_version), number_levels_(input_version_->NumberLevels()), + cfd_(input_version_->cfd_), seek_compaction_(seek_compaction), enable_compression_(enable_compression), grandparent_index_(0), @@ -42,8 +44,10 @@ Compaction::Compaction(Version* input_version, int level, int out_level, is_manual_compaction_(false), level_ptrs_(std::vector(number_levels_)) { + cfd_->Ref(); input_version_->Ref(); edit_ = new VersionEdit(); + edit_->SetColumnFamily(cfd_->GetID()); for (int i = 0; i < number_levels_; i++) { level_ptrs_[i] = 0; } @@ -54,6 +58,11 @@ Compaction::~Compaction() { if (input_version_ != nullptr) { input_version_->Unref(); } + if (cfd_ != nullptr) { + if (cfd_->Unref()) { + delete cfd_; + } + } } bool Compaction::IsTrivialMove() const { @@ -77,12 +86,11 @@ void Compaction::AddInputDeletions(VersionEdit* edit) { } bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { return bottommost_level_; } // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + const Comparator* user_cmp = cfd_->user_comparator(); for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { const std::vector& files = input_version_->files_[lvl]; for (; level_ptrs_[lvl] < files.size(); ) { @@ -103,7 +111,7 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) { bool Compaction::ShouldStopBefore(const Slice& internal_key) { // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + const InternalKeyComparator* icmp = &cfd_->internal_comparator(); while (grandparent_index_ < grandparents_.size() && icmp->Compare(internal_key, grandparents_[grandparent_index_]->largest.Encode()) > 0) { @@ -141,8 +149,7 @@ void Compaction::MarkFilesBeingCompacted(bool value) { // Is this compaction producing files at the bottommost level? void Compaction::SetupBottomMostLevel(bool isManual) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { // If universal compaction style is used and manual // compaction is occuring, then we are guaranteed that // all files will be picked in a single compaction @@ -155,8 +162,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) { return; } bottommost_level_ = true; - int num_levels = input_version_->vset_->NumberLevels(); - for (int i = output_level() + 1; i < num_levels; i++) { + for (int i = output_level() + 1; i < number_levels_; i++) { if (input_version_->NumLevelFiles(i) > 0) { bottommost_level_ = false; break; @@ -169,6 +175,16 @@ void Compaction::ReleaseInputs() { input_version_->Unref(); input_version_ = nullptr; } + if (cfd_ != nullptr) { + if (cfd_->Unref()) { + delete cfd_; + } + cfd_ = nullptr; + } +} + +void Compaction::ReleaseCompactionFiles(Status status) { + cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); } void Compaction::ResetNextCompactionIndex() { diff --git a/db/compaction.h b/db/compaction.h index ead1d87a2..8fd95f909 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -13,6 +13,7 @@ namespace rocksdb { class Version; +class ColumnFamilyData; // A Compaction encapsulates information about a compaction. class Compaction { @@ -36,6 +37,8 @@ class Compaction { // Returns input version of the compaction Version* input_version() const { return input_version_; } + ColumnFamilyData* column_family_data() const { return cfd_; } + // Return the ith input file at "level()+which" ("which" must be 0 or 1). FileMetaData* input(int which, int i) const { return inputs_[which][i]; } @@ -67,6 +70,10 @@ class Compaction { // is successful. void ReleaseInputs(); + // Clear all files to indicate that they are not being compacted + // Delete this compaction from the list of running compactions. + void ReleaseCompactionFiles(Status status); + void Summary(char* output, int len); // Return the score that was used to pick this compaction run. @@ -97,6 +104,7 @@ class Compaction { Version* input_version_; VersionEdit* edit_; int number_levels_; + ColumnFamilyData* cfd_; bool seek_compaction_; bool enable_compression_; diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index ccdbce72b..b7ec66d96 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -277,14 +277,10 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) { Log(options_->info_log, "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)" "\n", - (unsigned long)level, - (unsigned long)(c->inputs_[0].size()), - (unsigned long)(c->inputs_[1].size()), - (unsigned long)inputs0_size, - (unsigned long)inputs1_size, - (unsigned long)(expanded0.size()), - (unsigned long)(expanded1.size()), - (unsigned long)expanded0_size, + (unsigned long)level, (unsigned long)(c->inputs_[0].size()), + (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size, + (unsigned long)inputs1_size, (unsigned long)(expanded0.size()), + (unsigned long)(expanded1.size()), (unsigned long)expanded0_size, (unsigned long)inputs1_size); smallest = new_start; largest = new_limit; @@ -587,7 +583,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version, options_->level0_file_num_compaction_trigger; if ((c = PickCompactionUniversalReadAmp( version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { - Log(options_->info_log, "Universal: compacting for file num\n"); + LogToBuffer(log_buffer, "Universal: compacting for file num\n"); } } } @@ -653,7 +649,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( FileMetaData* f = nullptr; bool done = false; int start_index = 0; - unsigned int candidate_count; + unsigned int candidate_count = 0; assert(file_by_time.size() == version->files_[level].size()); unsigned int max_files_to_compact = std::min(max_merge_width, diff --git a/db/compaction_picker.h b/db/compaction_picker.h index 4793b1ba4..6527ef967 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -12,6 +12,7 @@ #include "db/compaction.h" #include "rocksdb/status.h" #include "rocksdb/options.h" +#include "rocksdb/env.h" #include #include @@ -118,6 +119,7 @@ class CompactionPicker { std::unique_ptr level_max_bytes_; const Options* const options_; + private: int num_levels_; diff --git a/db/db_bench.cc b/db/db_bench.cc index 14d886f5c..0314a3ea9 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -42,7 +42,6 @@ DEFINE_string(benchmarks, - "fillseq," "fillsync," "fillrandom," @@ -53,6 +52,7 @@ DEFINE_string(benchmarks, "readreverse," "compact," "readrandom," + "multireadrandom," "readseq," "readtocache," "readreverse," @@ -64,8 +64,7 @@ DEFINE_string(benchmarks, "crc32c," "compress," "uncompress," - "acquireload," - "fillfromstdin,", + "acquireload,", "Comma-separated list of operations to run in the specified order" "Actual benchmarks:\n" @@ -129,16 +128,8 @@ DEFINE_int64(merge_keys, -1, DEFINE_int64(reads, -1, "Number of read operations to do. " "If negative, do FLAGS_num reads."); -DEFINE_int64(read_range, 1, "When ==1 reads use ::Get, when >1 reads use" - " an iterator"); - -DEFINE_bool(use_prefix_blooms, false, "Whether to place prefixes in blooms"); - DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); -DEFINE_bool(use_prefix_api, false, "Whether to set ReadOptions.prefix for" - " prefixscanrandom. If true, use_prefix_blooms must also be true."); - DEFINE_int64(seed, 0, "Seed base for random number generators. " "When 0 it is deterministic."); @@ -278,12 +269,6 @@ DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); -DEFINE_bool(use_snapshot, false, "If true, create a snapshot per query when" - " randomread benchmark is used"); - -DEFINE_bool(get_approx, false, "If true, call GetApproximateSizes per query" - " when read_range is > 1 and randomread benchmark is used"); - DEFINE_int32(num_levels, 7, "The total number of levels"); DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); @@ -461,20 +446,9 @@ DEFINE_string(compaction_fadvice, "NORMAL", static auto FLAGS_compaction_fadvice_e = rocksdb::Options().access_hint_on_compaction_start; -DEFINE_bool(use_multiget, false, - "Use multiget to access a series of keys instead of get"); - DEFINE_bool(use_tailing_iterator, false, "Use tailing iterator to access a series of keys instead of get"); -DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number" - " of keys to group per call Arbitrary default is good because it" - " agrees with readwritepercent"); - -// TODO: Apply this flag to generic Get calls too. Currently only with Multiget -DEFINE_bool(warn_missing_keys, true, "Print a message to user when a key is" - " missing in a Get/MultiGet call"); - DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex, "Use adaptive mutex"); @@ -798,7 +772,7 @@ class Duration { start_at_ = FLAGS_env->NowMicros(); } - bool Done(int increment) { + bool Done(int64_t increment) { if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops ops_ += increment; @@ -834,13 +808,12 @@ class Benchmark { int key_size_; int prefix_size_; int64_t keys_per_prefix_; - int entries_per_batch_; + int64_t entries_per_batch_; WriteOptions write_options_; int64_t reads_; int64_t writes_; int64_t readwrites_; int64_t merge_keys_; - int heap_counter_; void PrintHeader() { PrintEnvironment(); fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size); @@ -1037,8 +1010,7 @@ class Benchmark { readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads) ), - merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), - heap_counter_(0) { + merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) { if (FLAGS_prefix_size > FLAGS_key_size) { fprintf(stderr, "prefix size is larger than key size"); exit(1); @@ -1062,6 +1034,10 @@ class Benchmark { delete prefix_extractor_; } + Slice AllocateKey() { + return Slice(new char[key_size_], key_size_); + } + // Generate key according to the given specification and random number. // The resulting key will have the following format (if keys_per_prefix_ // is positive), extra trailing bytes are either cut off or paddd with '0'. @@ -1074,10 +1050,8 @@ class Benchmark { // ---------------------------- // | key 00000 | // ---------------------------- - std::string GenerateKeyFromInt(uint64_t v, int64_t num_keys) { - std::string key; - key.resize(key_size_); - char* start = &(key[0]); + void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) { + char* start = const_cast(key->data()); char* pos = start; if (keys_per_prefix_ > 0) { int64_t num_prefix = num_keys / keys_per_prefix_; @@ -1109,8 +1083,6 @@ class Benchmark { if (key_size_ > pos - start) { memset(pos, '0', key_size_ - (pos - start)); } - - return key; } void Run() { @@ -1155,15 +1127,12 @@ class Benchmark { } else if (name == Slice("fillrandom")) { fresh_db = true; method = &Benchmark::WriteRandom; - } else if (name == Slice("fillfromstdin")) { - fresh_db = true; - method = &Benchmark::WriteFromStdin; } else if (name == Slice("filluniquerandom")) { fresh_db = true; if (num_threads > 1) { fprintf(stderr, "filluniquerandom multithreaded not supported" - " set --threads=1"); - exit(1); + ", use 1 thread"); + num_threads = 1; } method = &Benchmark::WriteUniqueRandom; } else if (name == Slice("overwrite")) { @@ -1189,19 +1158,18 @@ class Benchmark { method = &Benchmark::ReadReverse; } else if (name == Slice("readrandom")) { method = &Benchmark::ReadRandom; + } else if (name == Slice("multireadrandom")) { + method = &Benchmark::MultiReadRandom; } else if (name == Slice("readmissing")) { - method = &Benchmark::ReadMissing; + ++key_size_; + method = &Benchmark::ReadRandom; } else if (name == Slice("newiterator")) { method = &Benchmark::IteratorCreation; } else if (name == Slice("seekrandom")) { method = &Benchmark::SeekRandom; - } else if (name == Slice("readhot")) { - method = &Benchmark::ReadHot; } else if (name == Slice("readrandomsmall")) { reads_ /= 1000; method = &Benchmark::ReadRandom; - } else if (name == Slice("prefixscanrandom")) { - method = &Benchmark::PrefixScanRandom; } else if (name == Slice("deleteseq")) { method = &Benchmark::DeleteSeq; } else if (name == Slice("deleterandom")) { @@ -1215,10 +1183,9 @@ class Benchmark { if (FLAGS_merge_operator.empty()) { fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", name.ToString().c_str()); - method = nullptr; - } else { - method = &Benchmark::ReadRandomMergeRandom; + exit(1); } + method = &Benchmark::ReadRandomMergeRandom; } else if (name == Slice("updaterandom")) { method = &Benchmark::UpdateRandom; } else if (name == Slice("appendrandom")) { @@ -1227,10 +1194,9 @@ class Benchmark { if (FLAGS_merge_operator.empty()) { fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n", name.ToString().c_str()); - method = nullptr; - } else { - method = &Benchmark::MergeRandom; + exit(1); } + method = &Benchmark::MergeRandom; } else if (name == Slice("randomwithverify")) { method = &Benchmark::RandomWithVerify; } else if (name == Slice("compact")) { @@ -1243,8 +1209,6 @@ class Benchmark { method = &Benchmark::Compress; } else if (name == Slice("uncompress")) { method = &Benchmark::Uncompress; - } else if (name == Slice("heapprofile")) { - HeapProfile(); } else if (name == Slice("stats")) { PrintStats("rocksdb.stats"); } else if (name == Slice("levelstats")) { @@ -1254,6 +1218,7 @@ class Benchmark { } else { if (name != Slice()) { // No error message for empty name fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); + exit(1); } } @@ -1540,7 +1505,7 @@ class Benchmark { options.compaction_style = FLAGS_compaction_style_e; options.block_size = FLAGS_block_size; options.filter_policy = filter_policy_; - if (FLAGS_use_plain_table || FLAGS_use_prefix_blooms) { + if (FLAGS_use_plain_table) { options.prefix_extractor.reset( NewFixedPrefixTransform(FLAGS_prefix_size)); } @@ -1715,54 +1680,6 @@ class Benchmark { DoWrite(thread, UNIQUE_RANDOM); } - void writeOrFail(WriteBatch& batch) { - Status s = db_->Write(write_options_, &batch); - if (!s.ok()) { - fprintf(stderr, "put error: %s\n", s.ToString().c_str()); - exit(1); - } - } - - void WriteFromStdin(ThreadState* thread) { - size_t count = 0; - WriteBatch batch; - const size_t bufferLen = 32 << 20; - unique_ptr line = unique_ptr(new char[bufferLen]); - char* linep = line.get(); - const int batchSize = 100 << 10; - const char columnSeparator = '\t'; - const char lineSeparator = '\n'; - - while (fgets(linep, bufferLen, stdin) != nullptr) { - ++count; - char* tab = std::find(linep, linep + bufferLen, columnSeparator); - if (tab == linep + bufferLen) { - fprintf(stderr, "[Error] No Key delimiter TAB at line %zu\n", count); - continue; - } - Slice key(linep, tab - linep); - tab++; - char* endLine = std::find(tab, linep + bufferLen, lineSeparator); - if (endLine == linep + bufferLen) { - fprintf(stderr, "[Error] No ENTER at end of line # %zu\n", count); - continue; - } - Slice value(tab, endLine - tab); - thread->stats.FinishedSingleOp(db_); - thread->stats.AddBytes(endLine - linep - 1); - - if (batch.Count() < batchSize) { - batch.Put(key, value); - continue; - } - writeOrFail(batch); - batch.Clear(); - } - if (batch.Count() > 0) { - writeOrFail(batch); - } - } - void DoWrite(ThreadState* thread, WriteMode write_mode) { const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; const int64_t num_ops = writes_ == 0 ? num_ : writes_; @@ -1783,10 +1700,13 @@ class Benchmark { WriteBatch batch; Status s; int64_t bytes = 0; - int i = 0; + int64_t i = 0; + + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); while (!duration.Done(entries_per_batch_)) { batch.Clear(); - for (int j = 0; j < entries_per_batch_; j++) { + for (int64_t j = 0; j < entries_per_batch_; j++) { int64_t k = 0; switch(write_mode) { case SEQUENTIAL: @@ -1825,9 +1745,9 @@ class Benchmark { break; } }; - std::string key = GenerateKeyFromInt(k, FLAGS_num); + GenerateKeyFromInt(k, FLAGS_num, &key); batch.Put(key, gen.Generate(value_size_)); - bytes += value_size_ + key.size(); + bytes += value_size_ + key_size_; thread->stats.FinishedSingleOp(db_); } s = db_->Write(write_options_, &batch); @@ -1866,135 +1786,22 @@ class Benchmark { thread->stats.AddBytes(bytes); } - // Calls MultiGet over a list of keys from a random distribution. - // Returns the total number of keys found. - long MultiGetRandom(ReadOptions& options, int num_keys, - Random64* rand, int64_t range, const char* suffix) { - assert(num_keys > 0); - std::vector keys(num_keys); - std::vector values(num_keys); - std::vector gen_keys(num_keys); - - int i; - int64_t k; - - // Fill the keys vector - for(i=0; iNext() % range; - gen_keys[i] = GenerateKeyFromInt(k, range) + suffix; - keys[i] = gen_keys[i]; - } - - if (FLAGS_use_snapshot) { - options.snapshot = db_->GetSnapshot(); - } - - // Apply the operation - std::vector statuses = db_->MultiGet(options, keys, &values); - assert((long)statuses.size() == num_keys); - assert((long)keys.size() == num_keys); // Should always be the case. - assert((long)values.size() == num_keys); - - if (FLAGS_use_snapshot) { - db_->ReleaseSnapshot(options.snapshot); - options.snapshot = nullptr; - } - - // Count number found - long found = 0; - for(i=0; i key_guard(key.data()); + std::string value; - // Recalculate number of keys per group, and call MultiGet until done - long num_keys; - while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) { - read += num_keys; - found += - MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, ""); - thread->stats.FinishedSingleOp(db_); - keys_left -= num_keys; + Duration duration(FLAGS_duration, reads_); + while (!duration.Done(1)) { + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + read++; + if (db_->Get(options, key, &value).ok()) { + found++; } - } else if (FLAGS_use_tailing_iterator) { // use tailing iterator for gets - options.tailing = true; - Iterator* iter = db_->NewIterator(options); - while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); - - iter->Seek(key); - read++; - if (iter->Valid() && iter->key().compare(Slice(key)) == 0) { - found++; - } - - thread->stats.FinishedSingleOp(db_); - } - delete iter; - } else { // Regular case. Do one "get" at a time Get - options.tailing = true; - options.prefix_seek = (FLAGS_prefix_size == 0); - Iterator* iter = db_->NewIterator(options); - std::string value; - while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); - if (FLAGS_use_snapshot) { - options.snapshot = db_->GetSnapshot(); - } - - if (FLAGS_read_range < 2) { - read++; - if (db_->Get(options, key, &value).ok()) { - found++; - } - } else { - int count = 1; - - if (FLAGS_get_approx) { - std::string key2 = - GenerateKeyFromInt(k + static_cast(FLAGS_read_range), - FLAGS_num + FLAGS_read_range); - Range range(key, key2); - uint64_t sizes; - db_->GetApproximateSizes(&range, 1, &sizes); - } - - read += FLAGS_read_range; - for (iter->Seek(key); - iter->Valid() && count <= FLAGS_read_range; - ++count, iter->Next()) { - found++; - } - } - - if (FLAGS_use_snapshot) { - db_->ReleaseSnapshot(options.snapshot); - options.snapshot = nullptr; - } - - thread->stats.FinishedSingleOp(db_); - } - - delete iter; + thread->stats.FinishedSingleOp(db_); } char msg[100]; @@ -2008,113 +1815,41 @@ class Benchmark { } } - void PrefixScanRandom(ThreadState* thread) { - if (FLAGS_use_prefix_api) { - assert(FLAGS_use_prefix_blooms); - assert(FLAGS_bloom_bits >= 1); + // Calls MultiGet over a list of keys from a random distribution. + // Returns the total number of keys found. + void MultiReadRandom(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + ReadOptions options(FLAGS_verify_checksum, true); + std::vector keys(entries_per_batch_); + std::vector values(entries_per_batch_); + while (keys.size() < entries_per_batch_) { + keys.push_back(AllocateKey()); } - ReadOptions options(FLAGS_verify_checksum, true); Duration duration(FLAGS_duration, reads_); - - int64_t found = 0; - while (!duration.Done(1)) { - std::string value; - const int k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); - Slice skey(key); - Slice prefix = prefix_extractor_->Transform(skey); - options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr; - - Iterator* iter = db_->NewIterator(options); - for (iter->Seek(skey); - iter->Valid() && iter->key().starts_with(prefix); - iter->Next()) { - found++; + for (int64_t i = 0; i < entries_per_batch_; ++i) { + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, + FLAGS_num, &keys[i]); } - delete iter; + std::vector statuses = db_->MultiGet(options, keys, &values); + assert(statuses.size() == entries_per_batch_); - thread->stats.FinishedSingleOp(db_); - } - - char msg[100]; - snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", - found, reads_); - thread->stats.AddMessage(msg); - } - - void ReadMissing(ThreadState* thread) { - FLAGS_warn_missing_keys = false; // Never warn about missing keys - - Duration duration(FLAGS_duration, reads_); - ReadOptions options(FLAGS_verify_checksum, true); - - if (FLAGS_use_multiget) { - const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group - long keys_left = reads_; - - // Recalculate number of keys per group, and call MultiGet until done - long num_keys; - long found; - while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) { - found = - MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "."); - - // We should not find any key since the key we try to get has a - // different suffix - if (found) { - assert(false); - } - - thread->stats.FinishedSingleOp(db_); - keys_left -= num_keys; - } - } else { // Regular case (not MultiGet) - std::string value; - Status s; - while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num) + "."; - s = db_->Get(options, key, &value); - assert(!s.ok() && s.IsNotFound()); - thread->stats.FinishedSingleOp(db_); - } - } - } - - void ReadHot(ThreadState* thread) { - Duration duration(FLAGS_duration, reads_); - ReadOptions options(FLAGS_verify_checksum, true); - const int64_t range = (FLAGS_num + 99) / 100; - int64_t found = 0; - - if (FLAGS_use_multiget) { - const int64_t kpg = FLAGS_keys_per_multiget; // keys per multiget group - int64_t keys_left = reads_; - - // Recalculate number of keys per group, and call MultiGet until done - long num_keys; - while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) { - found += MultiGetRandom(options, num_keys, &thread->rand, range, ""); - thread->stats.FinishedSingleOp(db_); - keys_left -= num_keys; - } - } else { - std::string value; - while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % range; - std::string key = GenerateKeyFromInt(k, range); - if (db_->Get(options, key, &value).ok()) { + read += entries_per_batch_; + for (int64_t i = 0; i < entries_per_batch_; ++i) { + if (statuses[i].ok()) { ++found; } - thread->stats.FinishedSingleOp(db_); } } + for (auto& k : keys) { + delete k.data(); + } char msg[100]; snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", - found, reads_); + found, read); thread->stats.AddMessage(msg); } @@ -2129,44 +1864,53 @@ class Benchmark { } void SeekRandom(ThreadState* thread) { - Duration duration(FLAGS_duration, reads_); - ReadOptions options(FLAGS_verify_checksum, true); - std::string value; + int64_t read = 0; int64_t found = 0; + ReadOptions options(FLAGS_verify_checksum, true); + options.tailing = FLAGS_use_tailing_iterator; + auto* iter = db_->NewIterator(options); + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + + Duration duration(FLAGS_duration, reads_); while (!duration.Done(1)) { - Iterator* iter = db_->NewIterator(options); - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); iter->Seek(key); - if (iter->Valid() && iter->key() == Slice(key)) found++; - delete iter; + read++; + if (iter->Valid() && iter->key().compare(key) == 0) { + found++; + } thread->stats.FinishedSingleOp(db_); } + delete iter; + char msg[100]; snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", - found, num_); + found, read); thread->stats.AddMessage(msg); } void DoDelete(ThreadState* thread, bool seq) { WriteBatch batch; - Status s; Duration duration(seq ? 0 : FLAGS_duration, num_); - long i = 0; + int64_t i = 0; + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + while (!duration.Done(entries_per_batch_)) { batch.Clear(); - for (int j = 0; j < entries_per_batch_; j++) { - const int64_t k = seq ? i+j : (thread->rand.Next() % FLAGS_num); - std::string key = GenerateKeyFromInt(k, FLAGS_num); + for (int64_t j = 0; j < entries_per_batch_; ++j) { + const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num); + GenerateKeyFromInt(k, FLAGS_num, &key); batch.Delete(key); thread->stats.FinishedSingleOp(db_); } - s = db_->Write(write_options_, &batch); + auto s = db_->Write(write_options_, &batch); if (!s.ok()) { fprintf(stderr, "del error: %s\n", s.ToString().c_str()); exit(1); } - ++i; + i += entries_per_batch_; } } @@ -2197,6 +1941,9 @@ class Benchmark { // Don't merge stats from this thread with the readers. thread->stats.SetExcludeFromMerge(); + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + while (true) { { MutexLock l(&thread->shared->mu); @@ -2206,8 +1953,7 @@ class Benchmark { } } - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); if (!s.ok()) { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); @@ -2235,7 +1981,7 @@ class Benchmark { // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V) // in DB atomically i.e in a single batch. Also refer GetMany. Status PutMany(const WriteOptions& writeoptions, - const Slice& key, const Slice& value) { + const Slice& key, const Slice& value) { std::string suffixes[3] = {"2", "1", "0"}; std::string keys[3]; @@ -2273,7 +2019,7 @@ class Benchmark { // in the same snapshot, and verifies that all the values are identical. // ASSUMES that PutMany was used to put (K, V) into the DB. Status GetMany(const ReadOptions& readoptions, - const Slice& key, std::string* value) { + const Slice& key, std::string* value) { std::string suffixes[3] = {"0", "1", "2"}; std::string keys[3]; Slice key_slices[3]; @@ -2328,16 +2074,19 @@ class Benchmark { int64_t puts_done = 0; int64_t deletes_done = 0; + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + // the number of iterations is the larger of read_ or write_ for (int64_t i = 0; i < readwrites_; i++) { - const int64_t k = thread->rand.Next() % (FLAGS_numdistinct); - std::string key = GenerateKeyFromInt(k, FLAGS_numdistinct); if (get_weight == 0 && put_weight == 0 && delete_weight == 0) { // one batch completed, reinitialize for next batch get_weight = FLAGS_readwritepercent; delete_weight = FLAGS_deletepercent; put_weight = 100 - get_weight - delete_weight; } + GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct, + FLAGS_numdistinct, &key); if (get_weight > 0) { // do all the gets first Status s = GetMany(options, key, &value); @@ -2383,12 +2132,6 @@ class Benchmark { // This is different from ReadWhileWriting because it does not use // an extra thread. void ReadRandomWriteRandom(ThreadState* thread) { - if (FLAGS_use_multiget){ - // Separate function for multiget (for ease of reading) - ReadRandomWriteRandomMultiGet(thread); - return; - } - ReadOptions options(FLAGS_verify_checksum, true); RandomGenerator gen; std::string value; @@ -2399,28 +2142,18 @@ class Benchmark { int64_t writes_done = 0; Duration duration(FLAGS_duration, readwrites_); + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + // the number of iterations is the larger of read_ or write_ while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); if (get_weight == 0 && put_weight == 0) { // one batch completed, reinitialize for next batch get_weight = FLAGS_readwritepercent; put_weight = 100 - get_weight; } if (get_weight > 0) { - - if (FLAGS_use_snapshot) { - options.snapshot = db_->GetSnapshot(); - } - - if (FLAGS_get_approx) { - std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1); - Range range(key, key2); - uint64_t sizes; - db_->GetApproximateSizes(&range, 1, &sizes); - } - // do all the gets first Status s = db_->Get(options, key, &value); if (!s.ok() && !s.IsNotFound()) { @@ -2430,14 +2163,8 @@ class Benchmark { } else if (!s.IsNotFound()) { found++; } - get_weight--; reads_done++; - - if (FLAGS_use_snapshot) { - db_->ReleaseSnapshot(options.snapshot); - } - } else if (put_weight > 0) { // then do all the corresponding number of puts // for all the gets we have done earlier @@ -2458,82 +2185,6 @@ class Benchmark { thread->stats.AddMessage(msg); } - // ReadRandomWriteRandom (with multiget) - // Does FLAGS_keys_per_multiget reads (per multiget), followed by some puts. - // FLAGS_readwritepercent will specify the ratio of gets to puts. - // e.g.: If FLAGS_keys_per_multiget == 100 and FLAGS_readwritepercent == 75 - // Then each block will do 100 multigets and 33 puts - // So there are 133 operations in-total: 100 of them (75%) are gets, and 33 - // of them (25%) are puts. - void ReadRandomWriteRandomMultiGet(ThreadState* thread) { - ReadOptions options(FLAGS_verify_checksum, true); - RandomGenerator gen; - - // For multiget - const long& kpg = FLAGS_keys_per_multiget; // keys per multiget group - - long keys_left = readwrites_; // number of keys still left to read - long num_keys; // number of keys to read in current group - long num_put_keys; // number of keys to put in current group - - int64_t found = 0; - int64_t reads_done = 0; - int64_t writes_done = 0; - int64_t multigets_done = 0; - - // the number of iterations is the larger of read_ or write_ - Duration duration(FLAGS_duration, readwrites_); - while(true) { - // Read num_keys keys, then write num_put_keys keys. - // The ratio of num_keys to num_put_keys is always FLAGS_readwritepercent - // And num_keys is set to be FLAGS_keys_per_multiget (kpg) - // num_put_keys is calculated accordingly (to maintain the ratio) - // Note: On the final iteration, num_keys and num_put_keys will be smaller - num_keys = std::min(keys_left*(FLAGS_readwritepercent + 99)/100, kpg); - num_put_keys = num_keys * (100-FLAGS_readwritepercent) - / FLAGS_readwritepercent; - - // This will break the loop when duration is complete - if (duration.Done(num_keys + num_put_keys)) { - break; - } - - // A quick check to make sure our formula doesn't break on edge cases - assert(num_keys >= 1); - assert(num_keys + num_put_keys <= keys_left); - - // Apply the MultiGet operations - found += MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, ""); - ++multigets_done; - reads_done+=num_keys; - thread->stats.FinishedSingleOp(db_); - - // Now do the puts - int i; - int64_t k; - for(i=0; irand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); - Status s = db_->Put(write_options_, key, - gen.Generate(value_size_)); - if (!s.ok()) { - fprintf(stderr, "put error: %s\n", s.ToString().c_str()); - exit(1); - } - writes_done++; - thread->stats.FinishedSingleOp(db_); - } - - keys_left -= (num_keys + num_put_keys); - } - char msg[100]; - snprintf(msg, sizeof(msg), - "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 \ - " multiget_ops:%" PRIu64 " found:%" PRIu64 ")", - reads_done, writes_done, readwrites_, multigets_done, found); - thread->stats.AddMessage(msg); - } - // // Read-modify-write for random keys void UpdateRandom(ThreadState* thread) { @@ -2543,30 +2194,16 @@ class Benchmark { int64_t found = 0; Duration duration(FLAGS_duration, readwrites_); + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); // the number of iterations is the larger of read_ or write_ while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); - - if (FLAGS_use_snapshot) { - options.snapshot = db_->GetSnapshot(); - } - - if (FLAGS_get_approx) { - std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1); - Range range(key, key2); - uint64_t sizes; - db_->GetApproximateSizes(&range, 1, &sizes); - } + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); if (db_->Get(options, key, &value).ok()) { found++; } - if (FLAGS_use_snapshot) { - db_->ReleaseSnapshot(options.snapshot); - } - Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); if (!s.ok()) { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); @@ -2589,22 +2226,12 @@ class Benchmark { std::string value; int64_t found = 0; + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); // The number of iterations is the larger of read_ or write_ Duration duration(FLAGS_duration, readwrites_); while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % FLAGS_num; - std::string key = GenerateKeyFromInt(k, FLAGS_num); - - if (FLAGS_use_snapshot) { - options.snapshot = db_->GetSnapshot(); - } - - if (FLAGS_get_approx) { - std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1); - Range range(key, key2); - uint64_t sizes; - db_->GetApproximateSizes(&range, 1, &sizes); - } + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); // Get the existing value if (db_->Get(options, key, &value).ok()) { @@ -2614,10 +2241,6 @@ class Benchmark { value.clear(); } - if (FLAGS_use_snapshot) { - db_->ReleaseSnapshot(options.snapshot); - } - // Update the value (by appending data) Slice operand = gen.Generate(value_size_); if (value.size() > 0) { @@ -2634,6 +2257,7 @@ class Benchmark { } thread->stats.FinishedSingleOp(db_); } + char msg[100]; snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found); @@ -2653,11 +2277,12 @@ class Benchmark { void MergeRandom(ThreadState* thread) { RandomGenerator gen; + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); // The number of iterations is the larger of read_ or write_ Duration duration(FLAGS_duration, readwrites_); while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % merge_keys_; - std::string key = GenerateKeyFromInt(k, merge_keys_); + GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key); Status s = db_->Merge(write_options_, key, gen.Generate(value_size_)); @@ -2690,12 +2315,12 @@ class Benchmark { int64_t num_merges = 0; size_t max_length = 0; + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); // the number of iterations is the larger of read_ or write_ Duration duration(FLAGS_duration, readwrites_); - while (!duration.Done(1)) { - const int64_t k = thread->rand.Next() % merge_keys_; - std::string key = GenerateKeyFromInt(k, merge_keys_); + GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key); bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent; @@ -2727,6 +2352,7 @@ class Benchmark { thread->stats.FinishedSingleOp(db_); } + char msg[100]; snprintf(msg, sizeof(msg), "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \ @@ -2735,7 +2361,6 @@ class Benchmark { thread->stats.AddMessage(msg); } - void Compact(ThreadState* thread) { db_->CompactRange(nullptr, nullptr); } @@ -2747,28 +2372,6 @@ class Benchmark { } fprintf(stdout, "\n%s\n", stats.c_str()); } - - static void WriteToFile(void* arg, const char* buf, int n) { - reinterpret_cast(arg)->Append(Slice(buf, n)); - } - - void HeapProfile() { - char fname[100]; - EnvOptions soptions; - snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db.c_str(), - ++heap_counter_); - unique_ptr file; - Status s = FLAGS_env->NewWritableFile(fname, &file, soptions); - if (!s.ok()) { - fprintf(stderr, "%s\n", s.ToString().c_str()); - return; - } - bool ok = port::GetHeapProfile(WriteToFile, file.get()); - if (!ok) { - fprintf(stderr, "heap profiling not supported\n"); - FLAGS_env->DeleteFile(fname); - } - } }; } // namespace rocksdb diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 04d6d0e17..61a818465 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#define __STDC_FORMAT_MACROS +#include #include #include #include @@ -17,6 +19,7 @@ #include "rocksdb/env.h" #include "port/port.h" #include "util/mutexlock.h" +#include "util/sync_point.h" namespace rocksdb { @@ -60,21 +63,36 @@ Status DBImpl::GetLiveFiles(std::vector& ret, *manifest_file_size = 0; + mutex_.Lock(); + if (flush_memtable) { // flush all dirty data to disk. - Status status = Flush(FlushOptions()); + Status status; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->Ref(); + mutex_.Unlock(); + status = FlushMemTable(cfd, FlushOptions()); + mutex_.Lock(); + cfd->Unref(); + if (!status.ok()) { + break; + } + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + if (!status.ok()) { + mutex_.Unlock(); Log(options_.info_log, "Cannot Flush data %s\n", status.ToString().c_str()); return status; } } - MutexLock l(&mutex_); - // Make a set of all of the live *.sst files std::set live; - versions_->current()->AddLiveFiles(&live); + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->current()->AddLiveFiles(&live); + } ret.clear(); ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST @@ -91,24 +109,60 @@ Status DBImpl::GetLiveFiles(std::vector& ret, // find length of manifest file while holding the mutex lock *manifest_file_size = versions_->ManifestFileSize(); + mutex_.Unlock(); return Status::OK(); } Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { - // First get sorted files in archive dir, then append sorted files from main - // dir to maintain sorted order - - // list wal files in archive dir. + // First get sorted files in db dir, then get sorted files from archived + // dir, to avoid a race condition where a log file is moved to archived + // dir in between. Status s; + // list wal files in main db dir. + VectorLogPtr logs; + s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); + if (!s.ok()) { + return s; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1"); + TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2"); + + files.clear(); + // list wal files in archive dir. std::string archivedir = ArchivalDirectory(options_.wal_dir); if (env_->FileExists(archivedir)) { - s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile); + s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); if (!s.ok()) { return s; } } - // list wal files in main db dir. - return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile); + + uint64_t latest_archived_log_number = 0; + if (!files.empty()) { + latest_archived_log_number = files.back()->LogNumber(); + Log(options_.info_log, "Latest Archived log: %" PRIu64, + latest_archived_log_number); + } + + files.reserve(files.size() + logs.size()); + for (auto& log : logs) { + if (log->LogNumber() > latest_archived_log_number) { + files.push_back(std::move(log)); + } else { + // When the race condition happens, we could see the + // same log in both db dir and archived dir. Simply + // ignore the one in db dir. Note that, if we read + // archived dir first, we would have missed the log file. + Log(options_.info_log, "%s already moved to archive", + log->PathName().c_str()); + } + } + + return s; } } diff --git a/db/db_impl.cc b/db/db_impl.cc index bb1f839a9..c0d8440dd 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "port/port.h" +#include "rocksdb/cache.h" #include "port/likely.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -64,12 +66,11 @@ #include "util/mutexlock.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" +#include "util/sync_point.h" namespace rocksdb { -int DBImpl::SuperVersion::dummy = 0; -void* const DBImpl::SuperVersion::kSVInUse = &DBImpl::SuperVersion::dummy; -void* const DBImpl::SuperVersion::kSVObsolete = nullptr; +const std::string default_column_family_name("default"); void DumpLeveldbBuildVersion(Logger * log); @@ -251,34 +252,31 @@ struct DBImpl::CompactionState { } }; +namespace { // Fix user-supplied options to be reasonable template static void ClipToRange(T* ptr, V minvalue, V maxvalue) { if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; if (static_cast(*ptr) < minvalue) *ptr = minvalue; } +} // anonymous namespace + Options SanitizeOptions(const std::string& dbname, const InternalKeyComparator* icmp, const InternalFilterPolicy* ipolicy, const Options& src) { - Options result = src; - result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr; + auto db_options = SanitizeOptions(dbname, DBOptions(src)); + auto cf_options = SanitizeOptions(icmp, ipolicy, ColumnFamilyOptions(src)); + return Options(db_options, cf_options); +} + +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { + DBOptions result = src; // result.max_open_files means an "infinite" open files. if (result.max_open_files != -1) { - ClipToRange(&result.max_open_files, 20, 1000000); - } - ClipToRange(&result.write_buffer_size, ((size_t)64)<<10, - ((size_t)64)<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); - - // if user sets arena_block_size, we trust user to use this value. Otherwise, - // calculate a proper value from writer_buffer_size; - if (result.arena_block_size <= 0) { - result.arena_block_size = result.write_buffer_size / 10; + ClipToRange(&result.max_open_files, 20, 1000000); } - result.min_write_buffer_number_to_merge = std::min( - result.min_write_buffer_number_to_merge, result.max_write_buffer_number-1); if (result.info_log == nullptr) { Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env, result, &result.info_log); @@ -287,57 +285,15 @@ Options SanitizeOptions(const std::string& dbname, result.info_log = nullptr; } } - if (result.block_cache == nullptr && !result.no_block_cache) { - result.block_cache = NewLRUCache(8 << 20); - } - result.compression_per_level = src.compression_per_level; - if (result.block_size_deviation < 0 || result.block_size_deviation > 100) { - result.block_size_deviation = 0; - } - if (result.max_mem_compaction_level >= result.num_levels) { - result.max_mem_compaction_level = result.num_levels - 1; - } - if (result.soft_rate_limit > result.hard_rate_limit) { - result.soft_rate_limit = result.hard_rate_limit; - } - if (result.compaction_filter) { - Log(result.info_log, "Compaction filter specified, ignore factory"); - } - if (result.prefix_extractor) { - Log(result.info_log, "prefix extractor %s in use.", - result.prefix_extractor->Name()); - } else { - assert(result.memtable_factory); - Slice name = result.memtable_factory->Name(); - if (name.compare("HashSkipListRepFactory") == 0 || - name.compare("HashLinkListRepFactory") == 0) { - Log(result.info_log, "prefix extractor is not provided while using %s. " - "fallback to skiplist", name.ToString().c_str()); - result.memtable_factory = std::make_shared(); - } - } if (result.wal_dir.empty()) { // Use dbname as default result.wal_dir = dbname; } - - // -- Sanitize the table properties collector - // All user defined properties collectors will be wrapped by - // UserKeyTablePropertiesCollector since for them they only have the - // knowledge of the user keys; internal keys are invisible to them. - auto& collectors = result.table_properties_collectors; - for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) { - assert(collectors[i]); - collectors[i] = - std::make_shared(collectors[i]); + if (result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); } - // Add collector to collect internal key statistics - collectors.push_back( - std::make_shared() - ); - return result; } @@ -384,24 +340,16 @@ CompressionType GetCompressionFlush(const Options& options) { } } -DBImpl::DBImpl(const Options& options, const std::string& dbname) +DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) : env_(options.env), dbname_(dbname), - internal_comparator_(options.comparator), - options_(SanitizeOptions(dbname, &internal_comparator_, - &internal_filter_policy_, options)), - internal_filter_policy_(options.filter_policy), - owns_info_log_(options_.info_log != options.info_log), + options_(SanitizeOptions(dbname, options)), db_lock_(nullptr), mutex_(options.use_adaptive_mutex), shutting_down_(nullptr), bg_cv_(&mutex_), - mem_(new MemTable(internal_comparator_, options_)), - imm_(options_.min_write_buffer_number_to_merge), logfile_number_(0), - super_version_(nullptr), - super_version_number_(0), - local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), + default_cf_handle_(nullptr), tmp_batch_(), bg_schedule_needed_(false), bg_compaction_scheduled_(0), @@ -416,25 +364,26 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) last_stats_dump_time_microsec_(0), default_interval_to_delete_obsolete_WAL_(600), flush_on_destroy_(false), - internal_stats_(options.num_levels, options.env, - options.statistics.get()), delayed_writes_(0), storage_options_(options), bg_work_gate_closed_(false), refitting_level_(false), opened_successfully_(false) { - mem_->Ref(); env_->GetAbsolutePath(dbname, &db_absolute_path_); // Reserve ten files or so for other uses and give the rest to TableCache. // Give a large number for setting of "infinite" open files. const int table_cache_size = - (options_.max_open_files == -1) ? - 4194304 : options_.max_open_files - 10; - table_cache_.reset(new TableCache(dbname_, &options_, - storage_options_, table_cache_size)); - versions_.reset(new VersionSet(dbname_, &options_, storage_options_, - table_cache_.get(), &internal_comparator_)); + (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10; + // Reserve ten files or so for other uses and give the rest to TableCache. + table_cache_ = + NewLRUCache(table_cache_size, options_.table_cache_numshardbits, + options_.table_cache_remove_scan_count_limit); + + versions_.reset( + new VersionSet(dbname_, &options_, storage_options_, table_cache_.get())); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); DumpLeveldbBuildVersion(options_.info_log.get()); options_.Dump(options_.info_log.get()); @@ -453,27 +402,35 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) } DBImpl::~DBImpl() { - // Wait for background work to finish - if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) { - FlushMemTable(FlushOptions()); + mutex_.Lock(); + if (flush_on_destroy_) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + cfd->Ref(); + mutex_.Unlock(); + FlushMemTable(cfd, FlushOptions()); + mutex_.Lock(); + cfd->Unref(); + } + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); } - mutex_.Lock(); + // Wait for background work to finish shutting_down_.Release_Store(this); // Any non-nullptr value is ok while (bg_compaction_scheduled_ || bg_flush_scheduled_ || bg_logstats_scheduled_) { bg_cv_.Wait(); } - mutex_.Unlock(); - // Release SuperVersion reference kept in ThreadLocalPtr. - // This must be done outside of mutex_ since unref handler can lock mutex. - // It also needs to be done after FlushMemTable, which can trigger local_sv_ - // access. - delete local_sv_; + if (default_cf_handle_ != nullptr) { + // we need to delete handle outside of lock because it does its own locking + mutex_.Unlock(); + delete default_cf_handle_; + mutex_.Lock(); + } - mutex_.Lock(); if (options_.allow_thread_local) { // Clean up obsolete files due to SuperVersion release. // (1) Need to delete to obsolete files before closing because RepairDB() @@ -495,79 +452,15 @@ DBImpl::~DBImpl() { } } - if (super_version_ != nullptr) { - bool is_last_reference __attribute__((unused)); - is_last_reference = super_version_->Unref(); - assert(is_last_reference); - super_version_->Cleanup(); - delete super_version_; - } - mutex_.Unlock(); - - if (db_lock_ != nullptr) { - env_->UnlockFile(db_lock_); - } - - if (mem_ != nullptr) { - delete mem_->Unref(); - } - - autovector to_delete; - imm_.current()->Unref(&to_delete); - for (MemTable* m: to_delete) { - delete m; - } - // versions need to be destroyed before table_cache since it can holds + // versions need to be destroyed before table_cache since it can hold // references to table_cache. versions_.reset(); - LogFlush(options_.info_log); -} - -// Do not flush and close database elegantly. Simulate a crash. -void DBImpl::TEST_Destroy_DBImpl() { - // ensure that no new memtable flushes can occur - flush_on_destroy_ = false; - - // wait till all background compactions are done. - mutex_.Lock(); - while (bg_compaction_scheduled_ || - bg_flush_scheduled_ || - bg_logstats_scheduled_) { - bg_cv_.Wait(); - } mutex_.Unlock(); - - // Release SuperVersion reference kept in ThreadLocalPtr. - // This must be done outside of mutex_ since unref handler can lock mutex. - // It also needs to be done after FlushMemTable, which can trigger local_sv_ - // access. - delete local_sv_; - - mutex_.Lock(); - if (super_version_ != nullptr) { - bool is_last_reference __attribute__((unused)); - is_last_reference = super_version_->Unref(); - assert(is_last_reference); - super_version_->Cleanup(); - delete super_version_; - } - - // Prevent new compactions from occuring. - bg_work_gate_closed_ = true; - const int LargeNumber = 10000000; - bg_compaction_scheduled_ += LargeNumber; - - mutex_.Unlock(); - LogFlush(options_.info_log); - - // force release the lock file. if (db_lock_ != nullptr) { env_->UnlockFile(db_lock_); } - log_.reset(); - versions_.reset(); - table_cache_.reset(); + LogFlush(options_.info_log); } uint64_t DBImpl::TEST_Current_Manifest_FileNo() { @@ -576,7 +469,6 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() { Status DBImpl::NewDB() { VersionEdit new_db; - new_db.SetComparatorName(user_comparator()->Name()); new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); @@ -650,47 +542,6 @@ void DBImpl::MaybeDumpStats() { } } -// DBImpl::SuperVersion methods -DBImpl::SuperVersion::~SuperVersion() { - for (auto td : to_delete) { - delete td; - } -} - -DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() { - refs.fetch_add(1, std::memory_order_relaxed); - return this; -} - -bool DBImpl::SuperVersion::Unref() { - assert(refs > 0); - // fetch_sub returns the previous value of yoeref - return refs.fetch_sub(1, std::memory_order_relaxed) == 1; -} - -void DBImpl::SuperVersion::Cleanup() { - db->mutex_.AssertHeld(); - assert(refs.load(std::memory_order_relaxed) == 0); - imm->Unref(&to_delete); - MemTable* m = mem->Unref(); - if (m != nullptr) { - to_delete.push_back(m); - } - current->Unref(); -} - -void DBImpl::SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm, - Version* new_current) { - db->mutex_.AssertHeld(); - mem = new_mem; - imm = new_imm; - current = new_current; - mem->Ref(); - imm->Ref(); - current->Ref(); - refs.store(1, std::memory_order_relaxed); -} - // Returns the list of live files in 'sst_live' and the list // of all files in the filesystem in 'candidate_files'. // no_full_scan = true -- never do the full scan using GetChildren() @@ -730,7 +581,7 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, deletion_state.manifest_file_number = versions_->ManifestFileNumber(); deletion_state.pending_manifest_file_number = versions_->PendingManifestFileNumber(); - deletion_state.log_number = versions_->LogNumber(); + deletion_state.log_number = versions_->MinLogNumber(); deletion_state.prev_log_number = versions_->PrevLogNumber(); if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) { @@ -864,7 +715,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { if (type == kTableFile) { // evict from cache - table_cache_->Evict(number); + TableCache::Evict(table_cache_.get(), number); } std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) + @@ -872,7 +723,11 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { if (type == kLogFile && (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1"); Status s = env_->RenameFile(fname, archived_log_name); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2"); Log(options_.info_log, "Move log file %s to %s -- %s\n", fname.c_str(), archived_log_name.c_str(), s.ToString().c_str()); @@ -1020,7 +875,7 @@ void DBImpl::PurgeObsoleteWALFiles() { size_t files_del_num = log_files_num - files_keep_num; VectorLogPtr archived_logs; - AppendSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); if (files_del_num > archived_logs.size()) { Log(options_.info_log, "Trying to delete more archived log files than " @@ -1039,7 +894,9 @@ void DBImpl::PurgeObsoleteWALFiles() { } } -Status DBImpl::Recover(bool read_only, bool error_if_log_file_exist) { +Status DBImpl::Recover( + const std::vector& column_families, bool read_only, + bool error_if_log_file_exist) { mutex_.AssertHeld(); assert(db_lock_ == nullptr); @@ -1092,12 +949,14 @@ Status DBImpl::Recover(bool read_only, bool error_if_log_file_exist) { } } - Status s = versions_->Recover(); + Status s = versions_->Recover(column_families); if (options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } if (s.ok()) { SequenceNumber max_sequence(0); + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); // Recover from all newer log files than the ones named in the // descriptor (new log files may have been added by the previous @@ -1106,7 +965,7 @@ Status DBImpl::Recover(bool read_only, bool error_if_log_file_exist) { // Note that PrevLogNumber() is no longer used, but we pay // attention to it in case we are recovering a database // produced by an older version of rocksdb. - const uint64_t min_log = versions_->LogNumber(); + const uint64_t min_log = versions_->MinLogNumber(); const uint64_t prev_log = versions_->PrevLogNumber(); std::vector filenames; s = env_->GetChildren(options_.wal_dir, &filenames); @@ -1140,14 +999,8 @@ Status DBImpl::Recover(bool read_only, bool error_if_log_file_exist) { versions_->MarkFileNumberUsed(log); s = RecoverLogFile(log, &max_sequence, read_only); } - - if (s.ok()) { - if (versions_->LastSequence() < max_sequence) { - versions_->SetLastSequence(max_sequence); - } - SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, - versions_->LastSequence()); - } + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, + versions_->LastSequence()); } return s; @@ -1171,7 +1024,13 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, mutex_.AssertHeld(); - VersionEdit edit; + std::unordered_map version_edits; + // no need to refcount because iteration is under mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + version_edits.insert({cfd->GetID(), edit}); + } // Open the log file std::string fname = LogFileName(options_.wal_dir, log_number); @@ -1202,7 +1061,6 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, std::string scratch; Slice record; WriteBatch batch; - bool memtable_empty = true; while (reader.ReadRecord(&record, &scratch)) { if (record.size() < 12) { reporter.Corruption( @@ -1211,8 +1069,9 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, } WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, mem_, &options_); - memtable_empty = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), true, log_number); + MaybeIgnoreError(&status); if (!status.ok()) { return status; @@ -1224,52 +1083,86 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, *max_sequence = last_seq; } - if (!read_only && mem_->ShouldFlush()) { - status = WriteLevel0TableForRecovery(mem_, &edit); - // we still want to clear memtable, even if the recovery failed - delete mem_->Unref(); - mem_ = new MemTable(internal_comparator_, options_); - mem_->Ref(); - memtable_empty = true; + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->mem()->ShouldFlush()) { + // If this asserts, it means that InsertInto failed in + // filtering updates to already-flushed column families + assert(cfd->GetLogNumber() <= log_number); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); + // we still want to clear the memtable, even if the recovery failed + cfd->CreateNewMemtable(); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + } + } + } + } + + if (versions_->LastSequence() < *max_sequence) { + versions_->SetLastSequence(*max_sequence); + } + + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + + if (cfd->GetLogNumber() > log_number) { + // Column family cfd has already flushed the data + // from log_number. Memtable has to be empty because + // we filter the updates based on log_number + // (in WriteBatch::InsertInto) + assert(cfd->mem()->GetFirstSequenceNumber() == 0); + assert(edit->NumEntries() == 0); + continue; + } + + // flush the final memtable (if non-empty) + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); + } + // we still want to clear the memtable, even if the recovery failed + cfd->CreateNewMemtable(); + if (!status.ok()) { + return status; + } + + // write MANIFEST with update + // writing log number in the manifest means that any log file + // with number strongly less than (log_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered log_number, we want all logs + // with numbers `<= log_number` (includes this one) to be ignored + edit->SetLogNumber(log_number + 1); + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any + // log number + versions_->MarkFileNumberUsed(log_number + 1); + status = versions_->LogAndApply(cfd, edit, &mutex_); if (!status.ok()) { - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. return status; } } } - if (!memtable_empty && !read_only) { - status = WriteLevel0TableForRecovery(mem_, &edit); - delete mem_->Unref(); - mem_ = new MemTable(internal_comparator_, options_); - mem_->Ref(); - if (!status.ok()) { - return status; - } - } - - if (edit.NumEntries() > 0) { - // if read_only, NumEntries() will be 0 - assert(!read_only); - // writing log number in the manifest means that any log file - // with number strongly less than (log_number + 1) is already - // recovered and should be ignored on next reincarnation. - // Since we already recovered log_number, we want all logs - // with numbers `<= log_number` (includes this one) to be ignored - edit.SetLogNumber(log_number + 1); - // we must mark the next log number as used, even though it's - // not actually used. that is because VersionSet assumes - // VersionSet::next_file_number_ always to be strictly greater than any log - // number - versions_->MarkFileNumberUsed(log_number + 1); - status = versions_->LogAndApply(&edit, &mutex_); - } - return status; } -Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { +Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, + VersionEdit* edit) { mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; @@ -1285,10 +1178,10 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { Status s; { mutex_.Unlock(); - s = BuildTable(dbname_, env_, options_, storage_options_, - table_cache_.get(), iter, &meta, internal_comparator_, + s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(options_)); + GetCompressionFlush(*cfd->options())); LogFlush(options_.info_log); mutex_.Lock(); } @@ -1314,15 +1207,14 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) { stats.micros = env_->NowMicros() - start_micros; stats.bytes_written = meta.file_size; stats.files_out_levelnp1 = 1; - internal_stats_.AddCompactionStats(level, stats); + cfd->internal_stats()->AddCompactionStats(level, stats); RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); return s; } - -Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, - uint64_t* filenumber, - LogBuffer* log_buffer) { +Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, + autovector& mems, VersionEdit* edit, + uint64_t* filenumber, LogBuffer* log_buffer) { mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; @@ -1333,7 +1225,7 @@ Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber earliest_seqno_in_memtable = mems[0]->GetFirstSequenceNumber(); - Version* base = versions_->current(); + Version* base = cfd->current(); base->Ref(); // it is likely that we do not need this reference Status s; { @@ -1342,20 +1234,19 @@ Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, std::vector memtables; for (MemTable* m : mems) { Log(options_.info_log, - "Flushing memtable with log file: %lu\n", - (unsigned long)m->GetLogNumber()); + "[CF %u] Flushing memtable with next log file: %lu\n", cfd->GetID(), + (unsigned long)m->GetNextLogNumber()); memtables.push_back(m->NewIterator()); } - Iterator* iter = NewMergingIterator( - env_, &internal_comparator_, &memtables[0], memtables.size()); - Log(options_.info_log, - "Level-0 flush table #%lu: started", + Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), + &memtables[0], memtables.size()); + Log(options_.info_log, "Level-0 flush table #%lu: started", (unsigned long)meta.number); - s = BuildTable(dbname_, env_, options_, storage_options_, - table_cache_.get(), iter, &meta, internal_comparator_, + s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, + cfd->table_cache(), iter, &meta, cfd->internal_comparator(), newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(options_)); + GetCompressionFlush(*cfd->options())); LogFlush(options_.info_log); delete iter; Log(options_.info_log, "Level-0 flush table #%lu: %lu bytes %s", @@ -1370,7 +1261,7 @@ Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, base->Unref(); // re-acquire the most current version - base = versions_->current(); + base = cfd->current(); // There could be multiple threads writing to its own level-0 file. // The pending_outputs cannot be cleared here, otherwise this newly @@ -1392,7 +1283,7 @@ Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, // threads could be concurrently producing compacted files for // that key range. if (base != nullptr && options_.max_background_compactions <= 1 && - options_.compaction_style == kCompactionStyleLevel) { + cfd->options()->compaction_style == kCompactionStyleLevel) { level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } edit->AddFile(level, meta.number, meta.file_size, @@ -1403,22 +1294,23 @@ Status DBImpl::WriteLevel0Table(autovector& mems, VersionEdit* edit, InternalStats::CompactionStats stats; stats.micros = env_->NowMicros() - start_micros; stats.bytes_written = meta.file_size; - internal_stats_.AddCompactionStats(level, stats); + cfd->internal_stats()->AddCompactionStats(level, stats); RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); return s; } -Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, +Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, + bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer) { mutex_.AssertHeld(); - assert(imm_.size() != 0); - assert(imm_.IsFlushPending()); + assert(cfd->imm()->size() != 0); + assert(cfd->imm()->IsFlushPending()); // Save the contents of the earliest memtable as a new Table uint64_t file_number; autovector mems; - imm_.PickMemtablesToFlush(&mems); + cfd->imm()->PickMemtablesToFlush(&mems); if (mems.empty()) { LogToBuffer(log_buffer, "Nothing in memstore to flush"); return Status::OK(); @@ -1434,32 +1326,28 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, // SetLogNumber(log_num) indicates logs with number smaller than log_num // will no longer be picked up for recovery. edit->SetLogNumber(mems.back()->GetNextLogNumber()); - - std::vector logs_to_delete; - for (auto mem : mems) { - logs_to_delete.push_back(mem->GetLogNumber()); - } + edit->SetColumnFamily(cfd->GetID()); // This will release and re-acquire the mutex. - Status s = WriteLevel0Table(mems, edit, &file_number, log_buffer); + Status s = WriteLevel0Table(cfd, mems, edit, &file_number, log_buffer); - if (s.ok() && shutting_down_.Acquire_Load()) { + if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) { s = Status::ShutdownInProgress( - "Database shutdown started during memtable compaction"); + "Database shutdown or Column family drop during flush"); } if (!s.ok()) { - imm_.RollbackMemtableFlush(mems, file_number, &pending_outputs_); + cfd->imm()->RollbackMemtableFlush(mems, file_number, &pending_outputs_); } else { // Replace immutable memtable with the generated Table - s = imm_.InstallMemtableFlushResults( - mems, versions_.get(), &mutex_, options_.info_log.get(), file_number, - pending_outputs_, &deletion_state.memtables_to_free, - db_directory_.get()); + s = cfd->imm()->InstallMemtableFlushResults( + cfd, mems, versions_.get(), &mutex_, options_.info_log.get(), + file_number, pending_outputs_, &deletion_state.memtables_to_free, + db_directory_.get(), log_buffer); } if (s.ok()) { - InstallSuperVersion(deletion_state); + InstallSuperVersion(cfd, deletion_state); if (madeProgress) { *madeProgress = 1; } @@ -1468,10 +1356,11 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, if (disable_delete_obsolete_files_ == 0) { // add to deletion state - deletion_state.log_delete_files.insert( - deletion_state.log_delete_files.end(), - logs_to_delete.begin(), - logs_to_delete.end()); + while (alive_log_files_.size() && + *alive_log_files_.begin() < versions_->MinLogNumber()) { + deletion_state.log_delete_files.push_back(*alive_log_files_.begin()); + alive_log_files_.pop_front(); + } } } @@ -1484,11 +1373,13 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, return s; } -Status DBImpl::CompactRange(const Slice* begin, - const Slice* end, - bool reduce_level, - int target_level) { - Status s = FlushMemTable(FlushOptions()); +Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level, int target_level) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + Status s = FlushMemTable(cfd, FlushOptions()); if (!s.ok()) { LogFlush(options_.info_log); return s; @@ -1497,8 +1388,8 @@ Status DBImpl::CompactRange(const Slice* begin, int max_level_with_files = 1; { MutexLock l(&mutex_); - Version* base = versions_->current(); - for (int level = 1; level < NumberLevels(); level++) { + Version* base = cfd->current(); + for (int level = 1; level < cfd->NumberLevels(); level++) { if (base->OverlapInLevel(level, begin, end)) { max_level_with_files = level; } @@ -1507,11 +1398,11 @@ Status DBImpl::CompactRange(const Slice* begin, for (int level = 0; level <= max_level_with_files; level++) { // in case the compaction is unversal or if we're compacting the // bottom-most level, the output level will be the same as input one - if (options_.compaction_style == kCompactionStyleUniversal || + if (cfd->options()->compaction_style == kCompactionStyleUniversal || level == max_level_with_files) { - s = RunManualCompaction(level, level, begin, end); + s = RunManualCompaction(cfd, level, level, begin, end); } else { - s = RunManualCompaction(level, level + 1, begin, end); + s = RunManualCompaction(cfd, level, level + 1, begin, end); } if (!s.ok()) { LogFlush(options_.info_log); @@ -1520,7 +1411,7 @@ Status DBImpl::CompactRange(const Slice* begin, } if (reduce_level) { - s = ReFitLevel(max_level_with_files, target_level); + s = ReFitLevel(cfd, max_level_with_files, target_level); } LogFlush(options_.info_log); @@ -1528,23 +1419,26 @@ Status DBImpl::CompactRange(const Slice* begin, } // return the same level if it cannot be moved -int DBImpl::FindMinimumEmptyLevelFitting(int level) { +int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) { mutex_.AssertHeld(); - Version* current = versions_->current(); + Version* current = cfd->current(); int minimum_level = level; for (int i = level - 1; i > 0; --i) { // stop if level i is not empty if (current->NumLevelFiles(i) > 0) break; // stop if level i is too small (cannot fit the level files) - if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break; + if (cfd->compaction_picker()->MaxBytesForLevel(i) < + current->NumLevelBytes(level)) { + break; + } minimum_level = i; } return minimum_level; } -Status DBImpl::ReFitLevel(int level, int target_level) { - assert(level < NumberLevels()); +Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { + assert(level < cfd->NumberLevels()); SuperVersion* superversion_to_free = nullptr; SuperVersion* new_superversion = new SuperVersion(); @@ -1572,7 +1466,7 @@ Status DBImpl::ReFitLevel(int level, int target_level) { // move to a smaller level int to_level = target_level; if (target_level < 0) { - to_level = FindMinimumEmptyLevelFitting(level); + to_level = FindMinimumEmptyLevelFitting(cfd, level); } assert(to_level <= level); @@ -1580,10 +1474,11 @@ Status DBImpl::ReFitLevel(int level, int target_level) { Status status; if (to_level < level) { Log(options_.info_log, "Before refitting:\n%s", - versions_->current()->DebugString().data()); + cfd->current()->DebugString().data()); VersionEdit edit; - for (const auto& f : versions_->current()->files_[level]) { + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : cfd->current()->files_[level]) { edit.DeleteFile(level, f->number); edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); @@ -1591,15 +1486,15 @@ Status DBImpl::ReFitLevel(int level, int target_level) { Log(options_.info_log, "Apply version edit:\n%s", edit.DebugString().data()); - status = versions_->LogAndApply(&edit, &mutex_, db_directory_.get()); - superversion_to_free = InstallSuperVersion(new_superversion); + status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); + superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_); new_superversion = nullptr; Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data()); if (status.ok()) { Log(options_.info_log, "After refitting:\n%s", - versions_->current()->DebugString().data()); + cfd->current()->DebugString().data()); } } @@ -1612,24 +1507,25 @@ Status DBImpl::ReFitLevel(int level, int target_level) { return status; } -int DBImpl::NumberLevels() { - return options_.num_levels; +int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->NumberLevels(); } -int DBImpl::MaxMemCompactionLevel() { - return options_.max_mem_compaction_level; +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->options()->max_mem_compaction_level; } -int DBImpl::Level0StopWriteTrigger() { - return options_.level0_stop_writes_trigger; +int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->options()->level0_stop_writes_trigger; } -uint64_t DBImpl::CurrentVersionNumber() const { - return super_version_number_.load(); -} - -Status DBImpl::Flush(const FlushOptions& options) { - return FlushMemTable(options); +Status DBImpl::Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return FlushMemTable(cfh->cfd(), options); } SequenceNumber DBImpl::GetLatestSequenceNumber() const { @@ -1791,20 +1687,14 @@ struct CompareLogByPointer { } }; -Status DBImpl::AppendSortedWalsOfType(const std::string& path, +Status DBImpl::GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files, WalFileType log_type) { std::vector all_files; const Status status = env_->GetChildren(path, &all_files); if (!status.ok()) { return status; } - log_files.reserve(log_files.size() + all_files.size()); - VectorLogPtr::iterator pos_start; - if (!log_files.empty()) { - pos_start = log_files.end() - 1; - } else { - pos_start = log_files.begin(); - } + log_files.reserve(all_files.size()); for (const auto& f : all_files) { uint64_t number; FileType type; @@ -1830,19 +1720,19 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path, } } CompareLogByPointer compare_log_files; - std::sort(pos_start, log_files.end(), compare_log_files); + std::sort(log_files.begin(), log_files.end(), compare_log_files); return status; } -Status DBImpl::RunManualCompaction(int input_level, - int output_level, - const Slice* begin, +Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, const Slice* begin, const Slice* end) { assert(input_level >= 0); InternalKey begin_storage, end_storage; ManualCompaction manual; + manual.cfd = cfd; manual.input_level = input_level; manual.output_level = output_level; manual.done = false; @@ -1850,14 +1740,14 @@ Status DBImpl::RunManualCompaction(int input_level, // For universal compaction, we enforce every manual compaction to compact // all files. if (begin == nullptr || - options_.compaction_style == kCompactionStyleUniversal) { + cfd->options()->compaction_style == kCompactionStyleUniversal) { manual.begin = nullptr; } else { begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); manual.begin = &begin_storage; } if (end == nullptr || - options_.compaction_style == kCompactionStyleUniversal) { + cfd->options()->compaction_style == kCompactionStyleUniversal) { manual.end = nullptr; } else { end_storage = InternalKey(*end, 0, static_cast(0)); @@ -1907,33 +1797,42 @@ Status DBImpl::RunManualCompaction(int input_level, return manual.status; } -Status DBImpl::TEST_CompactRange(int level, - const Slice* begin, - const Slice* end) { - int output_level = (options_.compaction_style == kCompactionStyleUniversal) - ? level - : level + 1; - return RunManualCompaction(level, output_level, begin, end); +Status DBImpl::TEST_CompactRange(int level, const Slice* begin, + const Slice* end, + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + int output_level = + (cfd->options()->compaction_style == kCompactionStyleUniversal) + ? level + : level + 1; + return RunManualCompaction(cfd, level, output_level, begin, end); } -Status DBImpl::FlushMemTable(const FlushOptions& options) { +Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& options) { // nullptr batch means just wait for earlier writes to be done Status s = Write(WriteOptions(), nullptr); if (s.ok() && options.wait) { // Wait until the compaction completes - s = WaitForFlushMemTable(); + s = WaitForFlushMemTable(cfd); } return s; } -Status DBImpl::WaitForFlushMemTable() { +Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { Status s; // Wait until the compaction completes MutexLock l(&mutex_); - while (imm_.size() > 0 && bg_error_.ok()) { + while (cfd->imm()->size() > 0 && bg_error_.ok()) { bg_cv_.Wait(); } - if (imm_.size() != 0) { + if (!bg_error_.ok()) { s = bg_error_; } return s; @@ -1942,11 +1841,18 @@ Status DBImpl::WaitForFlushMemTable() { Status DBImpl::TEST_FlushMemTable(bool wait) { FlushOptions fo; fo.wait = wait; - return FlushMemTable(fo); + return FlushMemTable(default_cf_handle_->cfd(), fo); } -Status DBImpl::TEST_WaitForFlushMemTable() { - return WaitForFlushMemTable(); +Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + return WaitForFlushMemTable(cfd); } Status DBImpl::TEST_WaitForCompact() { @@ -1972,24 +1878,37 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { } else if (shutting_down_.Acquire_Load()) { // DB is being deleted; no more background compactions } else { - bool is_flush_pending = imm_.IsFlushPending(); + bool is_flush_pending = false; + // no need to refcount since we're under a mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->imm()->IsFlushPending()) { + is_flush_pending = true; + } + } if (is_flush_pending) { + // memtable flush needed if (bg_flush_scheduled_ < options_.max_background_flushes) { - // memtable flush needed bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); } else if (options_.max_background_flushes > 0) { bg_schedule_needed_ = true; } } + bool is_compaction_needed = false; + // no need to refcount since we're under a mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->current()->NeedsCompaction()) { + is_compaction_needed = true; + break; + } + } // Schedule BGWorkCompaction if there's a compaction pending (or a memtable - // flush, but the HIGH pool is not enabled). Do it only if - // max_background_compactions hasn't been reached and, in case + // flush, but the HIGH pool is not enabled) + // Do it only if max_background_compactions hasn't been reached and, in case // bg_manual_only_ > 0, if it's a manual compaction. - if ((manual_compaction_ || - versions_->current()->NeedsCompaction() || - (is_flush_pending && (options_.max_background_flushes <= 0))) && + if ((manual_compaction_ || is_compaction_needed || + (is_flush_pending && options_.max_background_flushes == 0)) && (!bg_manual_only_ || manual_compaction_)) { if (bg_compaction_scheduled_ < options_.max_background_compactions) { bg_compaction_scheduled_++; @@ -2012,14 +1931,31 @@ void DBImpl::BGWorkCompaction(void* db) { Status DBImpl::BackgroundFlush(bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer) { - Status stat; - while (stat.ok() && imm_.IsFlushPending()) { - Log(options_.info_log, - "BackgroundCallFlush doing FlushMemTableToOutputFile, flush slots available %d", - options_.max_background_flushes - bg_flush_scheduled_); - stat = FlushMemTableToOutputFile(madeProgress, deletion_state, log_buffer); + mutex_.AssertHeld(); + // call_status is failure if at least one flush was a failure. even if + // flushing one column family reports a failure, we will continue flushing + // other column families. however, call_status will be a failure in that case. + Status call_status; + // refcounting in iteration + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->Ref(); + Status flush_status; + while (flush_status.ok() && cfd->imm()->IsFlushPending()) { + LogToBuffer( + log_buffer, + "BackgroundCallFlush doing FlushMemTableToOutputFile with column " + "family %u, flush slots available %d", + cfd->GetID(), options_.max_background_flushes - bg_flush_scheduled_); + flush_status = FlushMemTableToOutputFile(cfd, madeProgress, + deletion_state, log_buffer); + } + if (call_status.ok() && !flush_status.ok()) { + call_status = flush_status; + } + cfd->Unref(); } - return stat; + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + return call_status; } void DBImpl::BackgroundCallFlush() { @@ -2039,7 +1975,9 @@ void DBImpl::BackgroundCallFlush() { // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of // the problem. - uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount(); + uint64_t error_cnt = default_cf_handle_->cfd() + ->internal_stats() + ->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); Log(options_.info_log, @@ -2094,7 +2032,7 @@ void DBImpl::TEST_PurgeObsoleteteWAL() { uint64_t DBImpl::TEST_GetLevel0TotalSize() { MutexLock l(&mutex_); - return versions_->current()->NumLevelBytes(0); + return default_cf_handle_->cfd()->current()->NumLevelBytes(0); } void DBImpl::BackgroundCallCompaction() { @@ -2116,7 +2054,9 @@ void DBImpl::BackgroundCallCompaction() { // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of // the problem. - uint64_t error_cnt = internal_stats_.BumpAndGetBackgroundErrorCount(); + uint64_t error_cnt = default_cf_handle_->cfd() + ->internal_stats() + ->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); log_buffer.FlushBufferToLog(); @@ -2155,6 +2095,8 @@ void DBImpl::BackgroundCallCompaction() { MaybeScheduleLogDBDeployStats(); + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + // Previous compaction may have produced too many files in a level, // So reschedule another compaction if we made progress in the // last compaction. @@ -2182,28 +2124,34 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, bool is_manual = (manual_compaction_ != nullptr) && (manual_compaction_->in_progress == false); + if (is_manual) { // another thread cannot pick up the same work manual_compaction_->in_progress = true; } - // TODO: remove memtable flush from formal compaction - while (imm_.IsFlushPending()) { - LogToBuffer(log_buffer, - "BackgroundCompaction doing FlushMemTableToOutputFile, " - "compaction slots " - "available %d", - options_.max_background_compactions - bg_compaction_scheduled_); - Status stat = FlushMemTableToOutputFile(madeProgress, deletion_state, - log_buffer); - if (!stat.ok()) { - if (is_manual) { - manual_compaction_->status = stat; - manual_compaction_->done = true; - manual_compaction_->in_progress = false; - manual_compaction_ = nullptr; + // FLUSH preempts compaction + Status flush_stat; + for (auto cfd : *versions_->GetColumnFamilySet()) { + while (cfd->imm()->IsFlushPending()) { + LogToBuffer( + log_buffer, + "BackgroundCompaction doing FlushMemTableToOutputFile, " + "compaction slots available %d", + options_.max_background_compactions - bg_compaction_scheduled_); + cfd->Ref(); + flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, + log_buffer); + cfd->Unref(); + if (!flush_stat.ok()) { + if (is_manual) { + manual_compaction_->status = flush_stat; + manual_compaction_->done = true; + manual_compaction_->in_progress = false; + manual_compaction_ = nullptr; + } + return flush_stat; } - return stat; } } @@ -2213,8 +2161,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, if (is_manual) { ManualCompaction* m = manual_compaction_; assert(m->in_progress); - c.reset(versions_->CompactRange( - m->input_level, m->output_level, m->begin, m->end, &manual_end)); + c.reset(m->cfd->CompactRange(m->input_level, m->output_level, m->begin, + m->end, &manual_end)); if (!c) { m->done = true; } @@ -2228,8 +2176,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, ((m->done || manual_end == nullptr) ? "(end)" : manual_end->DebugString().c_str())); - } else if (!options_.disable_auto_compactions) { - c.reset(versions_->PickCompaction(log_buffer)); + } else { + // no need to refcount in iteration since it's always under a mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->options()->disable_auto_compactions) { + c.reset(cfd->PickCompaction(log_buffer)); + if (c != nullptr) { + // update statistics + MeasureTime(options_.statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + break; + } + } + } } Status status; @@ -2244,22 +2203,23 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, c->edit()->AddFile(c->level() + 1, f->number, f->file_size, f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); - status = versions_->LogAndApply(c->edit(), &mutex_, db_directory_.get()); - InstallSuperVersion(deletion_state); + status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, + db_directory_.get()); + InstallSuperVersion(c->column_family_data(), deletion_state); + Version::LevelSummaryStorage tmp; LogToBuffer(log_buffer, "Moved #%lld to level-%d %lld bytes %s: %s\n", - static_cast(f->number), c->level() + 1, - static_cast(f->file_size), - status.ToString().c_str(), - versions_->current()->LevelSummary(&tmp)); - versions_->ReleaseCompactionFiles(c.get(), status); + static_cast(f->number), c->level() + 1, + static_cast(f->file_size), + status.ToString().c_str(), c->input_version()->LevelSummary(&tmp)); + c->ReleaseCompactionFiles(status); *madeProgress = true; } else { MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel. CompactionState* compact = new CompactionState(c.get()); status = DoCompactionWork(compact, deletion_state, log_buffer); CleanupCompaction(compact, status); - versions_->ReleaseCompactionFiles(c.get(), status); + c->ReleaseCompactionFiles(status); c->ReleaseInputs(); *madeProgress = true; } @@ -2303,7 +2263,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // We only compacted part of the requested range. Update *m // to the range that is left to be compacted. // Universal compaction should always compact the whole range - assert(options_.compaction_style != kCompactionStyleUniversal); + assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal); m->tmp_storage = *manual_end; m->begin = &m->tmp_storage; } @@ -2329,7 +2289,7 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) { // If this file was inserted into the table cache then remove // them here because this compaction was not committed. if (!status.ok()) { - table_cache_->Evict(out.number); + TableCache::Evict(table_cache_.get(), out.number); } } delete compact; @@ -2389,16 +2349,18 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { if (s.ok()) { // Over-estimate slightly so we don't end up just barely crossing // the threshold. + ColumnFamilyData* cfd = compact->compaction->column_family_data(); compact->outfile->SetPreallocationBlockSize( - 1.1 * versions_->MaxFileSizeForLevel(compact->compaction->output_level())); + 1.1 * cfd->compaction_picker()->MaxFileSizeForLevel( + compact->compaction->output_level())); - CompressionType compression_type = GetCompressionType( - options_, compact->compaction->output_level(), - compact->compaction->enable_compression()); + CompressionType compression_type = + GetCompressionType(*cfd->options(), compact->compaction->output_level(), + compact->compaction->enable_compression()); - compact->builder.reset(NewTableBuilder(options_, internal_comparator_, - compact->outfile.get(), - compression_type)); + compact->builder.reset( + NewTableBuilder(*cfd->options(), cfd->internal_comparator(), + compact->outfile.get(), compression_type)); } LogFlush(options_.info_log); return s; @@ -2445,9 +2407,10 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, if (s.ok() && current_entries > 0) { // Verify that the table is usable + ColumnFamilyData* cfd = compact->compaction->column_family_data(); FileMetaData meta(output_number, current_bytes); - Iterator* iter = table_cache_->NewIterator(ReadOptions(), storage_options_, - internal_comparator_, meta); + Iterator* iter = cfd->table_cache()->NewIterator( + ReadOptions(), storage_options_, cfd->internal_comparator(), meta); s = iter->status(); delete iter; if (s.ok()) { @@ -2462,7 +2425,8 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, } -Status DBImpl::InstallCompactionResults(CompactionState* compact) { +Status DBImpl::InstallCompactionResults(CompactionState* compact, + LogBuffer* log_buffer) { mutex_.AssertHeld(); // paranoia: verify that the files that we started with @@ -2474,16 +2438,16 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), - compact->compaction->level() + 1); + compact->compaction->output_level()); return Status::Corruption("Compaction input files inconsistent"); } - Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1, - static_cast(compact->total_bytes)); + LogToBuffer(log_buffer, "Compacted %d@%d + %d@%d files => %lld bytes", + compact->compaction->num_input_files(0), + compact->compaction->level(), + compact->compaction->num_input_files(1), + compact->compaction->output_level(), + static_cast(compact->total_bytes)); // Add compaction outputs compact->compaction->AddInputDeletions(compact->compaction->edit()); @@ -2493,11 +2457,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { compact->compaction->output_level(), out.number, out.file_size, out.smallest, out.largest, out.smallest_seqno, out.largest_seqno); } - return versions_->LogAndApply(compact->compaction->edit(), &mutex_, + return versions_->LogAndApply(compact->compaction->column_family_data(), + compact->compaction->edit(), &mutex_, db_directory_.get()); } -// // Given a sequence number, return the sequence number of the // earliest snapshot that this sequence number is visible in. // The snapshots themselves are arranged in ascending order of @@ -2525,6 +2489,25 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( return 0; } +uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, + DeletionState& deletion_state, + LogBuffer* log_buffer) { + if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) { + const uint64_t imm_start = env_->NowMicros(); + mutex_.Lock(); + if (cfd->imm()->IsFlushPending()) { + cfd->Ref(); + FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer); + cfd->Unref(); + bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + } + mutex_.Unlock(); + log_buffer->FlushBufferToLog(); + return env_->NowMicros() - imm_start; + } + return 0; +} + Status DBImpl::ProcessKeyValueCompaction( SequenceNumber visible_at_tip, SequenceNumber earliest_snapshot, @@ -2546,34 +2529,28 @@ Status DBImpl::ProcessKeyValueCompaction( SequenceNumber last_sequence_for_key __attribute__((unused)) = kMaxSequenceNumber; SequenceNumber visible_in_snapshot = kMaxSequenceNumber; - MergeHelper merge(user_comparator(), options_.merge_operator.get(), - options_.info_log.get(), - options_.min_partial_merge_operands, - false /* internal key corruption is expected */); - auto compaction_filter = options_.compaction_filter; + ColumnFamilyData* cfd = compact->compaction->column_family_data(); + MergeHelper merge( + cfd->user_comparator(), cfd->options()->merge_operator.get(), + options_.info_log.get(), cfd->options()->min_partial_merge_operands, + false /* internal key corruption is expected */); + auto compaction_filter = cfd->options()->compaction_filter; std::unique_ptr compaction_filter_from_factory = nullptr; if (!compaction_filter) { auto context = compact->GetFilterContextV1(); compaction_filter_from_factory = - options_.compaction_filter_factory->CreateCompactionFilter(context); + cfd->options()->compaction_filter_factory->CreateCompactionFilter( + context); compaction_filter = compaction_filter_from_factory.get(); } - for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Prioritize immutable compaction work - // TODO: remove memtable flush from normal compaction work - if (imm_.imm_flush_needed.NoBarrier_Load() != nullptr) { - const uint64_t imm_start = env_->NowMicros(); - LogFlush(options_.info_log); - mutex_.Lock(); - if (imm_.IsFlushPending()) { - FlushMemTableToOutputFile(nullptr, deletion_state, log_buffer); - bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary - } - mutex_.Unlock(); - log_buffer->FlushBufferToLog(); - imm_micros += (env_->NowMicros() - imm_start); - } + while (input->Valid() && !shutting_down_.Acquire_Load() && + !cfd->IsDropped()) { + // FLUSH preempts compaction + // TODO(icanadi) this currently only checks if flush is necessary on + // compacting column family. we should also check if flush is necessary on + // other column families, too + imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); Slice key; Slice value; @@ -2618,8 +2595,8 @@ Status DBImpl::ProcessKeyValueCompaction( visible_in_snapshot = kMaxSequenceNumber; } else { if (!has_current_user_key || - user_comparator()->Compare(ikey.user_key, - Slice(current_user_key)) != 0) { + cfd->user_comparator()->Compare(ikey.user_key, + Slice(current_user_key)) != 0) { // First occurrence of this user key current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); has_current_user_key = true; @@ -2636,11 +2613,9 @@ Status DBImpl::ProcessKeyValueCompaction( // the entry with a delete marker. bool value_changed = false; compaction_filter_value.clear(); - bool to_delete = - compaction_filter->Filter(compact->compaction->level(), - ikey.user_key, value, - &compaction_filter_value, - &value_changed); + bool to_delete = compaction_filter->Filter( + compact->compaction->level(), ikey.user_key, value, + &compaction_filter_value, &value_changed); if (to_delete) { // make a copy of the original key delete_key.assign(key.data(), key.data() + key.size()); @@ -2658,7 +2633,6 @@ Status DBImpl::ProcessKeyValueCompaction( value = compaction_filter_value; } } - } // If there are no snapshots, then this kv affect visibility at tip. @@ -2906,19 +2880,19 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, bool prefix_initialized = false; int64_t imm_micros = 0; // Micros spent doing imm_ compactions - Log(options_.info_log, - "Compacting %d@%d + %d@%d files, score %.2f slots available %d", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->output_level(), - compact->compaction->score(), + ColumnFamilyData* cfd = compact->compaction->column_family_data(); + LogToBuffer( + log_buffer, + "[CF %u] Compacting %d@%d + %d@%d files, score %.2f slots available %d", + cfd->GetID(), compact->compaction->num_input_files(0), + compact->compaction->level(), compact->compaction->num_input_files(1), + compact->compaction->output_level(), compact->compaction->score(), options_.max_background_compactions - bg_compaction_scheduled_); char scratch[2345]; compact->compaction->Summary(scratch, sizeof(scratch)); - Log(options_.info_log, "Compaction start summary: %s\n", scratch); + LogToBuffer(log_buffer, "Compaction start summary: %s\n", scratch); - assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0); + assert(cfd->current()->NumLevelFiles(compact->compaction->level()) > 0); assert(compact->builder == nullptr); assert(!compact->outfile); @@ -2946,6 +2920,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, // Release mutex while we're actually doing the compaction work mutex_.Unlock(); + log_buffer->FlushBufferToLog(); const uint64_t start_micros = env_->NowMicros(); unique_ptr input(versions_->MakeInputIterator(compact->compaction)); @@ -2960,7 +2935,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, = nullptr; auto context = compact->GetFilterContext(); compaction_filter_from_factory_v2 = - options_.compaction_filter_factory_v2->CreateCompactionFilterV2(context); + cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2( + context); auto compaction_filter_v2 = compaction_filter_from_factory_v2.get(); @@ -2972,28 +2948,22 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, // 3) merge value_buffer with ineligible_value_buffer; // 4) run the modified "compaction" using the old for loop. if (compaction_filter_v2) { - for (; backup_input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Prioritize immutable compaction work - if (imm_.imm_flush_needed.NoBarrier_Load() != nullptr) { - const uint64_t imm_start = env_->NowMicros(); - LogFlush(options_.info_log); - mutex_.Lock(); - if (imm_.IsFlushPending()) { - FlushMemTableToOutputFile(nullptr, deletion_state, log_buffer); - bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary - } - mutex_.Unlock(); - imm_micros += (env_->NowMicros() - imm_start); - } + while (backup_input->Valid() && !shutting_down_.Acquire_Load() && + !cfd->IsDropped()) { + // FLUSH preempts compaction + // TODO(icanadi) this currently only checks if flush is necessary on + // compacting column family. we should also check if flush is necessary on + // other column families, too + imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); Slice key = backup_input->key(); Slice value = backup_input->value(); const SliceTransform* transformer = - options_.compaction_filter_factory_v2->GetPrefixExtractor(); - std::string key_prefix = transformer->Transform(key).ToString(); + cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor(); + const auto key_prefix = transformer->Transform(key); if (!prefix_initialized) { - compact->cur_prefix_ = key_prefix; + compact->cur_prefix_ = key_prefix.ToString(); prefix_initialized = true; } if (!ParseInternalKey(key, &ikey)) { @@ -3003,7 +2973,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, continue; } else { // If the prefix remains the same, keep buffering - if (key_prefix == compact->cur_prefix_) { + if (key_prefix.compare(Slice(compact->cur_prefix_)) == 0) { // Apply the compaction filter V2 to all the kv pairs sharing // the same prefix if (ikey.type == kTypeValue && @@ -3024,12 +2994,12 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (compact->key_buf_.size() > 0) { CallCompactionFilterV2(compact, compaction_filter_v2); } - compact->cur_prefix_ = key_prefix; + compact->cur_prefix_ = key_prefix.ToString(); } } // Merge this batch of data (values + ineligible keys) - compact->MergeKeyValueSliceBuffer(&internal_comparator_); + compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); // Done buffering for the current prefix. Spit it out to disk // Now just iterate through all the kv-pairs @@ -3066,7 +3036,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (compact->key_buf_.size() > 0) { CallCompactionFilterV2(compact, compaction_filter_v2); } - compact->MergeKeyValueSliceBuffer(&internal_comparator_); + compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); status = ProcessKeyValueCompaction( visible_at_tip, @@ -3088,7 +3058,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (compact->key_buf_.size() > 0) { CallCompactionFilterV2(compact, compaction_filter_v2); } - compact->MergeKeyValueSliceBuffer(&internal_comparator_); + compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); status = ProcessKeyValueCompaction( visible_at_tip, earliest_snapshot, @@ -3116,9 +3086,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, log_buffer); } - if (status.ok() && shutting_down_.Acquire_Load()) { + if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) { status = Status::ShutdownInProgress( - "Database shutdown started during compaction"); + "Database shutdown or Column family drop during compaction"); } if (status.ok() && compact->builder != nullptr) { status = FinishCompactionOutputFile(compact, input.get()); @@ -3166,23 +3136,24 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, LogFlush(options_.info_log); mutex_.Lock(); - internal_stats_.AddCompactionStats(compact->compaction->output_level(), - stats); + cfd->internal_stats()->AddCompactionStats(compact->compaction->output_level(), + stats); // if there were any unused file number (mostly in case of // compaction error), free up the entry from pending_putputs ReleaseCompactionUnusedFileNumbers(compact); if (status.ok()) { - status = InstallCompactionResults(compact); - InstallSuperVersion(deletion_state); + status = InstallCompactionResults(compact, log_buffer); + InstallSuperVersion(cfd, deletion_state); } Version::LevelSummaryStorage tmp; - Log(options_.info_log, + LogToBuffer( + log_buffer, "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " "write-amplify(%.1f) %s\n", - versions_->current()->LevelSummary(&tmp), + cfd->current()->LevelSummary(&tmp), (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / (double)stats.micros, compact->compaction->output_level(), stats.files_in_leveln, @@ -3199,12 +3170,12 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, namespace { struct IterState { - IterState(DBImpl* db, port::Mutex* mu, DBImpl::SuperVersion* super_version) - : db(db), mu(mu), super_version(super_version) {} + IterState(DBImpl* db, port::Mutex* mu, SuperVersion* super_version) + : db(db), mu(mu), super_version(super_version) {} DBImpl* db; port::Mutex* mu; - DBImpl::SuperVersion* super_version; + SuperVersion* super_version; }; static void CleanupIteratorState(void* arg1, void* arg2) { @@ -3230,12 +3201,8 @@ static void CleanupIteratorState(void* arg1, void* arg2) { } // namespace Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot) { - mutex_.Lock(); - *latest_snapshot = versions_->LastSequence(); - SuperVersion* super_version = super_version_->Ref(); - mutex_.Unlock(); - + ColumnFamilyData* cfd, + SuperVersion* super_version) { std::vector iterator_list; // Collect iterator for mutable mem iterator_list.push_back(super_version->mem->NewIterator(options)); @@ -3245,7 +3212,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, super_version->current->AddIterators(options, storage_options_, &iterator_list); Iterator* internal_iter = NewMergingIterator( - env_, &internal_comparator_, &iterator_list[0], iterator_list.size()); + &cfd->internal_comparator(), &iterator_list[0], iterator_list.size()); IterState* cleanup = new IterState(this, &mutex_, super_version); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); @@ -3253,39 +3220,54 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, return internal_iter; } -Iterator* DBImpl::TEST_NewInternalIterator() { - SequenceNumber ignored; - ReadOptions read_options; - // Use prefix_seek to make the test function more useful. - read_options.prefix_seek = true; - return NewInternalIterator(read_options, &ignored); +ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { + return default_cf_handle_; +} + +Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + ReadOptions roptions; + roptions.prefix_seek = true; + return NewInternalIterator(roptions, cfd, super_version); } std::pair DBImpl::GetTailingIteratorPair( - const ReadOptions& options, + const ReadOptions& options, ColumnFamilyData* cfd, uint64_t* superversion_number) { mutex_.Lock(); - SuperVersion* super_version = super_version_->Ref(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); if (superversion_number != nullptr) { - *superversion_number = CurrentVersionNumber(); + *superversion_number = cfd->GetSuperVersionNumber(); } mutex_.Unlock(); Iterator* mutable_iter = super_version->mem->NewIterator(options); // create a DBIter that only uses memtable content; see NewIterator() - mutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(), - mutable_iter, kMaxSequenceNumber); + mutable_iter = + NewDBIterator(&dbname_, env_, *cfd->options(), cfd->user_comparator(), + mutable_iter, kMaxSequenceNumber); std::vector list; super_version->imm->AddIterators(options, &list); super_version->current->AddIterators(options, storage_options_, &list); Iterator* immutable_iter = - NewMergingIterator(env_, &internal_comparator_, &list[0], list.size()); + NewMergingIterator(&cfd->internal_comparator(), &list[0], list.size()); // create a DBIter that only uses memtable content; see NewIterator() - immutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(), - immutable_iter, kMaxSequenceNumber); + immutable_iter = + NewDBIterator(&dbname_, env_, *cfd->options(), cfd->user_comparator(), + immutable_iter, kMaxSequenceNumber); // register cleanups mutable_iter->RegisterCleanup(CleanupIteratorState, @@ -3298,15 +3280,23 @@ std::pair DBImpl::GetTailingIteratorPair( return std::make_pair(mutable_iter, immutable_iter); } -int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } MutexLock l(&mutex_); - return versions_->current()->MaxNextLevelOverlappingBytes(); + return cfd->current()->MaxNextLevelOverlappingBytes(); } Status DBImpl::Get(const ReadOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { - return GetImpl(options, key, value); + return GetImpl(options, column_family, key, value); } // DeletionState gets created and destructed outside of the lock -- we @@ -3317,67 +3307,35 @@ Status DBImpl::Get(const ReadOptions& options, // However, if InstallSuperVersion() gets called twice with the same, // deletion_state, we can't reuse the SuperVersion() that got malloced because // first call already used it. In that rare case, we take a hit and create a -// new SuperVersion() inside of the mutex. -void DBImpl::InstallSuperVersion(DeletionState& deletion_state) { +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free +void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd, + DeletionState& deletion_state) { mutex_.AssertHeld(); // if new_superversion == nullptr, it means somebody already used it SuperVersion* new_superversion = (deletion_state.new_superversion != nullptr) ? deletion_state.new_superversion : new SuperVersion(); - SuperVersion* old_superversion = InstallSuperVersion(new_superversion); + SuperVersion* old_superversion = + cfd->InstallSuperVersion(new_superversion, &mutex_); deletion_state.new_superversion = nullptr; deletion_state.superversions_to_free.push_back(old_superversion); // Reset SuperVersions cached in thread local storage if (options_.allow_thread_local) { - ResetThreadLocalSuperVersions(&deletion_state); - } -} - -DBImpl::SuperVersion* DBImpl::InstallSuperVersion( - SuperVersion* new_superversion) { - mutex_.AssertHeld(); - new_superversion->db = this; - new_superversion->Init(mem_, imm_.current(), versions_->current()); - SuperVersion* old_superversion = super_version_; - super_version_ = new_superversion; - ++super_version_number_; - super_version_->version_number = super_version_number_; - - if (old_superversion != nullptr && old_superversion->Unref()) { - old_superversion->Cleanup(); - return old_superversion; // will let caller delete outside of mutex - } - return nullptr; -} - -void DBImpl::ResetThreadLocalSuperVersions(DeletionState* deletion_state) { - mutex_.AssertHeld(); - autovector sv_ptrs; - local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); - for (auto ptr : sv_ptrs) { - assert(ptr); - if (ptr == SuperVersion::kSVInUse) { - continue; - } - auto sv = static_cast(ptr); - if (static_cast(ptr)->Unref()) { - sv->Cleanup(); - deletion_state->superversions_to_free.push_back(sv); - } + cfd->ResetThreadLocalSuperVersions(); } } Status DBImpl::GetImpl(const ReadOptions& options, - const Slice& key, - std::string* value, - bool* value_found) { - Status s; - + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { StopWatch sw(env_, options_.statistics.get(), DB_GET, false); - StopWatchNano snapshot_timer(env_, false); - StartPerfTimer(&snapshot_timer); - SequenceNumber snapshot; + PERF_TIMER_AUTO(get_snapshot_time); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + SequenceNumber snapshot; if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; } else { @@ -3386,6 +3344,7 @@ Status DBImpl::GetImpl(const ReadOptions& options, // Acquire SuperVersion SuperVersion* sv = nullptr; + ThreadLocalPtr* thread_local_sv = nullptr; if (LIKELY(options_.allow_thread_local)) { // The SuperVersion is cached in thread local storage to avoid acquiring // mutex when SuperVersion does not change since the last use. When a new @@ -3398,7 +3357,8 @@ Status DBImpl::GetImpl(const ReadOptions& options, // have swapped in kSVObsolete. We re-check the value at the end of // Get, with an atomic compare and swap. The superversion will be released // if detected to be stale. - void* ptr = local_sv_->Swap(SuperVersion::kSVInUse); + thread_local_sv = cfd->GetThreadLocalSuperVersion(); + void* ptr = thread_local_sv->Swap(SuperVersion::kSVInUse); // Invariant: // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage @@ -3406,8 +3366,7 @@ Status DBImpl::GetImpl(const ReadOptions& options, assert(ptr != SuperVersion::kSVInUse); sv = static_cast(ptr); if (sv == SuperVersion::kSVObsolete || - sv->version_number != super_version_number_.load( - std::memory_order_relaxed)) { + sv->version_number != cfd->GetSuperVersionNumber()) { RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES); SuperVersion* sv_to_delete = nullptr; @@ -3421,14 +3380,14 @@ Status DBImpl::GetImpl(const ReadOptions& options, } else { mutex_.Lock(); } - sv = super_version_->Ref(); + sv = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); delete sv_to_delete; } } else { mutex_.Lock(); - sv = super_version_->Ref(); + sv = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); } @@ -3438,32 +3397,31 @@ Status DBImpl::GetImpl(const ReadOptions& options, // Prepare to store a list of merge operations if merge occurs. MergeContext merge_context; + Status s; // First look in the memtable, then in the immutable memtable (if any). // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); - BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); - if (sv->mem->Get(lkey, value, &s, merge_context, options_)) { + PERF_TIMER_STOP(get_snapshot_time); + if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); - } else if (sv->imm->Get(lkey, value, &s, merge_context, options_)) { + } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); } else { - StopWatchNano from_files_timer(env_, false); - StartPerfTimer(&from_files_timer); + PERF_TIMER_START(get_from_output_files_time); sv->current->Get(options, lkey, value, &s, &merge_context, &stats, - options_, value_found); + *cfd->options(), value_found); have_stat_update = true; - BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer); + PERF_TIMER_STOP(get_from_output_files_time); RecordTick(options_.statistics.get(), MEMTABLE_MISS); } - StopWatchNano post_process_timer(env_, false); - StartPerfTimer(&post_process_timer); + PERF_TIMER_START(get_post_process_time); - if (!options_.disable_seek_compaction && have_stat_update) { + if (!cfd->options()->disable_seek_compaction && have_stat_update) { mutex_.Lock(); if (sv->current->UpdateStats(stats)) { MaybeScheduleFlushOrCompaction(); @@ -3475,7 +3433,7 @@ Status DBImpl::GetImpl(const ReadOptions& options, if (LIKELY(options_.allow_thread_local)) { // Put the SuperVersion back void* expected = SuperVersion::kSVInUse; - if (local_sv_->CompareAndSwap(static_cast(sv), expected)) { + if (thread_local_sv->CompareAndSwap(static_cast(sv), expected)) { // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal // storage has not been altered and no Scrape has happend. The // SuperVersion is still current. @@ -3500,118 +3458,219 @@ Status DBImpl::GetImpl(const ReadOptions& options, RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_RELEASES); } - // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); RecordTick(options_.statistics.get(), BYTES_READ, value->size()); - BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); + PERF_TIMER_STOP(get_post_process_time); return s; } -std::vector DBImpl::MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values) { +std::vector DBImpl::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false); - StopWatchNano snapshot_timer(env_, false); - StartPerfTimer(&snapshot_timer); + PERF_TIMER_AUTO(get_snapshot_time); SequenceNumber snapshot; + struct MultiGetColumnFamilyData { + ColumnFamilyData* cfd; + SuperVersion* super_version; + Version::GetStats stats; + bool have_stat_update = false; + }; + std::unordered_map multiget_cf_data; + // fill up and allocate outside of mutex + for (auto cf : column_family) { + auto cfh = reinterpret_cast(cf); + auto cfd = cfh->cfd(); + if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { + auto mgcfd = new MultiGetColumnFamilyData(); + mgcfd->cfd = cfd; + multiget_cf_data.insert({cfd->GetID(), mgcfd}); + } + } + mutex_.Lock(); if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; } else { snapshot = versions_->LastSequence(); } - - SuperVersion* get_version = super_version_->Ref(); + for (auto mgd_iter : multiget_cf_data) { + mgd_iter.second->super_version = + mgd_iter.second->cfd->GetSuperVersion()->Ref(); + } mutex_.Unlock(); - bool have_stat_update = false; - Version::GetStats stats; - // Contain a list of merge operations if merge occurs. MergeContext merge_context; // Note: this always resizes the values array - int numKeys = keys.size(); - std::vector statList(numKeys); - values->resize(numKeys); + size_t num_keys = keys.size(); + std::vector stat_list(num_keys); + values->resize(num_keys); // Keep track of bytes that we read for statistics-recording later - uint64_t bytesRead = 0; - BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); + uint64_t bytes_read = 0; + PERF_TIMER_STOP(get_snapshot_time); // For each of the given keys, apply the entire "get" process as follows: // First look in the memtable, then in the immutable memtable (if any). // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. - for (int i=0; imem->Get(lkey, value, &s, merge_context, options_)) { + auto cfh = reinterpret_cast(column_family[i]); + auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); + assert(mgd_iter != multiget_cf_data.end()); + auto mgd = mgd_iter->second; + auto super_version = mgd->super_version; + auto cfd = mgd->cfd; + if (super_version->mem->Get(lkey, value, &s, merge_context, + *cfd->options())) { // Done - } else if (get_version->imm->Get(lkey, value, &s, merge_context, - options_)) { + } else if (super_version->imm->Get(lkey, value, &s, merge_context, + *cfd->options())) { // Done } else { - get_version->current->Get(options, lkey, value, &s, &merge_context, - &stats, options_); - have_stat_update = true; + super_version->current->Get(options, lkey, value, &s, &merge_context, + &mgd->stats, *cfd->options()); + mgd->have_stat_update = true; } if (s.ok()) { - bytesRead += value->size(); + bytes_read += value->size(); } } // Post processing (decrement reference counts and record statistics) - StopWatchNano post_process_timer(env_, false); - StartPerfTimer(&post_process_timer); - bool delete_get_version = false; - if (!options_.disable_seek_compaction && have_stat_update) { - mutex_.Lock(); - if (get_version->current->UpdateStats(stats)) { - MaybeScheduleFlushOrCompaction(); + PERF_TIMER_START(get_post_process_time); + autovector superversions_to_delete; + + bool schedule_flush_or_compaction = false; + mutex_.Lock(); + for (auto mgd_iter : multiget_cf_data) { + auto mgd = mgd_iter.second; + auto cfd = mgd->cfd; + if (!cfd->options()->disable_seek_compaction && mgd->have_stat_update) { + if (mgd->super_version->current->UpdateStats(mgd->stats)) { + schedule_flush_or_compaction = true; + } } - if (get_version->Unref()) { - get_version->Cleanup(); - delete_get_version = true; - } - mutex_.Unlock(); - } else { - if (get_version->Unref()) { - mutex_.Lock(); - get_version->Cleanup(); - mutex_.Unlock(); - delete_get_version = true; + if (mgd->super_version->Unref()) { + mgd->super_version->Cleanup(); + superversions_to_delete.push_back(mgd->super_version); } } - if (delete_get_version) { - delete get_version; + if (schedule_flush_or_compaction) { + MaybeScheduleFlushOrCompaction(); + } + mutex_.Unlock(); + + for (auto td : superversions_to_delete) { + delete td; + } + for (auto mgd : multiget_cf_data) { + delete mgd.second; } RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS); - RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys); - RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytesRead); - BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read); + PERF_TIMER_STOP(get_post_process_time); - return statList; + return stat_list; +} + +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + *handle = nullptr; + MutexLock l(&mutex_); + + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); + } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(options.comparator->Name()); + + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + Status s = versions_->LogAndApply(nullptr, &edit, &mutex_, + db_directory_.get(), false, &options); + if (s.ok()) { + auto cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_); + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + Log(options_.info_log, "Created column family \"%s\" (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + } else { + Log(options_.info_log, "Creating column family \"%s\" FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); + } + return s; +} + +Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (cfd->GetID() == 0) { + return Status::InvalidArgument("Can't drop default column family"); + } + + VersionEdit edit; + edit.DropColumnFamily(); + edit.SetColumnFamily(cfd->GetID()); + + Status s; + { + MutexLock l(&mutex_); + if (cfd->IsDropped()) { + s = Status::InvalidArgument("Column family already dropped!\n"); + } + if (s.ok()) { + s = versions_->LogAndApply(cfd, &edit, &mutex_); + } + } + + if (s.ok()) { + assert(cfd->IsDropped()); + Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID()); + // Flush the memtables. This will make all WAL files referencing dropped + // column family to be obsolete. They will be deleted once user deletes + // column family handle + Write(WriteOptions(), nullptr); // ignore error + } else { + Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n", + cfd->GetID(), s.ToString().c_str()); + } + + return s; } bool DBImpl::KeyMayExist(const ReadOptions& options, - const Slice& key, - std::string* value, - bool* value_found) { + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { if (value_found != nullptr) { // falsify later if key-may-exist but can't fetch value *value_found = true; } ReadOptions roptions = options; roptions.read_tier = kBlockCacheTier; // read from block cache only - auto s = GetImpl(roptions, key, value, value_found); + auto s = GetImpl(roptions, column_family, key, value, value_found); // If options.block_cache != nullptr and the index block of the table didn't // not present in block_cache, the return value will be Status::Incomplete. @@ -3619,31 +3678,93 @@ bool DBImpl::KeyMayExist(const ReadOptions& options, return s.ok() || s.IsIncomplete(); } -Iterator* DBImpl::NewIterator(const ReadOptions& options) { +Iterator* DBImpl::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + SequenceNumber latest_snapshot = 0; + SuperVersion* super_version = nullptr; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (!options.tailing) { + mutex_.Lock(); + super_version = cfd->GetSuperVersion()->Ref(); + latest_snapshot = versions_->LastSequence(); + mutex_.Unlock(); + } + Iterator* iter; - if (options.tailing) { - iter = new TailingIterator(this, options, user_comparator()); + iter = new TailingIterator(this, options, cfd); } else { - SequenceNumber latest_snapshot; - iter = NewInternalIterator(options, &latest_snapshot); + iter = NewInternalIterator(options, cfd, super_version); - iter = NewDBIterator( - &dbname_, env_, options_, user_comparator(), iter, - (options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot)); + auto snapshot = + options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot; + iter = NewDBIterator(&dbname_, env_, *cfd->options(), + cfd->user_comparator(), iter, snapshot); } if (options.prefix) { // use extra wrapper to exclude any keys from the results which // don't begin with the prefix iter = new PrefixFilterIterator(iter, *options.prefix, - options_.prefix_extractor.get()); + cfd->options()->prefix_extractor.get()); } return iter; } +Status DBImpl::NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) { + + if (options.prefix) { + return Status::NotSupported( + "NewIterators doesn't support ReadOptions::prefix"); + } + + iterators->clear(); + iterators->reserve(column_families.size()); + SequenceNumber latest_snapshot = 0; + std::vector super_versions; + super_versions.reserve(column_families.size()); + + if (!options.tailing) { + mutex_.Lock(); + latest_snapshot = versions_->LastSequence(); + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + super_versions.push_back(cfd->GetSuperVersion()->Ref()); + } + mutex_.Unlock(); + } + + if (options.tailing) { + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + iterators->push_back(new TailingIterator(this, options, cfd)); + } + } else { + for (size_t i = 0; i < column_families.size(); ++i) { + auto cfh = reinterpret_cast(column_families[i]); + auto cfd = cfh->cfd(); + + auto snapshot = + options.snapshot != nullptr + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot; + + auto iter = NewInternalIterator(options, cfd, super_versions[i]); + iter = NewDBIterator(&dbname_, env_, *cfd->options(), + cfd->user_comparator(), iter, snapshot); + iterators->push_back(iter); + } + } + + return Status::OK(); +} + const Snapshot* DBImpl::GetSnapshot() { MutexLock l(&mutex_); return snapshots_.New(versions_->LastSequence()); @@ -3655,26 +3776,28 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { } // Convenience methods -Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { - return DB::Put(o, key, val); +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + return DB::Put(o, column_family, key, val); } -Status DBImpl::Merge(const WriteOptions& o, const Slice& key, - const Slice& val) { - if (!options_.merge_operator) { +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + auto cfh = reinterpret_cast(column_family); + if (!cfh->cfd()->options()->merge_operator) { return Status::NotSupported("Provide a merge_operator when opening DB"); } else { - return DB::Merge(o, key, val); + return DB::Merge(o, column_family, key, val); } } -Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { - return DB::Delete(options, key); +Status DBImpl::Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key) { + return DB::Delete(options, column_family, key); } Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { - StopWatchNano pre_post_process_timer(env_, false); - StartPerfTimer(&pre_post_process_timer); + PERF_TIMER_AUTO(write_pre_and_post_process_time); Writer w(&mutex_); w.batch = my_batch; w.sync = options.sync; @@ -3700,9 +3823,24 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); } - // May temporarily unlock and wait. - SuperVersion* superversion_to_free = nullptr; - Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free); + Status status; + // refcounting cfd in iteration + bool dead_cfd = false; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->Ref(); + // May temporarily unlock and wait. + status = MakeRoomForWrite(cfd, my_batch == nullptr); + if (cfd->Unref()) { + dead_cfd = true; + } + if (!status.ok()) { + break; + } + } + if (dead_cfd) { + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + } + uint64_t last_sequence = versions_->LastSequence(); Writer* last_writer = &w; if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions @@ -3712,7 +3850,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging // and protects against concurrent loggers and concurrent writes - // into mem_. + // into memtables { mutex_.Unlock(); WriteBatch* updates = nullptr; @@ -3738,12 +3876,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (options.disableWAL) { flush_on_destroy_ = true; } - BumpPerfTime(&perf_context.write_pre_and_post_process_time, - &pre_post_process_timer); + PERF_TIMER_STOP(write_pre_and_post_process_time); if (!options.disableWAL) { - StopWatchNano timer(env_); - StartPerfTimer(&timer); + PERF_TIMER_START(write_wal_time); Slice log_entry = WriteBatchInternal::Contents(updates); status = log_->AddRecord(log_entry); RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); @@ -3757,25 +3893,27 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { status = log_->file()->Sync(); } } - BumpPerfTime(&perf_context.write_wal_time, &timer); + PERF_TIMER_STOP(write_wal_time); } if (status.ok()) { - StopWatchNano write_memtable_timer(env_, false); - StartPerfTimer(&write_memtable_timer); - status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this, - options_.filter_deletes); - BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer); + PERF_TIMER_START(write_memtable_time); + status = WriteBatchInternal::InsertInto( + updates, column_family_memtables_.get(), false, 0, this, false); + PERF_TIMER_STOP(write_memtable_time); + if (!status.ok()) { - // Panic for in-memory corruptions + // Iteration failed (either in-memory writebatch corruption (very + // bad), or the client specified invalid column family). Return + // failure. // Note that existing logic was not sound. Any partial failure writing // into the memtable would result in a state that some write ops might // have succeeded in memtable but Status reports error for all writes. - throw std::runtime_error("In memory WriteBatch corruption!"); + return status; } SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, last_sequence); } - StartPerfTimer(&pre_post_process_timer); + PERF_TIMER_START(write_pre_and_post_process_time); if (updates == &tmp_batch_) tmp_batch_.Clear(); mutex_.Lock(); if (status.ok()) { @@ -3803,9 +3941,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { writers_.front()->cv.Signal(); } mutex_.Unlock(); - delete superversion_to_free; - BumpPerfTime(&perf_context.write_pre_and_post_process_time, - &pre_post_process_timer); + PERF_TIMER_STOP(write_pre_and_post_process_time); return status; } @@ -3891,8 +4027,7 @@ uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(bool force, - SuperVersion** superversion_to_free) { +Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) { mutex_.AssertHeld(); assert(!writers_.empty()); bool allow_delay = !force; @@ -3901,14 +4036,13 @@ Status DBImpl::MakeRoomForWrite(bool force, uint64_t rate_limit_delay_millis = 0; Status s; double score; - *superversion_to_free = nullptr; while (true) { if (!bg_error_.ok()) { // Yield previous error s = bg_error_; break; - } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) { + } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, start delaying each @@ -3916,9 +4050,9 @@ Status DBImpl::MakeRoomForWrite(bool force, // this delay hands over some CPU to the compaction thread in // case it is sharing the same core as the writer. uint64_t slowdown = - SlowdownAmount(versions_->current()->NumLevelFiles(0), - options_.level0_slowdown_writes_trigger, - options_.level0_stop_writes_trigger); + SlowdownAmount(cfd->current()->NumLevelFiles(0), + cfd->options()->level0_slowdown_writes_trigger, + cfd->options()->level0_stop_writes_trigger); mutex_.Unlock(); uint64_t delayed; { @@ -3927,17 +4061,19 @@ Status DBImpl::MakeRoomForWrite(bool force, delayed = sw.ElapsedMicros(); } RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); - internal_stats_.RecordWriteStall(InternalStats::LEVEL0_SLOWDOWN, delayed); + cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_SLOWDOWN, + delayed); allow_delay = false; // Do not delay a single write more than once mutex_.Lock(); delayed_writes_++; - } else if (!force && !mem_->ShouldFlush()) { + } else if (!force && !cfd->mem()->ShouldFlush()) { // There is room in current memtable if (allow_delay) { DelayLoggingAndReset(); } break; - } else if (imm_.size() == options_.max_write_buffer_number - 1) { + } else if (cfd->imm()->size() == + cfd->options()->max_write_buffer_number - 1) { // We have filled up the current memtable, but the previous // ones are still being flushed, so we wait. DelayLoggingAndReset(); @@ -3946,16 +4082,16 @@ Status DBImpl::MakeRoomForWrite(bool force, uint64_t stall; { StopWatch sw(env_, options_.statistics.get(), - STALL_MEMTABLE_COMPACTION_COUNT); + STALL_MEMTABLE_COMPACTION_COUNT); bg_cv_.Wait(); stall = sw.ElapsedMicros(); } RecordTick(options_.statistics.get(), STALL_MEMTABLE_COMPACTION_MICROS, stall); - internal_stats_.RecordWriteStall(InternalStats::MEMTABLE_COMPACTION, - stall); - } else if (versions_->current()->NumLevelFiles(0) >= - options_.level0_stop_writes_trigger) { + cfd->internal_stats()->RecordWriteStall( + InternalStats::MEMTABLE_COMPACTION, stall); + } else if (cfd->current()->NumLevelFiles(0) >= + cfd->options()->level0_stop_writes_trigger) { // There are too many level-0 files. DelayLoggingAndReset(); Log(options_.info_log, "wait for fewer level0 files...\n"); @@ -3967,12 +4103,14 @@ Status DBImpl::MakeRoomForWrite(bool force, stall = sw.ElapsedMicros(); } RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall); - internal_stats_.RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, stall); - } else if (allow_hard_rate_limit_delay && options_.hard_rate_limit > 1.0 && - (score = versions_->current()->MaxCompactionScore()) > - options_.hard_rate_limit) { + cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, + stall); + } else if (allow_hard_rate_limit_delay && + cfd->options()->hard_rate_limit > 1.0 && + (score = cfd->current()->MaxCompactionScore()) > + cfd->options()->hard_rate_limit) { // Delay a write when the compaction score for any level is too large. - int max_level = versions_->current()->MaxCompactionScoreLevel(); + int max_level = cfd->current()->MaxCompactionScoreLevel(); mutex_.Unlock(); uint64_t delayed; { @@ -3981,32 +4119,31 @@ Status DBImpl::MakeRoomForWrite(bool force, env_->SleepForMicroseconds(1000); delayed = sw.ElapsedMicros(); } - internal_stats_.RecordLevelNSlowdown(max_level, delayed); + cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed); // Make sure the following value doesn't round to zero. uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); rate_limit_delay_millis += rate_limit; RecordTick(options_.statistics.get(), RATE_LIMIT_DELAY_MILLIS, rate_limit); - if (options_.rate_limit_delay_max_milliseconds > 0 && + if (cfd->options()->rate_limit_delay_max_milliseconds > 0 && rate_limit_delay_millis >= - (unsigned)options_.rate_limit_delay_max_milliseconds) { + (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) { allow_hard_rate_limit_delay = false; } mutex_.Lock(); - } else if (allow_soft_rate_limit_delay && options_.soft_rate_limit > 0.0 && - (score = versions_->current()->MaxCompactionScore()) > - options_.soft_rate_limit) { + } else if (allow_soft_rate_limit_delay && + cfd->options()->soft_rate_limit > 0.0 && + (score = cfd->current()->MaxCompactionScore()) > + cfd->options()->soft_rate_limit) { // Delay a write when the compaction score for any level is too large. // TODO: add statistics mutex_.Unlock(); { StopWatch sw(env_, options_.statistics.get(), SOFT_RATE_LIMIT_DELAY_COUNT); - env_->SleepForMicroseconds(SlowdownAmount( - score, - options_.soft_rate_limit, - options_.hard_rate_limit) - ); + env_->SleepForMicroseconds( + SlowdownAmount(score, cfd->options()->soft_rate_limit, + cfd->options()->hard_rate_limit)); rate_limit_delay_millis += sw.ElapsedMicros(); } allow_soft_rate_limit_delay = false; @@ -4014,6 +4151,7 @@ Status DBImpl::MakeRoomForWrite(bool force, } else { unique_ptr lfile; + log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; // Attempt to switch to a new memtable and trigger flush of old. @@ -4030,8 +4168,10 @@ Status DBImpl::MakeRoomForWrite(bool force, if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. - lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); - new_mem = new MemTable(internal_comparator_, options_); + lfile->SetPreallocationBlockSize(1.1 * + cfd->options()->write_buffer_size); + new_log = new log::Writer(std::move(lfile)); + new_mem = new MemTable(cfd->internal_comparator(), *cfd->options()); new_superversion = new SuperVersion(); } Log(options_.info_log, @@ -4042,31 +4182,53 @@ Status DBImpl::MakeRoomForWrite(bool force, if (!s.ok()) { // Avoid chewing through file number space in a tight loop. versions_->ReuseFileNumber(new_log_number); - assert (!new_mem); + assert(!new_mem); + assert(!new_log); break; } logfile_number_ = new_log_number; - log_.reset(new log::Writer(std::move(lfile))); - mem_->SetNextLogNumber(logfile_number_); - imm_.Add(mem_); + assert(new_log != nullptr); + // TODO(icanadi) delete outside of mutex + delete log_.release(); + log_.reset(new_log); + cfd->mem()->SetNextLogNumber(logfile_number_); + cfd->imm()->Add(cfd->mem()); if (force) { - imm_.FlushRequested(); + cfd->imm()->FlushRequested(); } - mem_ = new_mem; - mem_->Ref(); - mem_->SetLogNumber(logfile_number_); - force = false; // Do not force another compaction if have room + new_mem->Ref(); + alive_log_files_.push_back(logfile_number_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (cfd->mem()->GetFirstSequenceNumber() == 0 && + cfd->imm()->size() == 0) { + cfd->SetLogNumber(logfile_number_); + } + } + cfd->SetMemtable(new_mem); + Log(options_.info_log, + "[CF %" PRIu32 "] New memtable created with log file: #%lu\n", + cfd->GetID(), (unsigned long)logfile_number_); + force = false; // Do not force another compaction if have room MaybeScheduleFlushOrCompaction(); - *superversion_to_free = InstallSuperVersion(new_superversion); + // TODO(icanadi) delete outside of mutex + delete cfd->InstallSuperVersion(new_superversion, &mutex_); } } return s; } -Status DBImpl::GetPropertiesOfAllTables(TablePropertiesCollection* props) { +Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + // Increment the ref count mutex_.Lock(); - auto version = versions_->current(); + auto version = cfd->current(); version->Ref(); mutex_.Unlock(); @@ -4088,26 +4250,32 @@ Env* DBImpl::GetEnv() const { return env_; } -const Options& DBImpl::GetOptions() const { - return options_; +const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { + auto cfh = reinterpret_cast(column_family); + return *cfh->cfd()->options(); } -bool DBImpl::GetProperty(const Slice& property, std::string* value) { +bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { value->clear(); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); DBPropertyType property_type = GetPropertyType(property); MutexLock l(&mutex_); - return internal_stats_.GetProperty(property_type, property, value, this); + return cfd->internal_stats()->GetProperty(property_type, property, value, + cfd); } -void DBImpl::GetApproximateSizes( - const Range* range, int n, - uint64_t* sizes) { +void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { // TODO(opt): better implementation Version* v; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); { MutexLock l(&mutex_); - versions_->current()->Ref(); - v = versions_->current(); + v = cfd->current(); + v->Ref(); } for (int i = 0; i < n; i++) { @@ -4160,18 +4328,18 @@ Status DBImpl::DeleteFile(std::string name) { int level; FileMetaData* metadata; - int maxlevel = NumberLevels(); + ColumnFamilyData* cfd; VersionEdit edit; DeletionState deletion_state(true); { MutexLock l(&mutex_); - status = versions_->GetMetadataForFile(number, &level, &metadata); + status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); if (!status.ok()) { Log(options_.info_log, "DeleteFile %s failed. File not found\n", name.c_str()); return Status::InvalidArgument("File not found"); } - assert((level > 0) && (level < maxlevel)); + assert((level > 0) && (level < cfd->NumberLevels())); // If the file is being compacted no need to delete. if (metadata->being_compacted) { @@ -4183,17 +4351,17 @@ Status DBImpl::DeleteFile(std::string name) { // Only the files in the last level can be deleted externally. // This is to make sure that any deletion tombstones are not // lost. Check that the level passed is the last level. - for (int i = level + 1; i < maxlevel; i++) { - if (versions_->current()->NumLevelFiles(i) != 0) { + for (int i = level + 1; i < cfd->NumberLevels(); i++) { + if (cfd->current()->NumLevelFiles(i) != 0) { Log(options_.info_log, "DeleteFile %s FAILED. File not in last level\n", name.c_str()); return Status::InvalidArgument("File not in last level"); } } edit.DeleteFile(level, number); - status = versions_->LogAndApply(&edit, &mutex_, db_directory_.get()); + status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); if (status.ok()) { - InstallSuperVersion(deletion_state); + InstallSuperVersion(cfd, deletion_state); } FindObsoleteFiles(deletion_state, false); } // lock released here @@ -4213,7 +4381,7 @@ Status DBImpl::DeleteFile(std::string name) { void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { MutexLock l(&mutex_); - return versions_->GetLiveFilesMetaData(metadata); + versions_->GetLiveFilesMetaData(metadata); } Status DBImpl::CheckConsistency() { @@ -4244,12 +4412,14 @@ Status DBImpl::CheckConsistency() { } void DBImpl::TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, std::vector>* metadata) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); MutexLock l(&mutex_); metadata->resize(NumberLevels()); for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = - versions_->current()->files_[level]; + const std::vector& files = cfd->current()->files_[level]; (*metadata)[level].clear(); for (const auto& f : files) { @@ -4287,39 +4457,76 @@ Status DBImpl::GetDbIdentity(std::string& identity) { // Default implementations of convenience methods that subclasses of DB // can call if they wish -Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { // Pre-allocate size of write batch conservatively. // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, // and we allocate 11 extra bytes for key length, as well as value length. WriteBatch batch(key.size() + value.size() + 24); - batch.Put(key, value); + batch.Put(column_family, key, value); return Write(opt, &batch); } -Status DB::Delete(const WriteOptions& opt, const Slice& key) { +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key) { WriteBatch batch; - batch.Delete(key); + batch.Delete(column_family, key); return Write(opt, &batch); } -Status DB::Merge(const WriteOptions& opt, const Slice& key, - const Slice& value) { +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { WriteBatch batch; - batch.Merge(key, value); + batch.Merge(column_family, key, value); return Write(opt, &batch); } +// Default implementation -- returns not supported status +Status DB::CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + return Status::NotSupported(""); +} +Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) { + return Status::NotSupported(""); +} + DB::~DB() { } Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { - *dbptr = nullptr; + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(default_column_family_name, cf_options)); + std::vector handles; + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; + } + return s; +} - if (options.block_cache != nullptr && options.no_block_cache) { - return Status::InvalidArgument( - "no_block_cache is true while block_cache is not nullptr"); +Status DB::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + *dbptr = nullptr; + handles->clear(); + + size_t max_write_buffer_size = 0; + for (auto cf : column_families) { + max_write_buffer_size = + std::max(max_write_buffer_size, cf.options.write_buffer_size); + if (cf.options.block_cache != nullptr && cf.options.no_block_cache) { + return Status::InvalidArgument( + "no_block_cache is true while block_cache is not nullptr"); + } } - DBImpl* impl = new DBImpl(options, dbname); + DBImpl* impl = new DBImpl(db_options, dbname); Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); if (!s.ok()) { delete impl; @@ -4332,26 +4539,37 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { return s; } impl->mutex_.Lock(); - s = impl->Recover(); // Handles create_if_missing, error_if_exists + // Handles create_if_missing, error_if_exists + s = impl->Recover(column_families); if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; - EnvOptions soptions(options); + EnvOptions soptions(db_options); s = impl->options_.env->NewWritableFile( LogFileName(impl->options_.wal_dir, new_log_number), &lfile, impl->options_.env->OptimizeForLogWrite(soptions)); if (s.ok()) { - lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); - VersionEdit edit; - edit.SetLogNumber(new_log_number); + lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); impl->logfile_number_ = new_log_number; impl->log_.reset(new log::Writer(std::move(lfile))); - s = impl->versions_->LogAndApply(&edit, &impl->mutex_, - impl->db_directory_.get()); + + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd == nullptr) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } } if (s.ok()) { - delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); - impl->mem_->SetLogNumber(impl->logfile_number_); + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_); + impl->alive_log_files_.push_back(impl->logfile_number_); + } impl->DeleteObsoleteFiles(); impl->MaybeScheduleFlushOrCompaction(); impl->MaybeScheduleLogDBDeployStats(); @@ -4359,13 +4577,20 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { } } - if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) { - Version* current = impl->versions_->current(); - for (int i = 1; i < impl->NumberLevels(); i++) { - int num_files = current->NumLevelFiles(i); - if (num_files > 0) { - s = Status::InvalidArgument("Not all files are at level 0. Cannot " - "open with universal compaction style."); + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + if (cfd->options()->compaction_style == kCompactionStyleUniversal) { + Version* current = cfd->current(); + for (int i = 1; i < current->NumberLevels(); ++i) { + int num_files = current->NumLevelFiles(i); + if (num_files > 0) { + s = Status::InvalidArgument("Not all files are at level 0. Cannot " + "open with universal compaction style."); + break; + } + } + } + if (!s.ok()) { break; } } @@ -4377,11 +4602,21 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { impl->opened_successfully_ = true; *dbptr = impl; } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); delete impl; } return s; } +Status DB::ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families) { + return VersionSet::ListColumnFamilies(column_families, name, db_options.env); +} + Snapshot::~Snapshot() { } diff --git a/db/db_impl.h b/db/db_impl.h index 4cfb6ecaf..e16bf3bb4 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -13,10 +13,12 @@ #include #include #include +#include #include "db/dbformat.h" #include "db/log_writer.h" #include "db/snapshot.h" +#include "db/column_family.h" #include "db/version_edit.h" #include "memtable_list.h" #include "port/port.h" @@ -40,44 +42,79 @@ class CompactionFilterV2; class DBImpl : public DB { public: - DBImpl(const Options& options, const std::string& dbname); + DBImpl(const DBOptions& options, const std::string& dbname); virtual ~DBImpl(); // Implementations of the DB interface - virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); - virtual Status Merge(const WriteOptions&, const Slice& key, + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value); - virtual Status Delete(const WriteOptions&, const Slice& key); + using DB::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key); + using DB::Write; virtual Status Write(const WriteOptions& options, WriteBatch* updates); + using DB::Get; virtual Status Get(const ReadOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value); - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values); + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values); + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family, + ColumnFamilyHandle** handle); + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); // Returns false if key doesn't exist in the database and true if it may. // If value_found is not passed in as null, then return the value if found in // memory. On return, if value was found, then value_found will be set to true // , otherwise false. + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, - const Slice& key, - std::string* value, - bool* value_found = nullptr); - virtual Iterator* NewIterator(const ReadOptions&); + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr); + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family); + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators); virtual const Snapshot* GetSnapshot(); virtual void ReleaseSnapshot(const Snapshot* snapshot); - virtual bool GetProperty(const Slice& property, std::string* value); - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); - virtual Status CompactRange(const Slice* begin, const Slice* end, + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value); + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes); + using DB::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, bool reduce_level = false, int target_level = -1); - virtual int NumberLevels(); - virtual int MaxMemCompactionLevel(); - virtual int Level0StopWriteTrigger(); + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family); + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family); + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family); virtual const std::string& GetName() const; virtual Env* GetEnv() const; - virtual const Options& GetOptions() const; - virtual Status Flush(const FlushOptions& options); + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const; + using DB::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family); virtual Status DisableFileDeletions(); virtual Status EnableFileDeletions(bool force); // All the returned filenames start with "/" @@ -92,8 +129,7 @@ class DBImpl : public DB { read_options = TransactionLogIterator::ReadOptions()); virtual Status DeleteFile(std::string name); - virtual void GetLiveFilesMetaData( - std::vector *metadata); + virtual void GetLiveFilesMetaData(std::vector* metadata); // checks if all live files exist on file system and that their file sizes // match to our in-memory records @@ -101,23 +137,21 @@ class DBImpl : public DB { virtual Status GetDbIdentity(std::string& identity); - Status RunManualCompaction(int input_level, - int output_level, - const Slice* begin, + Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, const Slice* begin, const Slice* end); // Extra methods (for testing) that are not in the public DB interface // Compact any files in the named level that overlap [*begin, *end] - Status TEST_CompactRange(int level, - const Slice* begin, - const Slice* end); + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr); // Force current memtable contents to be flushed. Status TEST_FlushMemTable(bool wait = true); // Wait for memtable compaction - Status TEST_WaitForFlushMemTable(); + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); // Wait for any compaction Status TEST_WaitForCompact(); @@ -125,14 +159,13 @@ class DBImpl : public DB { // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(); + Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family = + nullptr); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes(); - - // Simulate a db crash, no elegant closing of database. - void TEST_Destroy_DBImpl(); + int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family = + nullptr); // Return the current manifest file no. uint64_t TEST_Current_Manifest_FileNo(); @@ -148,61 +181,8 @@ class DBImpl : public DB { default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; } - void TEST_GetFilesMetaData(std::vector>* metadata); - - // holds references to memtable, all immutable memtables and version - struct SuperVersion { - MemTable* mem; - MemTableListVersion* imm; - Version* current; - std::atomic refs; - // We need to_delete because during Cleanup(), imm->Unref() returns - // all memtables that we need to free through this vector. We then - // delete all those memtables outside of mutex, during destruction - autovector to_delete; - // Version number of the current SuperVersion - uint64_t version_number; - DBImpl* db; - - // should be called outside the mutex - SuperVersion() = default; - ~SuperVersion(); - SuperVersion* Ref(); - // Returns true if this was the last reference and caller should - // call Clenaup() and delete the object - bool Unref(); - - // call these two methods with db mutex held - // Cleanup unrefs mem, imm and current. Also, it stores all memtables - // that needs to be deleted in to_delete vector. Unrefing those - // objects needs to be done in the mutex - void Cleanup(); - void Init(MemTable* new_mem, MemTableListVersion* new_imm, - Version* new_current); - - // The value of dummy is not actually used. kSVInUse takes its address as a - // mark in the thread local storage to indicate the SuperVersion is in use - // by thread. This way, the value of kSVInUse is guaranteed to have no - // conflict with SuperVersion object address and portable on different - // platform. - static int dummy; - static void* const kSVInUse; - static void* const kSVObsolete; - }; - - static void SuperVersionUnrefHandle(void* ptr) { - // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets - // destroyed. When former happens, the thread shouldn't see kSVInUse. - // When latter happens, we are in ~DBImpl(), no get should happen as well. - assert(ptr != SuperVersion::kSVInUse); - DBImpl::SuperVersion* sv = static_cast(ptr); - if (sv->Unref()) { - sv->db->mutex_.Lock(); - sv->Cleanup(); - sv->db->mutex_.Unlock(); - delete sv; - } - } + void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, + std::vector>* metadata); // needed for CleanupIteratorState struct DeletionState { @@ -231,7 +211,7 @@ class DBImpl : public DB { autovector superversions_to_free; - SuperVersion* new_superversion; // if nullptr no new superversion + SuperVersion* new_superversion; // if nullptr no new superversion // the current manifest_file_number, log_number and prev_log_number // that corresponds to the set of files in 'live'. @@ -243,8 +223,7 @@ class DBImpl : public DB { pending_manifest_file_number = 0; log_number = 0; prev_log_number = 0; - new_superversion = - create_superversion ? new SuperVersion() : nullptr; + new_superversion = create_superversion ? new SuperVersion() : nullptr; } ~DeletionState() { @@ -277,23 +256,16 @@ class DBImpl : public DB { // It is not necessary to hold the mutex when invoking this method. void PurgeObsoleteFiles(DeletionState& deletion_state); + ColumnFamilyHandle* DefaultColumnFamily() const; + protected: Env* const env_; const std::string dbname_; unique_ptr versions_; - const InternalKeyComparator internal_comparator_; - const Options options_; // options_.comparator == &internal_comparator_ + const DBOptions options_; - const Comparator* user_comparator() const { - return internal_comparator_.user_comparator(); - } - - SuperVersion* GetSuperVersion() { - return super_version_; - } - - Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot); + Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, + SuperVersion* super_version); private: friend class DB; @@ -306,8 +278,10 @@ class DBImpl : public DB { Status NewDB(); // Recover the descriptor from persistent storage. May do a significant - // amount of work to recover recently logged updates. - Status Recover(bool read_only = false, bool error_if_log_file_exist = false); + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + Status Recover(const std::vector& column_families, + bool read_only = false, bool error_if_log_file_exist = false); void MaybeIgnoreError(Status* s) const; @@ -318,7 +292,7 @@ class DBImpl : public DB { // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. - Status FlushMemTableToOutputFile(bool* madeProgress, + Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer); @@ -330,25 +304,26 @@ class DBImpl : public DB { // database is opened) and is heavyweight because it holds the mutex // for the entire period. The second method WriteLevel0Table supports // concurrent flush memtables to storage. - Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit); - Status WriteLevel0Table(autovector& mems, VersionEdit* edit, - uint64_t* filenumber, + Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, + VersionEdit* edit); + Status WriteLevel0Table(ColumnFamilyData* cfd, autovector& mems, + VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer); uint64_t SlowdownAmount(int n, double bottom, double top); - // MakeRoomForWrite will return superversion_to_free through an arugment, - // which the caller needs to delete. We do it because caller can delete - // the superversion outside of mutex - Status MakeRoomForWrite(bool force /* compact even if there is room? */, - SuperVersion** superversion_to_free); + + // TODO(icanadi) free superversion_to_free and old_log outside of mutex + Status MakeRoomForWrite(ColumnFamilyData* cfd, + bool force /* flush even if there is room? */); + void BuildBatchGroup(Writer** last_writer, autovector* write_batch_group); // Force current memtable contents to be flushed. - Status FlushMemTable(const FlushOptions& options); + Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options); // Wait for memtable flushed - Status WaitForFlushMemTable(); + Status WaitForFlushMemTable(ColumnFamilyData* cfd); void MaybeScheduleLogDBDeployStats(); static void BGLogDBDeployStats(void* db); @@ -368,6 +343,13 @@ class DBImpl : public DB { DeletionState& deletion_state, LogBuffer* log_buffer); + // This function is called as part of compaction. It enables Flush process to + // preempt compaction, since it's higher prioirty + // Returns: micros spent executing + uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd, + DeletionState& deletion_state, + LogBuffer* log_buffer); + // Call compaction filter if is_compaction_v2 is not true. Then iterate // through input and compact the kv-pairs Status ProcessKeyValueCompaction( @@ -388,15 +370,16 @@ class DBImpl : public DB { Status OpenCompactionOutputFile(CompactionState* compact); Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); - Status InstallCompactionResults(CompactionState* compact); + Status InstallCompactionResults(CompactionState* compact, + LogBuffer* log_buffer); void AllocateCompactionOutputFileNumbers(CompactionState* compact); void ReleaseCompactionUnusedFileNumbers(CompactionState* compact); void PurgeObsoleteWALFiles(); - Status AppendSortedWalsOfType(const std::string& path, - VectorLogPtr& log_files, - WalFileType type); + Status GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType type); // Requires: all_logs should be sorted with earliest log file first // Retains all log files in all_logs which contain updates with seq no. @@ -419,30 +402,23 @@ class DBImpl : public DB { // Return the minimum empty level that could hold the total data in the // input level. Return the input level, if such level could not be found. - int FindMinimumEmptyLevelFitting(int level); + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level); // Move the files in the input level to the target level. // If target_level < 0, automatically calculate the minimum level that could // hold the data set. - Status ReFitLevel(int level, int target_level = -1); - - // Returns the current SuperVersion number. - uint64_t CurrentVersionNumber() const; + Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1); // Returns a pair of iterators (mutable-only and immutable-only) used - // internally by TailingIterator and stores CurrentVersionNumber() in + // internally by TailingIterator and stores cfd->GetSuperVersionNumber() in // *superversion_number. These iterators are always up-to-date, i.e. can // be used to read new data. std::pair GetTailingIteratorPair( - const ReadOptions& options, - uint64_t* superversion_number); - - // Constant after construction - const InternalFilterPolicy internal_filter_policy_; - bool owns_info_log_; + const ReadOptions& options, ColumnFamilyData* cfd, + uint64_t* superversion_number); // table_cache_ provides its own synchronization - unique_ptr table_cache_; + std::shared_ptr table_cache_; // Lock over the persistent DB state. Non-nullptr iff successfully acquired. FileLock* db_lock_; @@ -451,20 +427,11 @@ class DBImpl : public DB { port::Mutex mutex_; port::AtomicPointer shutting_down_; port::CondVar bg_cv_; // Signalled when background work finishes - MemTable* mem_; - MemTableList imm_; // Memtable that are not changing uint64_t logfile_number_; unique_ptr log_; - - SuperVersion* super_version_; - - // An ordinal representing the current SuperVersion. Updated by - // InstallSuperVersion(), i.e. incremented every time super_version_ - // changes. - std::atomic super_version_number_; - // Thread's local copy of SuperVersion pointer - // This needs to be destructed after mutex_ - ThreadLocalPtr* local_sv_; + ColumnFamilyHandleImpl* default_cf_handle_; + unique_ptr column_family_memtables_; + std::deque alive_log_files_; std::string host_name_; @@ -500,6 +467,7 @@ class DBImpl : public DB { // Information for a manual compaction struct ManualCompaction { + ColumnFamilyData* cfd; int input_level; int output_level; bool done; @@ -541,8 +509,6 @@ class DBImpl : public DB { bool flush_on_destroy_; // Used when disableWAL is true. - InternalStats internal_stats_; - static const int KEEP_LOG_FILE_NUM = 1000; std::string db_absolute_path_; @@ -575,28 +541,21 @@ class DBImpl : public DB { std::vector& snapshots, SequenceNumber* prev_snapshot); - // will return a pointer to SuperVersion* if previous SuperVersion - // if its reference count is zero and needs deletion or nullptr if not - // As argument takes a pointer to allocated SuperVersion - // Foreground threads call this function directly (they don't carry - // deletion state and have to handle their own creation and deletion - // of SuperVersion) - SuperVersion* InstallSuperVersion(SuperVersion* new_superversion); // Background threads call this function, which is just a wrapper around - // the InstallSuperVersion() function above. Background threads carry + // the cfd->InstallSuperVersion() function. Background threads carry // deletion_state which can have new_superversion already allocated. - void InstallSuperVersion(DeletionState& deletion_state); + void InstallSuperVersion(ColumnFamilyData* cfd, + DeletionState& deletion_state); - void ResetThreadLocalSuperVersions(DeletionState* deletion_state); - - virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; // Function that Get and KeyMayExist call with no_io true or false // Note: 'value_found' from KeyMayExist propagates here - Status GetImpl(const ReadOptions& options, - const Slice& key, - std::string* value, + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value, bool* value_found = nullptr); }; @@ -606,7 +565,7 @@ extern Options SanitizeOptions(const std::string& db, const InternalKeyComparator* icmp, const InternalFilterPolicy* ipolicy, const Options& src); - +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); // Determine compression type, based on user options, level of the output // file and whether compression is disabled. diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index d130783ea..6d519a07c 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -42,8 +42,8 @@ namespace rocksdb { -DBImplReadOnly::DBImplReadOnly(const Options& options, - const std::string& dbname) +DBImplReadOnly::DBImplReadOnly(const DBOptions& options, + const std::string& dbname) : DBImpl(options, dbname) { Log(options_.info_log, "Opening the db in read only mode"); } @@ -53,42 +53,57 @@ DBImplReadOnly::~DBImplReadOnly() { // Implementations of the DB interface Status DBImplReadOnly::Get(const ReadOptions& options, - const Slice& key, - std::string* value) { + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { Status s; SequenceNumber snapshot = versions_->LastSequence(); - SuperVersion* super_version = GetSuperVersion(); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion(); MergeContext merge_context; LookupKey lkey(key, snapshot); - if (super_version->mem->Get(lkey, value, &s, merge_context, options_)) { + if (super_version->mem->Get(lkey, value, &s, merge_context, + *cfd->options())) { } else { Version::GetStats stats; super_version->current->Get(options, lkey, value, &s, &merge_context, - &stats, options_); + &stats, *cfd->options()); } return s; } -Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) { - SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + SequenceNumber latest_snapshot = versions_->LastSequence(); + Iterator* internal_iter = NewInternalIterator(options, cfd, super_version); return NewDBIterator( - &dbname_, env_, options_, user_comparator(),internal_iter, + &dbname_, env_, *cfd->options(), cfd->user_comparator(), internal_iter, (options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot)); + ? reinterpret_cast(options.snapshot)->number_ + : latest_snapshot)); } - Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, - DB** dbptr, bool error_if_log_file_exist) { + DB** dbptr, bool error_if_log_file_exist) { *dbptr = nullptr; - DBImplReadOnly* impl = new DBImplReadOnly(options, dbname); + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(default_column_family_name, cf_options)); + + DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); impl->mutex_.Lock(); - Status s = impl->Recover(true /* read only */, error_if_log_file_exist); + Status s = impl->Recover(column_families, true /* read only */, + error_if_log_file_exist); if (s.ok()) { - delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_); + } } impl->mutex_.Unlock(); if (s.ok()) { diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index 57eae0e26..c4703ba69 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -12,6 +12,8 @@ #include #include +#include +#include #include "db/dbformat.h" #include "db/log_writer.h" #include "db/snapshot.h" @@ -23,57 +25,79 @@ namespace rocksdb { class DBImplReadOnly : public DBImpl { -public: - DBImplReadOnly(const Options& options, const std::string& dbname); - virtual ~DBImplReadOnly(); + public: + DBImplReadOnly(const DBOptions& options, const std::string& dbname); + virtual ~DBImplReadOnly(); - // Implementations of the DB interface - virtual Status Get(const ReadOptions& options, - const Slice& key, - std::string* value); + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value); - // TODO: Implement ReadOnly MultiGet? + // TODO: Implement ReadOnly MultiGet? - virtual Iterator* NewIterator(const ReadOptions&); + using DBImpl::NewIterator; + virtual Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family); - virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status Merge(const WriteOptions&, const Slice& key, - const Slice& value) { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status Delete(const WriteOptions&, const Slice& key) { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status Write(const WriteOptions& options, WriteBatch* updates) { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status CompactRange(const Slice* begin, const Slice* end, - bool reduce_level = false, int target_level = -1) { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status DisableFileDeletions() { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status EnableFileDeletions(bool force) { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status GetLiveFiles(std::vector&, - uint64_t* manifest_file_size, - bool flush_memtable = true) { - return Status::NotSupported("Not supported operation in read only mode."); - } - virtual Status Flush(const FlushOptions& options) { - return Status::NotSupported("Not supported operation in read only mode."); - } + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_family, + std::vector* iterators) { + // TODO + return Status::NotSupported("Not supported yet."); + } -private: - friend class DB; + using DBImpl::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& options, WriteBatch* updates) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level = false, + int target_level = -1) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status DisableFileDeletions() { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status EnableFileDeletions(bool force) { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) { + return Status::NotSupported("Not supported operation in read only mode."); + } - // No copying allowed - DBImplReadOnly(const DBImplReadOnly&); - void operator=(const DBImplReadOnly&); + private: + friend class DB; + + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&); + void operator=(const DBImplReadOnly&); }; - } diff --git a/db/db_iter.cc b/db/db_iter.cc index 5329e5297..47c07bfd9 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -39,71 +39,6 @@ static void DumpInternalIter(Iterator* iter) { namespace { -class IterLookupKey { - public: - IterLookupKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {} - - ~IterLookupKey() { Clear(); } - - Slice GetKey() const { - if (key_ != nullptr) { - return Slice(key_, key_size_); - } else { - return Slice(); - } - } - - bool Valid() const { return key_ != nullptr; } - - void Clear() { - if (key_ != nullptr && key_ != space_) { - delete[] key_; - } - key_ = space_; - buf_size_ = sizeof(buf_size_); - } - - // Enlarge the buffer size if needed based on key_size. - // By default, static allocated buffer is used. Once there is a key - // larger than the static allocated buffer, another buffer is dynamically - // allocated, until a larger key buffer is requested. In that case, we - // reallocate buffer and delete the old one. - void EnlargeBufferIfNeeded(size_t key_size) { - // If size is smaller than buffer size, continue using current buffer, - // or the static allocated one, as default - if (key_size > buf_size_) { - // Need to enlarge the buffer. - Clear(); - key_ = new char[key_size]; - buf_size_ = key_size; - } - key_size_ = key_size; - } - - void SetUserKey(const Slice& user_key) { - size_t size = user_key.size(); - EnlargeBufferIfNeeded(size); - memcpy(key_, user_key.data(), size); - } - - void SetInternalKey(const Slice& user_key, SequenceNumber s) { - size_t usize = user_key.size(); - EnlargeBufferIfNeeded(usize + sizeof(uint64_t)); - memcpy(key_, user_key.data(), usize); - EncodeFixed64(key_ + usize, PackSequenceAndType(s, kValueTypeForSeek)); - } - - private: - char* key_; - size_t buf_size_; - size_t key_size_; - char space_[32]; // Avoid allocation for short keys - - // No copying allowed - IterLookupKey(const IterLookupKey&) = delete; - void operator=(const LookupKey&) = delete; -}; - // Memtables and sstables that make the DB representation contain // (userkey,seq,type) => uservalue entries. DBIter // combines multiple entries for the same userkey found in the DB @@ -191,7 +126,7 @@ class DBIter: public Iterator { SequenceNumber const sequence_; Status status_; - IterLookupKey saved_key_; // == current key when direction_==kReverse + IterKey saved_key_; // == current key when direction_==kReverse std::string saved_value_; // == current raw value when direction_==kReverse std::string skip_key_; Direction direction_; @@ -254,10 +189,9 @@ void DBIter::Next() { // NOTE: In between, saved_key_ can point to a user key that has // a delete marker inline void DBIter::FindNextUserEntry(bool skipping) { - StopWatchNano timer(env_, false); - StartPerfTimer(&timer); + PERF_TIMER_AUTO(find_next_user_entry_time); FindNextUserEntryInternal(skipping); - BumpPerfTime(&perf_context.find_next_user_entry_time, &timer); + PERF_TIMER_STOP(find_next_user_entry_time); } // Actual implementation of DBIter::FindNextUserEntry() @@ -273,7 +207,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { if (skipping && user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { num_skipped++; // skip this entry - BumpPerfCount(&perf_context.internal_key_skipped_count); + PERF_COUNTER_ADD(internal_key_skipped_count, 1); } else { skipping = false; switch (ikey.type) { @@ -283,7 +217,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { saved_key_.SetUserKey(ikey.user_key); skipping = true; num_skipped = 0; - BumpPerfCount(&perf_context.internal_delete_skipped_count); + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeValue: valid_ = true; @@ -488,10 +422,9 @@ void DBIter::Seek(const Slice& target) { saved_key_.Clear(); // now savved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); - StopWatchNano internal_seek_timer(env_, false); - StartPerfTimer(&internal_seek_timer); + PERF_TIMER_AUTO(seek_internal_seek_time); iter_->Seek(saved_key_.GetKey()); - BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); + PERF_TIMER_STOP(seek_internal_seek_time); if (iter_->Valid()) { direction_ = kForward; ClearSavedValue(); @@ -504,10 +437,9 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { direction_ = kForward; ClearSavedValue(); - StopWatchNano internal_seek_timer(env_, false); - StartPerfTimer(&internal_seek_timer); + PERF_TIMER_AUTO(seek_internal_seek_time); iter_->SeekToFirst(); - BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); + PERF_TIMER_STOP(seek_internal_seek_time); if (iter_->Valid()) { FindNextUserEntry(false /* not skipping */); } else { @@ -526,10 +458,9 @@ void DBIter::SeekToLast() { direction_ = kReverse; ClearSavedValue(); - StopWatchNano internal_seek_timer(env_, false); - StartPerfTimer(&internal_seek_timer); + PERF_TIMER_AUTO(seek_internal_seek_time); iter_->SeekToLast(); - BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer); + PERF_TIMER_STOP(seek_internal_seek_time); FindPrevUserEntry(); } diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc index db86865ca..46918d4e7 100644 --- a/db/db_stats_logger.cc +++ b/db/db_stats_logger.cc @@ -65,7 +65,7 @@ void DBImpl::LogDBDeployStats() { uint64_t file_total_size = 0; uint32_t file_total_num = 0; - Version* current = versions_->current(); + Version* current = default_cf_handle_->cfd()->current(); for (int i = 0; i < current->NumberLevels(); i++) { file_total_num += current->NumLevelFiles(i); file_total_size += current->NumLevelBytes(i); diff --git a/db/db_test.cc b/db/db_test.cc index 0695b5cc7..0c728184a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -37,6 +37,7 @@ #include "util/mutexlock.h" #include "util/statistics.h" #include "util/testharness.h" +#include "util/sync_point.h" #include "util/testutil.h" namespace rocksdb { @@ -291,6 +292,7 @@ class DBTest { std::string dbname_; SpecialEnv* env_; DB* db_; + std::vector handles_; Options last_options_; @@ -306,7 +308,6 @@ class DBTest { DBTest() : option_config_(kDefault), env_(new SpecialEnv(Env::Default())) { - last_options_.max_background_flushes = 0; filter_policy_ = NewBloomFilterPolicy(10); dbname_ = test::TmpDir() + "/db_test"; ASSERT_OK(DestroyDB(dbname_, Options())); @@ -315,7 +316,7 @@ class DBTest { } ~DBTest() { - delete db_; + Close(); ASSERT_OK(DestroyDB(dbname_, Options())); delete env_; delete filter_policy_; @@ -372,8 +373,6 @@ class DBTest { // Return the current option configuration. Options CurrentOptions() { Options options; - options.paranoid_checks = false; - options.max_background_flushes = 0; switch (option_config_) { case kHashSkipList: options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -450,11 +449,70 @@ class DBTest { return reinterpret_cast(db_); } + void CreateColumnFamilies(const std::vector& cfs, + const ColumnFamilyOptions* options = nullptr) { + ColumnFamilyOptions cf_opts; + if (options != nullptr) { + cf_opts = ColumnFamilyOptions(*options); + } else { + cf_opts = ColumnFamilyOptions(CurrentOptions()); + } + int cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options* options = nullptr) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), + default_column_family_name); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options* options = nullptr) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies( + const std::vector& cfs, + const std::vector& options) { + Close(); + ASSERT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i])); + } + DBOptions db_opts = DBOptions(*options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options* options = nullptr) { + Close(); + Options opts = (options == nullptr) ? CurrentOptions() : *options; + std::vector v_opts(cfs.size(), &opts); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); } void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); delete db_; db_ = nullptr; } @@ -466,22 +524,16 @@ class DBTest { } void Destroy(Options* options) { - delete db_; - db_ = nullptr; + Close(); ASSERT_OK(DestroyDB(dbname_, *options)); } - Status PureReopen(Options* options, DB** db) { - return DB::Open(*options, dbname_, db); - } - Status ReadOnlyReopen(Options* options) { return DB::OpenForReadOnly(*options, dbname_, &db_); } Status TryReopen(Options* options = nullptr) { - delete db_; - db_ = nullptr; + Close(); Options opts; if (options != nullptr) { opts = *options; @@ -494,6 +546,14 @@ class DBTest { return DB::Open(opts, dbname_, &db_); } + Status Flush(int cf = 0) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } + } + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { if (kMergePut == option_config_ ) { return db_->Merge(wo, k, v); @@ -502,10 +562,23 @@ class DBTest { } } + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + if (kMergePut == option_config_) { + return db_->Merge(wo, handles_[cf], k, v); + } else { + return db_->Put(wo, handles_[cf], k, v); + } + } + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } + Status Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); + } + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { ReadOptions options; options.verify_checksums = true; @@ -520,12 +593,28 @@ class DBTest { return result; } + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + // Return a string that contains all key,value pairs in order, // formatted like "(k1->v1)(k2->v2)". - std::string Contents() { + std::string Contents(int cf = 0) { std::vector forward; std::string result; - Iterator* iter = db_->NewIterator(ReadOptions()); + Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions()) + : db_->NewIterator(ReadOptions(), handles_[cf]); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { std::string s = IterStatus(iter); result.push_back('('); @@ -547,8 +636,13 @@ class DBTest { return result; } - std::string AllEntriesFor(const Slice& user_key) { - Iterator* iter = dbfull()->TEST_NewInternalIterator(); + std::string AllEntriesFor(const Slice& user_key, int cf = 0) { + Iterator* iter; + if (cf == 0) { + iter = dbfull()->TEST_NewInternalIterator(); + } else { + iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); std::string result; @@ -596,28 +690,39 @@ class DBTest { return result; } - int NumTableFilesAtLevel(int level) { + int NumTableFilesAtLevel(int level, int cf = 0) { std::string property; - ASSERT_TRUE( - db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), - &property)); + if (cf == 0) { + // default cfd + ASSERT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + } else { + ASSERT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), + &property)); + } return atoi(property.c_str()); } - int TotalTableFiles() { + int TotalTableFiles(int cf = 0, int levels = -1) { + if (levels == -1) { + levels = CurrentOptions().num_levels; + } int result = 0; - for (int level = 0; level < db_->NumberLevels(); level++) { - result += NumTableFilesAtLevel(level); + for (int level = 0; level < levels; level++) { + result += NumTableFilesAtLevel(level, cf); } return result; } // Return spread of files per level - std::string FilesPerLevel() { + std::string FilesPerLevel(int cf = 0) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); std::string result; int last_non_zero_offset = 0; - for (int level = 0; level < db_->NumberLevels(); level++) { - int f = NumTableFilesAtLevel(level); + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); char buf[100]; snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); result += buf; @@ -642,37 +747,46 @@ class DBTest { } int CountLiveFiles() { - std::vector files; - uint64_t manifest_file_size; - db_->GetLiveFiles(files, &manifest_file_size); - return files.size(); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return metadata.size(); } - uint64_t Size(const Slice& start, const Slice& limit) { + uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) { Range r(start, limit); uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); + if (cf == 0) { + db_->GetApproximateSizes(&r, 1, &size); + } else { + db_->GetApproximateSizes(handles_[1], &r, 1, &size); + } return size; } + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit)); + } + void Compact(const Slice& start, const Slice& limit) { - db_->CompactRange(&start, &limit); + ASSERT_OK(db_->CompactRange(&start, &limit)); } // Do n memtable compactions, each of which produces an sstable // covering the range [small,large]. - void MakeTables(int n, const std::string& small, const std::string& large) { + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0) { for (int i = 0; i < n; i++) { - Put(small, "begin"); - Put(large, "end"); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); } } // Prevent pushing of new sstables into deeper levels by adding // tables that cover a specified range to all levels. - void FillLevels(const std::string& smallest, const std::string& largest) { - MakeTables(db_->NumberLevels(), smallest, largest); + void FillLevels(const std::string& smallest, const std::string& largest, + int cf) { + MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf); } void DumpFileCounts(const char* label) { @@ -724,8 +838,13 @@ class DBTest { return std::string(len, c); } - void VerifyIterLast(std::string expected_key) { - Iterator* iter = db_->NewIterator(ReadOptions()); + void VerifyIterLast(std::string expected_key, int cf = 0) { + Iterator* iter; + if (cf == 0) { + iter = db_->NewIterator(ReadOptions()); + } else { + iter = db_->NewIterator(ReadOptions(), handles_[cf]); + } iter->SeekToLast(); ASSERT_EQ(IterStatus(iter), expected_key); delete iter; @@ -779,22 +898,27 @@ class DBTest { } // Utility method to test InplaceUpdate - void validateNumberOfEntries(int numValues) { - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_EQ(iter->status().ok(), true); - int seq = numValues; - while (iter->Valid()) { - ParsedInternalKey ikey; - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + void validateNumberOfEntries(int numValues, int cf = 0) { + Iterator* iter; + if (cf != 0) { + iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + } else { + iter = dbfull()->TEST_NewInternalIterator(); + } + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + int seq = numValues; + while (iter->Valid()) { + ParsedInternalKey ikey; + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - // checks sequence number for updates - ASSERT_EQ(ikey.sequence, (unsigned)seq--); - iter->Next(); - } - delete iter; - ASSERT_EQ(0, seq); + // checks sequence number for updates + ASSERT_EQ(ikey.sequence, (unsigned)seq--); + iter->Next(); + } + delete iter; + ASSERT_EQ(0, seq); } void CopyFile(const std::string& source, const std::string& destination, @@ -882,15 +1006,15 @@ TEST(DBTest, Empty) { Options options = CurrentOptions(); options.env = env_; options.write_buffer_size = 100000; // Small write buffer - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); - env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls - Put("k1", std::string(100000, 'x')); // Fill memtable - Put("k2", std::string(100000, 'y')); // Trigger compaction - ASSERT_EQ("v1", Get("foo")); + env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_EQ("v1", Get(1, "foo")); env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls } while (ChangeOptions()); } @@ -926,11 +1050,11 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(new BlockBasedTableFactory(table_options)); - DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); - // Create a new talbe. - ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); // index/filter blocks added to block cache right after table creation. ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); @@ -942,24 +1066,24 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { // Make sure filter block is in cache. std::string value; ReadOptions ropt; - db_->KeyMayExist(ReadOptions(), "key", &value); + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); // Miss count should remain the same. ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - db_->KeyMayExist(ReadOptions(), "key", &value); + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); // Make sure index block is in cache. auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); - value = Get("key"); + value = Get(1, "key"); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(index_block_hit + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); - value = Get("key"); + value = Get(1, "key"); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(index_block_hit + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); @@ -1002,24 +1126,24 @@ TEST(DBTest, GetPropertiesOfAllTablesTest) { TEST(DBTest, LevelLimitReopen) { Options options = CurrentOptions(); - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); const std::string value(1024 * 1024, ' '); int i = 0; - while (NumTableFilesAtLevel(2) == 0) { - ASSERT_OK(Put(Key(i++), value)); + while (NumTableFilesAtLevel(2, 1) == 0) { + ASSERT_OK(Put(1, Key(i++), value)); } options.num_levels = 1; options.max_bytes_for_level_multiplier_additional.resize(1, 1); - Status s = TryReopen(&options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options); ASSERT_EQ(s.IsInvalidArgument(), true); ASSERT_EQ(s.ToString(), "Invalid argument: db has more levels than options.num_levels"); options.num_levels = 10; options.max_bytes_for_level_multiplier_additional.resize(10, 1); - ASSERT_OK(TryReopen(&options)); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, &options)); } TEST(DBTest, Preallocation) { @@ -1054,12 +1178,13 @@ TEST(DBTest, Preallocation) { TEST(DBTest, PutDeleteGet) { do { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - ASSERT_OK(db_->Delete(WriteOptions(), "foo")); - ASSERT_EQ("NOT_FOUND", Get("foo")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); } while (ChangeOptions()); } @@ -1069,40 +1194,44 @@ TEST(DBTest, GetFromImmutableLayer) { Options options = CurrentOptions(); options.env = env_; options.write_buffer_size = 100000; // Small write buffer - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls - Put("k1", std::string(100000, 'x')); // Fill memtable - Put("k2", std::string(100000, 'y')); // Trigger compaction - ASSERT_EQ("v1", Get("foo")); + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls } while (ChangeOptions()); } TEST(DBTest, GetFromVersions) { do { - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v1", Get("foo")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); } while (ChangeOptions()); } TEST(DBTest, GetSnapshot) { do { + CreateAndReopenWithCF({"pikachu"}); // Try with both a short key and a long key for (int i = 0; i < 2; i++) { std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); - ASSERT_OK(Put(key, "v1")); + ASSERT_OK(Put(1, key, "v1")); const Snapshot* s1 = db_->GetSnapshot(); - ASSERT_OK(Put(key, "v2")); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); + ASSERT_OK(Put(1, key, "v2")); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); db_->ReleaseSnapshot(s1); } } while (ChangeOptions()); @@ -1110,48 +1239,52 @@ TEST(DBTest, GetSnapshot) { TEST(DBTest, GetLevel0Ordering) { do { + CreateAndReopenWithCF({"pikachu"}); // Check that we process level-0 files in correct order. The code // below generates two level-0 files where the earlier one comes // before the later one in the level-0 file list since the earlier // one has a smaller "smallest" key. - ASSERT_OK(Put("bar", "b")); - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_FlushMemTable(); - ASSERT_OK(Put("foo", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("foo")); + ASSERT_OK(Put(1, "bar", "b")); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); } while (ChangeOptions()); } TEST(DBTest, GetOrderedByLevels) { do { - ASSERT_OK(Put("foo", "v1")); - Compact("a", "z"); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("foo")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + Compact(1, "a", "z"); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); } while (ChangeOptions()); } TEST(DBTest, GetPicksCorrectFile) { do { + CreateAndReopenWithCF({"pikachu"}); // Arrange to have multiple files in a non-level-0 level. - ASSERT_OK(Put("a", "va")); - Compact("a", "b"); - ASSERT_OK(Put("x", "vx")); - Compact("x", "y"); - ASSERT_OK(Put("f", "vf")); - Compact("f", "g"); - ASSERT_EQ("va", Get("a")); - ASSERT_EQ("vf", Get("f")); - ASSERT_EQ("vx", Get("x")); + ASSERT_OK(Put(1, "a", "va")); + Compact(1, "a", "b"); + ASSERT_OK(Put(1, "x", "vx")); + Compact(1, "x", "y"); + ASSERT_OK(Put(1, "f", "vf")); + Compact(1, "f", "g"); + ASSERT_EQ("va", Get(1, "a")); + ASSERT_EQ("vf", Get(1, "f")); + ASSERT_EQ("vx", Get(1, "x")); } while (ChangeOptions()); } TEST(DBTest, GetEncountersEmptyLevel) { do { + CreateAndReopenWithCF({"pikachu"}); // Arrange for the following to happen: // * sstable A in level 0 // * nothing in level 1 @@ -1162,30 +1295,29 @@ TEST(DBTest, GetEncountersEmptyLevel) { // Step 1: First place sstables in levels 0 and 2 int compaction_count = 0; - while (NumTableFilesAtLevel(0) == 0 || - NumTableFilesAtLevel(2) == 0) { + while (NumTableFilesAtLevel(0, 1) == 0 || NumTableFilesAtLevel(2, 1) == 0) { ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2"; compaction_count++; - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_FlushMemTable(); + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); } // Step 2: clear level 1 if necessary. - dbfull()->TEST_CompactRange(1, nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1); // Step 3: read a bunch of times for (int i = 0; i < 1000; i++) { - ASSERT_EQ("NOT_FOUND", Get("missing")); + ASSERT_EQ("NOT_FOUND", Get(1, "missing")); } // Step 4: Wait for compaction to finish env_->SleepForMicroseconds(1000000); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); // XXX + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX } while (ChangeOptions(kSkipUniversalCompaction)); } @@ -1199,50 +1331,52 @@ TEST(DBTest, KeyMayExist) { Options options = CurrentOptions(); options.filter_policy = NewBloomFilterPolicy(20); options.statistics = rocksdb::CreateDBStatistics(); - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); - ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + ASSERT_OK(Put(1, "a", "b")); bool value_found = false; - ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); ASSERT_TRUE(value_found); ASSERT_EQ("b", value); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); value.clear(); long numopen = TestGetTickerCount(options, NO_FILE_OPENS); long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); ASSERT_TRUE(!value_found); // assert that no new files were opened and no new blocks were // read into block cache. ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - ASSERT_OK(db_->Delete(WriteOptions(), "a")); + ASSERT_OK(Delete(1, "a")); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - dbfull()->Flush(FlushOptions()); - dbfull()->CompactRange(nullptr, nullptr); + ASSERT_OK(Flush(1)); + db_->CompactRange(handles_[1], nullptr, nullptr); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); - ASSERT_OK(db_->Delete(WriteOptions(), "c")); + ASSERT_OK(Delete(1, "c")); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value)); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -1259,13 +1393,13 @@ TEST(DBTest, NonBlockingIteration) { Options options = CurrentOptions(); options.statistics = rocksdb::CreateDBStatistics(); non_blocking_opts.read_tier = kBlockCacheTier; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // write one kv to the database. - ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + ASSERT_OK(Put(1, "a", "b")); // scan using non-blocking iterator. We should find it because // it is in memtable. - Iterator* iter = db_->NewIterator(non_blocking_opts); + Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]); int count = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_OK(iter->status()); @@ -1276,13 +1410,13 @@ TEST(DBTest, NonBlockingIteration) { // flush memtable to storage. Now, the key should not be in the // memtable neither in the block cache. - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); // verify that a non-blocking iterator does not find any // kvs. Neither does it do any IOs to storage. long numopen = TestGetTickerCount(options, NO_FILE_OPENS); long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); count = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { count++; @@ -1294,12 +1428,12 @@ TEST(DBTest, NonBlockingIteration) { delete iter; // read in the specified block via a regular get - ASSERT_EQ(Get("a"), "b"); + ASSERT_EQ(Get(1, "a"), "b"); // verify that we can find it via a non-blocking scan numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - iter = db_->NewIterator(non_blocking_opts); + iter = db_->NewIterator(non_blocking_opts, handles_[1]); count = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_OK(iter->status()); @@ -1322,33 +1456,33 @@ TEST(DBTest, FilterDeletes) { Options options = CurrentOptions(); options.filter_policy = NewBloomFilterPolicy(20); options.filter_deletes = true; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); WriteBatch batch; - batch.Delete("a"); + batch.Delete(handles_[1], "a"); dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(AllEntriesFor("a"), "[ ]"); // Delete skipped + ASSERT_EQ(AllEntriesFor("a", 1), "[ ]"); // Delete skipped batch.Clear(); - batch.Put("a", "b"); - batch.Delete("a"); + batch.Put(handles_[1], "a", "b"); + batch.Delete(handles_[1], "a"); dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(Get("a"), "NOT_FOUND"); - ASSERT_EQ(AllEntriesFor("a"), "[ DEL, b ]"); // Delete issued + ASSERT_EQ(Get(1, "a"), "NOT_FOUND"); + ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]"); // Delete issued batch.Clear(); - batch.Delete("c"); - batch.Put("c", "d"); + batch.Delete(handles_[1], "c"); + batch.Put(handles_[1], "c", "d"); dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(Get("c"), "d"); - ASSERT_EQ(AllEntriesFor("c"), "[ d ]"); // Delete skipped + ASSERT_EQ(Get(1, "c"), "d"); + ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]"); // Delete skipped batch.Clear(); - dbfull()->Flush(FlushOptions()); // A stray Flush + ASSERT_OK(Flush(1)); // A stray Flush - batch.Delete("c"); + batch.Delete(handles_[1], "c"); dbfull()->Write(WriteOptions(), &batch); - ASSERT_EQ(AllEntriesFor("c"), "[ DEL, d ]"); // Delete issued + ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]"); // Delete issued batch.Clear(); delete options.filter_policy; @@ -1481,7 +1615,8 @@ TEST(DBTest, IterPrevWithNewerSeq2) { TEST(DBTest, IterEmpty) { do { - Iterator* iter = db_->NewIterator(ReadOptions()); + CreateAndReopenWithCF({"pikachu"}); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "(invalid)"); @@ -1498,8 +1633,9 @@ TEST(DBTest, IterEmpty) { TEST(DBTest, IterSingle) { do { - ASSERT_OK(Put("a", "va")); - Iterator* iter = db_->NewIterator(ReadOptions()); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "a", "va")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->va"); @@ -1538,10 +1674,11 @@ TEST(DBTest, IterSingle) { TEST(DBTest, IterMulti) { do { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - Iterator* iter = db_->NewIterator(ReadOptions()); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", "vb")); + ASSERT_OK(Put(1, "c", "vc")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->va"); @@ -1596,11 +1733,11 @@ TEST(DBTest, IterMulti) { ASSERT_EQ(IterStatus(iter), "b->vb"); // Make sure iter stays at snapshot - ASSERT_OK(Put("a", "va2")); - ASSERT_OK(Put("a2", "va3")); - ASSERT_OK(Put("b", "vb2")); - ASSERT_OK(Put("c", "vc2")); - ASSERT_OK(Delete("b")); + ASSERT_OK(Put(1, "a", "va2")); + ASSERT_OK(Put(1, "a2", "va3")); + ASSERT_OK(Put(1, "b", "vb2")); + ASSERT_OK(Put(1, "c", "vc2")); + ASSERT_OK(Delete(1, "b")); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->va"); iter->Next(); @@ -1630,14 +1767,15 @@ TEST(DBTest, IterReseek) { options.create_if_missing = true; options.statistics = rocksdb::CreateDBStatistics(); DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // insert two keys with same userkey and verify that // reseek is not invoked. For each of these test cases, // verify that we can find the next key "b". - ASSERT_OK(Put("a", "one")); - ASSERT_OK(Put("a", "two")); - ASSERT_OK(Put("b", "bone")); - Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(Put(1, "a", "one")); + ASSERT_OK(Put(1, "a", "two")); + ASSERT_OK(Put(1, "b", "bone")); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); ASSERT_EQ(IterStatus(iter), "a->two"); @@ -1648,8 +1786,8 @@ TEST(DBTest, IterReseek) { // insert a total of three keys with same userkey and verify // that reseek is still not invoked. - ASSERT_OK(Put("a", "three")); - iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(Put(1, "a", "three")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->three"); iter->Next(); @@ -1659,8 +1797,8 @@ TEST(DBTest, IterReseek) { // insert a total of four keys with same userkey and verify // that reseek is invoked. - ASSERT_OK(Put("a", "four")); - iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(Put(1, "a", "four")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->four"); ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); @@ -1676,8 +1814,8 @@ TEST(DBTest, IterReseek) { (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION); // Insert another version of b and assert that reseek is not invoked - ASSERT_OK(Put("b", "btwo")); - iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(Put(1, "b", "btwo")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToLast(); ASSERT_EQ(IterStatus(iter), "b->btwo"); ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), @@ -1690,9 +1828,9 @@ TEST(DBTest, IterReseek) { // insert two more versions of b. This makes a total of 4 versions // of b and 4 versions of a. - ASSERT_OK(Put("b", "bthree")); - ASSERT_OK(Put("b", "bfour")); - iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(Put(1, "b", "bthree")); + ASSERT_OK(Put(1, "b", "bfour")); + iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToLast(); ASSERT_EQ(IterStatus(iter), "b->bfour"); ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), @@ -1708,13 +1846,14 @@ TEST(DBTest, IterReseek) { TEST(DBTest, IterSmallAndLargeMix) { do { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", std::string(100000, 'b'))); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Put("d", std::string(100000, 'd'))); - ASSERT_OK(Put("e", std::string(100000, 'e'))); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); + ASSERT_OK(Put(1, "c", "vc")); + ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); + ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); - Iterator* iter = db_->NewIterator(ReadOptions()); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->va"); @@ -1748,13 +1887,14 @@ TEST(DBTest, IterSmallAndLargeMix) { TEST(DBTest, IterMultiWithDelete) { do { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Delete("b")); - ASSERT_EQ("NOT_FOUND", Get("b")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", "vb")); + ASSERT_OK(Put(1, "c", "vc")); + ASSERT_OK(Delete(1, "b")); + ASSERT_EQ("NOT_FOUND", Get(1, "b")); - Iterator* iter = db_->NewIterator(ReadOptions()); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->Seek("c"); ASSERT_EQ(IterStatus(iter), "c->vc"); if (!CurrentOptions().merge_operator) { @@ -1768,49 +1908,51 @@ TEST(DBTest, IterMultiWithDelete) { TEST(DBTest, IterPrevMaxSkip) { do { + CreateAndReopenWithCF({"pikachu"}); for (int i = 0; i < 2; i++) { - db_->Put(WriteOptions(), "key1", "v1"); - db_->Put(WriteOptions(), "key2", "v2"); - db_->Put(WriteOptions(), "key3", "v3"); - db_->Put(WriteOptions(), "key4", "v4"); - db_->Put(WriteOptions(), "key5", "v5"); + ASSERT_OK(Put(1, "key1", "v1")); + ASSERT_OK(Put(1, "key2", "v2")); + ASSERT_OK(Put(1, "key3", "v3")); + ASSERT_OK(Put(1, "key4", "v4")); + ASSERT_OK(Put(1, "key5", "v5")); } - VerifyIterLast("key5->v5"); + VerifyIterLast("key5->v5", 1); - ASSERT_OK(db_->Delete(WriteOptions(), "key5")); - VerifyIterLast("key4->v4"); + ASSERT_OK(Delete(1, "key5")); + VerifyIterLast("key4->v4", 1); - ASSERT_OK(db_->Delete(WriteOptions(), "key4")); - VerifyIterLast("key3->v3"); + ASSERT_OK(Delete(1, "key4")); + VerifyIterLast("key3->v3", 1); - ASSERT_OK(db_->Delete(WriteOptions(), "key3")); - VerifyIterLast("key2->v2"); + ASSERT_OK(Delete(1, "key3")); + VerifyIterLast("key2->v2", 1); - ASSERT_OK(db_->Delete(WriteOptions(), "key2")); - VerifyIterLast("key1->v1"); + ASSERT_OK(Delete(1, "key2")); + VerifyIterLast("key1->v1", 1); - ASSERT_OK(db_->Delete(WriteOptions(), "key1")); - VerifyIterLast("(invalid)"); + ASSERT_OK(Delete(1, "key1")); + VerifyIterLast("(invalid)", 1); } while (ChangeOptions(kSkipMergePut)); } TEST(DBTest, IterWithSnapshot) { do { - ASSERT_OK(Put("key1", "val1")); - ASSERT_OK(Put("key2", "val2")); - ASSERT_OK(Put("key3", "val3")); - ASSERT_OK(Put("key4", "val4")); - ASSERT_OK(Put("key5", "val5")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(1, "key2", "val2")); + ASSERT_OK(Put(1, "key3", "val3")); + ASSERT_OK(Put(1, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); const Snapshot *snapshot = db_->GetSnapshot(); ReadOptions options; options.snapshot = snapshot; - Iterator* iter = db_->NewIterator(options); + Iterator* iter = db_->NewIterator(options, handles_[1]); // Put more values after the snapshot - ASSERT_OK(Put("key100", "val100")); - ASSERT_OK(Put("key101", "val101")); + ASSERT_OK(Put(1, "key100", "val100")); + ASSERT_OK(Put(1, "key101", "val101")); iter->Seek("key5"); ASSERT_EQ(IterStatus(iter), "key5->val5"); @@ -1835,23 +1977,24 @@ TEST(DBTest, IterWithSnapshot) { TEST(DBTest, Recover) { do { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("baz", "v5")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); - Reopen(); - ASSERT_EQ("v1", Get("foo")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v5", Get("baz")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); - ASSERT_OK(Put("foo", "v4")); - ASSERT_EQ("v4", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ("v5", Get("baz")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); } while (ChangeOptions()); } @@ -1862,18 +2005,19 @@ TEST(DBTest, RecoverWithTableHandle) { options.write_buffer_size = 100; options.disable_auto_compactions = true; DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("bar", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_OK(Put("foo", "v3")); - ASSERT_OK(Put("bar", "v4")); - dbfull()->TEST_FlushMemTable(); - ASSERT_OK(Put("big", std::string(100, 'a'))); - Reopen(); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Put(1, "bar", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "big", std::string(100, 'a'))); + ReopenWithColumnFamilies({"default", "pikachu"}); std::vector> files; - dbfull()->TEST_GetFilesMetaData(&files); + dbfull()->TEST_GetFilesMetaData(handles_[1], &files); int total_files = 0; for (const auto& level : files) { total_files += level.size(); @@ -1969,51 +2113,52 @@ TEST(DBTest, IgnoreRecoveredLog) { TEST(DBTest, RollLog) { do { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("baz", "v5")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); - Reopen(); + ReopenWithColumnFamilies({"default", "pikachu"}); for (int i = 0; i < 10; i++) { - Reopen(); + ReopenWithColumnFamilies({"default", "pikachu"}); } - ASSERT_OK(Put("foo", "v4")); + ASSERT_OK(Put(1, "foo", "v4")); for (int i = 0; i < 10; i++) { - Reopen(); + ReopenWithColumnFamilies({"default", "pikachu"}); } } while (ChangeOptions()); } TEST(DBTest, WAL) { do { - Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; - ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); - ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - Reopen(); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v1", Get("bar")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); writeOpt.disableWAL = false; - ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); writeOpt.disableWAL = true; - ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); - Reopen(); + ReopenWithColumnFamilies({"default", "pikachu"}); // Both value's should be present. - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ("v2", Get("foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v2", Get(1, "foo")); writeOpt.disableWAL = true; - ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); writeOpt.disableWAL = false; - ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); - Reopen(); + ReopenWithColumnFamilies({"default", "pikachu"}); // again both values should be present. - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v3", Get("bar")); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); } while (ChangeCompactOptions()); } @@ -2024,7 +2169,7 @@ TEST(DBTest, CheckLock) { ASSERT_OK(TryReopen(&options)); // second open should fail - ASSERT_TRUE(!(PureReopen(&options, &localdb)).ok()); + ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); } while (ChangeCompactOptions()); } @@ -2035,14 +2180,14 @@ TEST(DBTest, FlushMultipleMemtable) { writeOpt.disableWAL = true; options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; - Reopen(&options); - ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + CreateAndReopenWithCF({"pikachu"}, &options); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v1", Get("bar")); - dbfull()->Flush(FlushOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); } while (ChangeCompactOptions()); } @@ -2054,49 +2199,53 @@ TEST(DBTest, NumImmutableMemTable) { options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; options.write_buffer_size = 1000000; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); std::string big_value(1000000 * 2, 'x'); std::string num; SetPerfLevel(kEnableTime);; - ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); perf_context.Reset(); - Get("k1"); + Get(1, "k1"); ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "1"); perf_context.Reset(); - Get("k1"); + Get(1, "k1"); ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); perf_context.Reset(); - Get("k2"); + Get(1, "k2"); ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cur-size-active-mem-table", - &num)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "2"); perf_context.Reset(); - Get("k2"); + Get(1, "k2"); ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); perf_context.Reset(); - Get("k3"); + Get(1, "k3"); ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); perf_context.Reset(); - Get("k1"); + Get(1, "k1"); ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); - dbfull()->Flush(FlushOptions()); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_OK(Flush(1)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cur-size-active-mem-table", - &num)); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); // "208" is the size of the metadata of an empty skiplist, this would // break if we change the default skiplist implementation ASSERT_EQ(num, "208"); @@ -2106,18 +2255,27 @@ TEST(DBTest, NumImmutableMemTable) { class SleepingBackgroundTask { public: - SleepingBackgroundTask() : bg_cv_(&mutex_), should_sleep_(true) {} + SleepingBackgroundTask() + : bg_cv_(&mutex_), should_sleep_(true), done_with_sleep_(false) {} void DoSleep() { MutexLock l(&mutex_); while (should_sleep_) { bg_cv_.Wait(); } + done_with_sleep_ = true; + bg_cv_.SignalAll(); } void WakeUp() { MutexLock l(&mutex_); should_sleep_ = false; bg_cv_.SignalAll(); } + void WaitUntilDone() { + MutexLock l(&mutex_); + while (!done_with_sleep_) { + bg_cv_.Wait(); + } + } static void DoSleepTask(void* arg) { reinterpret_cast(arg)->DoSleep(); @@ -2127,6 +2285,7 @@ class SleepingBackgroundTask { port::Mutex mutex_; port::CondVar bg_cv_; // Signalled when background work finishes bool should_sleep_; + bool done_with_sleep_; }; TEST(DBTest, GetProperty) { @@ -2178,6 +2337,7 @@ TEST(DBTest, GetProperty) { ASSERT_EQ(num, "0"); sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); dbfull()->TEST_WaitForFlushMemTable(); ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); @@ -2188,48 +2348,49 @@ TEST(DBTest, GetProperty) { ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); ASSERT_EQ(num, "1"); sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); } TEST(DBTest, FLUSH) { do { - Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; SetPerfLevel(kEnableTime);; - ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); // this will now also flush the last 2 writes - dbfull()->Flush(FlushOptions()); - ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); perf_context.Reset(); - Get("foo"); + Get(1, "foo"); ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); - Reopen(); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v1", Get("bar")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); writeOpt.disableWAL = true; - ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v2")); - ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v2")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + ASSERT_OK(Flush(1)); - Reopen(); - ASSERT_EQ("v2", Get("bar")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v2", Get(1, "bar")); perf_context.Reset(); - ASSERT_EQ("v2", Get("foo")); + ASSERT_EQ("v2", Get(1, "foo")); ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); writeOpt.disableWAL = false; - ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3")); - ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v3")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + ASSERT_OK(Flush(1)); - Reopen(); + ReopenWithColumnFamilies({"default", "pikachu"}); // 'foo' should be there because its put // has WAL enabled. - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v3", Get("bar")); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); SetPerfLevel(kDisable); } while (ChangeCompactOptions()); @@ -2237,13 +2398,14 @@ TEST(DBTest, FLUSH) { TEST(DBTest, RecoveryWithEmptyLog) { do { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("foo", "v2")); - Reopen(); - Reopen(); - ASSERT_OK(Put("foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Put(1, "foo", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("v3", Get(1, "foo")); } while (ChangeOptions()); } @@ -2254,19 +2416,19 @@ TEST(DBTest, RecoverDuringMemtableCompaction) { Options options = CurrentOptions(); options.env = env_; options.write_buffer_size = 1000000; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Trigger a long memtable compaction and reopen the database during it - ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file - ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable - ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction - ASSERT_OK(Put("bar", "v2")); // Goes to new log file + ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file - Reopen(&options); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); - ASSERT_EQ(std::string(1000, 'y'), Get("big2")); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1")); + ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2")); } while (ChangeOptions()); } @@ -2274,25 +2436,25 @@ TEST(DBTest, MinorCompactionsHappen) { do { Options options = CurrentOptions(); options.write_buffer_size = 10000; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); const int N = 500; - int starting_num_tables = TotalTableFiles(); + int starting_num_tables = TotalTableFiles(1); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v'))); } - int ending_num_tables = TotalTableFiles(); + int ending_num_tables = TotalTableFiles(1); ASSERT_GT(ending_num_tables, starting_num_tables); for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); } - Reopen(); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); } } while (ChangeCompactOptions()); } @@ -2301,24 +2463,21 @@ TEST(DBTest, ManifestRollOver) { do { Options options = CurrentOptions(); options.max_manifest_file_size = 10 ; // 10 bytes - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); { - ASSERT_OK(Put("manifest_key1", std::string(1000, '1'))); - ASSERT_OK(Put("manifest_key2", std::string(1000, '2'))); - ASSERT_OK(Put("manifest_key3", std::string(1000, '3'))); - uint64_t manifest_before_flush = - dbfull()->TEST_Current_Manifest_FileNo(); - dbfull()->Flush(FlushOptions()); // This should trigger LogAndApply. - uint64_t manifest_after_flush = - dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Flush(1)); // This should trigger LogAndApply. + uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); ASSERT_GT(manifest_after_flush, manifest_before_flush); - Reopen(&options); - ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), - manifest_after_flush); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); // check if a new manifest file got inserted or not. - ASSERT_EQ(std::string(1000, '1'), Get("manifest_key1")); - ASSERT_EQ(std::string(1000, '2'), Get("manifest_key2")); - ASSERT_EQ(std::string(1000, '3'), Get("manifest_key3")); + ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); } } while (ChangeCompactOptions()); } @@ -2349,51 +2508,51 @@ TEST(DBTest, RecoverWithLargeLog) { do { { Options options = CurrentOptions(); - Reopen(&options); - ASSERT_OK(Put("big1", std::string(200000, '1'))); - ASSERT_OK(Put("big2", std::string(200000, '2'))); - ASSERT_OK(Put("small3", std::string(10, '3'))); - ASSERT_OK(Put("small4", std::string(10, '4'))); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + CreateAndReopenWithCF({"pikachu"}, &options); + ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); + ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); + ASSERT_OK(Put(1, "small3", std::string(10, '3'))); + ASSERT_OK(Put(1, "small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); } // Make sure that if we re-open with a small write buffer size that // we flush table files in the middle of a large log file. Options options = CurrentOptions(); options.write_buffer_size = 100000; - Reopen(&options); - ASSERT_EQ(NumTableFilesAtLevel(0), 3); - ASSERT_EQ(std::string(200000, '1'), Get("big1")); - ASSERT_EQ(std::string(200000, '2'), Get("big2")); - ASSERT_EQ(std::string(10, '3'), Get("small3")); - ASSERT_EQ(std::string(10, '4'), Get("small4")); - ASSERT_GT(NumTableFilesAtLevel(0), 1); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); + ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); + ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); + ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); + ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); } while (ChangeCompactOptions()); } TEST(DBTest, CompactionsGenerateMultipleFiles) { Options options = CurrentOptions(); options.write_buffer_size = 100000000; // Large write buffer - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); std::vector values; for (int i = 0; i < 80; i++) { values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(Key(i), values[i])); + ASSERT_OK(Put(1, Key(i), values[i])); } // Reopening moves updates to level-0 - Reopen(&options); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); for (int i = 0; i < 80; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); + ASSERT_EQ(Get(1, Key(i)), values[i]); } } @@ -2403,33 +2562,32 @@ TEST(DBTest, CompactionTrigger) { options.num_levels = 3; options.max_mem_compaction_level = 0; options.level0_file_num_compaction_trigger = 3; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); - for (int num = 0; - num < options.level0_file_num_compaction_trigger - 1; + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; num++) { std::vector values; // Write 120KB (12 values, each 10K) for (int i = 0; i < 12; i++) { values.push_back(RandomString(&rnd, 10000)); - ASSERT_OK(Put(Key(i), values[i])); + ASSERT_OK(Put(1, Key(i), values[i])); } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); } //generate one more file in level-0, and should trigger level-0 compaction std::vector values; for (int i = 0; i < 12; i++) { values.push_back(RandomString(&rnd, 10000)); - ASSERT_OK(Put(Key(i), values[i])); + ASSERT_OK(Put(1, Key(i), values[i])); } dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); } // This is a static filter used for filtering @@ -2537,7 +2695,7 @@ TEST(DBTest, UniversalCompactionTrigger) { filter->expect_manual_compaction_.store(false); options.compaction_filter_factory.reset(filter); - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); int key_idx = 0; @@ -2546,31 +2704,30 @@ TEST(DBTest, UniversalCompactionTrigger) { // Stage 1: // Generate a set of files at level 0, but don't trigger level-0 // compaction. - for (int num = 0; - num < options.level0_file_num_compaction_trigger-1; + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; num++) { // Write 110KB (11 values, each 10K) - for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); } // Generate one more file at level-0, which should trigger level-0 // compaction. for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } dbfull()->TEST_WaitForCompact(); // Suppose each file flushed from mem table has size 1. Now we compact // (level0_file_num_compaction_trigger+1)=4 files and should have a big // file of size 4. - ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); for (int i = 1; i < options.num_levels ; i++) { - ASSERT_EQ(NumTableFilesAtLevel(i), 0); + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); } // Stage 2: @@ -2581,74 +2738,72 @@ TEST(DBTest, UniversalCompactionTrigger) { // a level-0 file, with size around 0.4 (according to previously written // data amount). filter->expect_full_compaction_.store(false); - dbfull()->Flush(FlushOptions()); - for (int num = 0; - num < options.level0_file_num_compaction_trigger-3; + ASSERT_OK(Flush(1)); + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3); } // Generate one more file at level-0, which should trigger level-0 // compaction. for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } dbfull()->TEST_WaitForCompact(); // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. // After comapction, we should have 2 files, with size 4, 2.4. - ASSERT_EQ(NumTableFilesAtLevel(0), 2); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2); for (int i = 1; i < options.num_levels ; i++) { - ASSERT_EQ(NumTableFilesAtLevel(i), 0); + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); } // Stage 3: // Now we have 2 files at level 0, with size 4 and 2.4. Continue // generating new files at level 0. - for (int num = 0; - num < options.level0_file_num_compaction_trigger-3; + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), num + 3); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 3); } // Generate one more file at level-0, which should trigger level-0 // compaction. for (int i = 0; i < 12; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } dbfull()->TEST_WaitForCompact(); // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1. // After comapction, we should have 3 files, with size 4, 2.4, 2. - ASSERT_EQ(NumTableFilesAtLevel(0), 3); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); for (int i = 1; i < options.num_levels ; i++) { - ASSERT_EQ(NumTableFilesAtLevel(i), 0); + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); } // Stage 4: // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a // new file of size 1. for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } dbfull()->TEST_WaitForCompact(); // Level-0 compaction is triggered, but no file will be picked up. - ASSERT_EQ(NumTableFilesAtLevel(0), 4); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 4); for (int i = 1; i < options.num_levels ; i++) { - ASSERT_EQ(NumTableFilesAtLevel(i), 0); + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); } // Stage 5: @@ -2656,14 +2811,14 @@ TEST(DBTest, UniversalCompactionTrigger) { // a new file of size 1. filter->expect_full_compaction_.store(true); for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } dbfull()->TEST_WaitForCompact(); // All files at level 0 will be compacted into a single one. - ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); for (int i = 1; i < options.num_levels ; i++) { - ASSERT_EQ(NumTableFilesAtLevel(i), 0); + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); } } @@ -2672,38 +2827,37 @@ TEST(DBTest, UniversalCompactionSizeAmplification) { options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 100<<10; //100KB options.level0_file_num_compaction_trigger = 3; + CreateAndReopenWithCF({"pikachu"}, &options); // Trigger compaction if size amplification exceeds 110% - options.compaction_options_universal. - max_size_amplification_percent = 110; - Reopen(&options); + options.compaction_options_universal.max_size_amplification_percent = 110; + ReopenWithColumnFamilies({"default", "pikachu"}, &options); Random rnd(301); int key_idx = 0; // Generate two files in Level 0. Both files are approx the same size. - for (int num = 0; - num < options.level0_file_num_compaction_trigger-1; + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); } - ASSERT_EQ(NumTableFilesAtLevel(0), 2); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2); // Flush whatever is remaining in memtable. This is typically // small, which should not trigger size ratio based compaction // but will instead trigger size amplification. - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); // Verify that size amplification did occur - ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); } TEST(DBTest, UniversalCompactionOptions) { @@ -2713,30 +2867,28 @@ TEST(DBTest, UniversalCompactionOptions) { options.level0_file_num_compaction_trigger = 4; options.num_levels = 1; options.compaction_options_universal.compression_size_percent = -1; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); int key_idx = 0; - for (int num = 0; - num < options.level0_file_num_compaction_trigger; - num++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); if (num < options.level0_file_num_compaction_trigger - 1) { - ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); } } dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); for (int i = 1; i < options.num_levels ; i++) { - ASSERT_EQ(NumTableFilesAtLevel(i), 0); + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); } } @@ -2856,12 +3008,18 @@ TEST(DBTest, CompressedCache) { default: ASSERT_TRUE(false); } - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.no_block_cache = true; + no_block_cache_opts.statistics = options.statistics; + ReopenWithColumnFamilies({"default", "pikachu"}, + {&no_block_cache_opts, &options}); Random rnd(301); // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); std::vector values; std::string str; for (int i = 0; i < num_iter; i++) { @@ -2869,14 +3027,14 @@ TEST(DBTest, CompressedCache) { str = RandomString(&rnd, 1000); } values.push_back(str); - ASSERT_OK(Put(Key(i), values[i])); + ASSERT_OK(Put(1, Key(i), values[i])); } // flush all data from memtable so that reads are from block cache - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); for (int i = 0; i < num_iter; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); + ASSERT_EQ(Get(1, Key(i)), values[i]); } // check that we triggered the appropriate code paths in the cache @@ -2899,6 +3057,9 @@ TEST(DBTest, CompressedCache) { default: ASSERT_TRUE(false); } + + options.create_if_missing = true; + DestroyAndReopen(&options); } } @@ -3014,26 +3175,26 @@ TEST(DBTest, ConvertCompactionStyle) { options.max_bytes_for_level_multiplier = 1; options.target_file_size_base = 200<<10; // 200KB options.target_file_size_multiplier = 1; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); for (int i = 0; i <= max_key_level_insert; i++) { // each value is 10K - ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); } - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); - ASSERT_GT(TotalTableFiles(), 1); + ASSERT_GT(TotalTableFiles(1, 4), 1); int non_level0_num_files = 0; - for (int i = 1; i < dbfull()->NumberLevels(); i++) { - non_level0_num_files += NumTableFilesAtLevel(i); + for (int i = 1; i < options.num_levels; i++) { + non_level0_num_files += NumTableFilesAtLevel(i, 1); } ASSERT_GT(non_level0_num_files, 0); // Stage 2: reopen with universal compaction - should fail options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; - Status s = TryReopen(&options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options); ASSERT_TRUE(s.IsInvalidArgument()); // Stage 3: compact into a single file and move the file to level 0 @@ -3043,14 +3204,13 @@ TEST(DBTest, ConvertCompactionStyle) { options.target_file_size_multiplier = 1; options.max_bytes_for_level_base = INT_MAX; options.max_bytes_for_level_multiplier = 1; - Reopen(&options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); - dbfull()->CompactRange(nullptr, nullptr, - true /* reduce level */, - 0 /* reduce to level 0 */); + dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */, + 0 /* reduce to level 0 */); - for (int i = 0; i < dbfull()->NumberLevels(); i++) { - int num = NumTableFilesAtLevel(i); + for (int i = 0; i < options.num_levels; i++) { + int num = NumTableFilesAtLevel(i, 1); if (i == 0) { ASSERT_EQ(num, 1); } else { @@ -3063,22 +3223,23 @@ TEST(DBTest, ConvertCompactionStyle) { options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 100<<10; //100KB options.level0_file_num_compaction_trigger = 3; - Reopen(&options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); } dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); - for (int i = 1; i < dbfull()->NumberLevels(); i++) { - ASSERT_EQ(NumTableFilesAtLevel(i), 0); + for (int i = 1; i < options.num_levels; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); } // verify keys inserted in both level compaction style and universal // compaction style std::string keys_in_db; - Iterator* iter = dbfull()->NewIterator(ReadOptions()); + Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { keys_in_db.append(iter->key().ToString()); keys_in_db.push_back(','); @@ -3213,18 +3374,18 @@ TEST(DBTest, RepeatedWritesToSameKey) { Options options = CurrentOptions(); options.env = env_; options.write_buffer_size = 100000; // Small write buffer - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // We must have at most one file per level except for level-0, // which may have up to kL0_StopWritesTrigger files. - const int kMaxFiles = dbfull()->NumberLevels() + - dbfull()->Level0StopWriteTrigger(); + const int kMaxFiles = + options.num_levels + options.level0_stop_writes_trigger; Random rnd(301); std::string value = RandomString(&rnd, 2 * options.write_buffer_size); for (int i = 0; i < 5 * kMaxFiles; i++) { - Put("key", value); - ASSERT_LE(TotalTableFiles(), kMaxFiles); + ASSERT_OK(Put(1, "key", value)); + ASSERT_LE(TotalTableFiles(1), kMaxFiles); } } while (ChangeCompactOptions()); } @@ -3236,18 +3397,18 @@ TEST(DBTest, InPlaceUpdate) { options.inplace_update_support = true; options.env = env_; options.write_buffer_size = 100000; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Update key with values of smaller size int numValues = 10; for (int i = numValues; i > 0; i--) { std::string value = DummyString(i, 'a'); - ASSERT_OK(Put("key", value)); - ASSERT_EQ(value, Get("key")); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); } // Only 1 instance for that key. - validateNumberOfEntries(1); + validateNumberOfEntries(1, 1); } while (ChangeCompactOptions()); } @@ -3259,18 +3420,18 @@ TEST(DBTest, InPlaceUpdateLargeNewValue) { options.inplace_update_support = true; options.env = env_; options.write_buffer_size = 100000; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Update key with values of larger size int numValues = 10; for (int i = 0; i < numValues; i++) { std::string value = DummyString(i, 'a'); - ASSERT_OK(Put("key", value)); - ASSERT_EQ(value, Get("key")); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); } // All 10 updates exist in the internal iterator - validateNumberOfEntries(numValues); + validateNumberOfEntries(numValues, 1); } while (ChangeCompactOptions()); } @@ -3286,20 +3447,20 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerSize) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTest::updateInPlaceSmallerSize; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Update key with values of smaller size int numValues = 10; - ASSERT_OK(Put("key", DummyString(numValues, 'a'))); - ASSERT_EQ(DummyString(numValues, 'c'), Get("key")); + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); for (int i = numValues; i > 0; i--) { - ASSERT_OK(Put("key", DummyString(i, 'a'))); - ASSERT_EQ(DummyString(i - 1, 'b'), Get("key")); + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key")); } // Only 1 instance for that key. - validateNumberOfEntries(1); + validateNumberOfEntries(1, 1); } while (ChangeCompactOptions()); } @@ -3314,20 +3475,20 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTest::updateInPlaceSmallerVarintSize; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Update key with values of smaller varint size int numValues = 265; - ASSERT_OK(Put("key", DummyString(numValues, 'a'))); - ASSERT_EQ(DummyString(numValues, 'c'), Get("key")); + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); for (int i = numValues; i > 0; i--) { - ASSERT_OK(Put("key", DummyString(i, 'a'))); - ASSERT_EQ(DummyString(1, 'b'), Get("key")); + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(1, 'b'), Get(1, "key")); } // Only 1 instance for that key. - validateNumberOfEntries(1); + validateNumberOfEntries(1, 1); } while (ChangeCompactOptions()); } @@ -3342,18 +3503,18 @@ TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTest::updateInPlaceLargerSize; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Update key with values of larger size int numValues = 10; for (int i = 0; i < numValues; i++) { - ASSERT_OK(Put("key", DummyString(i, 'a'))); - ASSERT_EQ(DummyString(i, 'c'), Get("key")); + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i, 'c'), Get(1, "key")); } // No inplace updates. All updates are puts with new seq number // All 10 updates exist in the internal iterator - validateNumberOfEntries(numValues); + validateNumberOfEntries(numValues, 1); } while (ChangeCompactOptions()); } @@ -3368,11 +3529,11 @@ TEST(DBTest, InPlaceUpdateCallbackNoAction) { options.write_buffer_size = 100000; options.inplace_callback = rocksdb::DBTest::updateInPlaceNoAction; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Callback function requests no actions from db - ASSERT_OK(Put("key", DummyString(1, 'a'))); - ASSERT_EQ(Get("key"), "NOT_FOUND"); + ASSERT_OK(Put(1, "key", DummyString(1, 'a'))); + ASSERT_EQ(Get(1, "key"), "NOT_FOUND"); } while (ChangeCompactOptions()); } @@ -3383,30 +3544,30 @@ TEST(DBTest, CompactionFilter) { options.num_levels = 3; options.max_mem_compaction_level = 0; options.compaction_filter_factory = std::make_shared(); - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Write 100K keys, these are written to a few files in L0. const std::string value(10, 'x'); for (int i = 0; i < 100000; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); + Put(1, key, value); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Flush(1)); // Push all files to the highest level L2. Verify that // the compaction is each level invokes the filter for // all the keys in that level. cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); ASSERT_EQ(cfilter_count, 100000); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_NE(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); cfilter_count = 0; // All the files are in the lowest level. @@ -3417,7 +3578,7 @@ TEST(DBTest, CompactionFilter) { // TODO: figure out sequence number squashtoo int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); + Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { @@ -3438,55 +3599,56 @@ TEST(DBTest, CompactionFilter) { for (int i = 0; i < 100000; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); + ASSERT_OK(Put(1, key, value)); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Flush(1)); // push all files to the highest level L2. This // means that all keys should pass at least once // via the compaction filter cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); ASSERT_EQ(cfilter_count, 100000); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_NE(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); // create a new database with the compaction // filter in such a way that it deletes all keys options.compaction_filter_factory = std::make_shared(); options.create_if_missing = true; DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // write all the keys once again. for (int i = 0; i < 100000; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); + ASSERT_OK(Put(1, key, value)); } - dbfull()->TEST_FlushMemTable(); - ASSERT_NE(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_OK(Flush(1)); + ASSERT_NE(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0); // Push all files to the highest level L2. This // triggers the compaction filter to delete all keys, // verify that at the end of the compaction process, // nothing is left. cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); ASSERT_EQ(cfilter_count, 0); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); // Scan the entire database to ensure that nothing is left - iter = db_->NewIterator(ReadOptions()); + iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -3502,7 +3664,7 @@ TEST(DBTest, CompactionFilter) { // TODO: remove the following or design a different // test count = 0; - iter = dbfull()->TEST_NewInternalIterator(); + iter = dbfull()->TEST_NewInternalIterator(handles_[1]); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { @@ -3524,6 +3686,7 @@ TEST(DBTest, CompactionFilterWithValueChange) { options.compaction_filter_factory = std::make_shared(); Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Write 100K+1 keys, these are written to a few files // in L0. We do this so that the current snapshot points @@ -3534,33 +3697,33 @@ TEST(DBTest, CompactionFilterWithValueChange) { for (int i = 0; i < 100001; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); + Put(1, key, value); } // push all files to lower levels - dbfull()->TEST_FlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); // re-write all data again for (int i = 0; i < 100001; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); + Put(1, key, value); } // push all files to lower levels. This should // invoke the compaction filter for all 100000 keys. - dbfull()->TEST_FlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); // verify that all keys now have the new value that // was set by the compaction process. for (int i = 0; i < 100001; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - std::string newvalue = Get(key); + std::string newvalue = Get(1, key); ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); } } while (ChangeCompactOptions()); @@ -3924,9 +4087,9 @@ TEST(DBTest, SparseMerge) { do { Options options = CurrentOptions(); options.compression = kNoCompression; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - FillLevels("A", "Z"); + FillLevels("A", "Z", 1); // Suppose there is: // small amount of data with prefix A @@ -3935,30 +4098,33 @@ TEST(DBTest, SparseMerge) { // and that recent updates have made small changes to all three prefixes. // Check that we do not do a compaction that merges all of B in one shot. const std::string value(1000, 'x'); - Put("A", "va"); + Put(1, "A", "va"); // Write approximately 100MB of "B" values for (int i = 0; i < 100000; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); + Put(1, key, value); } - Put("C", "vc"); - dbfull()->TEST_FlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + Put(1, "C", "vc"); + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); // Make sparse update - Put("A", "va2"); - Put("B100", "bvalue2"); - Put("C", "vc2"); - dbfull()->TEST_FlushMemTable(); + Put(1, "A", "va2"); + Put(1, "B100", "bvalue2"); + Put(1, "C", "vc2"); + ASSERT_OK(Flush(1)); // Compactions should not cause us to create a situation where // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); dbfull()->TEST_CompactRange(1, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); } while (ChangeCompactOptions()); } @@ -3979,46 +4145,49 @@ TEST(DBTest, ApproximateSizes) { options.write_buffer_size = 100000000; // Large write buffer options.compression = kNoCompression; DestroyAndReopen(); + CreateAndReopenWithCF({"pikachu"}, &options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - Reopen(&options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); const int N = 80; static const int S1 = 100000; static const int S2 = 105000; // Allow some expansion from metadata Random rnd(301); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, S1))); + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); } // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { - Reopen(&options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); for (int compact_start = 0; compact_start < N; compact_start += 10) { for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i)); - ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1))); - ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10)); + ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); + ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), + S2 * (i + 1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); } - ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50)); - ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50)); + ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); + ASSERT_TRUE( + Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); std::string cstart_str = Key(compact_start); std::string cend_str = Key(compact_start + 9); Slice cstart = cstart_str; Slice cend = cend_str; - dbfull()->TEST_CompactRange(0, &cstart, &cend); + dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); } - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); } // ApproximateOffsetOf() is not yet implemented in plain table format. } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable)); @@ -4028,36 +4197,36 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { do { Options options = CurrentOptions(); options.compression = kNoCompression; - Reopen(); + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(2), big1)); - ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(4), big1)); - ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(2), big1)); + ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(4), big1)); + ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { - Reopen(&options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); - ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000)); + ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); - ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); + ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); } // ApproximateOffsetOf() is not yet implemented in plain table format. } while (ChangeOptions(kSkipPlainTable)); @@ -4065,17 +4234,19 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { TEST(DBTest, IteratorPinsRef) { do { - Put("foo", "hello"); + CreateAndReopenWithCF({"pikachu"}); + Put(1, "foo", "hello"); // Get iterator that will yield the current contents of the DB. - Iterator* iter = db_->NewIterator(ReadOptions()); + Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); // Write to force compactions - Put("foo", "newvalue1"); + Put(1, "foo", "newvalue1"); for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values + // 100K values + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); } - Put("foo", "newvalue2"); + Put(1, "foo", "newvalue2"); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); @@ -4089,61 +4260,77 @@ TEST(DBTest, IteratorPinsRef) { TEST(DBTest, Snapshot) { do { - Put("foo", "v1"); + CreateAndReopenWithCF({"pikachu"}); + Put(0, "foo", "0v1"); + Put(1, "foo", "1v1"); const Snapshot* s1 = db_->GetSnapshot(); - Put("foo", "v2"); + Put(0, "foo", "0v2"); + Put(1, "foo", "1v2"); const Snapshot* s2 = db_->GetSnapshot(); - Put("foo", "v3"); + Put(0, "foo", "0v3"); + Put(1, "foo", "1v3"); const Snapshot* s3 = db_->GetSnapshot(); - Put("foo", "v4"); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v3", Get("foo", s3)); - ASSERT_EQ("v4", Get("foo")); + Put(0, "foo", "0v4"); + Put(1, "foo", "1v4"); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3)); + ASSERT_EQ("1v3", Get(1, "foo", s3)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); db_->ReleaseSnapshot(s3); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); db_->ReleaseSnapshot(s1); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); db_->ReleaseSnapshot(s2); - ASSERT_EQ("v4", Get("foo")); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); } while (ChangeOptions()); } TEST(DBTest, HiddenValuesAreRemoved) { do { + CreateAndReopenWithCF({"pikachu"}); Random rnd(301); - FillLevels("a", "z"); + FillLevels("a", "z", 1); std::string big = RandomString(&rnd, 50000); - Put("foo", big); - Put("pastfoo", "v"); + Put(1, "foo", big); + Put(1, "pastfoo", "v"); const Snapshot* snapshot = db_->GetSnapshot(); - Put("foo", "tiny"); - Put("pastfoo2", "v2"); // Advance sequence number one more + Put(1, "foo", "tiny"); + Put(1, "pastfoo2", "v2"); // Advance sequence number one more - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); - ASSERT_EQ(big, Get("foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); + ASSERT_EQ(big, Get(1, "foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); Slice x("x"); - dbfull()->TEST_CompactRange(0, nullptr, &x); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, nullptr, &x); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); + dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); + dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); - ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); // ApproximateOffsetOf() is not yet implemented in plain table format, // which is used by Size(). } while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable)); @@ -4151,202 +4338,208 @@ TEST(DBTest, HiddenValuesAreRemoved) { TEST(DBTest, CompactBetweenSnapshots) { do { + CreateAndReopenWithCF({"pikachu"}); Random rnd(301); - FillLevels("a", "z"); + FillLevels("a", "z", 1); - Put("foo", "first"); + Put(1, "foo", "first"); const Snapshot* snapshot1 = db_->GetSnapshot(); - Put("foo", "second"); - Put("foo", "third"); - Put("foo", "fourth"); + Put(1, "foo", "second"); + Put(1, "foo", "third"); + Put(1, "foo", "fourth"); const Snapshot* snapshot2 = db_->GetSnapshot(); - Put("foo", "fifth"); - Put("foo", "sixth"); + Put(1, "foo", "fifth"); + Put(1, "foo", "sixth"); // All entries (including duplicates) exist // before any compaction is triggered. - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ("sixth", Get("foo")); - ASSERT_EQ("fourth", Get("foo", snapshot2)); - ASSERT_EQ("first", Get("foo", snapshot1)); - ASSERT_EQ(AllEntriesFor("foo"), + ASSERT_OK(Flush(1)); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fifth, fourth, third, second, first ]"); // After a compaction, "second", "third" and "fifth" should // be removed - FillLevels("a", "z"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ("sixth", Get("foo")); - ASSERT_EQ("fourth", Get("foo", snapshot2)); - ASSERT_EQ("first", Get("foo", snapshot1)); - ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth, first ]"); + FillLevels("a", "z", 1); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); // after we release the snapshot1, only two values left db_->ReleaseSnapshot(snapshot1); - FillLevels("a", "z"); - dbfull()->CompactRange(nullptr, nullptr); + FillLevels("a", "z", 1); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); // We have only one valid snapshot snapshot2. Since snapshot1 is // not valid anymore, "first" should be removed by a compaction. - ASSERT_EQ("sixth", Get("foo")); - ASSERT_EQ("fourth", Get("foo", snapshot2)); - ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth ]"); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); // after we release the snapshot2, only one value should be left db_->ReleaseSnapshot(snapshot2); - FillLevels("a", "z"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ("sixth", Get("foo")); - ASSERT_EQ(AllEntriesFor("foo"), "[ sixth ]"); + FillLevels("a", "z", 1); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); } while (ChangeOptions()); } TEST(DBTest, DeletionMarkers1) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - const int last = dbfull()->MaxMemCompactionLevel(); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + CreateAndReopenWithCF({"pikachu"}); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = CurrentOptions().max_mem_compaction_level; + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); // Place a table at level last-1 to prevent merging with preceding mutation - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); - ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - Delete("foo"); - Put("foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Moves to level last-2 + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 if (CurrentOptions().purge_redundant_kvs_while_flush) { - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); } else { - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); } Slice z("z"); - dbfull()->TEST_CompactRange(last-2, nullptr, &z); + dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); // DEL eliminated, but v1 remains because we aren't compacting that level // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(last-1, nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); } TEST(DBTest, DeletionMarkers2) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - const int last = dbfull()->MaxMemCompactionLevel(); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level + CreateAndReopenWithCF({"pikachu"}); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = CurrentOptions().max_mem_compaction_level; + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); // Place a table at level last-1 to prevent merging with preceding mutation - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); - ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - Delete("foo"); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-2, nullptr, nullptr); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); // DEL kept: "last" file overlaps - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-1, nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); } TEST(DBTest, OverlapInLevel0) { do { - int tmp = dbfull()->MaxMemCompactionLevel(); + CreateAndReopenWithCF({"pikachu"}); + int tmp = CurrentOptions().max_mem_compaction_level; ASSERT_EQ(tmp, 2) << "Fix test to match config"; //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. - ASSERT_OK(Put("100", "v100")); - ASSERT_OK(Put("999", "v999")); - dbfull()->TEST_FlushMemTable(); - ASSERT_OK(Delete("100")); - ASSERT_OK(Delete("999")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("0,1,1", FilesPerLevel()); + ASSERT_OK(Put(1, "100", "v100")); + ASSERT_OK(Put(1, "999", "v999")); + Flush(1); + ASSERT_OK(Delete(1, "100")); + ASSERT_OK(Delete(1, "999")); + Flush(1); + ASSERT_EQ("0,1,1", FilesPerLevel(1)); // Make files spanning the following ranges in level-0: // files[0] 200 .. 900 // files[1] 300 .. 500 // Note that files are sorted by smallest key. - ASSERT_OK(Put("300", "v300")); - ASSERT_OK(Put("500", "v500")); - dbfull()->TEST_FlushMemTable(); - ASSERT_OK(Put("200", "v200")); - ASSERT_OK(Put("600", "v600")); - ASSERT_OK(Put("900", "v900")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("2,1,1", FilesPerLevel()); + ASSERT_OK(Put(1, "300", "v300")); + ASSERT_OK(Put(1, "500", "v500")); + Flush(1); + ASSERT_OK(Put(1, "200", "v200")); + ASSERT_OK(Put(1, "600", "v600")); + ASSERT_OK(Put(1, "900", "v900")); + Flush(1); + ASSERT_EQ("2,1,1", FilesPerLevel(1)); // Compact away the placeholder files we created initially - dbfull()->TEST_CompactRange(1, nullptr, nullptr); - dbfull()->TEST_CompactRange(2, nullptr, nullptr); - ASSERT_EQ("2", FilesPerLevel()); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); + ASSERT_EQ("2", FilesPerLevel(1)); // Do a memtable compaction. Before bug-fix, the compaction would // not detect the overlap with level-0 files and would incorrectly place // the deletion in a deeper level. - ASSERT_OK(Delete("600")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("3", FilesPerLevel()); - ASSERT_EQ("NOT_FOUND", Get("600")); + ASSERT_OK(Delete(1, "600")); + Flush(1); + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ("NOT_FOUND", Get(1, "600")); } while (ChangeOptions(kSkipUniversalCompaction)); } TEST(DBTest, L0_CompactionBug_Issue44_a) { do { - Reopen(); - ASSERT_OK(Put("b", "v")); - Reopen(); - ASSERT_OK(Delete("b")); - ASSERT_OK(Delete("a")); - Reopen(); - ASSERT_OK(Delete("a")); - Reopen(); - ASSERT_OK(Put("a", "v")); - Reopen(); - Reopen(); - ASSERT_EQ("(a->v)", Contents()); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "b", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Delete(1, "b")); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_OK(Put(1, "a", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("(a->v)", Contents(1)); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - ASSERT_EQ("(a->v)", Contents()); + ASSERT_EQ("(a->v)", Contents(1)); } while (ChangeCompactOptions()); } TEST(DBTest, L0_CompactionBug_Issue44_b) { do { - Reopen(); - Put("",""); - Reopen(); - Delete("e"); - Put("",""); - Reopen(); - Put("c", "cv"); - Reopen(); - Put("",""); - Reopen(); - Put("",""); + CreateAndReopenWithCF({"pikachu"}); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Delete(1, "e"); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "c", "cv"); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "", ""); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - Reopen(); - Put("d","dv"); - Reopen(); - Put("",""); - Reopen(); - Delete("d"); - Delete("b"); - Reopen(); - ASSERT_EQ("(->)(c->cv)", Contents()); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "d", "dv"); + ReopenWithColumnFamilies({"default", "pikachu"}); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}); + Delete(1, "d"); + Delete(1, "b"); + ReopenWithColumnFamilies({"default", "pikachu"}); + ASSERT_EQ("(->)(c->cv)", Contents(1)); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - ASSERT_EQ("(->)(c->cv)", Contents()); + ASSERT_EQ("(->)(c->cv)", Contents(1)); } while (ChangeCompactOptions()); } @@ -4364,12 +4557,16 @@ TEST(DBTest, ComparatorCheck) { BytewiseComparator()->FindShortSuccessor(key); } }; - Options new_options; + Options new_options, options; NewComparator cmp; do { + CreateAndReopenWithCF({"pikachu"}); + options = CurrentOptions(); new_options = CurrentOptions(); new_options.comparator = &cmp; - Status s = TryReopen(&new_options); + // only the non-default column family has non-matching comparator + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, + {&options, &new_options}); ASSERT_TRUE(!s.ok()); ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) << s.ToString(); @@ -4411,70 +4608,73 @@ TEST(DBTest, CustomComparator) { new_options.filter_policy = nullptr; // Cannot use bloom filters new_options.write_buffer_size = 1000; // Compact more often DestroyAndReopen(&new_options); - ASSERT_OK(Put("[10]", "ten")); - ASSERT_OK(Put("[0x14]", "twenty")); + CreateAndReopenWithCF({"pikachu"}, &new_options); + ASSERT_OK(Put(1, "[10]", "ten")); + ASSERT_OK(Put(1, "[0x14]", "twenty")); for (int i = 0; i < 2; i++) { - ASSERT_EQ("ten", Get("[10]")); - ASSERT_EQ("ten", Get("[0xa]")); - ASSERT_EQ("twenty", Get("[20]")); - ASSERT_EQ("twenty", Get("[0x14]")); - ASSERT_EQ("NOT_FOUND", Get("[15]")); - ASSERT_EQ("NOT_FOUND", Get("[0xf]")); - Compact("[0]", "[9999]"); + ASSERT_EQ("ten", Get(1, "[10]")); + ASSERT_EQ("ten", Get(1, "[0xa]")); + ASSERT_EQ("twenty", Get(1, "[20]")); + ASSERT_EQ("twenty", Get(1, "[0x14]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); + Compact(1, "[0]", "[9999]"); } for (int run = 0; run < 2; run++) { for (int i = 0; i < 1000; i++) { char buf[100]; snprintf(buf, sizeof(buf), "[%d]", i*10); - ASSERT_OK(Put(buf, buf)); + ASSERT_OK(Put(1, buf, buf)); } - Compact("[0]", "[1000000]"); + Compact(1, "[0]", "[1000000]"); } } while (ChangeCompactOptions(&new_options)); } TEST(DBTest, ManualCompaction) { + CreateAndReopenWithCF({"pikachu"}); ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) << "Need to update this test to match kMaxMemCompactLevel"; // iter - 0 with 7 levels // iter - 1 with 3 levels for (int iter = 0; iter < 2; ++iter) { - MakeTables(3, "p", "q"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); // Compaction range falls before files - Compact("", "c"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); // Compaction range falls after files - Compact("r", "z"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); // Compaction range overlaps files - Compact("p1", "p9"); - ASSERT_EQ("0,0,1", FilesPerLevel()); + Compact(1, "p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); // Populate a different range - MakeTables(3, "c", "e"); - ASSERT_EQ("1,1,2", FilesPerLevel()); + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); // Compact just the new range - Compact("b", "f"); - ASSERT_EQ("0,0,2", FilesPerLevel()); + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); // Compact all - MakeTables(1, "a", "z"); - ASSERT_EQ("0,1,2", FilesPerLevel()); - db_->CompactRange(nullptr, nullptr); - ASSERT_EQ("0,0,1", FilesPerLevel()); + MakeTables(1, "a", "z", 1); + ASSERT_EQ("0,1,2", FilesPerLevel(1)); + db_->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); if (iter == 0) { Options options = CurrentOptions(); options.num_levels = 3; options.create_if_missing = true; DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); } } @@ -4520,26 +4720,22 @@ TEST(DBTest, DBOpen_Options) { } TEST(DBTest, DBOpen_Change_NumLevels) { - std::string dbname = test::TmpDir() + "/db_change_num_levels"; - ASSERT_OK(DestroyDB(dbname, Options())); Options opts; - Status s; - DB* db = nullptr; opts.create_if_missing = true; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != nullptr); - db->Put(WriteOptions(), "a", "123"); - db->Put(WriteOptions(), "b", "234"); - db->CompactRange(nullptr, nullptr); - delete db; - db = nullptr; + DestroyAndReopen(&opts); + ASSERT_TRUE(db_ != nullptr); + CreateAndReopenWithCF({"pikachu"}, &opts); + + ASSERT_OK(Put(1, "a", "123")); + ASSERT_OK(Put(1, "b", "234")); + db_->CompactRange(handles_[1], nullptr, nullptr); + Close(); opts.create_if_missing = false; opts.num_levels = 2; - s = DB::Open(opts, dbname, &db); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &opts); ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); - ASSERT_TRUE(db == nullptr); + ASSERT_TRUE(db_ == nullptr); } TEST(DBTest, DestroyDBMetaDatabase) { @@ -4581,6 +4777,7 @@ TEST(DBTest, NoSpace) { do { Options options = CurrentOptions(); options.env = env_; + options.paranoid_checks = false; Reopen(&options); ASSERT_OK(Put("foo", "v1")); @@ -4689,7 +4886,7 @@ TEST(DBTest, ManifestWriteError) { ASSERT_EQ("bar", Get("foo")); // Memtable compaction (will succeed) - dbfull()->TEST_FlushMemTable(); + Flush(); ASSERT_EQ("bar", Get("foo")); const int last = dbfull()->MaxMemCompactionLevel(); ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level @@ -4718,45 +4915,48 @@ TEST(DBTest, PutFailsParanoid) { options.error_if_exists = false; options.paranoid_checks = true; DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); Status s; - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error env_->log_write_error_.Release_Store(env_); - s = Put("foo2", "bar2"); + s = Put(1, "foo2", "bar2"); ASSERT_TRUE(!s.ok()); env_->log_write_error_.Release_Store(nullptr); - s = Put("foo3", "bar3"); + s = Put(1, "foo3", "bar3"); // the next put should fail, too ASSERT_TRUE(!s.ok()); // but we're still able to read - ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ("bar", Get(1, "foo")); // do the same thing with paranoid checks off options.paranoid_checks = false; DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error env_->log_write_error_.Release_Store(env_); - s = Put("foo2", "bar2"); + s = Put(1, "foo2", "bar2"); ASSERT_TRUE(!s.ok()); env_->log_write_error_.Release_Store(nullptr); - s = Put("foo3", "bar3"); + s = Put(1, "foo3", "bar3"); // the next put should NOT fail ASSERT_TRUE(s.ok()); } TEST(DBTest, FilesDeletedAfterCompaction) { do { - ASSERT_OK(Put("foo", "v2")); - Compact("a", "z"); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); const int num_files = CountLiveFiles(); for (int i = 0; i < 10; i++) { - ASSERT_OK(Put("foo", "v2")); - Compact("a", "z"); + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); } ASSERT_EQ(CountLiveFiles(), num_files); } while (ChangeCompactOptions()); @@ -4769,18 +4969,18 @@ TEST(DBTest, BloomFilter) { options.env = env_; options.no_block_cache = true; options.filter_policy = NewBloomFilterPolicy(10); - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Populate multiple layers const int N = 10000; for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), Key(i))); + ASSERT_OK(Put(1, Key(i), Key(i))); } - Compact("a", "z"); + Compact(1, "a", "z"); for (int i = 0; i < N; i += 100) { - ASSERT_OK(Put(Key(i), Key(i))); + ASSERT_OK(Put(1, Key(i), Key(i))); } - dbfull()->TEST_FlushMemTable(); + Flush(1); // Prevent auto compactions triggered by seeks env_->delay_sstable_sync_.Release_Store(env_); @@ -4788,7 +4988,7 @@ TEST(DBTest, BloomFilter) { // Lookup present keys. Should rarely read from small sstable. env_->random_read_counter_.Reset(); for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i), Get(Key(i))); + ASSERT_EQ(Key(i), Get(1, Key(i))); } int reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d present => %d reads\n", N, reads); @@ -4798,7 +4998,7 @@ TEST(DBTest, BloomFilter) { // Lookup present keys. Should rarely read from either sstable. env_->random_read_counter_.Reset(); for (int i = 0; i < N; i++) { - ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing")); + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); } reads = env_->random_read_counter_.Read(); fprintf(stderr, "%d missing => %d reads\n", N, reads); @@ -4814,20 +5014,20 @@ TEST(DBTest, SnapshotFiles) { do { Options options = CurrentOptions(); options.write_buffer_size = 100000000; // Large write buffer - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); Random rnd(301); // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); std::vector values; for (int i = 0; i < 80; i++) { values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(Key(i), values[i])); + ASSERT_OK(Put((i < 40), Key(i), values[i])); } // assert that nothing makes it to disk yet. - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); // get a file snapshot uint64_t manifest_number = 0; @@ -4836,8 +5036,8 @@ TEST(DBTest, SnapshotFiles) { dbfull()->DisableFileDeletions(); dbfull()->GetLiveFiles(files, &manifest_size); - // CURRENT, MANIFEST, *.sst files - ASSERT_EQ(files.size(), 3U); + // CURRENT, MANIFEST, *.sst files (one for each CF) + ASSERT_EQ(files.size(), 4U); uint64_t number = 0; FileType type; @@ -4878,22 +5078,30 @@ TEST(DBTest, SnapshotFiles) { std::vector extras; for (unsigned int i = 0; i < 1; i++) { extras.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(Key(i), extras[i])); + ASSERT_OK(Put(0, Key(i), extras[i])); } // verify that data in the snapshot are correct - Options opts; + std::vector column_families; + column_families.emplace_back("default", ColumnFamilyOptions()); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); + std::vector cf_handles; DB* snapdb; + DBOptions opts; opts.create_if_missing = false; - Status stat = DB::Open(opts, snapdir, &snapdb); + Status stat = + DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); ASSERT_OK(stat); ReadOptions roptions; std::string val; for (unsigned int i = 0; i < 80; i++) { - stat = snapdb->Get(roptions, Key(i), &val); + stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); ASSERT_EQ(values[i].compare(val), 0); } + for (auto cfh : cf_handles) { + delete cfh; + } delete snapdb; // look at the new live files after we added an 'extra' key @@ -4936,83 +5144,83 @@ TEST(DBTest, CompactOnFlush) { Options options = CurrentOptions(); options.purge_redundant_kvs_while_flush = true; options.disable_auto_compactions = true; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v1 ]"); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); // Write two new keys - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_FlushMemTable(); + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); // Case1: Delete followed by a put - Delete("foo"); - Put("foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); // After the current memtable is flushed, the DEL should // have been removed - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); // Case 2: Delete followed by another delete - Delete("foo"); - Delete("foo"); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, DEL, v2 ]"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v2 ]"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + Delete(1, "foo"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 3: Put followed by a delete - Put("foo", "v3"); - Delete("foo"); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v3 ]"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL ]"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + Put(1, "foo", "v3"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 4: Put followed by another Put - Put("foo", "v4"); - Put("foo", "v5"); - ASSERT_EQ(AllEntriesFor("foo"), "[ v5, v4 ]"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ(AllEntriesFor("foo"), "[ v5 ]"); + Put(1, "foo", "v4"); + Put(1, "foo", "v5"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); // clear database - Delete("foo"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + Delete(1, "foo"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 5: Put followed by snapshot followed by another Put // Both puts should remain. - Put("foo", "v6"); + Put(1, "foo", "v6"); const Snapshot* snapshot = db_->GetSnapshot(); - Put("foo", "v7"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v7, v6 ]"); + Put(1, "foo", "v7"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); db_->ReleaseSnapshot(snapshot); // clear database - Delete("foo"); - dbfull()->CompactRange(nullptr, nullptr); - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); + Delete(1, "foo"); + dbfull()->CompactRange(handles_[1], nullptr, nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 5: snapshot followed by a put followed by another Put // Only the last put should remain. const Snapshot* snapshot1 = db_->GetSnapshot(); - Put("foo", "v8"); - Put("foo", "v9"); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ(AllEntriesFor("foo"), "[ v9 ]"); + Put(1, "foo", "v8"); + Put(1, "foo", "v9"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); db_->ReleaseSnapshot(snapshot1); } while (ChangeCompactOptions()); } @@ -5168,19 +5376,21 @@ TEST(DBTest, TransactionLogIterator) { do { Options options = OptionsForLogIterTest(); DestroyAndReopen(&options); - Put("key1", DummyString(1024)); - Put("key2", DummyString(1024)); - Put("key2", DummyString(1024)); + CreateAndReopenWithCF({"pikachu"}, &options); + Put(0, "key1", DummyString(1024)); + Put(1, "key2", DummyString(1024)); + Put(1, "key2", DummyString(1024)); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); { auto iter = OpenTransactionLogIter(0); ExpectRecords(3, iter); } - Reopen(&options); - env_->SleepForMicroseconds(2 * 1000 * 1000);{ - Put("key4", DummyString(1024)); - Put("key5", DummyString(1024)); - Put("key6", DummyString(1024)); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + env_->SleepForMicroseconds(2 * 1000 * 1000); + { + Put(0, "key4", DummyString(1024)); + Put(1, "key5", DummyString(1024)); + Put(0, "key6", DummyString(1024)); } { auto iter = OpenTransactionLogIter(0); @@ -5189,17 +5399,63 @@ TEST(DBTest, TransactionLogIterator) { } while (ChangeCompactOptions()); } +TEST(DBTest, TransactionLogIteratorRace) { + // Setup sync point dependency to reproduce the race condition of + // a log file moved to archived dir, in the middle of GetSortedWalFiles + rocksdb::SyncPoint::GetInstance()->LoadDependency( + { { "DBImpl::GetSortedWalFiles:1", "DBImpl::PurgeObsoleteFiles:1" }, + { "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalFiles:2" }, + }); + + do { + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Options options = OptionsForLogIterTest(); + DestroyAndReopen(&options); + Put("key1", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key2", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key3", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key4", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U); + + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(4, iter); + } + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // trigger async flush, and log move. Well, log move will + // wait until the GetSortedWalFiles:1 to reproduce the race + // condition + FlushOptions flush_options; + flush_options.wait = false; + dbfull()->Flush(flush_options); + + // "key5" would be written in a new memtable and log + Put("key5", DummyString(1024)); + { + // this iter would miss "key4" if not fixed + auto iter = OpenTransactionLogIter(0); + ExpectRecords(5, iter); + } + } while (ChangeCompactOptions()); +} + TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) { do { Options options = OptionsForLogIterTest(); DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Do a plain Reopen. - Put("key1", DummyString(1024)); + Put(1, "key1", DummyString(1024)); // Two reopens should create a zero record WAL file. - Reopen(&options); - Reopen(&options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); - Put("key2", DummyString(1024)); + Put(1, "key2", DummyString(1024)); auto iter = OpenTransactionLogIter(0); ExpectRecords(2, iter); @@ -5285,15 +5541,17 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) { do { Options options = OptionsForLogIterTest(); DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); WriteBatch batch; - batch.Put("key1", DummyString(1024)); - batch.Put("key2", DummyString(1024)); - batch.Put("key3", DummyString(1024)); - batch.Delete("key2"); + batch.Put(handles_[1], "key1", DummyString(1024)); + batch.Put(handles_[0], "key2", DummyString(1024)); + batch.Put(handles_[1], "key3", DummyString(1024)); + batch.Delete(handles_[0], "key2"); dbfull()->Write(WriteOptions(), &batch); - dbfull()->Flush(FlushOptions()); - Reopen(&options); - Put("key4", DummyString(1024)); + Flush(1); + Flush(0); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); + Put(1, "key4", DummyString(1024)); auto iter = OpenTransactionLogIter(3); ExpectRecords(2, iter); } while (ChangeCompactOptions()); @@ -5302,43 +5560,49 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) { TEST(DBTest, TransactionLogIteratorBlobs) { Options options = OptionsForLogIterTest(); DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); { WriteBatch batch; - batch.Put("key1", DummyString(1024)); - batch.Put("key2", DummyString(1024)); + batch.Put(handles_[1], "key1", DummyString(1024)); + batch.Put(handles_[0], "key2", DummyString(1024)); batch.PutLogData(Slice("blob1")); - batch.Put("key3", DummyString(1024)); + batch.Put(handles_[1], "key3", DummyString(1024)); batch.PutLogData(Slice("blob2")); - batch.Delete("key2"); + batch.Delete(handles_[0], "key2"); dbfull()->Write(WriteOptions(), &batch); - Reopen(&options); + ReopenWithColumnFamilies({"default", "pikachu"}, &options); } auto res = OpenTransactionLogIter(0)->GetBatch(); struct Handler : public WriteBatch::Handler { std::string seen; - virtual void Put(const Slice& key, const Slice& value) { - seen += "Put(" + key.ToString() + ", " + std::to_string(value.size()) + - ")"; + virtual Status PutCF(uint32_t cf, const Slice& key, const Slice& value) { + seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " + + std::to_string(value.size()) + ")"; + return Status::OK(); } - virtual void Merge(const Slice& key, const Slice& value) { - seen += "Merge(" + key.ToString() + ", " + std::to_string(value.size()) + - ")"; + virtual Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) { + seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " + + std::to_string(value.size()) + ")"; + return Status::OK(); } virtual void LogData(const Slice& blob) { seen += "LogData(" + blob.ToString() + ")"; } - virtual void Delete(const Slice& key) { - seen += "Delete(" + key.ToString() + ")"; + virtual Status DeleteCF(uint32_t cf, const Slice& key) { + seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")"; + return Status::OK(); } } handler; res.writeBatchPtr->Iterate(&handler); - ASSERT_EQ("Put(key1, 1024)" - "Put(key2, 1024)" - "LogData(blob1)" - "Put(key3, 1024)" - "LogData(blob2)" - "Delete(key2)", handler.seen); + ASSERT_EQ( + "Put(1, key1, 1024)" + "Put(0, key2, 1024)" + "LogData(blob1)" + "Put(1, key3, 1024)" + "LogData(blob2)" + "Delete(0, key2)", + handler.seen); } TEST(DBTest, ReadCompaction) { @@ -5354,43 +5618,43 @@ TEST(DBTest, ReadCompaction) { options.no_block_cache = true; options.disable_seek_compaction = false; - Reopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); // Write 8MB (2000 values, each 4K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); std::vector values; for (int i = 0; i < 2000; i++) { - ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Put(1, Key(i), value)); } // clear level 0 and 1 if necessary. - dbfull()->TEST_FlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - dbfull()->TEST_CompactRange(1, nullptr, nullptr); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); + Flush(1); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); // write some new keys into level 0 for (int i = 0; i < 2000; i = i + 16) { - ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Put(1, Key(i), value)); } - dbfull()->Flush(FlushOptions()); + Flush(1); // Wait for any write compaction to finish dbfull()->TEST_WaitForCompact(); // remember number of files in each level - int l1 = NumTableFilesAtLevel(0); - int l2 = NumTableFilesAtLevel(1); - int l3 = NumTableFilesAtLevel(3); - ASSERT_NE(NumTableFilesAtLevel(0), 0); - ASSERT_NE(NumTableFilesAtLevel(1), 0); - ASSERT_NE(NumTableFilesAtLevel(2), 0); + int l1 = NumTableFilesAtLevel(0, 1); + int l2 = NumTableFilesAtLevel(1, 1); + int l3 = NumTableFilesAtLevel(2, 1); + ASSERT_NE(NumTableFilesAtLevel(0, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); // read a bunch of times, trigger read compaction for (int j = 0; j < 100; j++) { for (int i = 0; i < 2000; i++) { - Get(Key(i)); + Get(1, Key(i)); } } // wait for read compaction to finish @@ -5398,16 +5662,17 @@ TEST(DBTest, ReadCompaction) { // verify that the number of files have decreased // in some level, indicating that there was a compaction - ASSERT_TRUE(NumTableFilesAtLevel(0) < l1 || - NumTableFilesAtLevel(1) < l2 || - NumTableFilesAtLevel(2) < l3); + ASSERT_TRUE(NumTableFilesAtLevel(0, 1) < l1 || + NumTableFilesAtLevel(1, 1) < l2 || + NumTableFilesAtLevel(2, 1) < l3); } } // Multi-threaded test: namespace { -static const int kNumThreads = 4; +static const int kColumnFamilies = 10; +static const int kNumThreads = 10; static const int kTestSeconds = 10; static const int kNumKeys = 1000; @@ -5430,7 +5695,6 @@ static void MTThreadBody(void* arg) { uintptr_t counter = 0; fprintf(stderr, "... starting thread %d\n", id); Random rnd(1000 + id); - std::string value; char valbuf[1500]; while (t->state->stop.Acquire_Load() == nullptr) { t->state->counter[id].Release_Store(reinterpret_cast(counter)); @@ -5440,26 +5704,57 @@ static void MTThreadBody(void* arg) { snprintf(keybuf, sizeof(keybuf), "%016d", key); if (rnd.OneIn(2)) { - // Write values of the form . + // Write values of the form . + // into each of the CFs // We add some padding for force compactions. - snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d", - key, id, static_cast(counter)); - ASSERT_OK(t->state->test->Put(Slice(keybuf), Slice(valbuf))); + int unique_id = rnd.Uniform(1000000); + WriteBatch batch; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); } else { - // Read a value and verify that it matches the pattern written above. - Status s = db->Get(ReadOptions(), Slice(keybuf), &value); + // Read a value and verify that it matches the pattern written above + // and that writes to all column families were atomic (unique_id is the + // same) + std::vector keys(kColumnFamilies, Slice(keybuf)); + std::vector values; + std::vector statuses = + db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values); + Status s = statuses[0]; + // all statuses have to be the same + for (size_t i = 1; i < statuses.size(); ++i) { + // they are either both ok or both not-found + ASSERT_TRUE((s.ok() && statuses[i].ok()) || + (s.IsNotFound() && statuses[i].IsNotFound())); + } if (s.IsNotFound()) { // Key has not yet been written } else { // Check that the writer thread counter is >= the counter in the value ASSERT_OK(s); - int k, w, c; - ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value; - ASSERT_EQ(k, key); - ASSERT_GE(w, 0); - ASSERT_LT(w, kNumThreads); - ASSERT_LE((unsigned int)c, reinterpret_cast( - t->state->counter[w].Acquire_Load())); + int unique_id = -1; + for (int i = 0; i < kColumnFamilies; ++i) { + int k, w, c, cf, u; + ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, + &c, &cf, &u)) + << values[i]; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE((unsigned int)c, reinterpret_cast( + t->state->counter[w].Acquire_Load())); + ASSERT_EQ(cf, i); + if (i == 0) { + unique_id = u; + } else { + // this checks that updates across column families happened + // atomically -- all unique ids are the same + ASSERT_EQ(u, unique_id); + } + } } } counter++; @@ -5472,6 +5767,11 @@ static void MTThreadBody(void* arg) { TEST(DBTest, MultiThreaded) { do { + std::vector cfs; + for (int i = 1; i < kColumnFamilies; ++i) { + cfs.push_back(std::to_string(i)); + } + CreateAndReopenWithCF(cfs); // Initialize state MTState mt; mt.test = this; @@ -5531,6 +5831,10 @@ static void GCThreadBody(void* arg) { TEST(DBTest, GroupCommitTest) { do { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + Reopen(&options); + // Start threads GCThread thread[kGCNumThreads]; for (int id = 0; id < kGCNumThreads; id++) { @@ -5545,6 +5849,7 @@ TEST(DBTest, GroupCommitTest) { env_->SleepForMicroseconds(100000); } } + ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); std::vector expected_db; for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { @@ -5577,41 +5882,62 @@ class ModelDB: public DB { KVMap map_; }; - explicit ModelDB(const Options& options): options_(options) { } - virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { - return DB::Put(o, k, v); + explicit ModelDB(const Options& options) : options_(options) {} + using DB::Put; + virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) { + WriteBatch batch; + batch.Put(cf, k, v); + return Write(o, &batch); } - virtual Status Merge(const WriteOptions& o, const Slice& k, const Slice& v) { - return DB::Merge(o, k, v); + using DB::Merge; + virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& k, const Slice& v) { + WriteBatch batch; + batch.Merge(cf, k, v); + return Write(o, &batch); } - virtual Status Delete(const WriteOptions& o, const Slice& key) { - return DB::Delete(o, key); + using DB::Delete; + virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) { + WriteBatch batch; + batch.Delete(cf, key); + return Write(o, &batch); } - virtual Status Get(const ReadOptions& options, + using DB::Get; + virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf, const Slice& key, std::string* value) { return Status::NotSupported(key); } - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values) { + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { std::vector s(keys.size(), Status::NotSupported("Not implemented.")); return s; } - virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { return Status(); } + + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, - const Slice& key, - std::string* value, - bool* value_found = nullptr) { + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { if (value_found != nullptr) { *value_found = false; } return true; // Not Supported directly } - virtual Iterator* NewIterator(const ReadOptions& options) { + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) { if (options.snapshot == nullptr) { KVMap* saved = new KVMap; *saved = map_; @@ -5622,6 +5948,12 @@ class ModelDB: public DB { return new ModelIter(snapshot_state, false); } } + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_family, + std::vector* iterators) { + return Status::NotSupported("Not supported yet"); + } virtual const Snapshot* GetSnapshot() { ModelSnapshot* snapshot = new ModelSnapshot; snapshot->map_ = map_; @@ -5651,32 +5983,36 @@ class ModelDB: public DB { return batch->Iterate(&handler); } - virtual bool GetProperty(const Slice& property, std::string* value) { + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { return false; } - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { for (int i = 0; i < n; i++) { sizes[i] = 0; } } - virtual Status CompactRange(const Slice* start, const Slice* end, + using DB::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* start, const Slice* end, bool reduce_level, int target_level) { return Status::NotSupported("Not supported operation."); } - virtual int NumberLevels() - { - return 1; + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) { return 1; } + + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { + return 1; } - virtual int MaxMemCompactionLevel() - { - return 1; - } - - virtual int Level0StopWriteTrigger() - { - return -1; + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + return -1; } virtual const std::string& GetName() const { @@ -5687,11 +6023,14 @@ class ModelDB: public DB { return nullptr; } - virtual const Options& GetOptions() const { + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const { return options_; } - virtual Status Flush(const rocksdb::FlushOptions& options) { + using DB::Flush; + virtual Status Flush(const rocksdb::FlushOptions& options, + ColumnFamilyHandle* column_family) { Status ret; return ret; } @@ -5729,6 +6068,8 @@ class ModelDB: public DB { return Status::NotSupported("Not supported in Model DB"); } + virtual ColumnFamilyHandle* DefaultColumnFamily() const { return nullptr; } + private: class ModelIter: public Iterator { public: @@ -5898,26 +6239,22 @@ TEST(DBTest, Randomized) { TEST(DBTest, MultiGetSimple) { do { - ASSERT_OK(db_->Put(WriteOptions(),"k1","v1")); - ASSERT_OK(db_->Put(WriteOptions(),"k2","v2")); - ASSERT_OK(db_->Put(WriteOptions(),"k3","v3")); - ASSERT_OK(db_->Put(WriteOptions(),"k4","v4")); - ASSERT_OK(db_->Delete(WriteOptions(),"k4")); - ASSERT_OK(db_->Put(WriteOptions(),"k5","v5")); - ASSERT_OK(db_->Delete(WriteOptions(),"no_key")); + CreateAndReopenWithCF({"pikachu"}); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); - std::vector keys(6); - keys[0] = "k1"; - keys[1] = "k2"; - keys[2] = "k3"; - keys[3] = "k4"; - keys[4] = "k5"; - keys[5] = "no_key"; + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); - std::vector values(20,"Temporary data to be overwritten"); + std::vector values(20, "Temporary data to be overwritten"); + std::vector cfs(keys.size(), handles_[1]); - std::vector s = db_->MultiGet(ReadOptions(),keys,&values); - ASSERT_EQ(values.size(),keys.size()); + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(values.size(), keys.size()); ASSERT_EQ(values[0], "v1"); ASSERT_EQ(values[1], "v2"); ASSERT_EQ(values[2], "v3"); @@ -5934,22 +6271,27 @@ TEST(DBTest, MultiGetSimple) { TEST(DBTest, MultiGetEmpty) { do { + CreateAndReopenWithCF({"pikachu"}); // Empty Key Set std::vector keys; std::vector values; - std::vector s = db_->MultiGet(ReadOptions(),keys,&values); - ASSERT_EQ((int)s.size(),0); + std::vector cfs; + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); // Empty Database, Empty Key Set DestroyAndReopen(); - s = db_->MultiGet(ReadOptions(), keys, &values); - ASSERT_EQ((int)s.size(),0); + CreateAndReopenWithCF({"pikachu"}); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); // Empty Database, Search for Keys keys.resize(2); keys[0] = "a"; keys[1] = "b"; - s = db_->MultiGet(ReadOptions(),keys,&values); + cfs.push_back(handles_[0]); + cfs.push_back(handles_[1]); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); ASSERT_EQ((int)s.size(), 2); ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); } while (ChangeCompactOptions()); @@ -5977,7 +6319,7 @@ void PrefixScanInit(DBTest *dbtest) { snprintf(buf, sizeof(buf), "%02d______:end", 10); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->dbfull()->TEST_FlushMemTable(); + dbtest->Flush(); dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1 // GROUP 1 @@ -5988,7 +6330,7 @@ void PrefixScanInit(DBTest *dbtest) { snprintf(buf, sizeof(buf), "%02d______:end", i+1); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->dbfull()->TEST_FlushMemTable(); + dbtest->Flush(); } // GROUP 2 @@ -6001,7 +6343,7 @@ void PrefixScanInit(DBTest *dbtest) { small_range_sstfiles+i+1); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->dbfull()->TEST_FlushMemTable(); + dbtest->Flush(); } } @@ -6110,11 +6452,13 @@ void BM_LogAndApply(int iters, int num_base_files) { port::Mutex mu; MutexLock l(&mu); - InternalKeyComparator cmp(BytewiseComparator()); Options options; EnvOptions sopt; - VersionSet vset(dbname, &options, sopt, nullptr, &cmp); - ASSERT_OK(vset.Recover()); + VersionSet vset(dbname, &options, sopt, nullptr); + std::vector dummy; + dummy.push_back(ColumnFamilyDescriptor()); + ASSERT_OK(vset.Recover(dummy)); + auto default_cfd = vset.GetColumnFamilySet()->GetDefault(); VersionEdit vbase; uint64_t fnum = 1; for (int i = 0; i < num_base_files; i++) { @@ -6122,7 +6466,7 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); vbase.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1); } - ASSERT_OK(vset.LogAndApply(&vbase, &mu)); + ASSERT_OK(vset.LogAndApply(default_cfd, &vbase, &mu)); uint64_t start_micros = env->NowMicros(); @@ -6132,7 +6476,7 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKey start(MakeKey(2*fnum), 1, kTypeValue); InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); vedit.AddFile(2, fnum++, 1 /* file size */, start, limit, 1, 1); - vset.LogAndApply(&vedit, &mu); + vset.LogAndApply(default_cfd, &vedit, &mu); } uint64_t stop_micros = env->NowMicros(); unsigned int us = stop_micros - start_micros; @@ -6162,10 +6506,11 @@ TEST(DBTest, TailingIteratorSingle) { } TEST(DBTest, TailingIteratorKeepAdding) { + CreateAndReopenWithCF({"pikachu"}); ReadOptions read_options; read_options.tailing = true; - std::unique_ptr iter(db_->NewIterator(read_options)); + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); std::string value(1024, 'a'); const int num_records = 10000; @@ -6174,7 +6519,7 @@ TEST(DBTest, TailingIteratorKeepAdding) { snprintf(buf, sizeof(buf), "%016d", i); Slice key(buf, 16); - ASSERT_OK(db_->Put(WriteOptions(), key, value)); + ASSERT_OK(Put(1, key, value)); iter->Seek(key); ASSERT_TRUE(iter->Valid()); @@ -6183,17 +6528,18 @@ TEST(DBTest, TailingIteratorKeepAdding) { } TEST(DBTest, TailingIteratorDeletes) { + CreateAndReopenWithCF({"pikachu"}); ReadOptions read_options; read_options.tailing = true; - std::unique_ptr iter(db_->NewIterator(read_options)); + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); // write a single record, read it using the iterator, then delete it - ASSERT_OK(db_->Put(WriteOptions(), "0test", "test")); + ASSERT_OK(Put(1, "0test", "test")); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->key().ToString(), "0test"); - ASSERT_OK(db_->Delete(WriteOptions(), "0test")); + ASSERT_OK(Delete(1, "0test")); // write many more records const int num_records = 10000; @@ -6204,11 +6550,11 @@ TEST(DBTest, TailingIteratorDeletes) { snprintf(buf, sizeof(buf), "1%015d", i); Slice key(buf, 16); - ASSERT_OK(db_->Put(WriteOptions(), key, value)); + ASSERT_OK(Put(1, key, value)); } // force a flush to make sure that no records are read from memtable - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Flush(1)); // skip "0test" iter->Next(); @@ -6232,13 +6578,14 @@ TEST(DBTest, TailingIteratorPrefixSeek) { options.prefix_extractor.reset(NewFixedPrefixTransform(2)); options.memtable_factory.reset(NewHashSkipListRepFactory()); DestroyAndReopen(&options); + CreateAndReopenWithCF({"pikachu"}, &options); - std::unique_ptr iter(db_->NewIterator(read_options)); - ASSERT_OK(db_->Put(WriteOptions(), "0101", "test")); + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(Put(1, "0101", "test")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Flush(1)); - ASSERT_OK(db_->Put(WriteOptions(), "0202", "test")); + ASSERT_OK(Put(1, "0202", "test")); // Seek(0102) shouldn't find any records since 0202 has a different prefix iter->Seek("0102"); diff --git a/db/dbformat.cc b/db/dbformat.cc index 2d35d0423..e53d16dc1 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -59,7 +59,7 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { // decreasing sequence number // decreasing type (though sequence# should be enough to disambiguate) int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - BumpPerfCount(&perf_context.user_key_comparison_count); + PERF_COUNTER_ADD(user_key_comparison_count, 1); if (r == 0) { const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); @@ -79,7 +79,7 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a, // decreasing sequence number // decreasing type (though sequence# should be enough to disambiguate) int r = user_comparator_->Compare(a.user_key, b.user_key); - BumpPerfCount(&perf_context.user_key_comparison_count); + PERF_COUNTER_ADD(user_key_comparison_count, 1); if (r == 0) { if (a.sequence > b.sequence) { r = -1; diff --git a/db/dbformat.h b/db/dbformat.h index 3c5ea6958..6ac53074a 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -32,6 +32,9 @@ enum ValueType : unsigned char { kTypeValue = 0x1, kTypeMerge = 0x2, kTypeLogData = 0x3, + kTypeColumnFamilyDeletion = 0x4, + kTypeColumnFamilyValue = 0x5, + kTypeColumnFamilyMerge = 0x6, kMaxValue = 0x7F }; @@ -235,4 +238,74 @@ inline LookupKey::~LookupKey() { if (start_ != space_) delete[] start_; } +class IterKey { + public: + IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {} + + ~IterKey() { Clear(); } + + Slice GetKey() const { + if (key_ != nullptr) { + return Slice(key_, key_size_); + } else { + return Slice(); + } + } + + bool Valid() const { return key_ != nullptr; } + + void Clear() { + if (key_ != nullptr && key_ != space_) { + delete[] key_; + } + key_ = space_; + buf_size_ = sizeof(buf_size_); + } + + // Enlarge the buffer size if needed based on key_size. + // By default, static allocated buffer is used. Once there is a key + // larger than the static allocated buffer, another buffer is dynamically + // allocated, until a larger key buffer is requested. In that case, we + // reallocate buffer and delete the old one. + void EnlargeBufferIfNeeded(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + if (key_size > buf_size_) { + // Need to enlarge the buffer. + Clear(); + key_ = new char[key_size]; + buf_size_ = key_size; + } + key_size_ = key_size; + } + + void SetUserKey(const Slice& user_key) { + size_t size = user_key.size(); + EnlargeBufferIfNeeded(size); + memcpy(key_, user_key.data(), size); + } + + void SetInternalKey(const Slice& user_key, SequenceNumber s, + ValueType value_type = kValueTypeForSeek) { + size_t usize = user_key.size(); + EnlargeBufferIfNeeded(usize + sizeof(uint64_t)); + memcpy(key_, user_key.data(), usize); + EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type)); + } + + void SetInternalKey(const ParsedInternalKey& parsed_key) { + SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type); + } + + private: + char* key_; + size_t buf_size_; + size_t key_size_; + char space_[32]; // Avoid allocation for short keys + + // No copying allowed + IterKey(const IterKey&) = delete; + void operator=(const IterKey&) = delete; +}; + } // namespace rocksdb diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 4cc049965..fb5e9b229 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -7,8 +7,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/internal_stats.h" -#include "db/db_impl.h" -#include "db/memtable_list.h" +#include "db/column_family.h" #include @@ -44,10 +43,8 @@ DBPropertyType GetPropertyType(const Slice& property) { bool InternalStats::GetProperty(DBPropertyType property_type, const Slice& property, std::string* value, - DBImpl* db) { - VersionSet* version_set = db->versions_.get(); - Version* current = version_set->current(); - const MemTableList& imm = db->imm_; + ColumnFamilyData* cfd) { + Version* current = cfd->current(); Slice in = property; switch (property_type) { @@ -110,7 +107,6 @@ bool InternalStats::GetProperty(DBPropertyType property_type, write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL); } - // Pardon the long line but I think it is easier to read this way. snprintf( buf, sizeof(buf), " Compactions\n" @@ -159,7 +155,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type, "%9lu\n", level, files, current->NumLevelBytes(level) / 1048576.0, current->NumLevelBytes(level) / - version_set->MaxBytesForLevel(level), + cfd->compaction_picker()->MaxBytesForLevel(level), compaction_stats_[level].micros / 1e6, bytes_read / 1048576.0, compaction_stats_[level].bytes_written / 1048576.0, @@ -334,11 +330,11 @@ bool InternalStats::GetProperty(DBPropertyType property_type, *value = current->DebugString(); return true; case kNumImmutableMemTable: - *value = std::to_string(imm.size()); + *value = std::to_string(cfd->imm()->size()); return true; case kMemtableFlushPending: // Return number of mem tables that are ready to flush (made immutable) - *value = std::to_string(imm.IsFlushPending() ? 1 : 0); + *value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0); return true; case kCompactionPending: // 1 if the system already determines at least one compacdtion is needed. @@ -351,7 +347,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type, return true; case kCurSizeActiveMemTable: // Current size of the active memtable - *value = std::to_string(db->mem_->ApproximateMemoryUsage()); + *value = std::to_string(cfd->mem()->ApproximateMemoryUsage()); return true; default: return false; diff --git a/db/internal_stats.h b/db/internal_stats.h index e140e7280..616b6cc0d 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -16,6 +16,8 @@ #include #include +class ColumnFamilyData; + namespace rocksdb { class MemTableList; @@ -126,7 +128,7 @@ class InternalStats { uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } bool GetProperty(DBPropertyType property_type, const Slice& property, - std::string* value, DBImpl* db); + std::string* value, ColumnFamilyData* cfd); private: std::vector compaction_stats_; diff --git a/db/memtable.cc b/db/memtable.cc index b520fe25d..2d12708c3 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -29,7 +29,8 @@ namespace rocksdb { -MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) +MemTable::MemTable(const InternalKeyComparator& cmp, + const Options& options) : comparator_(cmp), refs_(0), kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), @@ -42,7 +43,6 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) file_number_(0), first_seqno_(0), mem_next_logfile_number_(0), - mem_logfile_number_(0), locks_(options.inplace_update_support ? options.inplace_update_num_locks : 0), prefix_extractor_(options.prefix_extractor.get()), @@ -142,6 +142,11 @@ Slice MemTableRep::UserKey(const char* key) const { return Slice(slice.data(), slice.size() - 8); } +KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { + *buf = arena_->Allocate(len); + return static_cast(*buf); +} + // Encode a suitable internal key target for "target" and return it. // Uses *scratch as scratch space, and the returned pointer will point // into this scratch space. @@ -243,7 +248,9 @@ void MemTable::Add(SequenceNumber s, ValueType type, const size_t encoded_len = VarintLength(internal_key_size) + internal_key_size + VarintLength(val_size) + val_size; - char* buf = arena_.Allocate(encoded_len); + char* buf = nullptr; + KeyHandle handle = table_->Allocate(encoded_len, &buf); + assert(buf != nullptr); char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); p += key_size; @@ -252,7 +259,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); - table_->Insert(buf); + table_->Insert(handle); if (prefix_bloom_) { assert(prefix_extractor_); @@ -370,8 +377,7 @@ static bool SaveValue(void* arg, const char* entry) { bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, MergeContext& merge_context, const Options& options) { - StopWatchNano memtable_get_timer(options.env, false); - StartPerfTimer(&memtable_get_timer); + PERF_TIMER_AUTO(get_from_memtable_time); Slice user_key = key.user_key(); bool found_final_value = false; @@ -401,8 +407,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (!found_final_value && merge_in_progress) { *s = Status::MergeInProgress(""); } - BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer); - BumpPerfCount(&perf_context.get_from_memtable_count); + PERF_TIMER_STOP(get_from_memtable_time); + PERF_COUNTER_ADD(get_from_memtable_count, 1); return found_final_value; } diff --git a/db/memtable.h b/db/memtable.h index 451def38f..3d392820c 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -13,7 +13,7 @@ #include #include "db/dbformat.h" #include "db/skiplist.h" -#include "db/version_set.h" +#include "db/version_edit.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" #include "util/arena.h" @@ -39,7 +39,7 @@ class MemTable { // MemTables are reference counted. The initial reference count // is zero and the caller must call Ref() at least once. explicit MemTable(const InternalKeyComparator& comparator, - const Options& options = Options()); + const Options& options); ~MemTable(); @@ -147,14 +147,6 @@ class MemTable { // be flushed to storage void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } - // Returns the logfile number that can be safely deleted when this - // memstore is flushed to storage - uint64_t GetLogNumber() { return mem_logfile_number_; } - - // Sets the logfile number that can be safely deleted when this - // memstore is flushed to storage - void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; } - // Notify the underlying storage that no more items will be added void MarkImmutable() { table_->MarkReadOnly(); } @@ -197,10 +189,6 @@ class MemTable { // The log files earlier than this number can be deleted. uint64_t mem_next_logfile_number_; - // The log file that backs this memtable (to be deleted when - // memtable flush is done) - uint64_t mem_logfile_number_; - // rw locks for inplace updates std::vector locks_; diff --git a/db/memtable_list.cc b/db/memtable_list.cc index ebda34802..655ded7f1 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -8,9 +8,11 @@ #include #include "rocksdb/db.h" #include "db/memtable.h" +#include "db/version_set.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "util/coding.h" +#include "util/log_buffer.h" namespace rocksdb { @@ -120,7 +122,8 @@ void MemTableList::PickMemtablesToFlush(autovector* ret) { } void MemTableList::RollbackMemtableFlush(const autovector& mems, - uint64_t file_number, std::set* pending_outputs) { + uint64_t file_number, + std::set* pending_outputs) { assert(!mems.empty()); // If the flush was not successful, then just reset state. @@ -140,10 +143,10 @@ void MemTableList::RollbackMemtableFlush(const autovector& mems, // Record a successful flush in the manifest file Status MemTableList::InstallMemtableFlushResults( - const autovector& mems, VersionSet* vset, + ColumnFamilyData* cfd, const autovector& mems, VersionSet* vset, port::Mutex* mu, Logger* info_log, uint64_t file_number, std::set& pending_outputs, autovector* to_delete, - Directory* db_directory) { + Directory* db_directory, LogBuffer* log_buffer) { mu->AssertHeld(); // flush was sucessful @@ -173,12 +176,11 @@ Status MemTableList::InstallMemtableFlushResults( break; } - Log(info_log, - "Level-0 commit table #%lu started", - (unsigned long)m->file_number_); + LogToBuffer(log_buffer, "Level-0 commit table #%lu started", + (unsigned long)m->file_number_); // this can release and reacquire the mutex. - s = vset->LogAndApply(&m->edit_, mu, db_directory); + s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory); // we will be changing the version in the next code path, // so we better create a new one, since versions are immutable @@ -189,10 +191,8 @@ Status MemTableList::InstallMemtableFlushResults( uint64_t mem_id = 1; // how many memtables has been flushed. do { if (s.ok()) { // commit new state - Log(info_log, - "Level-0 commit table #%lu: memtable #%lu done", - (unsigned long)m->file_number_, - (unsigned long)mem_id); + LogToBuffer(log_buffer, "Level-0 commit table #%lu: memtable #%lu done", + (unsigned long)m->file_number_, (unsigned long)mem_id); current_->Remove(m); assert(m->file_number_ > 0); diff --git a/db/memtable_list.h b/db/memtable_list.h index 3c87d4eee..903305779 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -7,19 +7,25 @@ #include #include +#include +#include #include +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/iterator.h" #include "db/dbformat.h" -#include "db/memtable.h" #include "db/skiplist.h" -#include "rocksdb/db.h" +#include "db/memtable.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "util/autovector.h" +#include "util/log_buffer.h" namespace rocksdb { +class ColumnFamilyData; class InternalKeyComparator; class Mutex; @@ -99,12 +105,14 @@ class MemTableList { std::set* pending_outputs); // Commit a successful flush in the manifest file - Status InstallMemtableFlushResults(const autovector& m, + Status InstallMemtableFlushResults(ColumnFamilyData* cfd, + const autovector& m, VersionSet* vset, port::Mutex* mu, Logger* info_log, uint64_t file_number, std::set& pending_outputs, autovector* to_delete, - Directory* db_directory); + Directory* db_directory, + LogBuffer* log_buffer); // New memtables are inserted at the front of the list. // Takes ownership of the referenced held on *m by the caller of Add(). diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 4f1563b94..6a95a2585 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -429,6 +429,48 @@ TEST(PlainTableDBTest, Iterator) { } } +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} + +TEST(PlainTableDBTest, IteratorLargeKeys) { + Options options = CurrentOptions(); + options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16)); + options.create_if_missing = true; + options.prefix_extractor.reset(); + DestroyAndReopen(&options); + + std::string key_list[] = { + MakeLongKey(30, '0'), + MakeLongKey(16, '1'), + MakeLongKey(32, '2'), + MakeLongKey(60, '3'), + MakeLongKey(90, '4'), + MakeLongKey(50, '5'), + MakeLongKey(26, '6') + }; + + for (size_t i = 0; i < 7; i++) { + ASSERT_OK(Put(key_list[i], std::to_string(i))); + } + + dbfull()->TEST_FlushMemTable(); + + Iterator* iter = dbfull()->NewIterator(ro_); + iter->Seek(key_list[0]); + + for (size_t i = 0; i < 7; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key_list[i], iter->key().ToString()); + ASSERT_EQ(std::to_string(i), iter->value().ToString()); + iter->Next(); + } + + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + // A test comparator which compare two strings in this way: // (1) first compare prefix of 8 bytes in alphabet order, // (2) if two strings share the same prefix, sort the other part of the string diff --git a/db/repair.cc b/db/repair.cc index f3b95f5e5..ab2850523 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -55,14 +55,20 @@ class Repairer { icmp_(options.comparator), ipolicy_(options.filter_policy), options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), + raw_table_cache_( + // TableCache can be small since we expect each table to be opened + // once. + NewLRUCache(10, options_.table_cache_numshardbits, + options_.table_cache_remove_scan_count_limit)), next_file_number_(1) { - // TableCache can be small since we expect each table to be opened once. - table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10); + table_cache_ = new TableCache(dbname_, &options_, storage_options_, + raw_table_cache_.get()); edit_ = new VersionEdit(); } ~Repairer() { delete table_cache_; + raw_table_cache_.reset(); delete edit_; } @@ -102,6 +108,7 @@ class Repairer { InternalKeyComparator const icmp_; InternalFilterPolicy const ipolicy_; Options const options_; + std::shared_ptr raw_table_cache_; TableCache* table_cache_; VersionEdit* edit_; @@ -197,6 +204,7 @@ class Repairer { Slice record; WriteBatch batch; MemTable* mem = new MemTable(icmp_, options_); + auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_); mem->Ref(); int counter = 0; while (reader.ReadRecord(&record, &scratch)) { @@ -206,7 +214,7 @@ class Repairer { continue; } WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, mem, &options_); + status = WriteBatchInternal::InsertInto(&batch, cf_mems_default); if (status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { @@ -226,6 +234,7 @@ class Repairer { iter, &meta, icmp_, 0, 0, kNoCompression); delete iter; delete mem->Unref(); + delete cf_mems_default; mem = nullptr; if (status.ok()) { if (meta.file_size > 0) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 7058221e0..36168d109 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -35,18 +35,13 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) { sizeof(*file_number)); } -TableCache::TableCache(const std::string& dbname, - const Options* options, - const EnvOptions& storage_options, - int entries) +TableCache::TableCache(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, Cache* const cache) : env_(options->env), dbname_(dbname), options_(options), storage_options_(storage_options), - cache_( - NewLRUCache(entries, options->table_cache_numshardbits, - options->table_cache_remove_scan_count_limit)) { -} + cache_(cache) {} TableCache::~TableCache() { } @@ -124,7 +119,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, TableReader* table_reader = GetTableReaderFromHandle(handle); Iterator* result = table_reader->NewIterator(options); if (!file_meta.table_reader_handle) { - result->RegisterCleanup(&UnrefEntry, cache_.get(), handle); + result->RegisterCleanup(&UnrefEntry, cache_, handle); } if (table_reader_ptr != nullptr) { *table_reader_ptr = table_reader; @@ -216,8 +211,8 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options, return may_match; } -void TableCache::Evict(uint64_t file_number) { - cache_->Erase(GetSliceForFileNumber(&file_number)); +void TableCache::Evict(Cache* cache, uint64_t file_number) { + cache->Erase(GetSliceForFileNumber(&file_number)); } } // namespace rocksdb diff --git a/db/table_cache.h b/db/table_cache.h index 5f1c29ea5..97e0f6a27 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -30,7 +30,7 @@ struct FileMetaData; class TableCache { public: TableCache(const std::string& dbname, const Options* options, - const EnvOptions& storage_options, int entries); + const EnvOptions& storage_options, Cache* cache); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -64,7 +64,7 @@ class TableCache { const Slice& internal_prefix, bool* table_io); // Evict any entry for the specified file number - void Evict(uint64_t file_number); + static void Evict(Cache* cache, uint64_t file_number); // Find table reader Status FindTable(const EnvOptions& toptions, @@ -95,7 +95,7 @@ class TableCache { const std::string dbname_; const Options* options_; const EnvOptions& storage_options_; - std::shared_ptr cache_; + Cache* const cache_; }; } // namespace rocksdb diff --git a/db/tailing_iter.cc b/db/tailing_iter.cc index 7264b43af..cd0335fef 100644 --- a/db/tailing_iter.cc +++ b/db/tailing_iter.cc @@ -8,15 +8,19 @@ #include #include #include "db/db_impl.h" +#include "db/column_family.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" namespace rocksdb { TailingIterator::TailingIterator(DBImpl* db, const ReadOptions& options, - const Comparator* comparator) - : db_(db), options_(options), comparator_(comparator), - version_number_(0), current_(nullptr), + ColumnFamilyData* cfd) + : db_(db), + options_(options), + cfd_(cfd), + version_number_(0), + current_(nullptr), status_(Status::InvalidArgument("Seek() not called on this iterator")) {} bool TailingIterator::Valid() const { @@ -53,10 +57,9 @@ void TailingIterator::Seek(const Slice& target) { // 'target' -- in this case, prev_key_ is included in the interval, so // prev_inclusive_ has to be set. - if (!is_prev_set_ || - comparator_->Compare(prev_key_, target) >= !is_prev_inclusive_ || - (immutable_->Valid() && - comparator_->Compare(target, immutable_->key()) > 0) || + const Comparator* cmp = cfd_->user_comparator(); + if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ || + (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) || (options_.prefix_seek && !IsSamePrefix(target))) { SeekImmutable(target); } @@ -121,7 +124,7 @@ void TailingIterator::SeekToLast() { void TailingIterator::CreateIterators() { std::pair iters = - db_->GetTailingIteratorPair(options_, &version_number_); + db_->GetTailingIteratorPair(options_, cfd_, &version_number_); assert(iters.first && iters.second); @@ -137,9 +140,10 @@ void TailingIterator::UpdateCurrent() { if (mutable_->Valid()) { current_ = mutable_.get(); } + const Comparator* cmp = cfd_->user_comparator(); if (immutable_->Valid() && (current_ == nullptr || - comparator_->Compare(immutable_->key(), current_->key()) < 0)) { + cmp->Compare(immutable_->key(), current_->key()) < 0)) { current_ = immutable_.get(); } @@ -151,11 +155,11 @@ void TailingIterator::UpdateCurrent() { bool TailingIterator::IsCurrentVersion() const { return mutable_ != nullptr && immutable_ != nullptr && - version_number_ == db_->CurrentVersionNumber(); + version_number_ == cfd_->GetSuperVersionNumber(); } bool TailingIterator::IsSamePrefix(const Slice& target) const { - const SliceTransform* extractor = db_->options_.prefix_extractor.get(); + const SliceTransform* extractor = cfd_->options()->prefix_extractor.get(); assert(extractor); assert(is_prev_set_); diff --git a/db/tailing_iter.h b/db/tailing_iter.h index 3b8343a28..2a5a02e24 100644 --- a/db/tailing_iter.h +++ b/db/tailing_iter.h @@ -13,6 +13,7 @@ namespace rocksdb { class DBImpl; +class ColumnFamilyData; /** * TailingIterator is a special type of iterator that doesn't use an (implicit) @@ -25,7 +26,7 @@ class DBImpl; class TailingIterator : public Iterator { public: TailingIterator(DBImpl* db, const ReadOptions& options, - const Comparator* comparator); + ColumnFamilyData* cfd); virtual ~TailingIterator() {} virtual bool Valid() const override; @@ -41,7 +42,7 @@ class TailingIterator : public Iterator { private: DBImpl* const db_; const ReadOptions options_; - const Comparator* const comparator_; + ColumnFamilyData* const cfd_; uint64_t version_number_; // TailingIterator merges the contents of the two iterators below (one using diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 36b8932a5..0394855c3 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -9,7 +9,7 @@ namespace rocksdb { TransactionLogIteratorImpl::TransactionLogIteratorImpl( - const std::string& dir, const Options* options, + const std::string& dir, const DBOptions* options, const TransactionLogIterator::ReadOptions& read_options, const EnvOptions& soptions, const SequenceNumber seq, std::unique_ptr files, DBImpl const* const dbimpl) diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index 6454d89e7..98e4e26b4 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -67,7 +67,7 @@ class LogFileImpl : public LogFile { class TransactionLogIteratorImpl : public TransactionLogIterator { public: TransactionLogIteratorImpl( - const std::string& dir, const Options* options, + const std::string& dir, const DBOptions* options, const TransactionLogIterator::ReadOptions& read_options, const EnvOptions& soptions, const SequenceNumber seqNum, std::unique_ptr files, DBImpl const* const dbimpl); @@ -82,7 +82,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { private: const std::string& dir_; - const Options* options_; + const DBOptions* options_; const TransactionLogIterator::ReadOptions read_options_; const EnvOptions& soptions_; SequenceNumber startingSequenceNumber_; diff --git a/db/version_edit.cc b/db/version_edit.cc index 5c532b138..24d7f0d9f 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -11,6 +11,7 @@ #include "db/version_set.h" #include "util/coding.h" +#include "rocksdb/slice.h" namespace rocksdb { @@ -29,6 +30,11 @@ enum Tag { // these are new formats divergent from open source leveldb kNewFile2 = 100, // store smallest & largest seqno + + kColumnFamily = 200, // specify column family for version edit + kColumnFamilyAdd = 201, + kColumnFamilyDrop = 202, + kMaxColumnFamily = 203, }; void VersionEdit::Clear() { @@ -38,13 +44,19 @@ void VersionEdit::Clear() { prev_log_number_ = 0; last_sequence_ = 0; next_file_number_ = 0; + max_column_family_ = 0; has_comparator_ = false; has_log_number_ = false; has_prev_log_number_ = false; has_next_file_number_ = false; has_last_sequence_ = false; + has_max_column_family_ = false; deleted_files_.clear(); new_files_.clear(); + column_family_ = 0; + is_column_family_add_ = 0; + is_column_family_drop_ = 0; + column_family_name_.clear(); } void VersionEdit::EncodeTo(std::string* dst) const { @@ -68,6 +80,10 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint32(dst, kLastSequence); PutVarint64(dst, last_sequence_); } + if (has_max_column_family_) { + PutVarint32(dst, kMaxColumnFamily); + PutVarint32(dst, max_column_family_); + } for (const auto& deleted : deleted_files_) { PutVarint32(dst, kDeletedFile); @@ -86,6 +102,21 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint64(dst, f.smallest_seqno); PutVarint64(dst, f.largest_seqno); } + + // 0 is default and does not need to be explicitly written + if (column_family_ != 0) { + PutVarint32(dst, kColumnFamily); + PutVarint32(dst, column_family_); + } + + if (is_column_family_add_) { + PutVarint32(dst, kColumnFamilyAdd); + PutLengthPrefixedSlice(dst, Slice(column_family_name_)); + } + + if (is_column_family_drop_) { + PutVarint32(dst, kColumnFamilyDrop); + } } static bool GetInternalKey(Slice* input, InternalKey* dst) { @@ -167,6 +198,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; + case kMaxColumnFamily: + if (GetVarint32(&input, &max_column_family_)) { + has_max_column_family_ = true; + } else { + msg = "max column family"; + } + break; + case kCompactPointer: if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) { @@ -221,6 +260,29 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; + case kColumnFamily: + if (!GetVarint32(&input, &column_family_)) { + if (!msg) { + msg = "set column family id"; + } + } + break; + + case kColumnFamilyAdd: + if (GetLengthPrefixedSlice(&input, &str)) { + is_column_family_add_ = true; + column_family_name_ = str.ToString(); + } else { + if (!msg) { + msg = "column family add"; + } + } + break; + + case kColumnFamilyDrop: + is_column_family_drop_ = true; + break; + default: msg = "unknown tag"; break; @@ -282,6 +344,19 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append(" .. "); r.append(f.largest.DebugString(hex_key)); } + r.append("\n ColumnFamily: "); + AppendNumberTo(&r, column_family_); + if (is_column_family_add_) { + r.append("\n ColumnFamilyAdd: "); + r.append(column_family_name_); + } + if (is_column_family_drop_) { + r.append("\n ColumnFamilyDrop"); + } + if (has_max_column_family_) { + r.append("\n MaxColumnFamily: "); + AppendNumberTo(&r, max_column_family_); + } r.append("\n}\n"); return r; } diff --git a/db/version_edit.h b/db/version_edit.h index f54949fbf..98731cfb2 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "rocksdb/cache.h" #include "db/dbformat.h" @@ -32,11 +33,14 @@ struct FileMetaData { // Needs to be disposed when refs becomes 0. Cache::Handle* table_reader_handle; - FileMetaData(uint64_t number, uint64_t file_size) : - refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size), - being_compacted(false), table_reader_handle(nullptr) { - } - FileMetaData() : FileMetaData(0, 0) { } + FileMetaData(uint64_t number, uint64_t file_size) + : refs(0), + allowed_seeks(1 << 30), + number(number), + file_size(file_size), + being_compacted(false), + table_reader_handle(nullptr) {} + FileMetaData() : FileMetaData(0, 0) {} }; class VersionEdit { @@ -66,6 +70,10 @@ class VersionEdit { has_last_sequence_ = true; last_sequence_ = seq; } + void SetMaxColumnFamily(uint32_t max_column_family) { + has_max_column_family_ = true; + max_column_family_ = max_column_family; + } // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) @@ -97,6 +105,31 @@ class VersionEdit { return new_files_.size() + deleted_files_.size(); } + bool IsColumnFamilyManipulation() { + return is_column_family_add_ || is_column_family_drop_; + } + + void SetColumnFamily(uint32_t column_family_id) { + column_family_ = column_family_id; + } + + // set column family ID by calling SetColumnFamily() + void AddColumnFamily(const std::string& name) { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_add_ = true; + column_family_name_ = name; + } + + // set column family ID by calling SetColumnFamily() + void DropColumnFamily() { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_drop_ = true; + } + void EncodeTo(std::string* dst) const; Status DecodeFrom(const Slice& src); @@ -114,15 +147,27 @@ class VersionEdit { uint64_t log_number_; uint64_t prev_log_number_; uint64_t next_file_number_; + uint32_t max_column_family_; SequenceNumber last_sequence_; bool has_comparator_; bool has_log_number_; bool has_prev_log_number_; bool has_next_file_number_; bool has_last_sequence_; + bool has_max_column_family_; DeletedFileSet deleted_files_; - std::vector > new_files_; + std::vector> new_files_; + + // Each version edit record should have column_family_id set + // If it's not set, it is default (0) + uint32_t column_family_; + // a version edit can be either column_family add or + // column_family drop. If it's column family add, + // it also includes column family name. + bool is_column_family_drop_; + bool is_column_family_add_; + std::string column_family_name_; }; } // namespace rocksdb diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 110b422f8..7842b3263 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -45,6 +45,19 @@ TEST(VersionEditTest, EncodeDecode) { TestEncodeDecode(edit); } +TEST(VersionEditTest, ColumnFamilyTest) { + VersionEdit edit; + edit.SetColumnFamily(2); + edit.AddColumnFamily("column_family"); + edit.SetMaxColumnFamily(5); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetColumnFamily(3); + edit.DropColumnFamily(); + TestEncodeDecode(edit); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/version_set.cc b/db/version_set.cc index 2057d6dd4..84361f5ff 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -12,7 +12,10 @@ #include #include +#include +#include #include +#include #include #include "db/filename.h" @@ -58,7 +61,7 @@ Version::~Version() { f->refs--; if (f->refs <= 0) { if (f->table_reader_handle) { - vset_->table_cache_->ReleaseHandle(f->table_reader_handle); + cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); f->table_reader_handle = nullptr; } vset_->obsolete_files_.push_back(f); @@ -255,15 +258,15 @@ bool Version::PrefixMayMatch(const ReadOptions& options, level_iter->value().data()); FileMetaData meta(encoded_meta->number, encoded_meta->file_size); meta.table_reader_handle = encoded_meta->table_reader_handle; - may_match = vset_->table_cache_->PrefixMayMatch(options, vset_->icmp_, meta, - internal_prefix, nullptr); + may_match = cfd_->table_cache()->PrefixMayMatch( + options, cfd_->internal_comparator(), meta, internal_prefix, nullptr); } return may_match; } Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { - auto table_cache = vset_->table_cache_; - auto options = vset_->options_; + auto table_cache = cfd_->table_cache(); + auto options = cfd_->options(); for (int level = 0; level < num_levels_; level++) { for (const auto& file_meta : files_[level]) { auto fname = TableFileName(vset_->dbname_, file_meta->number); @@ -271,8 +274,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { // properties from there. std::shared_ptr table_properties; Status s = table_cache->GetTableProperties( - vset_->storage_options_, vset_->icmp_, *file_meta, &table_properties, - true /* no io */); + vset_->storage_options_, cfd_->internal_comparator(), *file_meta, + &table_properties, true /* no io */); if (s.ok()) { props->insert({fname, table_properties}); continue; @@ -287,8 +290,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { // 2. Table is not present in table cache, we'll read the table properties // directly from the properties block in the file. std::unique_ptr file; - s = vset_->env_->NewRandomAccessFile(fname, &file, - vset_->storage_options_); + s = options->env->NewRandomAccessFile(fname, &file, + vset_->storage_options_); if (!s.ok()) { return s; } @@ -317,7 +320,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, const EnvOptions& soptions, int level) const { - Iterator* level_iter = new LevelFileNumIterator(vset_->icmp_, &files_[level]); + Iterator* level_iter = + new LevelFileNumIterator(cfd_->internal_comparator(), &files_[level]); if (options.prefix) { InternalKey internal_prefix(*options.prefix, 0, kTypeValue); if (!PrefixMayMatch(options, soptions, @@ -327,8 +331,8 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, return NewEmptyIterator(); } } - return NewTwoLevelIterator(level_iter, &GetFileIterator, vset_->table_cache_, - options, soptions, vset_->icmp_); + return NewTwoLevelIterator(level_iter, &GetFileIterator, cfd_->table_cache(), + options, soptions, cfd_->internal_comparator()); } void Version::AddIterators(const ReadOptions& options, @@ -336,8 +340,8 @@ void Version::AddIterators(const ReadOptions& options, std::vector* iters) { // Merge all level zero files together since they may overlap for (const FileMetaData* file : files_[0]) { - iters->push_back(vset_->table_cache_->NewIterator(options, soptions, - vset_->icmp_, *file)); + iters->push_back(cfd_->table_cache()->NewIterator( + options, soptions, cfd_->internal_comparator(), *file)); } // For levels > 0, we can use a concatenating iterator that sequentially @@ -476,12 +480,15 @@ bool BySmallestKey(FileMetaData* a, FileMetaData* b, } } // anonymous namespace -Version::Version(VersionSet* vset, uint64_t version_number) - : vset_(vset), +Version::Version(ColumnFamilyData* cfd, VersionSet* vset, + uint64_t version_number) + : cfd_(cfd), + vset_(vset), next_(this), prev_(this), refs_(0), - num_levels_(vset->num_levels_), + // cfd is nullptr if Version is dummy + num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()), files_(new std::vector[num_levels_]), files_by_size_(num_levels_), next_file_to_compact_by_size_(num_levels_), @@ -501,7 +508,7 @@ void Version::Get(const ReadOptions& options, bool* value_found) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); - const Comparator* ucmp = vset_->icmp_.user_comparator(); + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); auto merge_operator = db_options.merge_operator.get(); auto logger = db_options.info_log.get(); @@ -548,7 +555,7 @@ void Version::Get(const ReadOptions& options, // On Level-n (n>=1), files are sorted. // Binary search to find earliest index whose largest key >= ikey. // We will also stop when the file no longer overlaps ikey - start_index = FindFile(vset_->icmp_, files_[level], ikey); + start_index = FindFile(cfd_->internal_comparator(), files_[level], ikey); } // Traverse each relevant file to find the desired key @@ -574,11 +581,12 @@ void Version::Get(const ReadOptions& options, // Sanity check to make sure that the files are correctly sorted if (prev_file) { if (level != 0) { - int comp_sign = vset_->icmp_.Compare(prev_file->largest, f->smallest); + int comp_sign = cfd_->internal_comparator().Compare( + prev_file->largest, f->smallest); assert(comp_sign < 0); } else { // level == 0, the current file cannot be newer than the previous one. - if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { assert(!NewestFirstBySeqNo(f, prev_file)); } else { assert(!NewestFirst(f, prev_file)); @@ -588,9 +596,9 @@ void Version::Get(const ReadOptions& options, prev_file = f; #endif bool tableIO = false; - *status = - vset_->table_cache_->Get(options, vset_->icmp_, *f, ikey, &saver, - SaveValue, &tableIO, MarkKeyMayExist); + *status = cfd_->table_cache()->Get(options, cfd_->internal_comparator(), + *f, ikey, &saver, SaveValue, &tableIO, + MarkKeyMayExist); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; @@ -668,7 +676,7 @@ void Version::ComputeCompactionScore( int max_score_level = 0; int num_levels_to_check = - (vset_->options_->compaction_style != kCompactionStyleUniversal) + (cfd_->options()->compaction_style != kCompactionStyleUniversal) ? NumberLevels() - 1 : 1; @@ -694,15 +702,15 @@ void Version::ComputeCompactionScore( } // If we are slowing down writes, then we better compact that first - if (numfiles >= vset_->options_->level0_stop_writes_trigger) { + if (numfiles >= cfd_->options()->level0_stop_writes_trigger) { score = 1000000; // Log(options_->info_log, "XXX score l0 = 1000000000 max"); - } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) { + } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) { score = 10000; // Log(options_->info_log, "XXX score l0 = 1000000 medium"); } else { score = static_cast(numfiles) / - vset_->options_->level0_file_num_compaction_trigger; + cfd_->options()->level0_file_num_compaction_trigger; if (score >= 1) { // Log(options_->info_log, "XXX score l0 = %d least", (int)score); } @@ -711,7 +719,8 @@ void Version::ComputeCompactionScore( // Compute the ratio of current size to size limit. const uint64_t level_bytes = TotalFileSize(files_[level]) - size_being_compacted[level]; - score = static_cast(level_bytes) / vset_->MaxBytesForLevel(level); + score = static_cast(level_bytes) / + cfd_->compaction_picker()->MaxBytesForLevel(level); if (score > 1) { // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); } @@ -769,7 +778,7 @@ bool CompareSeqnoDescending(const Version::Fsize& first, void Version::UpdateFilesBySize() { // No need to sort the highest level because it is never compacted. int max_level = - (vset_->options_->compaction_style == kCompactionStyleUniversal) + (cfd_->options()->compaction_style == kCompactionStyleUniversal) ? NumberLevels() : NumberLevels() - 1; @@ -786,7 +795,7 @@ void Version::UpdateFilesBySize() { } // sort the top number_of_files_to_sort_ based on file size - if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { int num = temp.size(); std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), CompareSeqnoDescending); @@ -814,7 +823,6 @@ void Version::Ref() { } bool Version::Unref() { - assert(this != &vset_->dummy_versions_); assert(refs_ >= 1); --refs_; if (refs_ == 0) { @@ -836,8 +844,9 @@ bool Version::NeedsCompaction() const { // TODO(sdong): improve this function to be accurate for universal // compactions. int num_levels_to_check = - (vset_->options_->compaction_style != kCompactionStyleUniversal) ? - NumberLevels() - 1 : 1; + (cfd_->options()->compaction_style != kCompactionStyleUniversal) + ? NumberLevels() - 1 + : 1; for (int i = 0; i < num_levels_to_check; i++) { if (compaction_score_[i] >= 1) { return true; @@ -849,8 +858,9 @@ bool Version::NeedsCompaction() const { bool Version::OverlapInLevel(int level, const Slice* smallest_user_key, const Slice* largest_user_key) { - return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], - smallest_user_key, largest_user_key); + return SomeFileOverlapsRange(cfd_->internal_comparator(), (level > 0), + files_[level], smallest_user_key, + largest_user_key); } int Version::PickLevelForMemTableOutput( @@ -863,7 +873,7 @@ int Version::PickLevelForMemTableOutput( InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); InternalKey limit(largest_user_key, 0, static_cast(0)); std::vector overlaps; - int max_mem_compact_level = vset_->options_->max_mem_compaction_level; + int max_mem_compact_level = cfd_->options()->max_mem_compaction_level; while (max_mem_compact_level > 0 && level < max_mem_compact_level) { if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; @@ -874,7 +884,7 @@ int Version::PickLevelForMemTableOutput( } GetOverlappingInputs(level + 2, &start, &limit, &overlaps); const uint64_t sum = TotalFileSize(overlaps); - if (sum > vset_->compaction_picker_->MaxGrandParentOverlapBytes(level)) { + if (sum > cfd_->compaction_picker()->MaxGrandParentOverlapBytes(level)) { break; } level++; @@ -888,13 +898,12 @@ int Version::PickLevelForMemTableOutput( // If hint_index is specified, then it points to a file in the // overlapping range. // The file_index returns a pointer to any file in an overlapping range. -void Version::GetOverlappingInputs( - int level, - const InternalKey* begin, - const InternalKey* end, - std::vector* inputs, - int hint_index, - int* file_index) { +void Version::GetOverlappingInputs(int level, + const InternalKey* begin, + const InternalKey* end, + std::vector* inputs, + int hint_index, + int* file_index) { inputs->clear(); Slice user_begin, user_end; if (begin != nullptr) { @@ -906,7 +915,7 @@ void Version::GetOverlappingInputs( if (file_index) { *file_index = -1; } - const Comparator* user_cmp = vset_->icmp_.user_comparator(); + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); if (begin != nullptr && end != nullptr && level > 0) { GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs, hint_index, file_index); @@ -958,7 +967,7 @@ void Version::GetOverlappingInputsBinarySearch( int mid = 0; int max = files_[level].size() -1; bool foundOverlap = false; - const Comparator* user_cmp = vset_->icmp_.user_comparator(); + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); // if the caller already knows the index of a file that has overlap, // then we can skip the binary search. @@ -1004,7 +1013,7 @@ void Version::ExtendOverlappingInputs( std::vector* inputs, unsigned int midIndex) { - const Comparator* user_cmp = vset_->icmp_.user_comparator(); + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); #ifndef NDEBUG { // assert that the file at midIndex overlaps with the range @@ -1068,12 +1077,12 @@ bool Version::HasOverlappingUserKey( return false; } - const Comparator* user_cmp = vset_->icmp_.user_comparator(); + const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); const std::vector& files = files_[level]; const size_t kNumFiles = files.size(); // Check the last file in inputs against the file after it - size_t last_file = FindFile(vset_->icmp_, files, + size_t last_file = FindFile(cfd_->internal_comparator(), files, inputs->back()->largest.Encode()); assert(0 <= last_file && last_file < kNumFiles); // File should exist! if (last_file < kNumFiles-1) { // If not the last file @@ -1086,7 +1095,7 @@ bool Version::HasOverlappingUserKey( } // Check the first file in inputs against the file just before it - size_t first_file = FindFile(vset_->icmp_, files, + size_t first_file = FindFile(cfd_->internal_comparator(), files, inputs->front()->smallest.Encode()); assert(0 <= first_file && first_file <= last_file); // File should exist! if (first_file > 0) { // If not first file @@ -1195,10 +1204,12 @@ struct VersionSet::ManifestWriter { Status status; bool done; port::CondVar cv; + ColumnFamilyData* cfd; VersionEdit* edit; - explicit ManifestWriter(port::Mutex* mu, VersionEdit* e) : - done(false), cv(mu), edit(e) {} + explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* cfd, + VersionEdit* e) + : done(false), cv(mu), cfd(cfd), edit(e) {} }; // A helper class so we can efficiently apply a whole sequence @@ -1238,26 +1249,25 @@ class VersionSet::Builder { FileSet* added_files; }; - VersionSet* vset_; + ColumnFamilyData* cfd_; Version* base_; LevelState* levels_; FileComparator level_zero_cmp_; FileComparator level_nonzero_cmp_; public: - // Initialize a builder with the files from *base and other info from *vset - Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) { + Builder(ColumnFamilyData* cfd) : cfd_(cfd), base_(cfd->current()) { base_->Ref(); - levels_ = new LevelState[base->NumberLevels()]; + levels_ = new LevelState[base_->NumberLevels()]; level_zero_cmp_.sort_method = - (vset_->options_->compaction_style == kCompactionStyleUniversal) + (cfd_->options()->compaction_style == kCompactionStyleUniversal) ? FileComparator::kLevel0UniversalCompaction : FileComparator::kLevel0LevelCompaction; level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; - level_nonzero_cmp_.internal_comparator = &vset_->icmp_; + level_nonzero_cmp_.internal_comparator = &cfd->internal_comparator(); levels_[0].added_files = new FileSet(level_zero_cmp_); - for (int level = 1; level < base->NumberLevels(); level++) { + for (int level = 1; level < base_->NumberLevels(); level++) { levels_[level].added_files = new FileSet(level_nonzero_cmp_); } } @@ -1277,8 +1287,7 @@ class VersionSet::Builder { f->refs--; if (f->refs <= 0) { if (f->table_reader_handle) { - vset_->table_cache_->ReleaseHandle( - f->table_reader_handle); + cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); f->table_reader_handle = nullptr; } delete f; @@ -1299,14 +1308,15 @@ class VersionSet::Builder { auto f2 = v->files_[level][i]; if (level == 0) { assert(level_zero_cmp_(f1, f2)); - if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { assert(f1->largest_seqno > f2->largest_seqno); } } else { assert(level_nonzero_cmp_(f1, f2)); // Make sure there is no overlap in levels > 0 - if (vset_->icmp_.Compare(f1->largest, f2->smallest) >= 0) { + if (cfd_->internal_comparator().Compare(f1->largest, f2->smallest) >= + 0) { fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", (f1->largest).DebugString().c_str(), (f2->smallest).DebugString().c_str()); @@ -1408,6 +1418,7 @@ class VersionSet::Builder { void SaveTo(Version* v) { CheckConsistency(base_); CheckConsistency(v); + for (int level = 0; level < base_->NumberLevels(); level++) { const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; // Merge the set of added files with the set of pre-existing files. @@ -1439,14 +1450,14 @@ class VersionSet::Builder { } void LoadTableHandlers() { - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < cfd_->NumberLevels(); level++) { for (auto& file_meta : *(levels_[level].added_files)) { assert (!file_meta->table_reader_handle); bool table_io; - vset_->table_cache_->FindTable(vset_->storage_options_, vset_->icmp_, - file_meta->number, file_meta->file_size, - &file_meta->table_reader_handle, - &table_io, false); + cfd_->table_cache()->FindTable( + base_->vset_->storage_options_, cfd_->internal_comparator(), + file_meta->number, file_meta->file_size, + &file_meta->table_reader_handle, &table_io, false); } } } @@ -1458,8 +1469,8 @@ class VersionSet::Builder { auto* files = &v->files_[level]; if (level > 0 && !files->empty()) { // Must not overlap - assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest, - f->smallest) < 0); + assert(cfd_->internal_comparator().Compare( + (*files)[files->size() - 1]->largest, f->smallest) < 0); } f->refs++; files->push_back(f); @@ -1467,74 +1478,68 @@ class VersionSet::Builder { } }; -VersionSet::VersionSet(const std::string& dbname, const Options* options, - const EnvOptions& storage_options, - TableCache* table_cache, - const InternalKeyComparator* cmp) - : env_(options->env), +VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, + const EnvOptions& storage_options, Cache* table_cache) + : column_family_set_(new ColumnFamilySet(dbname, options, storage_options, + table_cache)), + env_(options->env), dbname_(dbname), options_(options), - table_cache_(table_cache), - icmp_(*cmp), next_file_number_(2), manifest_file_number_(0), // Filled by Recover() pending_manifest_file_number_(0), last_sequence_(0), - log_number_(0), prev_log_number_(0), - num_levels_(options_->num_levels), - dummy_versions_(this), - current_(nullptr), - need_slowdown_for_num_level0_files_(false), current_version_number_(0), manifest_file_size_(0), storage_options_(storage_options), - storage_options_compactions_(storage_options_) { - if (options_->compaction_style == kCompactionStyleUniversal) { - compaction_picker_.reset(new UniversalCompactionPicker(options_, &icmp_)); - } else { - compaction_picker_.reset(new LevelCompactionPicker(options_, &icmp_)); - } - AppendVersion(new Version(this, current_version_number_++)); -} + storage_options_compactions_(storage_options_) {} VersionSet::~VersionSet() { - current_->Unref(); - assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty + // we need to delete column_family_set_ because its destructor depends on + // VersionSet + column_family_set_.reset(); for (auto file : obsolete_files_) { delete file; } obsolete_files_.clear(); } -void VersionSet::AppendVersion(Version* v) { +void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, + Version* v) { // Make "v" current assert(v->refs_ == 0); - assert(v != current_); - if (current_ != nullptr) { - assert(current_->refs_ > 0); - current_->Unref(); + Version* current = column_family_data->current(); + assert(v != current); + if (current != nullptr) { + assert(current->refs_ > 0); + current->Unref(); } - current_ = v; - need_slowdown_for_num_level0_files_ = - (options_->level0_slowdown_writes_trigger >= 0 && current_ != nullptr && - v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger); + column_family_data->SetCurrent(v); v->Ref(); // Append to linked list - v->prev_ = dummy_versions_.prev_; - v->next_ = &dummy_versions_; + v->prev_ = column_family_data->dummy_versions()->prev_; + v->next_ = column_family_data->dummy_versions(); v->prev_->next_ = v; v->next_->prev_ = v; } -Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, - Directory* db_directory, - bool new_descriptor_log) { +Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, + VersionEdit* edit, port::Mutex* mu, + Directory* db_directory, bool new_descriptor_log, + const ColumnFamilyOptions* options) { mu->AssertHeld(); + // column_family_data can be nullptr only if this is column_family_add. + // in that case, we also need to specify ColumnFamilyOptions + if (column_family_data == nullptr) { + assert(edit->is_column_family_add_); + assert(options != nullptr); + } + // queue our request - ManifestWriter w(mu, edit); + ManifestWriter w(mu, column_family_data, edit); manifest_writers_.push_back(&w); while (!w.done && &w != manifest_writers_.front()) { w.cv.Wait(); @@ -1542,32 +1547,46 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, if (w.done) { return w.status; } + if (column_family_data != nullptr && column_family_data->IsDropped()) { + // if column family is dropped by the time we get here, no need to write + // anything to the manifest + manifest_writers_.pop_front(); + // Notify new head of write queue + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return Status::OK(); + } std::vector batch_edits; - Version* v = new Version(this, current_version_number_++); - Builder builder(this, current_); + Version* v = nullptr; + std::unique_ptr builder(nullptr); // process all requests in the queue ManifestWriter* last_writer = &w; assert(!manifest_writers_.empty()); assert(manifest_writers_.front() == &w); - - uint64_t max_log_number_in_batch = 0; - for (const auto& writer : manifest_writers_) { - last_writer = writer; - LogAndApplyHelper(&builder, v, writer->edit, mu); - if (writer->edit->has_log_number_) { - // When batch commit of manifest writes, we could have multiple flush and - // compaction edits. A flush edit has a bigger log number than what - // VersionSet has while a compaction edit does not have a log number. - // In this case, we want to make sure the largest log number is updated - // to VersionSet - max_log_number_in_batch = - std::max(max_log_number_in_batch, writer->edit->log_number_); + if (edit->IsColumnFamilyManipulation()) { + // no group commits for column family add or drop + LogAndApplyCFHelper(edit); + batch_edits.push_back(edit); + } else { + v = new Version(column_family_data, this, current_version_number_++); + builder.reset(new Builder(column_family_data)); + for (const auto& writer : manifest_writers_) { + if (writer->edit->IsColumnFamilyManipulation() || + writer->cfd->GetID() != column_family_data->GetID()) { + // no group commits for column family add or drop + // also, group commits across column families are not supported + break; + } + last_writer = writer; + LogAndApplyHelper(column_family_data, builder.get(), v, last_writer->edit, + mu); + batch_edits.push_back(last_writer->edit); } - batch_edits.push_back(writer->edit); + builder->SaveTo(v); } - builder.SaveTo(v); // Initialize new descriptor log file if necessary by creating // a temporary file that contains a snapshot of the current version. @@ -1584,19 +1603,30 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, pending_manifest_file_number_ = manifest_file_number_; } + if (new_descriptor_log) { + // if we're writing out new snapshot make sure to persist max column family + if (column_family_set_->GetMaxColumnFamily() > 0) { + edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + } + } + // Unlock during expensive operations. New writes cannot get here // because &w is ensuring that all new writes get queued. { - // calculate the amount of data being compacted at every level - std::vector size_being_compacted(v->NumberLevels() - 1); - compaction_picker_->SizeBeingCompacted(size_being_compacted); + std::vector size_being_compacted; + if (!edit->IsColumnFamilyManipulation()) { + size_being_compacted.resize(v->NumberLevels() - 1); + // calculate the amount of data being compacted at every level + column_family_data->compaction_picker()->SizeBeingCompacted( + size_being_compacted); + } mu->Unlock(); - if (options_->max_open_files == -1) { + if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. - builder.LoadTableHandlers(); + builder->LoadTableHandlers(); } // This is fine because everything inside of this block is serialized -- @@ -1614,10 +1644,12 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, } } - // The calls to ComputeCompactionScore and UpdateFilesBySize are cpu-heavy - // and is best called outside the mutex. - v->ComputeCompactionScore(size_being_compacted); - v->UpdateFilesBySize(); + if (!edit->IsColumnFamilyManipulation()) { + // The calls to ComputeCompactionScore and UpdateFilesBySize are cpu-heavy + // and is best called outside the mutex. + v->ComputeCompactionScore(size_being_compacted); + v->UpdateFilesBySize(); + } // Write new record to MANIFEST log if (s.ok()) { @@ -1690,13 +1722,34 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, // Install the new version if (s.ok()) { + if (edit->is_column_family_add_) { + // no group commit on column family add + assert(batch_edits.size() == 1); + assert(options != nullptr); + CreateColumnFamily(*options, edit); + } else if (edit->is_column_family_drop_) { + assert(batch_edits.size() == 1); + column_family_data->SetDropped(); + if (column_family_data->Unref()) { + delete column_family_data; + } + } else { + uint64_t max_log_number_in_batch = 0; + for (auto& e : batch_edits) { + if (e->has_log_number_) { + max_log_number_in_batch = + std::max(max_log_number_in_batch, e->log_number_); + } + } + if (max_log_number_in_batch != 0) { + assert(column_family_data->GetLogNumber() < max_log_number_in_batch); + column_family_data->SetLogNumber(max_log_number_in_batch); + } + AppendVersion(column_family_data, v); + } + manifest_file_number_ = pending_manifest_file_number_; manifest_file_size_ = new_manifest_file_size; - AppendVersion(v); - if (max_log_number_in_batch != 0) { - assert(log_number_ < max_log_number_in_batch); - log_number_ = max_log_number_in_batch; - } prev_log_number_ = edit->prev_log_number_; } else { Log(options_->info_log, "Error in committing version %lu", @@ -1728,34 +1781,47 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, return s; } -void VersionSet::LogAndApplyHelper(Builder* builder, Version* v, - VersionEdit* edit, port::Mutex* mu) { +void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { + assert(edit->IsColumnFamilyManipulation()); + edit->SetNextFile(next_file_number_); + edit->SetLastSequence(last_sequence_); + if (edit->is_column_family_drop_) { + // if we drop column family, we have to make sure to save max column family, + // so that we don't reuse existing ID + edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + } +} + +void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, Builder* builder, + Version* v, VersionEdit* edit, + port::Mutex* mu) { mu->AssertHeld(); + assert(!edit->IsColumnFamilyManipulation()); if (edit->has_log_number_) { - assert(edit->log_number_ >= log_number_); + assert(edit->log_number_ >= cfd->GetLogNumber()); assert(edit->log_number_ < next_file_number_); } - // If the edit does not have log number, it must be generated - // from a compaction if (!edit->has_prev_log_number_) { edit->SetPrevLogNumber(prev_log_number_); } - edit->SetNextFile(next_file_number_); edit->SetLastSequence(last_sequence_); builder->Apply(edit); } -Status VersionSet::Recover() { - struct LogReporter : public log::Reader::Reporter { - Status* status; - virtual void Corruption(size_t bytes, const Status& s) { - if (this->status->ok()) *this->status = s; - } - }; +Status VersionSet::Recover( + const std::vector& column_families) { + std::unordered_map cf_name_to_options; + for (auto cf : column_families) { + cf_name_to_options.insert({cf.name, cf.options}); + } + // keeps track of column families in manifest that were not found in + // column families parameters. if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status + std::set column_families_not_found; // Read "CURRENT" file, which contains a pointer to the current manifest file std::string manifest_filename; @@ -1771,15 +1837,20 @@ Status VersionSet::Recover() { } // remove the trailing '\n' manifest_filename.resize(manifest_filename.size() - 1); + FileType type; + bool parse_ok = + ParseFileName(manifest_filename, &manifest_file_number_, &type); + if (!parse_ok || type != kDescriptorFile) { + return Status::Corruption("CURRENT file corrupted"); + } Log(options_->info_log, "Recovering from manifest file:%s\n", manifest_filename.c_str()); manifest_filename = dbname_ + "/" + manifest_filename; unique_ptr manifest_file; - s = env_->NewSequentialFile( - manifest_filename, &manifest_file, storage_options_ - ); + s = env_->NewSequentialFile(manifest_filename, &manifest_file, + storage_options_); if (!s.ok()) { return s; } @@ -1797,10 +1868,23 @@ Status VersionSet::Recover() { uint64_t last_sequence = 0; uint64_t log_number = 0; uint64_t prev_log_number = 0; - Builder builder(this, current_); + uint32_t max_column_family = 0; + std::unordered_map builders; + + // add default column family + auto default_cf_iter = cf_name_to_options.find(default_column_family_name); + if (default_cf_iter == cf_name_to_options.end()) { + return Status::InvalidArgument("Default column family not specified"); + } + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(default_column_family_name); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(default_cf_iter->second, &default_cf_edit); + builders.insert({0, new Builder(default_cfd)}); { - LogReporter reporter; + VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/, 0 /*initial_offset*/); @@ -1813,34 +1897,99 @@ Status VersionSet::Recover() { break; } - if (edit.max_level_ >= current_->NumberLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } + // Not found means that user didn't supply that column + // family option AND we encountered column family add + // record. Once we encounter column family drop record, + // we will delete the column family from + // column_families_not_found. + bool cf_in_not_found = + column_families_not_found.find(edit.column_family_) != + column_families_not_found.end(); + // in builders means that user supplied that column family + // option AND that we encountered column family add record + bool cf_in_builders = + builders.find(edit.column_family_) != builders.end(); - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument(icmp_.user_comparator()->Name(), - "does not match existing comparator " + - edit.comparator_); - break; - } + // they can't both be true + assert(!(cf_in_not_found && cf_in_builders)); - builder.Apply(&edit); + ColumnFamilyData* cfd = nullptr; - // Only a flush's edit or a new snapshot can write log number during - // LogAndApply. Since memtables are flushed and inserted into - // manifest_writers_ queue in order, the log number in MANIFEST file - // should be monotonically increasing. - if (edit.has_log_number_) { - if (have_log_number && log_number >= edit.log_number_) { - Log(options_->info_log, - "decreasing of log_number is detected " - "in MANIFEST\n"); + if (edit.is_column_family_add_) { + if (cf_in_builders || cf_in_not_found) { + s = Status::Corruption( + "Manifest adding the same column family twice"); + break; + } + auto cf_options = cf_name_to_options.find(edit.column_family_name_); + if (cf_options == cf_name_to_options.end()) { + column_families_not_found.insert(edit.column_family_); } else { - log_number = edit.log_number_; - have_log_number = true; + cfd = CreateColumnFamily(cf_options->second, &edit); + builders.insert({edit.column_family_, new Builder(cfd)}); + } + } else if (edit.is_column_family_drop_) { + if (cf_in_builders) { + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + delete builder->second; + builders.erase(builder); + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + if (cfd->Unref()) { + delete cfd; + cfd = nullptr; + } else { + // who else can have reference to cfd!? + assert(false); + } + } else if (cf_in_not_found) { + column_families_not_found.erase(edit.column_family_); + } else { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + } else if (!cf_in_not_found) { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest record referencing unknown column family"); + break; + } + + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // this should never happen since cf_in_builders is true + assert(cfd != nullptr); + if (edit.max_level_ >= cfd->current()->NumberLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + + // if it is not column family add or column family drop, + // then it's a file add/delete, which should be forwarded + // to builder + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + builder->second->Apply(&edit); + } + + if (cfd != nullptr) { + if (edit.has_log_number_) { + if (cfd->GetLogNumber() > edit.log_number_) { + Log(options_->info_log, + "MANIFEST corruption detected, but ignored - Log numbers in " + "records NOT monotonically increasing"); + } else { + cfd->SetLogNumber(edit.log_number_); + have_log_number = true; + } + } + if (edit.has_comparator_ && + edit.comparator_ != cfd->user_comparator()->Name()) { + s = Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + break; } } @@ -1854,6 +2003,10 @@ Status VersionSet::Recover() { have_next_file = true; } + if (edit.has_max_column_family_) { + max_column_family = edit.max_column_family_; + } + if (edit.has_last_sequence_) { last_sequence = edit.last_sequence_; have_last_sequence = true; @@ -1874,43 +2027,137 @@ Status VersionSet::Recover() { prev_log_number = 0; } + column_family_set_->UpdateMaxColumnFamily(max_column_family); + MarkFileNumberUsed(prev_log_number); MarkFileNumberUsed(log_number); } + // there were some column families in the MANIFEST that weren't specified + // in the argument + if (column_families_not_found.size() > 0) { + s = Status::InvalidArgument( + "Found unexpected column families. You have to specify all column " + "families when opening the DB"); + } + if (s.ok()) { - if (options_->max_open_files == -1) { + for (auto cfd : *column_family_set_) { + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto builder = builders_iter->second; + + if (options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. - builder.LoadTableHandlers(); + builder->LoadTableHandlers(); + } + + Version* v = new Version(cfd, this, current_version_number_++); + builder->SaveTo(v); + + // Install recovered version + std::vector size_being_compacted(v->NumberLevels() - 1); + cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); + v->ComputeCompactionScore(size_being_compacted); + v->UpdateFilesBySize(); + AppendVersion(cfd, v); } - Version* v = new Version(this, current_version_number_++); - builder.SaveTo(v); - - // Install recovered version - std::vector size_being_compacted(v->NumberLevels() - 1); - compaction_picker_->SizeBeingCompacted(size_being_compacted); - v->ComputeCompactionScore(size_being_compacted); - manifest_file_size_ = manifest_file_size; - AppendVersion(v); - manifest_file_number_ = next_file; next_file_number_ = next_file + 1; last_sequence_ = last_sequence; - log_number_ = log_number; prev_log_number_ = prev_log_number; Log(options_->info_log, "Recovered from manifest file:%s succeeded," "manifest_file_number is %lu, next_file_number is %lu, " "last_sequence is %lu, log_number is %lu," - "prev_log_number is %lu\n", + "prev_log_number is %lu," + "max_column_family is %u\n", manifest_filename.c_str(), (unsigned long)manifest_file_number_, (unsigned long)next_file_number_, (unsigned long)last_sequence_, - (unsigned long)log_number_, - (unsigned long)prev_log_number_); + (unsigned long)log_number, + (unsigned long)prev_log_number_, + column_family_set_->GetMaxColumnFamily()); + + for (auto cfd : *column_family_set_) { + Log(options_->info_log, + "Column family \"%s\", log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetLogNumber()); + } + } + + for (auto builder : builders) { + delete builder.second; + } + + return s; +} + +Status VersionSet::ListColumnFamilies(std::vector* column_families, + const std::string& dbname, Env* env) { + // these are just for performance reasons, not correcntes, + // so we're fine using the defaults + EnvOptions soptions; + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string current; + Status s = ReadFileToString(env, CurrentFileName(dbname), ¤t); + if (!s.ok()) { + return s; + } + if (current.empty() || current[current.size()-1] != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + current.resize(current.size() - 1); + + std::string dscname = dbname + "/" + current; + unique_ptr file; + s = env->NewSequentialFile(dscname, &file, soptions); + if (!s.ok()) { + return s; + } + + std::map column_family_names; + // default column family is always implicitly there + column_family_names.insert({0, default_column_family_name}); + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(std::move(file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + if (edit.is_column_family_add_) { + if (column_family_names.find(edit.column_family_) != + column_family_names.end()) { + s = Status::Corruption("Manifest adding the same column family twice"); + break; + } + column_family_names.insert( + {edit.column_family_, edit.column_family_name_}); + } else if (edit.is_column_family_drop_) { + if (column_family_names.find(edit.column_family_) == + column_family_names.end()) { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + column_family_names.erase(edit.column_family_); + } + } + + column_families->clear(); + if (s.ok()) { + for (const auto& iter : column_family_names) { + column_families->push_back(iter.second); + } } return s; @@ -1925,17 +2172,24 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, "Number of levels needs to be bigger than 1"); } - const InternalKeyComparator cmp(options->comparator); - TableCache tc(dbname, options, storage_options, 10); - VersionSet versions(dbname, options, storage_options, &tc, &cmp); + ColumnFamilyOptions cf_options(*options); + std::shared_ptr tc(NewLRUCache( + options->max_open_files - 10, options->table_cache_numshardbits, + options->table_cache_remove_scan_count_limit)); + VersionSet versions(dbname, options, storage_options, tc.get()); Status status; - status = versions.Recover(); + std::vector dummy; + ColumnFamilyDescriptor dummy_descriptor(default_column_family_name, + ColumnFamilyOptions(*options)); + dummy.push_back(dummy_descriptor); + status = versions.Recover(dummy); if (!status.ok()) { return status; } - Version* current_version = versions.current(); + Version* current_version = + versions.GetColumnFamilySet()->GetDefault()->current(); int current_levels = current_version->NumberLevels(); if (current_levels <= new_levels) { @@ -1985,18 +2239,12 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, VersionEdit ve; port::Mutex dummy_mutex; MutexLock l(&dummy_mutex); - return versions.LogAndApply(&ve, &dummy_mutex, nullptr, true); + return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), &ve, + &dummy_mutex, nullptr, true); } Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex) { - struct LogReporter : public log::Reader::Reporter { - Status* status; - virtual void Corruption(size_t bytes, const Status& s) { - if (this->status->ok()) *this->status = s; - } - }; - // Open the specified manifest file. unique_ptr file; Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); @@ -2004,19 +2252,26 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, return s; } - bool have_log_number = false; bool have_prev_log_number = false; bool have_next_file = false; bool have_last_sequence = false; uint64_t next_file = 0; uint64_t last_sequence = 0; - uint64_t log_number = 0; uint64_t prev_log_number = 0; int count = 0; - VersionSet::Builder builder(this, current_); + std::unordered_map comparators; + std::unordered_map builders; + + // add default column family + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(default_column_family_name); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit); + builders.insert({0, new Builder(default_cfd)}); { - LogReporter reporter; + VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(std::move(file), &reporter, true/*checksum*/, 0/*initial_offset*/); @@ -2025,13 +2280,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, while (reader.ReadRecord(&record, &scratch) && s.ok()) { VersionEdit edit; s = edit.DecodeFrom(record); - if (s.ok()) { - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument(icmp_.user_comparator()->Name(), - "does not match existing comparator " + - edit.comparator_); - } + if (!s.ok()) { + break; } // Write out each individual edit @@ -2041,13 +2291,59 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } count++; - if (s.ok()) { - builder.Apply(&edit); + bool cf_in_builders = + builders.find(edit.column_family_) != builders.end(); + + if (edit.has_comparator_) { + comparators.insert({edit.column_family_, edit.comparator_}); } - if (edit.has_log_number_) { - log_number = edit.log_number_; - have_log_number = true; + ColumnFamilyData* cfd = nullptr; + + if (edit.is_column_family_add_) { + if (cf_in_builders) { + s = Status::Corruption( + "Manifest adding the same column family twice"); + break; + } + cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit); + builders.insert({edit.column_family_, new Builder(cfd)}); + } else if (edit.is_column_family_drop_) { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + auto builder_iter = builders.find(edit.column_family_); + delete builder_iter->second; + builders.erase(builder_iter); + comparators.erase(edit.column_family_); + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + assert(cfd != nullptr); + cfd->Unref(); + delete cfd; + cfd = nullptr; + } else { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest record referencing unknown column family"); + break; + } + + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // this should never happen since cf_in_builders is true + assert(cfd != nullptr); + + // if it is not column family add or column family drop, + // then it's a file add/delete, which should be forwarded + // to builder + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + builder->second->Apply(&edit); + } + + if (cfd != nullptr && edit.has_log_number_) { + cfd->SetLogNumber(edit.log_number_); } if (edit.has_prev_log_number_) { @@ -2064,6 +2360,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, last_sequence = edit.last_sequence_; have_last_sequence = true; } + + if (edit.has_max_column_family_) { + column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_); + } } } file.reset(); @@ -2072,9 +2372,6 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, if (!have_next_file) { s = Status::Corruption("no meta-nextfile entry in descriptor"); printf("no meta-nextfile entry in descriptor"); - } else if (!have_log_number) { - s = Status::Corruption("no meta-lognumber entry in descriptor"); - printf("no meta-lognumber entry in descriptor"); } else if (!have_last_sequence) { printf("no last-sequence-number entry in descriptor"); s = Status::Corruption("no last-sequence-number entry in descriptor"); @@ -2083,35 +2380,45 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, if (!have_prev_log_number) { prev_log_number = 0; } - - MarkFileNumberUsed(prev_log_number); - MarkFileNumberUsed(log_number); } if (s.ok()) { - Version* v = new Version(this, 0); - builder.SaveTo(v); + for (auto cfd : *column_family_set_) { + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto builder = builders_iter->second; - // Install recovered version - std::vector size_being_compacted(v->NumberLevels() - 1); - compaction_picker_->SizeBeingCompacted(size_being_compacted); - v->ComputeCompactionScore(size_being_compacted); + Version* v = new Version(cfd, this, current_version_number_++); + builder->SaveTo(v); + std::vector size_being_compacted(v->NumberLevels() - 1); + cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); + v->ComputeCompactionScore(size_being_compacted); + v->UpdateFilesBySize(); + delete builder; + + printf("--------------- Column family \"%s\" (ID %u) --------------\n", + cfd->GetName().c_str(), (unsigned int)cfd->GetID()); + printf("log number: %lu\n", (unsigned long)cfd->GetLogNumber()); + auto comparator = comparators.find(cfd->GetID()); + if (comparator != comparators.end()) { + printf("comparator: %s\n", comparator->second.c_str()); + } else { + printf("comparator: \n"); + } + printf("%s \n", v->DebugString(hex).c_str()); + delete v; + } - AppendVersion(v); - manifest_file_number_ = next_file; next_file_number_ = next_file + 1; last_sequence_ = last_sequence; - log_number_ = log_number; prev_log_number_ = prev_log_number; - printf("manifest_file_number %lu next_file_number %lu last_sequence " - "%lu log_number %lu prev_log_number %lu\n", - (unsigned long)manifest_file_number_, - (unsigned long)next_file_number_, - (unsigned long)last_sequence, - (unsigned long)log_number, - (unsigned long)prev_log_number); - printf("%s \n", v->DebugString(hex).c_str()); + printf( + "next_file_number %lu last_sequence " + "%lu prev_log_number %lu max_column_family %u\n", + (unsigned long)next_file_number_, (unsigned long)last_sequence, + (unsigned long)prev_log_number, + column_family_set_->GetMaxColumnFamily()); } return s; @@ -2126,24 +2433,58 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { Status VersionSet::WriteSnapshot(log::Writer* log) { // TODO: Break up into multiple records to reduce memory usage on recovery? - // Save metadata - VersionEdit edit; - edit.SetComparatorName(icmp_.user_comparator()->Name()); + // WARNING: This method doesn't hold a mutex!! - // Save files - for (int level = 0; level < current_->NumberLevels(); level++) { - const auto& files = current_->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - const auto f = files[i]; - edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest, - f->smallest_seqno, f->largest_seqno); + // This is done without DB mutex lock held, but only within single-threaded + // LogAndApply. Column family manipulations can only happen within LogAndApply + // (the same single thread), so we're safe to iterate. + for (auto cfd : *column_family_set_) { + { + // Store column family info + VersionEdit edit; + if (cfd->GetID() != 0) { + // default column family is always there, + // no need to explicitly write it + edit.AddColumnFamily(cfd->GetName()); + edit.SetColumnFamily(cfd->GetID()); + } + edit.SetComparatorName( + cfd->internal_comparator().user_comparator()->Name()); + std::string record; + edit.EncodeTo(&record); + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; + } + } + + { + // Save files + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& f : cfd->current()->files_[level]) { + edit.AddFile(level, + f->number, + f->file_size, + f->smallest, + f->largest, + f->smallest_seqno, + f->largest_seqno); + } + } + edit.SetLogNumber(cfd->GetLogNumber()); + std::string record; + edit.EncodeTo(&record); + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; + } } } - edit.SetLogNumber(log_number_); - std::string record; - edit.EncodeTo(&record); - return log->AddRecord(record); + return Status::OK(); } // Opens the mainfest file and reads all records @@ -2182,10 +2523,12 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { for (int level = 0; level < v->NumberLevels(); level++) { const std::vector& files = v->files_[level]; for (size_t i = 0; i < files.size(); i++) { - if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <= + 0) { // Entire file is before "ikey", so just add the file size result += files[i]->file_size; - } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + } else if (v->cfd_->internal_comparator().Compare(files[i]->smallest, + ikey) > 0) { // Entire file is after "ikey", so ignore if (level > 0) { // Files other than level 0 are sorted by meta->smallest, so @@ -2197,9 +2540,9 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { // "ikey" falls in the range for this table. Add the // approximate offset of "ikey" within the table. TableReader* table_reader_ptr; - Iterator* iter = - table_cache_->NewIterator(ReadOptions(), storage_options_, icmp_, - *(files[i]), &table_reader_ptr); + Iterator* iter = v->cfd_->table_cache()->NewIterator( + ReadOptions(), storage_options_, v->cfd_->internal_comparator(), + *(files[i]), &table_reader_ptr); if (table_reader_ptr != nullptr) { result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); } @@ -2213,43 +2556,36 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { void VersionSet::AddLiveFiles(std::vector* live_list) { // pre-calculate space requirement int64_t total_files = 0; - for (Version* v = dummy_versions_.next_; - v != &dummy_versions_; - v = v->next_) { - for (int level = 0; level < v->NumberLevels(); level++) { - total_files += v->files_[level].size(); + for (auto cfd : *column_family_set_) { + Version* dummy_versions = cfd->dummy_versions(); + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + total_files += v->files_[level].size(); + } } } // just one time extension to the right size live_list->reserve(live_list->size() + total_files); - for (Version* v = dummy_versions_.next_; - v != &dummy_versions_; - v = v->next_) { - for (int level = 0; level < v->NumberLevels(); level++) { - for (const auto& f : v->files_[level]) { - live_list->push_back(f->number); + for (auto cfd : *column_family_set_) { + Version* dummy_versions = cfd->dummy_versions(); + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { + for (const auto& f : v->files_[level]) { + live_list->push_back(f->number); + } } } } } -Compaction* VersionSet::PickCompaction(LogBuffer* log_buffer) { - return compaction_picker_->PickCompaction(current_, log_buffer); -} - -Compaction* VersionSet::CompactRange(int input_level, int output_level, - const InternalKey* begin, - const InternalKey* end, - InternalKey** compaction_end) { - return compaction_picker_->CompactRange(current_, input_level, output_level, - begin, end, compaction_end); -} - Iterator* VersionSet::MakeInputIterator(Compaction* c) { ReadOptions options; - options.verify_checksums = options_->verify_checksums_in_compaction; + options.verify_checksums = + c->column_family_data()->options()->verify_checksums_in_compaction; options.fill_cache = false; // Level-0 files have to be merged together. For other levels, @@ -2262,38 +2598,36 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { if (!c->inputs(which)->empty()) { if (c->level() + which == 0) { for (const auto& file : *c->inputs(which)) { - list[num++] = table_cache_->NewIterator( - options, storage_options_compactions_, icmp_, *file, nullptr, + list[num++] = c->column_family_data()->table_cache()->NewIterator( + options, storage_options_compactions_, + c->column_family_data()->internal_comparator(), *file, nullptr, true /* for compaction */); } } else { // Create concatenating iterator for the files from this level list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator(icmp_, c->inputs(which)), - &GetFileIterator, table_cache_, options, storage_options_, icmp_, + new Version::LevelFileNumIterator( + c->column_family_data()->internal_comparator(), + c->inputs(which)), + &GetFileIterator, c->column_family_data()->table_cache(), options, + storage_options_, c->column_family_data()->internal_comparator(), true /* for compaction */); } } } assert(num <= space); - Iterator* result = NewMergingIterator(env_, &icmp_, list, num); + Iterator* result = NewMergingIterator( + &c->column_family_data()->internal_comparator(), list, num); delete[] list; return result; } -double VersionSet::MaxBytesForLevel(int level) { - return compaction_picker_->MaxBytesForLevel(level); -} - -uint64_t VersionSet::MaxFileSizeForLevel(int level) { - return compaction_picker_->MaxFileSizeForLevel(level); -} - // verify that the files listed in this compaction are present // in the current version bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { #ifndef NDEBUG - if (c->input_version() != current_) { + Version* version = c->column_family_data()->current(); + if (c->input_version() != version) { Log(options_->info_log, "VerifyCompactionFileConsistency version mismatch"); } @@ -2304,8 +2638,8 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { // look for this file in the current version bool found = false; - for (unsigned int j = 0; j < current_->files_[level].size(); j++) { - FileMetaData* f = current_->files_[level][j]; + for (unsigned int j = 0; j < version->files_[level].size(); j++) { + FileMetaData* f = version->files_[level][j]; if (f->number == number) { found = true; break; @@ -2322,8 +2656,8 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { // look for this file in the current version bool found = false; - for (unsigned int j = 0; j < current_->files_[level].size(); j++) { - FileMetaData* f = current_->files_[level][j]; + for (unsigned int j = 0; j < version->files_[level].size(); j++) { + FileMetaData* f = version->files_[level][j]; if (f->number == number) { found = true; break; @@ -2337,19 +2671,19 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { return true; // everything good } -void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) { - compaction_picker_->ReleaseCompactionFiles(c, status); -} - Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, - FileMetaData** meta) { - for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = current_->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - if (files[i]->number == number) { - *meta = files[i]; - *filelevel = level; - return Status::OK(); + FileMetaData** meta, + ColumnFamilyData** cfd) { + for (auto cfd_iter : *column_family_set_) { + Version* version = cfd_iter->current(); + for (int level = 0; level < version->NumberLevels(); level++) { + for (const auto& file : version->files_[level]) { + if (file->number == number) { + *meta = file; + *filelevel = level; + *cfd = cfd_iter; + return Status::OK(); + } } } } @@ -2357,27 +2691,42 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, } void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { - for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = current_->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - LiveFileMetaData filemetadata; - filemetadata.name = TableFileName("", files[i]->number); - filemetadata.level = level; - filemetadata.size = files[i]->file_size; - filemetadata.smallestkey = files[i]->smallest.user_key().ToString(); - filemetadata.largestkey = files[i]->largest.user_key().ToString(); - filemetadata.smallest_seqno = files[i]->smallest_seqno; - filemetadata.largest_seqno = files[i]->largest_seqno; - metadata->push_back(filemetadata); + for (auto cfd : *column_family_set_) { + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& file : cfd->current()->files_[level]) { + LiveFileMetaData filemetadata; + filemetadata.name = TableFileName("", file->number); + filemetadata.level = level; + filemetadata.size = file->file_size; + filemetadata.smallestkey = file->smallest.user_key().ToString(); + filemetadata.largestkey = file->largest.user_key().ToString(); + filemetadata.smallest_seqno = file->smallest_seqno; + filemetadata.largest_seqno = file->largest_seqno; + metadata->push_back(filemetadata); + } } } } void VersionSet::GetObsoleteFiles(std::vector* files) { - files->insert(files->end(), - obsolete_files_.begin(), - obsolete_files_.end()); + files->insert(files->end(), obsolete_files_.begin(), obsolete_files_.end()); obsolete_files_.clear(); } +ColumnFamilyData* VersionSet::CreateColumnFamily( + const ColumnFamilyOptions& options, VersionEdit* edit) { + assert(edit->is_column_family_add_); + + Version* dummy_versions = new Version(nullptr, this); + auto new_cfd = column_family_set_->CreateColumnFamily( + edit->column_family_name_, edit->column_family_, dummy_versions, options); + + Version* v = new Version(new_cfd, this, current_version_number_++); + + AppendVersion(new_cfd, v); + new_cfd->CreateNewMemtable(); + new_cfd->SetLogNumber(edit->log_number_); + return new_cfd; +} + } // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h index 7d7cdf4fc..d3bd97f3f 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -24,12 +24,15 @@ #include #include #include +#include #include "db/dbformat.h" #include "db/version_edit.h" #include "port/port.h" #include "db/table_cache.h" #include "db/compaction.h" #include "db/compaction_picker.h" +#include "db/column_family.h" +#include "db/log_reader.h" namespace rocksdb { @@ -41,10 +44,12 @@ class Iterator; class LogBuffer; class LookupKey; class MemTable; -class MergeContext; -class TableCache; class Version; class VersionSet; +class MergeContext; +class ColumnFamilyData; +class ColumnFamilySet; +class TableCache; // Return the smallest index i such that files[i]->largest >= key. // Return files.size() if there is no such file. @@ -208,6 +213,7 @@ class Version { friend class Compaction; friend class VersionSet; friend class DBImpl; + friend class ColumnFamilyData; friend class CompactionPicker; friend class LevelCompactionPicker; friend class UniversalCompactionPicker; @@ -223,6 +229,7 @@ class Version { // record results in files_by_size_. The largest files are listed first. void UpdateFilesBySize(); + ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list @@ -268,7 +275,7 @@ class Version { // used for debugging and logging purposes only. uint64_t version_number_; - explicit Version(VersionSet* vset, uint64_t version_number = 0); + Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0); ~Version(); @@ -285,22 +292,29 @@ class Version { class VersionSet { public: - VersionSet(const std::string& dbname, const Options* options, - const EnvOptions& storage_options, TableCache* table_cache, - const InternalKeyComparator*); + VersionSet(const std::string& dbname, const DBOptions* options, + const EnvOptions& storage_options, Cache* table_cache); ~VersionSet(); // Apply *edit to the current version to form a new descriptor that // is both saved to persistent state and installed as the new // current version. Will release *mu while actually writing to the file. + // column_family_options has to be set if edit is column family add // REQUIRES: *mu is held on entry. // REQUIRES: no other thread concurrently calls LogAndApply() - Status LogAndApply(VersionEdit* edit, port::Mutex* mu, - Directory* db_directory = nullptr, - bool new_descriptor_log = false); + Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit, + port::Mutex* mu, Directory* db_directory = nullptr, + bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = + nullptr); // Recover the last saved descriptor from persistent storage. - Status Recover(); + Status Recover(const std::vector& column_families); + + // Reads a manifest file and returns a list of column families in + // column_families. + static Status ListColumnFamilies(std::vector* column_families, + const std::string& dbname, Env* env); // Try to reduce the number of levels. This call is valid when // only one level from the new max level to the old @@ -316,15 +330,6 @@ class VersionSet { const EnvOptions& storage_options, int new_levels); - // Return the current version. - Version* current() const { return current_; } - - // A Flag indicating whether write needs to slowdown because of there are - // too many number of level0 files. - bool NeedSlowdownForNumLevel0Files() const { - return need_slowdown_for_num_level0_files_; - } - // Return the current manifest file number uint64_t ManifestFileNumber() const { return manifest_file_number_; } @@ -358,37 +363,21 @@ class VersionSet { // Mark the specified file number as used. void MarkFileNumberUsed(uint64_t number); - // Return the current log file number. - uint64_t LogNumber() const { return log_number_; } - // Return the log file number for the log file that is currently // being compacted, or zero if there is no such log file. uint64_t PrevLogNumber() const { return prev_log_number_; } - int NumberLevels() const { return num_levels_; } - - // Pick level and inputs for a new compaction. - // Returns nullptr if there is no compaction to be done. - // Otherwise returns a pointer to a heap-allocated object that - // describes the compaction. Caller should delete the result. - Compaction* PickCompaction(LogBuffer* log_buffer); - - // Return a compaction object for compacting the range [begin,end] in - // the specified level. Returns nullptr if there is nothing in that - // level that overlaps the specified range. Caller should delete - // the result. - // - // The returned Compaction might not include the whole requested range. - // In that case, compaction_end will be set to the next key that needs - // compacting. In case the compaction will compact the whole range, - // compaction_end will be set to nullptr. - // Client is responsible for compaction_end storage -- when called, - // *compaction_end should point to valid InternalKey! - Compaction* CompactRange(int input_level, - int output_level, - const InternalKey* begin, - const InternalKey* end, - InternalKey** compaction_end); + // Returns the minimum log number such that all + // log numbers less than or equal to it can be deleted + uint64_t MinLogNumber() const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + if (min_log_num > cfd->GetLogNumber()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. @@ -414,62 +403,53 @@ class VersionSet { // pick the same files to compact. bool VerifyCompactionFileConsistency(Compaction* c); - double MaxBytesForLevel(int level); - - // Get the max file size in a given level. - uint64_t MaxFileSizeForLevel(int level); - - void ReleaseCompactionFiles(Compaction* c, Status status); - - Status GetMetadataForFile( - uint64_t number, int *filelevel, FileMetaData **metadata); + Status GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** metadata, ColumnFamilyData** cfd); void GetLiveFilesMetaData( std::vector *metadata); void GetObsoleteFiles(std::vector* files); + ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + private: class Builder; struct ManifestWriter; - friend class Compaction; friend class Version; + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t bytes, const Status& s) { + if (this->status->ok()) *this->status = s; + } + }; + // Save current contents to *log Status WriteSnapshot(log::Writer* log); - void AppendVersion(Version* v); + void AppendVersion(ColumnFamilyData* column_family_data, Version* v); bool ManifestContains(uint64_t manifest_file_number, const std::string& record) const; + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options, + VersionEdit* edit); + + std::unique_ptr column_family_set_; + Env* const env_; const std::string dbname_; - const Options* const options_; - TableCache* const table_cache_; - const InternalKeyComparator icmp_; + const DBOptions* const options_; uint64_t next_file_number_; uint64_t manifest_file_number_; uint64_t pending_manifest_file_number_; std::atomic last_sequence_; - uint64_t log_number_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted - int num_levels_; - // Opened lazily unique_ptr descriptor_log_; - Version dummy_versions_; // Head of circular doubly-linked list of versions. - Version* current_; // == dummy_versions_.prev_ - - // A flag indicating whether we should delay writes because - // we have too many level 0 files - bool need_slowdown_for_num_level0_files_; - - // An object that keeps all the compaction stats - // and picks the next compaction - std::unique_ptr compaction_picker_; // generates a increasing version number for every new version uint64_t current_version_number_; @@ -493,8 +473,9 @@ class VersionSet { VersionSet(const VersionSet&); void operator=(const VersionSet&); - void LogAndApplyHelper(Builder*b, Version* v, - VersionEdit* edit, port::Mutex* mu); + void LogAndApplyCFHelper(VersionEdit* edit); + void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v, + VersionEdit* edit, port::Mutex* mu); }; } // namespace rocksdb diff --git a/db/write_batch.cc b/db/write_batch.cc index 352f57894..8fffdbfbd 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -15,6 +15,9 @@ // kTypeValue varstring varstring // kTypeMerge varstring varstring // kTypeDeletion varstring +// kTypeColumnFamilyValue varint32 varstring varstring +// kTypeColumnFamilyMerge varint32 varstring varstring +// kTypeColumnFamilyDeletion varint32 varstring varstring // varstring := // len: varint32 // data: uint8[len] @@ -45,10 +48,20 @@ WriteBatch::~WriteBatch() { } WriteBatch::Handler::~Handler() { } +void WriteBatch::Handler::Put(const Slice& key, const Slice& value) { + // you need to either implement Put or PutCF + throw std::runtime_error("Handler::Put not implemented!"); +} + void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) { throw std::runtime_error("Handler::Merge not implemented!"); } +void WriteBatch::Handler::Delete(const Slice& key) { + // you need to either implement Delete or DeleteCF + throw std::runtime_error("Handler::Delete not implemented!"); +} + void WriteBatch::Handler::LogData(const Slice& blob) { // If the user has not specified something to do with blobs, then we ignore // them. @@ -76,31 +89,48 @@ Status WriteBatch::Iterate(Handler* handler) const { input.remove_prefix(kHeader); Slice key, value, blob; int found = 0; - while (!input.empty() && handler->Continue()) { + Status s; + while (s.ok() && !input.empty() && handler->Continue()) { char tag = input[0]; input.remove_prefix(1); + uint32_t column_family = 0; // default switch (tag) { + case kTypeColumnFamilyValue: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Put"); + } + // intentional fallthrough case kTypeValue: if (GetLengthPrefixedSlice(&input, &key) && GetLengthPrefixedSlice(&input, &value)) { - handler->Put(key, value); + s = handler->PutCF(column_family, key, value); found++; } else { return Status::Corruption("bad WriteBatch Put"); } break; + case kTypeColumnFamilyDeletion: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Delete"); + } + // intentional fallthrough case kTypeDeletion: if (GetLengthPrefixedSlice(&input, &key)) { - handler->Delete(key); + s = handler->DeleteCF(column_family, key); found++; } else { return Status::Corruption("bad WriteBatch Delete"); } break; + case kTypeColumnFamilyMerge: + if (!GetVarint32(&input, &column_family)) { + return Status::Corruption("bad WriteBatch Merge"); + } + // intentional fallthrough case kTypeMerge: if (GetLengthPrefixedSlice(&input, &key) && GetLengthPrefixedSlice(&input, &value)) { - handler->Merge(key, value); + s = handler->MergeCF(column_family, key, value); found++; } else { return Status::Corruption("bad WriteBatch Merge"); @@ -117,7 +147,10 @@ Status WriteBatch::Iterate(Handler* handler) const { return Status::Corruption("unknown WriteBatch tag"); } } - if (found != WriteBatchInternal::Count(this)) { + if (!s.ok()) { + return s; + } + if (found != WriteBatchInternal::Count(this)) { return Status::Corruption("WriteBatch has wrong count"); } else { return Status::OK(); @@ -140,29 +173,76 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { EncodeFixed64(&b->rep_[0], seq); } -void WriteBatch::Put(const Slice& key, const Slice& value) { +void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + column_family_id = cfh->GetID(); + } + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeValue)); + if (column_family_id == 0) { + rep_.push_back(static_cast(kTypeValue)); + } else { + rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&rep_, column_family_id); + } PutLengthPrefixedSlice(&rep_, key); PutLengthPrefixedSlice(&rep_, value); } -void WriteBatch::Put(const SliceParts& key, const SliceParts& value) { +void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + column_family_id = cfh->GetID(); + } + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeValue)); + if (column_family_id == 0) { + rep_.push_back(static_cast(kTypeValue)); + } else { + rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&rep_, column_family_id); + } PutLengthPrefixedSliceParts(&rep_, key); PutLengthPrefixedSliceParts(&rep_, value); } -void WriteBatch::Delete(const Slice& key) { +void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + column_family_id = cfh->GetID(); + } + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeDeletion)); + if (column_family_id == 0) { + rep_.push_back(static_cast(kTypeDeletion)); + } else { + rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&rep_, column_family_id); + } PutLengthPrefixedSlice(&rep_, key); } -void WriteBatch::Merge(const Slice& key, const Slice& value) { +void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + column_family_id = cfh->GetID(); + } + WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast(kTypeMerge)); + if (column_family_id == 0) { + rep_.push_back(static_cast(kTypeMerge)); + } else { + rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&rep_, column_family_id); + } PutLengthPrefixedSlice(&rep_, key); PutLengthPrefixedSlice(&rep_, value); } @@ -176,33 +256,70 @@ namespace { class MemTableInserter : public WriteBatch::Handler { public: SequenceNumber sequence_; - MemTable* mem_; - const Options* options_; + ColumnFamilyMemTables* cf_mems_; + bool recovery_; + uint64_t log_number_; DBImpl* db_; - const bool filter_deletes_; + const bool dont_filter_deletes_; - MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts, - DB* db, const bool filter_deletes) - : sequence_(sequence), - mem_(mem), - options_(opts), - db_(reinterpret_cast(db)), - filter_deletes_(filter_deletes) { - assert(mem_); - if (filter_deletes_) { - assert(options_); + MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems, + bool recovery, uint64_t log_number, DB* db, + const bool dont_filter_deletes) + : sequence_(sequence), + cf_mems_(cf_mems), + recovery_(recovery), + log_number_(log_number), + db_(reinterpret_cast(db)), + dont_filter_deletes_(dont_filter_deletes) { + assert(cf_mems); + if (!dont_filter_deletes_) { assert(db_); } } - virtual void Put(const Slice& key, const Slice& value) { - if (!options_->inplace_update_support) { - mem_->Add(sequence_, kTypeValue, key, value); - } else if (options_->inplace_callback == nullptr) { - mem_->Update(sequence_, key, value); - RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED); + bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { + bool found = cf_mems_->Seek(column_family_id); + if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) { + // if in recovery envoronment: + // * If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- we + // just ignore the update. + // * If log_number_ < cf_mems_->GetLogNumber(), this means that column + // family already contains updates from this log. We can't apply updates + // twice because of update-in-place or merge workloads -- ignore the + // update + *s = Status::OK(); + return false; + } + if (!found) { + assert(!recovery_); + // If the column family was not found in non-recovery enviornment + // (client's write code-path), we have to fail the write and return + // the failure status to the client. + *s = Status::InvalidArgument( + "Invalid column family specified in write batch"); + return false; + } + return true; + } + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + Status seek_status; + if (!SeekToColumnFamily(column_family_id, &seek_status)) { + ++sequence_; + return seek_status; + } + MemTable* mem = cf_mems_->GetMemTable(); + const Options* options = cf_mems_->GetOptions(); + if (!options->inplace_update_support) { + mem->Add(sequence_, kTypeValue, key, value); + } else if (options->inplace_callback == nullptr) { + mem->Update(sequence_, key, value); + RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED); } else { - if (mem_->UpdateCallback(sequence_, key, value, *options_)) { + if (mem->UpdateCallback(sequence_, key, value, *options)) { } else { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; @@ -212,21 +329,26 @@ class MemTableInserter : public WriteBatch::Handler { std::string prev_value; std::string merged_value; - Status s = db_->Get(ropts, key, &prev_value); + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + Status s = db_->Get(ropts, cf_handle, key, &prev_value); + char* prev_buffer = const_cast(prev_value.c_str()); uint32_t prev_size = prev_value.size(); - auto status = - options_->inplace_callback(s.ok() ? prev_buffer: nullptr, - s.ok() ? &prev_size: nullptr, - value, &merged_value); + auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr, + s.ok() ? &prev_size : nullptr, + value, &merged_value); if (status == UpdateStatus::UPDATED_INPLACE) { // prev_value is updated in-place with final value. - mem_->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); - RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN); + mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); } else if (status == UpdateStatus::UPDATED) { // merged_value contains the final value. - mem_->Add(sequence_, kTypeValue, key, Slice(merged_value)); - RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN); + mem->Add(sequence_, kTypeValue, key, Slice(merged_value)); + RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); } } } @@ -234,19 +356,28 @@ class MemTableInserter : public WriteBatch::Handler { // sequence number. Even if the update eventually fails and does not result // in memtable add/update. sequence_++; + return Status::OK(); } - virtual void Merge(const Slice& key, const Slice& value) { + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + Status seek_status; + if (!SeekToColumnFamily(column_family_id, &seek_status)) { + ++sequence_; + return seek_status; + } + MemTable* mem = cf_mems_->GetMemTable(); + const Options* options = cf_mems_->GetOptions(); bool perform_merge = false; - if (options_->max_successive_merges > 0 && db_ != nullptr) { + if (options->max_successive_merges > 0 && db_ != nullptr) { LookupKey lkey(key, sequence_); // Count the number of successive merges at the head // of the key in the memtable - size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey); + size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); - if (num_merges >= options_->max_successive_merges) { + if (num_merges >= options->max_successive_merges) { perform_merge = true; } } @@ -262,62 +393,78 @@ class MemTableInserter : public WriteBatch::Handler { ReadOptions read_options; read_options.snapshot = &read_from_snapshot; - db_->Get(read_options, key, &get_value); + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + db_->Get(read_options, cf_handle, key, &get_value); Slice get_value_slice = Slice(get_value); // 2) Apply this merge - auto merge_operator = options_->merge_operator.get(); + auto merge_operator = options->merge_operator.get(); assert(merge_operator); std::deque operands; operands.push_front(value.ToString()); std::string new_value; - if (!merge_operator->FullMerge(key, - &get_value_slice, - operands, - &new_value, - options_->info_log.get())) { + if (!merge_operator->FullMerge(key, &get_value_slice, operands, + &new_value, options->info_log.get())) { // Failed to merge! - RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES); + RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES); // Store the delta in memtable perform_merge = false; } else { // 3) Add value to memtable - mem_->Add(sequence_, kTypeValue, key, new_value); + mem->Add(sequence_, kTypeValue, key, new_value); } } if (!perform_merge) { // Add merge operator to memtable - mem_->Add(sequence_, kTypeMerge, key, value); + mem->Add(sequence_, kTypeMerge, key, value); } sequence_++; + return Status::OK(); } - virtual void Delete(const Slice& key) { - if (filter_deletes_) { + + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + Status seek_status; + if (!SeekToColumnFamily(column_family_id, &seek_status)) { + ++sequence_; + return seek_status; + } + MemTable* mem = cf_mems_->GetMemTable(); + const Options* options = cf_mems_->GetOptions(); + if (!dont_filter_deletes_ && options->filter_deletes) { SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; ReadOptions ropts; ropts.snapshot = &read_from_snapshot; std::string value; - if (!db_->KeyMayExist(ropts, key, &value)) { - RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES); - return; + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { + RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES); + return Status::OK(); } } - mem_->Add(sequence_, kTypeDeletion, key, Slice()); + mem->Add(sequence_, kTypeDeletion, key, Slice()); sequence_++; + return Status::OK(); } }; } // namespace -Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem, - const Options* opts, DB* db, - const bool filter_deletes) { - MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db, - filter_deletes); +Status WriteBatchInternal::InsertInto(const WriteBatch* b, + ColumnFamilyMemTables* memtables, + bool recovery, uint64_t log_number, + DB* db, const bool dont_filter_deletes) { + MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables, + recovery, log_number, db, dont_filter_deletes); return b->Iterate(&inserter); } diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index b8991732f..793ee3e0e 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -17,6 +17,49 @@ namespace rocksdb { class MemTable; +class ColumnFamilyMemTables { + public: + virtual ~ColumnFamilyMemTables() {} + virtual bool Seek(uint32_t column_family_id) = 0; + // returns true if the update to memtable should be ignored + // (useful when recovering from log whose updates have already + // been processed) + virtual uint64_t GetLogNumber() const = 0; + virtual MemTable* GetMemTable() const = 0; + virtual const Options* GetOptions() const = 0; + virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; +}; + +class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { + public: + ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options) + : ok_(false), mem_(mem), options_(options) {} + + bool Seek(uint32_t column_family_id) override { + ok_ = (column_family_id == 0); + return ok_; + } + + uint64_t GetLogNumber() const override { return 0; } + + MemTable* GetMemTable() const override { + assert(ok_); + return mem_; + } + + const Options* GetOptions() const override { + assert(ok_); + return options_; + } + + ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } + + private: + bool ok_; + MemTable* mem_; + const Options* const options_; +}; + // WriteBatchInternal provides static methods for manipulating a // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { @@ -45,11 +88,21 @@ class WriteBatchInternal { static void SetContents(WriteBatch* batch, const Slice& contents); // Inserts batch entries into memtable - // Drops deletes in batch if filter_del is set to true and - // db->KeyMayExist returns false - static Status InsertInto(const WriteBatch* batch, MemTable* memtable, - const Options* opts, DB* db = nullptr, - const bool filter_del = false); + // If dont_filter_deletes is false AND options.filter_deletes is true, + // then --> Drops deletes in batch if db->KeyMayExist returns false + // If recovery == true, this means InsertInto is executed on a recovery + // code-path. WriteBatch referencing a dropped column family can be + // found on a recovery code-path and should be ignored (recovery should not + // fail). Additionally, the memtable will be updated only if + // memtables->GetLogNumber() >= log_number + // However, if recovery == false, any WriteBatch referencing + // non-existing column family will return a failure. Also, log_number is + // ignored in that case + static Status InsertInto(const WriteBatch* batch, + ColumnFamilyMemTables* memtables, + bool recovery = false, uint64_t log_number = 0, + DB* db = nullptr, + const bool dont_filter_deletes = true); static void Append(WriteBatch* dst, const WriteBatch* src); }; diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index d3454c343..c2f412c59 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -11,6 +11,7 @@ #include #include "db/memtable.h" +#include "db/column_family.h" #include "db/write_batch_internal.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" @@ -27,7 +28,8 @@ static std::string PrintContents(WriteBatch* b) { MemTable* mem = new MemTable(cmp, options); mem->Ref(); std::string state; - Status s = WriteBatchInternal::InsertInto(b, mem, &options); + ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); + Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; Iterator* iter = mem->NewIterator(); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -144,17 +146,37 @@ TEST(WriteBatchTest, Append) { namespace { struct TestHandler : public WriteBatch::Handler { std::string seen; - virtual void Put(const Slice& key, const Slice& value) { - seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "PutCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); } - virtual void Merge(const Slice& key, const Slice& value) { - seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "MergeCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); } virtual void LogData(const Slice& blob) { seen += "LogData(" + blob.ToString() + ")"; } - virtual void Delete(const Slice& key) { - seen += "Delete(" + key.ToString() + ")"; + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + seen += "Delete(" + key.ToString() + ")"; + } else { + seen += "DeleteCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ")"; + } + return Status::OK(); } }; } @@ -194,21 +216,23 @@ TEST(WriteBatchTest, Continue) { struct Handler : public TestHandler { int num_seen = 0; - virtual void Put(const Slice& key, const Slice& value) { + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { ++num_seen; - TestHandler::Put(key, value); + return TestHandler::PutCF(column_family_id, key, value); } - virtual void Merge(const Slice& key, const Slice& value) { + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { ++num_seen; - TestHandler::Merge(key, value); + return TestHandler::MergeCF(column_family_id, key, value); } virtual void LogData(const Slice& blob) { ++num_seen; TestHandler::LogData(blob); } - virtual void Delete(const Slice& key) { + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { ++num_seen; - TestHandler::Delete(key); + return TestHandler::DeleteCF(column_family_id, key); } virtual bool Continue() override { return num_seen < 3; @@ -256,6 +280,42 @@ TEST(WriteBatchTest, PutGatherSlices) { ASSERT_EQ(3, batch.Count()); } +namespace { +class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { + public: + ColumnFamilyHandleImplDummy(int id) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} + uint32_t GetID() const override { return id_; } + + private: + uint32_t id_; +}; +} // namespace anonymous + +TEST(WriteBatchTest, ColumnFamiliesBatchTest) { + WriteBatch batch; + ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Put(&two, Slice("twofoo"), Slice("bar2")); + batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); + batch.Delete(&eight, Slice("eightfoo")); + batch.Merge(&three, Slice("threethree"), Slice("3three")); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Merge(Slice("omom"), Slice("nom")); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "Put(foo, bar)" + "PutCF(2, twofoo, bar2)" + "PutCF(8, eightfoo, bar8)" + "DeleteCF(8, eightfoo)" + "MergeCF(3, threethree, 3three)" + "Put(foo, bar)" + "Merge(omom, nom)", + handler.seen); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index a6bc90085..7d4a374d9 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -243,6 +243,7 @@ extern void rocksdb_options_set_paranoid_checks( rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); +extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int); extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*); @@ -275,6 +276,8 @@ extern void rocksdb_options_set_expanded_compaction_factor( rocksdb_options_t*, int); extern void rocksdb_options_set_max_grandparent_overlap_factor( rocksdb_options_t*, int); +extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t*, int* level_values, size_t num_levels); extern void rocksdb_options_enable_statistics(rocksdb_options_t*); extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int); @@ -330,10 +333,14 @@ extern void rocksdb_options_set_block_size_deviation( rocksdb_options_t*, int); extern void rocksdb_options_set_advise_random_on_open( rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_access_hint_on_compaction_start( + rocksdb_options_t*, int); extern void rocksdb_options_set_use_adaptive_mutex( rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_bytes_per_sync( rocksdb_options_t*, uint64_t); +extern void rocksdb_options_set_verify_checksums_in_compaction( + rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_filter_deletes( rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_max_sequential_skip_in_iterations( @@ -348,6 +355,7 @@ extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*); extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*); extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t); extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t); +extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t); extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n); extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec); @@ -360,6 +368,16 @@ extern void rocksdb_options_set_memtable_prefix_bloom_probes( rocksdb_options_t*, uint32_t); extern void rocksdb_options_set_max_successive_merges( rocksdb_options_t*, size_t); +extern void rocksdb_options_set_min_partial_merge_operands( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_bloom_locality( + rocksdb_options_t*, uint32_t); +extern void rocksdb_options_set_allow_thread_local( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_inplace_update_support( + rocksdb_options_t*, unsigned char); +extern void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t*, size_t); enum { rocksdb_no_compression = 0, diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 1f8d2f37e..2159d35ca 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "rocksdb/iterator.h" #include "rocksdb/options.h" @@ -23,8 +24,24 @@ namespace rocksdb { using std::unique_ptr; +class ColumnFamilyHandle { + public: + virtual ~ColumnFamilyHandle() {} +}; +extern const std::string default_column_family_name; + +struct ColumnFamilyDescriptor { + std::string name; + ColumnFamilyOptions options; + ColumnFamilyDescriptor() + : name(default_column_family_name), options(ColumnFamilyOptions()) {} + ColumnFamilyDescriptor(const std::string& name, + const ColumnFamilyOptions& options) + : name(name), options(options) {} +}; + // Update Makefile if you change these -static const int kMajorVersion = 2; +static const int kMajorVersion = 3; static const int kMinorVersion = 0; struct Options; @@ -87,33 +104,80 @@ class DB { // that modify data, like put/delete, will return error. // If the db is opened in read only mode, then no compactions // will happen. + // TODO(icanadi): implement OpenForReadOnly that specifies column families. + // User can open DB in read-only mode even if not specifying all column + // families static Status OpenForReadOnly(const Options& options, const std::string& name, DB** dbptr, bool error_if_log_file_exist = false); + // Open DB with column families. + // db_options specify database specific options + // column_families is the vector of all column families you'd like to open, + // containing column family name and options. The default column family name + // is 'default'. + // If everything is OK, handles will on return be the same size + // as column_families --- handles[i] will be a handle that you + // will use to operate on column family column_family[i] + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr); + + // ListColumnFamilies will open the DB specified by argument name + // and return the list of all column families in that DB + // through column_families argument. The ordering of + // column families in column_families is unspecified. + static Status ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families); + DB() { } virtual ~DB(); + // Create a column_family and return the handle of column family + // through the argument handle. + virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, + const std::string& column_family_name, + ColumnFamilyHandle** handle); + + // Drop a column family specified by column_family handle. This call + // only records a drop record in the manifest and prevents the column + // family from flushing and compacting. + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family); + // Set the database entry for "key" to "value". // Returns OK on success, and a non-OK status on error. // Note: consider setting options.sync = true. virtual Status Put(const WriteOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) = 0; + Status Put(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Put(options, DefaultColumnFamily(), key, value); + } // Remove the database entry (if any) for "key". Returns OK on // success, and a non-OK status on error. It is not an error if "key" // did not exist in the database. // Note: consider setting options.sync = true. - virtual Status Delete(const WriteOptions& options, const Slice& key) = 0; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) = 0; + Status Delete(const WriteOptions& options, const Slice& key) { + return Delete(options, DefaultColumnFamily(), key); + } // Merge the database entry for "key" with "value". Returns OK on success, // and a non-OK status on error. The semantics of this operation is // determined by the user provided merge_operator when opening DB. // Note: consider setting options.sync = true. virtual Status Merge(const WriteOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) = 0; + Status Merge(const WriteOptions& options, const Slice& key, + const Slice& value) { + return Merge(options, DefaultColumnFamily(), key, value); + } // Apply the specified updates to the database. // Returns OK on success, non-OK on failure. @@ -128,8 +192,11 @@ class DB { // // May return some other Status on an error. virtual Status Get(const ReadOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value) = 0; + Status Get(const ReadOptions& options, const Slice& key, std::string* value) { + return Get(options, DefaultColumnFamily(), key, value); + } // If keys[i] does not exist in the database, then the i'th returned // status will be one for which Status::IsNotFound() is true, and @@ -141,9 +208,17 @@ class DB { // Similarly, the number of returned statuses will be the number of keys. // Note: keys will not be "de-duplicated". Duplicate keys will return // duplicate values in order. - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values) = 0; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) = 0; + std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values) { + return MultiGet(options, std::vector( + keys.size(), DefaultColumnFamily()), + keys, values); + } // If the key definitely does not exist in the database, then this method // returns false, else true. If the caller wants to obtain value when the key @@ -153,14 +228,17 @@ class DB { // to make this lighter weight is to avoid doing any IOs. // Default implementation here returns true and sets 'value_found' to false virtual bool KeyMayExist(const ReadOptions& options, - const Slice& key, - std::string* value, - bool* value_found = nullptr) { + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { if (value_found != nullptr) { *value_found = false; } return true; } + bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); + } // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must @@ -168,7 +246,18 @@ class DB { // // Caller should delete the iterator when it is no longer needed. // The returned iterator should be deleted before this db is deleted. - virtual Iterator* NewIterator(const ReadOptions& options) = 0; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) = 0; + Iterator* NewIterator(const ReadOptions& options) { + return NewIterator(options, DefaultColumnFamily()); + } + // Returns iterators from a consistent database state across multiple + // column families. Iterators are heap allocated and need to be deleted + // before the db is deleted + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) = 0; // Return a handle to the current DB state. Iterators created with // this handle will all observe a stable snapshot of the current DB @@ -194,7 +283,11 @@ class DB { // about the internal operation of the DB. // "rocksdb.sstables" - returns a multi-line string that describes all // of the sstables that make up the db contents. - virtual bool GetProperty(const Slice& property, std::string* value) = 0; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) = 0; + bool GetProperty(const Slice& property, std::string* value) { + return GetProperty(DefaultColumnFamily(), property, value); + } // For each i in [0,n-1], store in "sizes[i]", the approximate // file system space used by keys in "[range[i].start .. range[i].limit)". @@ -204,8 +297,12 @@ class DB { // sizes will be one-tenth the size of the corresponding user data size. // // The results may not include the sizes of recently written data. - virtual void GetApproximateSizes(const Range* range, int n, + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) = 0; + void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) { + GetApproximateSizes(DefaultColumnFamily(), range, n, sizes); + } // Compact the underlying storage for the key range [*begin,*end]. // The actual compaction interval might be superset of [*begin, *end]. @@ -224,19 +321,32 @@ class DB { // hosting all the files. In this case, client could set reduce_level // to true, to move the files back to the minimum level capable of holding // the data set or a given level (specified by non-negative target_level). - virtual Status CompactRange(const Slice* begin, const Slice* end, + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, bool reduce_level = false, int target_level = -1) = 0; + Status CompactRange(const Slice* begin, const Slice* end, + bool reduce_level = false, int target_level = -1) { + return CompactRange(DefaultColumnFamily(), begin, end, reduce_level, + target_level); + } // Number of levels used for this DB. - virtual int NumberLevels() = 0; + virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; + int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } // Maximum level to which a new compacted memtable is pushed if it // does not create overlap. - virtual int MaxMemCompactionLevel() = 0; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0; + int MaxMemCompactionLevel() { + return MaxMemCompactionLevel(DefaultColumnFamily()); + } // Number of files in level-0 that would stop writes. - virtual int Level0StopWriteTrigger() = 0; + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0; + int Level0StopWriteTrigger() { + return Level0StopWriteTrigger(DefaultColumnFamily()); + } // Get DB name -- the exact same name that was provided as an argument to // DB::Open() @@ -246,10 +356,18 @@ class DB { virtual Env* GetEnv() const = 0; // Get DB Options that we use - virtual const Options& GetOptions() const = 0; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) + const = 0; + const Options& GetOptions() const { + return GetOptions(DefaultColumnFamily()); + } // Flush all mem-table data. - virtual Status Flush(const FlushOptions& options) = 0; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) = 0; + Status Flush(const FlushOptions& options) { + return Flush(options, DefaultColumnFamily()); + } // Prevent file deletions. Compactions will continue to occur, // but no obsolete files will be deleted. Calling this multiple @@ -279,9 +397,12 @@ class DB { // Setting flush_memtable to true does Flush before recording the live files. // Setting flush_memtable to false is useful when we don't want to wait for // flush which may have to wait for compaction to complete taking an - // indeterminate time. But this will have to use GetSortedWalFiles after - // GetLiveFiles to compensate for memtables missed in this snapshot due to the - // absence of Flush, by WAL files to recover the database consistently later + // indeterminate time. + // + // In case you have multiple column families, even if flush_memtable is true, + // you still need to call GetSortedWalFiles after GetLiveFiles to compensate + // for new data that arrived to already-flushed column families while other + // column families were flushing virtual Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, bool flush_memtable = true) = 0; @@ -319,7 +440,14 @@ class DB { // be set properly virtual Status GetDbIdentity(std::string& identity) = 0; - virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) = 0; + // Returns default column family handle + virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; + + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) = 0; + Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { + return GetPropertiesOfAllTables(DefaultColumnFamily(), props); + } private: // No copying allowed diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index a64425174..7a989d29c 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -34,7 +34,7 @@ class Slice; class WritableFile; class RandomRWFile; class Directory; -struct Options; +struct DBOptions; using std::unique_ptr; using std::shared_ptr; @@ -47,7 +47,7 @@ struct EnvOptions { EnvOptions(); // construct from Options - explicit EnvOptions(const Options& options); + explicit EnvOptions(const DBOptions& options); // If true, then allow caching of data in environment buffers bool use_os_buffer = true; diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 6c65bdc3f..05f1aebca 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -45,6 +45,8 @@ class LookupKey; class Slice; class SliceTransform; +typedef void* KeyHandle; + class MemTableRep { public: // KeyComparator provides a means to compare keys, which are internal keys @@ -62,11 +64,19 @@ class MemTableRep { virtual ~KeyComparator() { } }; + explicit MemTableRep(Arena* arena) : arena_(arena) {} + + // Allocate a buf of len size for storing key. The idea is that a specific + // memtable representation knows its underlying data structure better. By + // allowing it to allocate memory, it can possibly put correlated stuff + // in consecutive memory area to make processor prefetching more efficient. + virtual KeyHandle Allocate(const size_t len, char** buf); + // Insert key into the collection. (The caller will pack key and value into a - // single buffer and pass that in as the parameter to Insert) + // single buffer and pass that in as the parameter to Insert). // REQUIRES: nothing that compares equal to key is currently in the // collection. - virtual void Insert(const char* key) = 0; + virtual void Insert(KeyHandle handle) = 0; // Returns true iff an entry that compares equal to key is in the collection. virtual bool Contains(const char* key) const = 0; @@ -153,6 +163,8 @@ class MemTableRep { // When *key is an internal key concatenated with the value, returns the // user key. virtual Slice UserKey(const char* key) const; + + Arena* arena_; }; // This is the base class for all factories that are used by RocksDB to create diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 54b4ef38f..9cfefb8dd 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -72,8 +72,9 @@ enum UpdateStatus { // Return status For inplace update callback UPDATED = 2, // No inplace update. Merged value set }; -// Options to control the behavior of a database (passed to DB::Open) -struct Options { +struct Options; + +struct ColumnFamilyOptions { // ------------------- // Parameters that affect behavior @@ -130,38 +131,6 @@ struct Options { // Default: a factory that doesn't provide any object std::shared_ptr compaction_filter_factory_v2; - // If true, the database will be created if it is missing. - // Default: false - bool create_if_missing; - - // If true, an error is raised if the database already exists. - // Default: false - bool error_if_exists; - - // If true, the implementation will do aggressive checking of the - // data it is processing and will stop early if it detects any - // errors. This may have unforeseen ramifications: for example, a - // corruption of one DB entry may cause a large number of entries to - // become unreadable or for the entire DB to become unopenable. - // If any of the writes to the database fails (Put, Delete, Merge, Write), - // the database will switch to read-only mode and fail all other - // Write operations. - // Default: true - bool paranoid_checks; - - // Use the specified object to interact with the environment, - // e.g. to read/write files, schedule background work, etc. - // Default: Env::Default() - Env* env; - - // Any internal progress/error information generated by the db will - // be written to info_log if it is non-nullptr, or to a file stored - // in the same directory as the DB contents if info_log is nullptr. - // Default: nullptr - shared_ptr info_log; - - InfoLogLevel info_log_level; - // ------------------- // Parameters that affect performance @@ -193,15 +162,6 @@ struct Options { // individual write buffers. Default: 1 int min_write_buffer_number_to_merge; - // Number of open files that can be used by the DB. You may need to - // increase this if your database has a large working set. Value -1 means - // files opened are always kept open. You can estimate number of files based - // on target_file_size_base and target_file_size_multiplier for level-based - // compaction. For universal-style compaction, you can usually set it to -1. - // - // Default: 5000 - int max_open_files; - // Control over blocks (user data is stored in a set of blocks, and // a block is the unit of reading from disk). @@ -369,93 +329,12 @@ struct Options { // stop building a single file in a level->level+1 compaction. int max_grandparent_overlap_factor; - // If non-null, then we should collect metrics about database operations - // Statistics objects should not be shared between DB instances as - // it does not use any locks to prevent concurrent updates. - shared_ptr statistics; - - // If true, then the contents of data files are not synced - // to stable storage. Their contents remain in the OS buffers till the - // OS decides to flush them. This option is good for bulk-loading - // of data. Once the bulk-loading is complete, please issue a - // sync to the OS to flush all dirty buffesrs to stable storage. - // Default: false - bool disableDataSync; - - // If true, then every store to stable storage will issue a fsync. - // If false, then every store to stable storage will issue a fdatasync. - // This parameter should be set to true while storing data to - // filesystem like ext3 that can lose files after a reboot. - // Default: false - bool use_fsync; - - // This number controls how often a new scribe log about - // db deploy stats is written out. - // -1 indicates no logging at all. - // Default value is 1800 (half an hour). - int db_stats_log_interval; - - // This specifies the info LOG dir. - // If it is empty, the log files will be in the same dir as data. - // If it is non empty, the log files will be in the specified dir, - // and the db data dir's absolute path will be used as the log file - // name's prefix. - std::string db_log_dir; - - // This specifies the absolute dir path for write-ahead logs (WAL). - // If it is empty, the log files will be in the same dir as data, - // dbname is used as the data dir by default - // If it is non empty, the log files will be in kept the specified dir. - // When destroying the db, - // all log files in wal_dir and the dir itself is deleted - std::string wal_dir; - // Disable compaction triggered by seek. // With bloomfilter and fast storage, a miss on one level // is very cheap if the file handle is cached in table cache // (which is true if max_open_files is large). bool disable_seek_compaction; - // The periodicity when obsolete files get deleted. The default - // value is 6 hours. The files that get out of scope by compaction - // process will still get automatically delete on every compaction, - // regardless of this setting - uint64_t delete_obsolete_files_period_micros; - - // Maximum number of concurrent background jobs, submitted to - // the default LOW priority thread pool - // Default: 1 - int max_background_compactions; - - // Maximum number of concurrent background memtable flush jobs, submitted to - // the HIGH priority thread pool. - // By default, all background jobs (major compaction and memtable flush) go - // to the LOW priority pool. If this option is set to a positive number, - // memtable flush jobs will be submitted to the HIGH priority pool. - // It is important when the same Env is shared by multiple db instances. - // Without a separate pool, long running major compaction jobs could - // potentially block memtable flush jobs of other db instances, leading to - // unnecessary Put stalls. - // Default: 1 - int max_background_flushes; - - // Specify the maximal size of the info log file. If the log file - // is larger than `max_log_file_size`, a new info log file will - // be created. - // If max_log_file_size == 0, all logs will be written to one - // log file. - size_t max_log_file_size; - - // Time for the info log file to roll (in seconds). - // If specified with non-zero value, log file will be rolled - // if it has been active longer than `log_file_time_to_roll`. - // Default: 0 (disabled) - size_t log_file_time_to_roll; - - // Maximal info log files to be kept. - // Default: 1000 - size_t keep_log_file_num; - // Puts are delayed 0-1 ms when any level has a compaction score that exceeds // soft_rate_limit. This is ignored when == 0.0. // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not @@ -473,32 +352,14 @@ struct Options { // Default: 1000 unsigned int rate_limit_delay_max_milliseconds; - // manifest file is rolled over on reaching this limit. - // The older manifest file be deleted. - // The default value is MAX_INT so that roll-over does not take place. - uint64_t max_manifest_file_size; - // Disable block cache. If this is set to true, // then no block cache should be used, and the block_cache should // point to a nullptr object. // Default: false bool no_block_cache; - // Number of shards used for table cache. - int table_cache_numshardbits; - - // During data eviction of table's LRU cache, it would be inefficient - // to strictly follow LRU because this piece of memory will not really - // be released unless its refcount falls to zero. Instead, make two - // passes: the first pass will release items with refcount = 1, - // and if not enough space releases after scanning the number of - // elements specified by this parameter, we will remove items in LRU - // order. - int table_cache_remove_scan_count_limit; - - // Size of one block in arena memory allocation. - // - // If <= 0, a proper value is automatically calculated (usually about 1/10 of + // size of one block in arena memory allocation. + // If <= 0, a proper value is automatically calculated (usually 1/10 of // writer_buffer_size). // // There are two additonal restriction of the The specified size: @@ -512,71 +373,14 @@ struct Options { // Default: 0 size_t arena_block_size; - // Create an Options object with default values for all fields. - Options(); - - void Dump(Logger* log) const; - - // Set appropriate parameters for bulk loading. - // The reason that this is a function that returns "this" instead of a - // constructor is to enable chaining of multiple similar calls in the future. - // - // All data will be in level 0 without any automatic compaction. - // It's recommended to manually call CompactRange(NULL, NULL) before reading - // from the database, because otherwise the read can be very slow. - Options* PrepareForBulkLoad(); - // Disable automatic compactions. Manual compactions can still - // be issued on this database. + // be issued on this column family bool disable_auto_compactions; - // The following two fields affect how archived logs will be deleted. - // 1. If both set to 0, logs will be deleted asap and will not get into - // the archive. - // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - // WAL files will be checked every 10 min and if total size is greater - // then WAL_size_limit_MB, they will be deleted starting with the - // earliest until size_limit is met. All empty files will be deleted. - // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - // WAL files will be checked every WAL_ttl_secondsi / 2 and those that - // are older than WAL_ttl_seconds will be deleted. - // 4. If both are not 0, WAL files will be checked every 10 min and both - // checks will be performed with ttl being first. - uint64_t WAL_ttl_seconds; - uint64_t WAL_size_limit_MB; - - // Number of bytes to preallocate (via fallocate) the manifest - // files. Default is 4mb, which is reasonable to reduce random IO - // as well as prevent overallocation for mounts that preallocate - // large amounts of data (such as xfs's allocsize option). - size_t manifest_preallocation_size; - // Purge duplicate/deleted keys when a memtable is flushed to storage. // Default: true bool purge_redundant_kvs_while_flush; - // Data being read from file storage may be buffered in the OS - // Default: true - bool allow_os_buffer; - - // Allow the OS to mmap file for reading sst tables. Default: false - bool allow_mmap_reads; - - // Allow the OS to mmap file for writing. Default: false - bool allow_mmap_writes; - - // Disable child process inherit open files. Default: true - bool is_fd_close_on_exec; - - // Skip log corruption error on recovery (If client is ok with - // losing most recent changes) - // Default: false - bool skip_log_error_on_recovery; - - // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec - // Default: 3600 (1 hour) - unsigned int stats_dump_period_sec; - // This is used to close a block before it reaches the configured // 'block_size'. If the percentage of free space in the current block is less // than this specified number and adding a new record to the block will @@ -585,45 +389,17 @@ struct Options { // Default is 10. int block_size_deviation; - // If set true, will hint the underlying file system that the file - // access pattern is random, when a sst file is opened. - // Default: true - bool advise_random_on_open; - - // Specify the file access pattern once a compaction is started. - // It will be applied to all input files of a compaction. - // Default: NORMAL - enum { - NONE, - NORMAL, - SEQUENTIAL, - WILLNEED - } access_hint_on_compaction_start; - - // Use adaptive mutex, which spins in the user space before resorting - // to kernel. This could reduce context switch when the mutex is not - // heavily contended. However, if the mutex is hot, we could end up - // wasting spin time. - // Default: false - bool use_adaptive_mutex; - - // Allows OS to incrementally sync files to disk while they are being - // written, asynchronously, in the background. - // Issue one request for every bytes_per_sync written. 0 turns it off. - // Default: 0 - uint64_t bytes_per_sync; - // The compaction style. Default: kCompactionStyleLevel CompactionStyle compaction_style; - // The options needed to support Universal Style compactions - CompactionOptionsUniversal compaction_options_universal; - // If true, compaction will verify checksum on every read that happens // as part of compaction // Default: true bool verify_checksums_in_compaction; + // The options needed to support Universal Style compactions + CompactionOptionsUniversal compaction_options_universal; + // Use KeyMayExist API to filter deletes when this is true. // If KeyMayExist returns false, i.e. the key definitely does not exist, then // the delete is a noop. KeyMayExist only incurs in-memory look up. @@ -653,7 +429,7 @@ struct Options { // Default: emtpy vector -- no user-defined statistics collection will be // performed. typedef std::vector> - TablePropertiesCollectors; + TablePropertiesCollectors; TablePropertiesCollectors table_properties_collectors; // Allows thread-safe inplace updates. @@ -750,9 +526,266 @@ struct Options { // Default: 2 uint32_t min_partial_merge_operands; + // Create ColumnFamilyOptions with default values for all fields + ColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit ColumnFamilyOptions(const Options& options); + + void Dump(Logger* log) const; +}; + +struct DBOptions { + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // If any of the writes to the database fails (Put, Delete, Merge, Write), + // the database will switch to read-only mode and fail all other + // Write operations. + // Default: true + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-nullptr, or to a file stored + // in the same directory as the DB contents if info_log is nullptr. + // Default: nullptr + shared_ptr info_log; + + InfoLogLevel info_log_level; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set. Value -1 means + // files opened are always kept open. You can estimate number of files based + // on target_file_size_base and target_file_size_multiplier for level-based + // compaction. For universal-style compaction, you can usually set it to -1. + // Default: 5000 + int max_open_files; + + // If non-null, then we should collect metrics about database operations + // Statistics objects should not be shared between DB instances as + // it does not use any locks to prevent concurrent updates. + shared_ptr statistics; + + // If true, then the contents of data files are not synced + // to stable storage. Their contents remain in the OS buffers till the + // OS decides to flush them. This option is good for bulk-loading + // of data. Once the bulk-loading is complete, please issue a + // sync to the OS to flush all dirty buffesrs to stable storage. + // Default: false + bool disableDataSync; + + // If true, then every store to stable storage will issue a fsync. + // If false, then every store to stable storage will issue a fdatasync. + // This parameter should be set to true while storing data to + // filesystem like ext3 that can lose files after a reboot. + // Default: false + bool use_fsync; + + // This number controls how often a new scribe log about + // db deploy stats is written out. + // -1 indicates no logging at all. + // Default value is 1800 (half an hour). + int db_stats_log_interval; + + // This specifies the info LOG dir. + // If it is empty, the log files will be in the same dir as data. + // If it is non empty, the log files will be in the specified dir, + // and the db data dir's absolute path will be used as the log file + // name's prefix. + std::string db_log_dir; + + // This specifies the absolute dir path for write-ahead logs (WAL). + // If it is empty, the log files will be in the same dir as data, + // dbname is used as the data dir by default + // If it is non empty, the log files will be in kept the specified dir. + // When destroying the db, + // all log files in wal_dir and the dir itself is deleted + std::string wal_dir; + + // The periodicity when obsolete files get deleted. The default + // value is 6 hours. The files that get out of scope by compaction + // process will still get automatically delete on every compaction, + // regardless of this setting + uint64_t delete_obsolete_files_period_micros; + + // Maximum number of concurrent background compaction jobs, submitted to + // the default LOW priority thread pool. + // If you're increasing this, also consider increasing number of threads in + // LOW priority thread pool. For more information, see + // Env::SetBackgroundThreads + // Default: 1 + int max_background_compactions; + + // Maximum number of concurrent background memtable flush jobs, submitted to + // the HIGH priority thread pool. + // + // By default, all background jobs (major compaction and memtable flush) go + // to the LOW priority pool. If this option is set to a positive number, + // memtable flush jobs will be submitted to the HIGH priority pool. + // It is important when the same Env is shared by multiple db instances. + // Without a separate pool, long running major compaction jobs could + // potentially block memtable flush jobs of other db instances, leading to + // unnecessary Put stalls. + // + // If you're increasing this, also consider increasing number of threads in + // HIGH priority thread pool. For more information, see + // Env::SetBackgroundThreads + // Default: 1 + int max_background_flushes; + + // Specify the maximal size of the info log file. If the log file + // is larger than `max_log_file_size`, a new info log file will + // be created. + // If max_log_file_size == 0, all logs will be written to one + // log file. + size_t max_log_file_size; + + // Time for the info log file to roll (in seconds). + // If specified with non-zero value, log file will be rolled + // if it has been active longer than `log_file_time_to_roll`. + // Default: 0 (disabled) + size_t log_file_time_to_roll; + + // Maximal info log files to be kept. + // Default: 1000 + size_t keep_log_file_num; + + // manifest file is rolled over on reaching this limit. + // The older manifest file be deleted. + // The default value is MAX_INT so that roll-over does not take place. + uint64_t max_manifest_file_size; + + // Number of shards used for table cache. + int table_cache_numshardbits; + + // During data eviction of table's LRU cache, it would be inefficient + // to strictly follow LRU because this piece of memory will not really + // be released unless its refcount falls to zero. Instead, make two + // passes: the first pass will release items with refcount = 1, + // and if not enough space releases after scanning the number of + // elements specified by this parameter, we will remove items in LRU + // order. + int table_cache_remove_scan_count_limit; + + // The following two fields affect how archived logs will be deleted. + // 1. If both set to 0, logs will be deleted asap and will not get into + // the archive. + // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + // WAL files will be checked every 10 min and if total size is greater + // then WAL_size_limit_MB, they will be deleted starting with the + // earliest until size_limit is met. All empty files will be deleted. + // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + // WAL files will be checked every WAL_ttl_secondsi / 2 and those that + // are older than WAL_ttl_seconds will be deleted. + // 4. If both are not 0, WAL files will be checked every 10 min and both + // checks will be performed with ttl being first. + uint64_t WAL_ttl_seconds; + uint64_t WAL_size_limit_MB; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size; + + // Data being read from file storage may be buffered in the OS + // Default: true + bool allow_os_buffer; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads; + + // Allow the OS to mmap file for writing. Default: false + bool allow_mmap_writes; + + // Disable child process inherit open files. Default: true + bool is_fd_close_on_exec; + + // Skip log corruption error on recovery (If client is ok with + // losing most recent changes) + // Default: false + bool skip_log_error_on_recovery; + + // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + // Default: 3600 (1 hour) + unsigned int stats_dump_period_sec; + + // If set true, will hint the underlying file system that the file + // access pattern is random, when a sst file is opened. + // Default: true + bool advise_random_on_open; + + // Specify the file access pattern once a compaction is started. + // It will be applied to all input files of a compaction. + // Default: NORMAL + enum { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + } access_hint_on_compaction_start; + + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex; + + // Allows OS to incrementally sync files to disk while they are being + // written, asynchronously, in the background. + // Issue one request for every bytes_per_sync written. 0 turns it off. + // Default: 0 + uint64_t bytes_per_sync; + // Allow RocksDB to use thread local storage to optimize performance. // Default: true bool allow_thread_local; + + // Create DBOptions with default values for all fields + DBOptions(); + // Create DBOptions from Options + explicit DBOptions(const Options& options); + + void Dump(Logger* log) const; +}; + +// Options to control the behavior of a database (passed to DB::Open) +struct Options : public DBOptions, public ColumnFamilyOptions { + // Create an Options object with default values for all fields. + Options() : + DBOptions(), + ColumnFamilyOptions() {} + + Options(const DBOptions& db_options, + const ColumnFamilyOptions& column_family_options) + : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} + + void Dump(Logger* log) const; + + // Set appropriate parameters for bulk loading. + // The reason that this is a function that returns "this" instead of a + // constructor is to enable chaining of multiple similar calls in the future. + // + + // All data will be in level 0 without any automatic compaction. + // It's recommended to manually call CompactRange(NULL, NULL) before reading + // from the database, because otherwise the read can be very slow. + Options* PrepareForBulkLoad(); }; // diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 61adad6b7..0704ea210 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -64,7 +64,11 @@ struct PerfContext { uint64_t write_memtable_time; }; +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) +extern PerfContext perf_context; +#else extern __thread PerfContext perf_context; +#endif } diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 2cfb731f6..60817056f 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -31,6 +31,7 @@ namespace rocksdb { class Slice; +class ColumnFamilyHandle; struct SliceParts; class WriteBatch { @@ -39,19 +40,34 @@ class WriteBatch { ~WriteBatch(); // Store the mapping "key->value" in the database. - void Put(const Slice& key, const Slice& value); + void Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + void Put(const Slice& key, const Slice& value) { + Put(nullptr, key, value); + } // Variant of Put() that gathers output like writev(2). The key and value // that will be written to the database are concatentations of arrays of // slices. - void Put(const SliceParts& key, const SliceParts& value); + void Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value); + void Put(const SliceParts& key, const SliceParts& value) { + Put(nullptr, key, value); + } // Merge "value" with the existing value of "key" in the database. // "key->merge(existing, value)" - void Merge(const Slice& key, const Slice& value); + void Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); + void Merge(const Slice& key, const Slice& value) { + Merge(nullptr, key, value); + } // If the database contains a mapping for "key", erase it. Else do nothing. - void Delete(const Slice& key); + void Delete(ColumnFamilyHandle* column_family, const Slice& key); + void Delete(const Slice& key) { + Delete(nullptr, key); + } // Append a blob of arbitrary size to the records in this batch. The blob will // be stored in the transaction log but not in any other file. In particular, @@ -72,14 +88,46 @@ class WriteBatch { class Handler { public: virtual ~Handler(); - virtual void Put(const Slice& key, const Slice& value) = 0; + // default implementation will just call Put without column family for + // backwards compatibility. If the column family is not default, + // the function is noop + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + // Put() historically doesn't return status. We didn't want to be + // backwards incompatible so we didn't change the return status + // (this is a public API). We do an ordinary get and return Status::OK() + Put(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and PutCF not implemented"); + } + virtual void Put(const Slice& key, const Slice& value); // Merge and LogData are not pure virtual. Otherwise, we would break // existing clients of Handler on a source code level. The default // implementation of Merge simply throws a runtime exception. + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + if (column_family_id == 0) { + Merge(key, value); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and MergeCF not implemented"); + } virtual void Merge(const Slice& key, const Slice& value); // The default implementation of LogData does nothing. virtual void LogData(const Slice& blob); - virtual void Delete(const Slice& key) = 0; + virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { + if (column_family_id == 0) { + Delete(key); + return Status::OK(); + } + return Status::InvalidArgument( + "non-default column family and DeleteCF not implemented"); + } + virtual void Delete(const Slice& key); // Continue is called by WriteBatch::Iterate. If it returns false, // iteration is halted. Otherwise, it continues iterating. The default // implementation always returns true. diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h index 370c920ae..57f444802 100644 --- a/include/utilities/stackable_db.h +++ b/include/utilities/stackable_db.h @@ -21,40 +21,49 @@ class StackableDB : public DB { return db_; } + using DB::Put; virtual Status Put(const WriteOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) override { - return db_->Put(options, key, val); + return db_->Put(options, column_family, key, val); } + using DB::Get; virtual Status Get(const ReadOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value) override { - return db_->Get(options, key, value); + return db_->Get(options, column_family, key, value); } - virtual std::vector MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values) - override { - return db_->MultiGet(options, keys, values); + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override { + return db_->MultiGet(options, column_family, keys, values); } + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value, bool* value_found = nullptr) override { - return db_->KeyMayExist(options, key, value, value_found); + return db_->KeyMayExist(options, column_family, key, value, value_found); } - virtual Status Delete(const WriteOptions& wopts, const Slice& key) override { - return db_->Delete(wopts, key); + using DB::Delete; + virtual Status Delete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return db_->Delete(wopts, column_family, key); } + using DB::Merge; virtual Status Merge(const WriteOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) override { - return db_->Merge(options, key, value); + return db_->Merge(options, column_family, key, value); } @@ -63,10 +72,20 @@ class StackableDB : public DB { return db_->Write(opts, updates); } - virtual Iterator* NewIterator(const ReadOptions& opts) override { - return db_->NewIterator(opts); + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override { + return db_->NewIterator(opts, column_family); } + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) { + return db_->NewIterators(options, column_families, iterators); + } + + virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); } @@ -75,32 +94,43 @@ class StackableDB : public DB { return db_->ReleaseSnapshot(snapshot); } - virtual bool GetProperty(const Slice& property, std::string* value) - override { - return db_->GetProperty(property, value); + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override { + return db_->GetProperty(column_family, property, value); } - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) - override { - return db_->GetApproximateSizes(r, n, sizes); + using DB::GetApproximateSizes; + virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* r, int n, + uint64_t* sizes) override { + return db_->GetApproximateSizes(column_family, r, n, sizes); } - virtual Status CompactRange(const Slice* begin, const Slice* end, + using DB::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, bool reduce_level = false, int target_level = -1) override { - return db_->CompactRange(begin, end, reduce_level, target_level); + return db_->CompactRange(column_family, begin, end, reduce_level, + target_level); } - virtual int NumberLevels() override { - return db_->NumberLevels(); + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override { + return db_->NumberLevels(column_family); } - virtual int MaxMemCompactionLevel() override { - return db_->MaxMemCompactionLevel(); + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) + override { + return db_->MaxMemCompactionLevel(column_family); } - virtual int Level0StopWriteTrigger() override { - return db_->Level0StopWriteTrigger(); + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) + override { + return db_->Level0StopWriteTrigger(column_family); } virtual const std::string& GetName() const override { @@ -111,12 +141,16 @@ class StackableDB : public DB { return db_->GetEnv(); } - virtual const Options& GetOptions() const override { - return db_->GetOptions(); + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const + override { + return db_->GetOptions(column_family); } - virtual Status Flush(const FlushOptions& fopts) override { - return db_->Flush(fopts); + using DB::Flush; + virtual Status Flush(const FlushOptions& fopts, + ColumnFamilyHandle* column_family) override { + return db_->Flush(fopts, column_family); } virtual Status DisableFileDeletions() override { @@ -148,8 +182,10 @@ class StackableDB : public DB { return db_->GetDbIdentity(identity); } - virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { - return db_->GetPropertiesOfAllTables(props); + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + return db_->GetPropertiesOfAllTables(column_family, props); } virtual Status GetUpdatesSince( @@ -158,6 +194,10 @@ class StackableDB : public DB { return db_->GetUpdatesSince(seq_number, iter, read_options); } + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return db_->DefaultColumnFamily(); + } + protected: DB* db_; }; diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index f72c3ba6d..a3564d536 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -208,7 +208,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); mem->Ref(); std::string state; - rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, mem, &options); + rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); + rocksdb::Status s = + rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; rocksdb::Iterator* iter = mem->NewIterator(); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { diff --git a/port/port_example.h b/port/port_example.h index 64a579187..f124abb06 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -127,13 +127,6 @@ extern bool Snappy_GetUncompressedLength(const char* input, size_t length, extern bool Snappy_Uncompress(const char* input_data, size_t input_length, char* output); -// ------------------ Miscellaneous ------------------- - -// If heap profiling is not supported, returns false. -// Else repeatedly calls (*func)(arg, data, n) and then returns true. -// The concatenation of all "data[0,n-1]" fragments is the heap profile. -extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); - } // namespace port } // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h index 6a7382926..b2d162468 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -476,10 +476,6 @@ inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input, return false; } -inline bool GetHeapProfile(void (*func)(void *, const char *, int), void *arg) { - return false; -} - #define CACHE_LINE_SIZE 64U } // namespace port diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index c3adf3ac5..a8d8695b9 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -45,7 +45,9 @@ namespace { // The longest the prefix of the cache key used to identify blocks can be. // We are using the fact that we know for Posix files the unique ID is three // varints. -const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; +// For some reason, compiling for iOS complains that this variable is unused +const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) = + kMaxVarint64Length * 3 + 1; // Read the block identified by "handle" from "file". // The only relevant option is options.verify_checksums for now. @@ -105,7 +107,7 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, Statistics* statistics) { auto cache_handle = block_cache->Lookup(key); if (cache_handle != nullptr) { - BumpPerfCount(&perf_context.block_cache_hit_count); + PERF_COUNTER_ADD(block_cache_hit_count, 1); // overall cache hit RecordTick(statistics, BLOCK_CACHE_HIT); // block-type specific cache hit diff --git a/table/filter_block.h b/table/filter_block.h index da19d42e9..05c2bb943 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -46,6 +46,9 @@ class FilterBlockBuilder { bool SamePrefix(const Slice &key1, const Slice &key2) const; void GenerateFilter(); + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. const FilterPolicy* policy_; const SliceTransform* prefix_extractor_; bool whole_key_filtering_; diff --git a/table/format.cc b/table/format.cc index f1adf97da..0d93cb93f 100644 --- a/table/format.cc +++ b/table/format.cc @@ -125,12 +125,11 @@ Status ReadBlockContents(RandomAccessFile* file, char* buf = new char[n + kBlockTrailerSize]; Slice contents; - StopWatchNano timer(env); - StartPerfTimer(&timer); + PERF_TIMER_AUTO(block_read_time); Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); - BumpPerfCount(&perf_context.block_read_count); - BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize); - BumpPerfTime(&perf_context.block_read_time, &timer); + PERF_TIMER_MEASURE(block_read_time); + PERF_COUNTER_ADD(block_read_count, 1); + PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize); if (!s.ok()) { delete[] buf; @@ -151,7 +150,7 @@ Status ReadBlockContents(RandomAccessFile* file, s = Status::Corruption("block checksum mismatch"); return s; } - BumpPerfTime(&perf_context.block_checksum_time, &timer); + PERF_TIMER_MEASURE(block_checksum_time); } // If the caller has requested that the block not be uncompressed @@ -175,7 +174,7 @@ Status ReadBlockContents(RandomAccessFile* file, s = UncompressBlockContents(data, n, result); delete[] buf; } - BumpPerfTime(&perf_context.block_decompress_time, &timer); + PERF_TIMER_STOP(block_decompress_time); return s; } diff --git a/table/merger.cc b/table/merger.cc index c154e6e64..03d177a6a 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -25,16 +25,14 @@ namespace { class MergingIterator : public Iterator { public: - MergingIterator(Env* const env, const Comparator* comparator, - Iterator** children, int n) + MergingIterator(const Comparator* comparator, Iterator** children, int n) : comparator_(comparator), children_(n), current_(nullptr), use_heap_(true), - env_(env), direction_(kForward), maxHeap_(NewMaxIterHeap(comparator_)), - minHeap_ (NewMinIterHeap(comparator_)) { + minHeap_(NewMinIterHeap(comparator_)) { for (int i = 0; i < n; i++) { children_[i].Set(children[i]); } @@ -79,13 +77,13 @@ class MergingIterator : public Iterator { // Invalidate the heap. use_heap_ = false; IteratorWrapper* first_child = nullptr; - StopWatchNano child_seek_timer(env_, false); - StopWatchNano min_heap_timer(env_, false); + PERF_TIMER_DECLARE(); + for (auto& child : children_) { - StartPerfTimer(&child_seek_timer); + PERF_TIMER_START(seek_child_seek_time); child.Seek(target); - BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer); - BumpPerfCount(&perf_context.seek_child_seek_count); + PERF_TIMER_STOP(seek_child_seek_time); + PERF_COUNTER_ADD(seek_child_seek_count, 1); if (child.Valid()) { // This child has valid key @@ -97,26 +95,24 @@ class MergingIterator : public Iterator { } else { // We have more than one children with valid keys. Initialize // the heap and put the first child into the heap. - StartPerfTimer(&min_heap_timer); + PERF_TIMER_START(seek_min_heap_time); ClearHeaps(); - BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); - StartPerfTimer(&min_heap_timer); minHeap_.push(first_child); - BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + PERF_TIMER_STOP(seek_min_heap_time); } } if (use_heap_) { - StartPerfTimer(&min_heap_timer); + PERF_TIMER_START(seek_min_heap_time); minHeap_.push(&child); - BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + PERF_TIMER_STOP(seek_min_heap_time); } } } if (use_heap_) { // If heap is valid, need to put the smallest key to curent_. - StartPerfTimer(&min_heap_timer); + PERF_TIMER_START(seek_min_heap_time); FindSmallest(); - BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer); + PERF_TIMER_STOP(seek_min_heap_time); } else { // The heap is not valid, then the current_ iterator is the first // one, or null if there is no first child. @@ -232,7 +228,6 @@ class MergingIterator : public Iterator { // This flag is always true for reverse direction, as we always use heap for // the reverse iterating case. bool use_heap_; - Env* const env_; // Which direction is the iterator moving? enum Direction { kForward, @@ -272,15 +267,14 @@ void MergingIterator::ClearHeaps() { } } // namespace -Iterator* NewMergingIterator(Env* const env, const Comparator* cmp, - Iterator** list, int n) { +Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) { assert(n >= 0); if (n == 0) { return NewEmptyIterator(); } else if (n == 1) { return list[0]; } else { - return new MergingIterator(env, cmp, list, n); + return new MergingIterator(cmp, list, n); } } diff --git a/table/merger.h b/table/merger.h index ea8daa770..3a1a4feb8 100644 --- a/table/merger.h +++ b/table/merger.h @@ -23,8 +23,7 @@ class Env; // key is present in K child iterators, it will be yielded K times. // // REQUIRES: n >= 0 -extern Iterator* NewMergingIterator(Env* const env, - const Comparator* comparator, +extern Iterator* NewMergingIterator(const Comparator* comparator, Iterator** children, int n); } // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index a6f8bed0e..436d13bf3 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -81,10 +81,9 @@ class PlainTableIterator : public Iterator { bool use_prefix_seek_; uint32_t offset_; uint32_t next_offset_; - Slice key_; + IterKey key_; Slice value_; Status status_; - std::string tmp_str_; // No copying allowed PlainTableIterator(const PlainTableIterator&) = delete; void operator=(const Iterator&) = delete; @@ -720,9 +719,7 @@ void PlainTableIterator::Next() { status_ = table_->Next(&next_offset_, &parsed_key, &value_); if (status_.ok()) { // Make a copy in this case. TODO optimize. - tmp_str_.clear(); - AppendInternalKey(&tmp_str_, parsed_key); - key_ = Slice(tmp_str_); + key_.SetInternalKey(parsed_key); } else { offset_ = next_offset_ = table_->data_end_offset_; } @@ -735,7 +732,7 @@ void PlainTableIterator::Prev() { Slice PlainTableIterator::key() const { assert(Valid()); - return key_; + return key_.GetKey(); } Slice PlainTableIterator::value() const { diff --git a/table/table_test.cc b/table/table_test.cc index b6b661e6b..18ae2a3aa 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1554,7 +1554,8 @@ TEST(MemTableTest, Simple) { batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("largekey"), std::string("vlarge")); - ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok()); + ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options); + ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); Iterator* iter = memtable->NewIterator(); iter->SeekToFirst(); diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh new file mode 100755 index 000000000..2d63c0a85 --- /dev/null +++ b/tools/auto_sanity_test.sh @@ -0,0 +1,71 @@ +TMP_DIR="/tmp/rocksdb-sanity-test" + +if [ "$#" -lt 2 ]; then + echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]" + echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits." + recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'` + commit_new=`echo "$recent_commits" | head -n1` + commit_old=`echo "$recent_commits" | tail -n1` + echo "the most recent commits are:" + echo "$recent_commits" +else + commit_new=$1 + commit_old=$2 +fi + +if [ ! -d $TMP_DIR ]; then + mkdir $TMP_DIR +fi +dir_new="${TMP_DIR}/${commit_new}" +dir_old="${TMP_DIR}/${commit_old}" + +function makestuff() { + echo "make clean" + make clean > /dev/null + echo "make db_sanity_test -j32" + make db_sanity_test -j32 > /dev/null + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to perform 'make db_sanity_test'" + exit 1 + fi +} + +rm -r -f $dir_new +rm -r -f $dir_old + +echo "Running db sanity check with commits $commit_new and $commit_old." + +echo "=============================================================" +echo "Making build $commit_new" +makestuff +mv db_sanity_test new_db_sanity_test +echo "Creating db based on the new commit --- $commit_new" +./new_db_sanity_test $dir_new create + +echo "=============================================================" +echo "Making build $commit_old" +makestuff +mv db_sanity_test old_db_sanity_test +echo "Creating db based on the old commit --- $commit_old" +./old_db_sanity_test $dir_old create + +echo "=============================================================" +echo "Verifying new db $dir_new using the old commit --- $commit_old" +./old_db_sanity_test $dir_new verify +if [ $? -ne 0 ]; then + echo "[ERROR] Verification of $dir_new using commit $commit_old failed." + exit 2 +fi + +echo "=============================================================" +echo "Verifying old db $dir_old using the new commit --- $commit_new" +./new_db_sanity_test $dir_old verify +if [ $? -ne 0 ]; then + echo "[ERROR] Verification of $dir_old using commit $commit_new failed." + exit 2 +fi + +rm old_db_sanity_test +rm new_db_sanity_test + +echo "Auto sanity test passed!" diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index d81fd0885..3c93eca36 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -88,6 +88,7 @@ def main(argv): --open_files=500000 --verify_checksum=1 --sync=0 + --progress_reports=0 --disable_wal=0 --disable_data_sync=1 --target_file_size_base=2097152 diff --git a/tools/db_crashtest2.py b/tools/db_crashtest2.py index 274fcde4e..0a12b5a60 100644 --- a/tools/db_crashtest2.py +++ b/tools/db_crashtest2.py @@ -101,6 +101,7 @@ def main(argv): --open_files=500000 --verify_checksum=1 --sync=0 + --progress_reports=0 --disable_wal=0 --disable_data_sync=1 --target_file_size_base=2097152 diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 32404c65d..a96ee3144 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -60,14 +60,16 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) { return true; } DEFINE_uint64(seed, 2341234, "Seed for PRNG"); -static const bool FLAGS_seed_dummy = - google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); +static const bool FLAGS_seed_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); -DEFINE_int64(max_key, 1 * KB * KB * KB, +DEFINE_int64(max_key, 1 * KB* KB, "Max number of key/values to place in database"); +DEFINE_int32(column_families, 10, "Number of column families"); + DEFINE_bool(test_batches_snapshots, false, - "If set, the test uses MultiGet(), MultiPut() and MultiDelete()" + "If set, the test uses MultiGet(), Multiut() and MultiDelete()" " which read/write/delete multiple keys in a batch. In this mode," " we do not verify db content by comparing the content with the " "pre-allocated array. Instead, we do partial verification inside" @@ -95,7 +97,10 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings"); DEFINE_bool(destroy_db_initially, true, "Destroys the database dir before start if this is true"); -DEFINE_bool (verbose, false, "Verbose"); +DEFINE_bool(verbose, false, "Verbose"); + +DEFINE_bool(progress_reports, true, + "If true, db_stress will report number of finished operations"); DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, "Number of bytes to buffer in memtable before compacting"); @@ -146,6 +151,10 @@ DEFINE_int32(max_background_compactions, "The maximum number of concurrent background compactions " "that can occur in parallel."); +DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes, + "The maximum number of concurrent background flushes " + "that can occur in parallel."); + DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger" " compaction in universal style"); @@ -158,6 +167,11 @@ DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" DEFINE_int32(universal_max_size_amplification_percent, 0, "The max size amplification for universal style compaction"); +DEFINE_int32(clear_column_family_one_in, 1000000, + "With a chance of 1/N, delete a column family and then recreate " + "it again. If N == 0, never drop/create column families. " + "When test_batches_snapshots is true, this flag has no effect"); + DEFINE_int64(cache_size, 2 * KB * KB * KB, "Number of bytes to use as a cache of uncompressed data."); @@ -170,8 +184,8 @@ static bool ValidateInt32Positive(const char* flagname, int32_t value) { return true; } DEFINE_int32(reopen, 10, "Number of times database reopens"); -static const bool FLAGS_reopen_dummy = - google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive); +static const bool FLAGS_reopen_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive); DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " "Negative means use default settings."); @@ -198,9 +212,9 @@ DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync"); DEFINE_int32(kill_random_test, 0, "If non-zero, kill at various points in source code with " "probability 1/this"); -static const bool FLAGS_kill_random_test_dummy = - google::RegisterFlagValidator(&FLAGS_kill_random_test, - &ValidateInt32Positive); +static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_kill_random_test, + &ValidateInt32Positive); extern int rocksdb_kill_odds; DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); @@ -226,42 +240,37 @@ static bool ValidateInt32Percent(const char* flagname, int32_t value) { } DEFINE_int32(readpercent, 10, "Ratio of reads to total workload (expressed as a percentage)"); -static const bool FLAGS_readpercent_dummy = - google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent); +static const bool FLAGS_readpercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent); DEFINE_int32(prefixpercent, 20, "Ratio of prefix iterators to total workload (expressed as a" " percentage)"); -static const bool FLAGS_prefixpercent_dummy = - google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent); +static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent); DEFINE_int32(writepercent, 45, " Ratio of deletes to total workload (expressed as a percentage)"); -static const bool FLAGS_writepercent_dummy = - google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent); +static const bool FLAGS_writepercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent); DEFINE_int32(delpercent, 15, "Ratio of deletes to total workload (expressed as a percentage)"); -static const bool FLAGS_delpercent_dummy = - google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent); +static const bool FLAGS_delpercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent); DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload" " (expressed as a percentage)"); -static const bool FLAGS_iterpercent_dummy = - google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent); +static const bool FLAGS_iterpercent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent); DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run"); -static const bool FLAGS_num_iterations_dummy = - google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); +static const bool FLAGS_num_iterations_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); DEFINE_bool(disable_seek_compaction, false, "Option to disable compation triggered by read."); -DEFINE_uint64(delete_obsolete_files_period_micros, 0, - "Option to delete obsolete files periodically" - "0 means that obsolete files are " - " deleted after every compaction run."); - enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { assert(ctype); @@ -290,21 +299,21 @@ DEFINE_string(hdfs, "", "Name of hdfs environment"); // posix or hdfs environment static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); -DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread."); -static const bool FLAGS_ops_per_thread_dummy = - google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range); +DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); +static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range); DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock"); -static const bool FLAGS_log2_keys_per_lock_dummy = - google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock, - &ValidateUint32Range); +static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock, + &ValidateUint32Range); DEFINE_int32(purge_redundant_percent, 50, "Percentage of times we want to purge redundant keys in memory " "before flushing"); -static const bool FLAGS_purge_redundant_percent_dummy = - google::RegisterFlagValidator(&FLAGS_purge_redundant_percent, - &ValidateInt32Percent); +static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_purge_redundant_percent, + &ValidateInt32Percent); DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop" " the delete if key not present"); @@ -438,16 +447,18 @@ class Stats { last_op_finish_ = now; } - done_++; - if (done_ >= next_report_) { - if (next_report_ < 1000) next_report_ += 100; - else if (next_report_ < 5000) next_report_ += 500; - else if (next_report_ < 10000) next_report_ += 1000; - else if (next_report_ < 50000) next_report_ += 5000; - else if (next_report_ < 100000) next_report_ += 10000; - else if (next_report_ < 500000) next_report_ += 50000; - else next_report_ += 100000; - fprintf(stdout, "... finished %ld ops%30s\r", done_, ""); + done_++; + if (FLAGS_progress_reports) { + if (done_ >= next_report_) { + if (next_report_ < 1000) next_report_ += 100; + else if (next_report_ < 5000) next_report_ += 500; + else if (next_report_ < 10000) next_report_ += 1000; + else if (next_report_ < 50000) next_report_ += 5000; + else if (next_report_ < 100000) next_report_ += 10000; + else if (next_report_ < 500000) next_report_ += 50000; + else next_report_ += 100000; + fprintf(stdout, "... finished %ld ops%30s\r", done_, ""); + } } } @@ -515,7 +526,7 @@ class Stats { // State shared by all concurrent executions of the same benchmark. class SharedState { public: - static const uint32_t SENTINEL = 0xffffffff; + static const uint32_t SENTINEL; explicit SharedState(StressTest* stress_test) : cv_(&mu_), @@ -531,28 +542,27 @@ class SharedState { start_verify_(false), stress_test_(stress_test) { if (FLAGS_test_batches_snapshots) { - key_locks_ = nullptr; - values_ = nullptr; fprintf(stdout, "No lock creation because test_batches_snapshots set\n"); return; } - values_ = new uint32_t[max_key_]; - for (long i = 0; i < max_key_; i++) { - values_[i] = SENTINEL; + values_.resize(FLAGS_column_families); + + for (int i = 0; i < FLAGS_column_families; ++i) { + values_[i] = std::vector(max_key_, SENTINEL); } long num_locks = (max_key_ >> log2_keys_per_lock_); if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) { - num_locks ++; + num_locks++; + } + fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families); + key_locks_.resize(FLAGS_column_families); + for (int i = 0; i < FLAGS_column_families; ++i) { + key_locks_[i] = std::vector(num_locks); } - fprintf(stdout, "Creating %ld locks\n", num_locks); - key_locks_ = new port::Mutex[num_locks]; } - ~SharedState() { - delete[] values_; - delete[] key_locks_; - } + ~SharedState() {} port::Mutex* GetMutex() { return &mu_; @@ -622,26 +632,36 @@ class SharedState { return start_verify_; } - port::Mutex* GetMutexForKey(long key) { - return &key_locks_[key >> log2_keys_per_lock_]; + port::Mutex* GetMutexForKey(int cf, long key) { + return &key_locks_[cf][key >> log2_keys_per_lock_]; } - void Put(long key, uint32_t value_base) { - values_[key] = value_base; + void LockColumnFamily(int cf) { + for (auto& mutex : key_locks_[cf]) { + mutex.Lock(); + } } - uint32_t Get(long key) const { - return values_[key]; + void UnlockColumnFamily(int cf) { + for (auto& mutex : key_locks_[cf]) { + mutex.Unlock(); + } } - void Delete(long key) const { - values_[key] = SENTINEL; + void ClearColumnFamily(int cf) { + std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL); } - uint32_t GetSeed() const { - return seed_; + void Put(int cf, long key, uint32_t value_base) { + values_[cf][key] = value_base; } + uint32_t Get(int cf, long key) const { return values_[cf][key]; } + + void Delete(int cf, long key) { values_[cf][key] = SENTINEL; } + + uint32_t GetSeed() const { return seed_; } + private: port::Mutex mu_; port::CondVar cv_; @@ -657,11 +677,12 @@ class SharedState { bool start_verify_; StressTest* stress_test_; - uint32_t *values_; - port::Mutex *key_locks_; - + std::vector> values_; + std::vector> key_locks_; }; +const uint32_t SharedState::SENTINEL = 0xffffffff; + // Per-thread state for concurrent executions of the same benchmark. struct ThreadState { uint32_t tid; // 0..n-1 @@ -682,13 +703,14 @@ class StressTest { public: StressTest() : cache_(NewLRUCache(FLAGS_cache_size)), - compressed_cache_(FLAGS_compressed_cache_size >= 0 ? - NewLRUCache(FLAGS_compressed_cache_size) : - nullptr), + compressed_cache_(FLAGS_compressed_cache_size >= 0 + ? NewLRUCache(FLAGS_compressed_cache_size) + : nullptr), filter_policy_(FLAGS_bloom_bits >= 0 - ? NewBloomFilterPolicy(FLAGS_bloom_bits) - : nullptr), + ? NewBloomFilterPolicy(FLAGS_bloom_bits) + : nullptr), db_(nullptr), + new_column_family_name_(0), num_times_reopened_(0) { if (FLAGS_destroy_db_initially) { std::vector files; @@ -703,6 +725,10 @@ class StressTest { } ~StressTest() { + for (auto cf : column_families_) { + delete cf; + } + column_families_.clear(); delete db_; delete filter_policy_; } @@ -817,9 +843,9 @@ class StressTest { // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ... // ("9"+K, "9"+V) in DB atomically i.e in a single batch. // Also refer MultiGet. - Status MultiPut(ThreadState* thread, - const WriteOptions& writeoptions, - const Slice& key, const Slice& value, size_t sz) { + Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, size_t sz) { std::string keys[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"}; std::string values[10] = {"9", "8", "7", "6", "5", @@ -832,9 +858,9 @@ class StressTest { values[i] += value.ToString(); value_slices[i] = values[i]; if (FLAGS_use_merge) { - batch.Merge(keys[i], value_slices[i]); + batch.Merge(column_family, keys[i], value_slices[i]); } else { - batch.Put(keys[i], value_slices[i]); + batch.Put(column_family, keys[i], value_slices[i]); } } @@ -852,9 +878,8 @@ class StressTest { // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K) // in DB atomically i.e in a single batch. Also refer MultiGet. - Status MultiDelete(ThreadState* thread, - const WriteOptions& writeoptions, - const Slice& key) { + Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions, + ColumnFamilyHandle* column_family, const Slice& key) { std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"}; @@ -862,7 +887,7 @@ class StressTest { Status s; for (int i = 0; i < 10; i++) { keys[i] += key.ToString(); - batch.Delete(keys[i]); + batch.Delete(column_family, keys[i]); } s = db_->Write(writeoptions, &batch); @@ -880,9 +905,9 @@ class StressTest { // in the same snapshot, and verifies that all the values are of the form // "0"+V, "1"+V,..."9"+V. // ASSUMES that MultiPut was used to put (K, V) into the DB. - Status MultiGet(ThreadState* thread, - const ReadOptions& readoptions, - const Slice& key, std::string* value) { + Status MultiGet(ThreadState* thread, const ReadOptions& readoptions, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; Slice key_slices[10]; std::string values[10]; @@ -892,7 +917,7 @@ class StressTest { for (int i = 0; i < 10; i++) { keys[i] += key.ToString(); key_slices[i] = keys[i]; - s = db_->Get(readoptionscopy, key_slices[i], value); + s = db_->Get(readoptionscopy, column_family, key_slices[i], value); if (!s.ok() && !s.IsNotFound()) { fprintf(stderr, "get error: %s\n", s.ToString().c_str()); values[i] = ""; @@ -937,8 +962,8 @@ class StressTest { // each series should be the same length, and it is verified for each // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V. // ASSUMES that MultiPut was used to put (K, V) - Status MultiPrefixScan(ThreadState* thread, - const ReadOptions& readoptions, + Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions, + ColumnFamilyHandle* column_family, const Slice& key) { std::string prefixes[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}; @@ -954,7 +979,7 @@ class StressTest { readoptionscopy[i] = readoptions; readoptionscopy[i].prefix_seek = true; readoptionscopy[i].snapshot = snapshot; - iters[i] = db_->NewIterator(readoptionscopy[i]); + iters[i] = db_->NewIterator(readoptionscopy[i], column_family); iters[i]->Seek(prefix_slices[i]); } @@ -1012,15 +1037,14 @@ class StressTest { // Given a key K, this creates an iterator which scans to K and then // does a random sequence of Next/Prev operations. - Status MultiIterate(ThreadState* thread, - const ReadOptions& readoptions, - const Slice& key) { + Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions, + ColumnFamilyHandle* column_family, const Slice& key) { Status s; const Snapshot* snapshot = db_->GetSnapshot(); ReadOptions readoptionscopy = readoptions; readoptionscopy.snapshot = snapshot; readoptionscopy.prefix_seek = FLAGS_prefix_size > 0; - unique_ptr iter(db_->NewIterator(readoptionscopy)); + unique_ptr iter(db_->NewIterator(readoptionscopy, column_family)); iter->Seek(key); for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) { @@ -1075,15 +1099,50 @@ class StressTest { } } + if (!FLAGS_test_batches_snapshots && + FLAGS_clear_column_family_one_in != 0) { + if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) { + // drop column family and then create it again (can't drop default) + int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1; + std::string new_name = + std::to_string(new_column_family_name_.fetch_add(1)); + { + MutexLock l(thread->shared->GetMutex()); + fprintf( + stdout, + "[CF %d] Dropping and recreating column family. new name: %s\n", + cf, new_name.c_str()); + } + thread->shared->LockColumnFamily(cf); + Status s __attribute__((unused)); + s = db_->DropColumnFamily(column_families_[cf]); + delete column_families_[cf]; + assert(s.ok()); + s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name, + &column_families_[cf]); + column_family_names_[cf] = new_name; + thread->shared->ClearColumnFamily(cf); + assert(s.ok()); + thread->shared->UnlockColumnFamily(cf); + } + } + long rand_key = thread->rand.Next() % max_key; + int rand_column_family = thread->rand.Next() % FLAGS_column_families; std::string keystr = Key(rand_key); Slice key = keystr; int prob_op = thread->rand.Uniform(100); + std::unique_ptr l; + if (!FLAGS_test_batches_snapshots) { + l.reset(new MutexLock( + thread->shared->GetMutexForKey(rand_column_family, rand_key))); + } + auto column_family = column_families_[rand_column_family]; if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) { // OPERATION read if (!FLAGS_test_batches_snapshots) { - Status s = db_->Get(read_opts, key, &from_db); + Status s = db_->Get(read_opts, column_family, key, &from_db); if (s.ok()) { // found case thread->stats.AddGets(1, 1); @@ -1095,7 +1154,7 @@ class StressTest { thread->stats.AddErrors(1); } } else { - MultiGet(thread, read_opts, key, &from_db); + MultiGet(thread, read_opts, column_family, key, &from_db); } } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) { // OPERATION prefix scan @@ -1106,7 +1165,7 @@ class StressTest { if (!FLAGS_test_batches_snapshots) { Slice prefix = Slice(key.data(), FLAGS_prefix_size); read_opts.prefix_seek = true; - Iterator* iter = db_->NewIterator(read_opts); + Iterator* iter = db_->NewIterator(read_opts, column_family); int64_t count = 0; for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix); iter->Next()) { @@ -1121,7 +1180,7 @@ class StressTest { } delete iter; } else { - MultiPrefixScan(thread, read_opts, key); + MultiPrefixScan(thread, read_opts, column_family, key); } } else if (prefixBound <= prob_op && prob_op < writeBound) { // OPERATION write @@ -1129,42 +1188,36 @@ class StressTest { size_t sz = GenerateValue(value_base, value, sizeof(value)); Slice v(value, sz); if (!FLAGS_test_batches_snapshots) { - MutexLock l(thread->shared->GetMutexForKey(rand_key)); if (FLAGS_verify_before_write) { std::string keystr2 = Key(rand_key); Slice k = keystr2; - Status s = db_->Get(read_opts, k, &from_db); - VerifyValue(rand_key, - read_opts, - *(thread->shared), - from_db, - s, - true); + Status s = db_->Get(read_opts, column_family, k, &from_db); + VerifyValue(rand_column_family, rand_key, read_opts, + *(thread->shared), from_db, s, true); } - thread->shared->Put(rand_key, value_base); + thread->shared->Put(rand_column_family, rand_key, value_base); if (FLAGS_use_merge) { - db_->Merge(write_opts, key, v); + db_->Merge(write_opts, column_family, key, v); } else { - db_->Put(write_opts, key, v); + db_->Put(write_opts, column_family, key, v); } thread->stats.AddBytesForWrites(1, sz); } else { - MultiPut(thread, write_opts, key, v, sz); + MultiPut(thread, write_opts, column_family, key, v, sz); } - PrintKeyValue(rand_key, value, sz); + PrintKeyValue(rand_column_family, rand_key, value, sz); } else if (writeBound <= prob_op && prob_op < delBound) { // OPERATION delete if (!FLAGS_test_batches_snapshots) { - MutexLock l(thread->shared->GetMutexForKey(rand_key)); - thread->shared->Delete(rand_key); - db_->Delete(write_opts, key); + thread->shared->Delete(rand_column_family, rand_key); + db_->Delete(write_opts, column_family, key); thread->stats.AddDeletes(1); } else { - MultiDelete(thread, write_opts, key); + MultiDelete(thread, write_opts, column_family, key); } } else { // OPERATION iterate - MultiIterate(thread, read_opts, key); + MultiIterate(thread, read_opts, column_family, key); } thread->stats.FinishedSingleOp(); } @@ -1182,97 +1235,100 @@ class StressTest { if (thread->tid == shared.GetNumThreads() - 1) { end = max_key; } - - if (!thread->rand.OneIn(2)) { - options.prefix_seek = FLAGS_prefix_size > 0; - // Use iterator to verify this range - unique_ptr iter(db_->NewIterator(options)); - iter->Seek(Key(start)); - for (long i = start; i < end; i++) { - // TODO(ljin): update "long" to uint64_t - // Reseek when the prefix changes - if (i % (static_cast(1) << 8 * (8 - FLAGS_prefix_size)) == 0) { - iter->Seek(Key(i)); - } - std::string from_db; - std::string keystr = Key(i); - Slice k = keystr; - Status s = iter->status(); - if (iter->Valid()) { - if (iter->key().compare(k) > 0) { - s = Status::NotFound(Slice()); - } else if (iter->key().compare(k) == 0) { - from_db = iter->value().ToString(); - iter->Next(); - } else if (iter->key().compare(k) < 0) { - VerificationAbort("An out of range key was found", i); + for (size_t cf = 0; cf < column_families_.size(); ++cf) { + if (!thread->rand.OneIn(2)) { + // Use iterator to verify this range + options.prefix_seek = FLAGS_prefix_size > 0; + unique_ptr iter( + db_->NewIterator(options, column_families_[cf])); + iter->Seek(Key(start)); + for (long i = start; i < end; i++) { + // TODO(ljin): update "long" to uint64_t + // Reseek when the prefix changes + if (i % (static_cast(1) << 8 * (8 - FLAGS_prefix_size)) == + 0) { + iter->Seek(Key(i)); + } + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = iter->status(); + if (iter->Valid()) { + if (iter->key().compare(k) > 0) { + s = Status::NotFound(Slice()); + } else if (iter->key().compare(k) == 0) { + from_db = iter->value().ToString(); + iter->Next(); + } else if (iter->key().compare(k) < 0) { + VerificationAbort("An out of range key was found", cf, i); + } + } else { + // The iterator found no value for the key in question, so do not + // move to the next item in the iterator + s = Status::NotFound(Slice()); + } + VerifyValue(cf, i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(cf, i, from_db.data(), from_db.length()); } - } else { - // The iterator found no value for the key in question, so do not - // move to the next item in the iterator - s = Status::NotFound(Slice()); } - VerifyValue(i, options, shared, from_db, s, true); - if (from_db.length()) { - PrintKeyValue(i, from_db.data(), from_db.length()); - } - } - } else { - // Use Get to verify this range - for (long i = start; i < end; i++) { - std::string from_db; - std::string keystr = Key(i); - Slice k = keystr; - Status s = db_->Get(options, k, &from_db); - VerifyValue(i, options, shared, from_db, s, true); - if (from_db.length()) { - PrintKeyValue(i, from_db.data(), from_db.length()); + } else { + // Use Get to verify this range + for (long i = start; i < end; i++) { + std::string from_db; + std::string keystr = Key(i); + Slice k = keystr; + Status s = db_->Get(options, column_families_[cf], k, &from_db); + VerifyValue(cf, i, options, shared, from_db, s, true); + if (from_db.length()) { + PrintKeyValue(cf, i, from_db.data(), from_db.length()); + } } } } } - void VerificationAbort(std::string msg, long key) const { - fprintf(stderr, "Verification failed for key %ld: %s\n", - key, msg.c_str()); + void VerificationAbort(std::string msg, int cf, long key) const { + fprintf(stderr, "Verification failed for column family %d key %ld: %s\n", + cf, key, msg.c_str()); exit(1); } - void VerifyValue(long key, - const ReadOptions &opts, - const SharedState &shared, - const std::string &value_from_db, - Status s, - bool strict=false) const { + void VerifyValue(int cf, long key, const ReadOptions& opts, + const SharedState& shared, const std::string& value_from_db, + Status s, bool strict = false) const { // compare value_from_db with the value in the shared state char value[100]; - uint32_t value_base = shared.Get(key); + uint32_t value_base = shared.Get(cf, key); if (value_base == SharedState::SENTINEL && !strict) { return; } if (s.ok()) { if (value_base == SharedState::SENTINEL) { - VerificationAbort("Unexpected value found", key); + VerificationAbort("Unexpected value found", cf, key); } size_t sz = GenerateValue(value_base, value, sizeof(value)); if (value_from_db.length() != sz) { - VerificationAbort("Length of value read is not equal", key); + VerificationAbort("Length of value read is not equal", cf, key); } if (memcmp(value_from_db.data(), value, sz) != 0) { - VerificationAbort("Contents of value read don't match", key); + VerificationAbort("Contents of value read don't match", cf, key); } } else { if (value_base != SharedState::SENTINEL) { - VerificationAbort("Value not found", key); + VerificationAbort("Value not found", cf, key); } } } - static void PrintKeyValue(uint32_t key, const char *value, size_t sz) { - if (!FLAGS_verbose) return; - fprintf(stdout, "%u ==> (%u) ", key, (unsigned int)sz); - for (size_t i=0; i (%u) ", cf, key, (unsigned int)sz); + for (size_t i = 0; i < sz; i++) { fprintf(stdout, "%X", value[i]); } fprintf(stdout, "\n"); @@ -1290,8 +1346,13 @@ class StressTest { } void PrintEnv() const { - fprintf(stdout, "LevelDB version : %d.%d\n", - kMajorVersion, kMinorVersion); + fprintf(stdout, "RocksDB version : %d.%d\n", kMajorVersion, + kMinorVersion); + fprintf(stdout, "Column families : %d\n", FLAGS_column_families); + if (!FLAGS_test_batches_snapshots) { + fprintf(stdout, "Clear CFs one in : %d\n", + FLAGS_clear_column_family_one_in); + } fprintf(stdout, "Number of threads : %d\n", FLAGS_threads); fprintf(stdout, "Ops per thread : %lu\n", @@ -1368,43 +1429,41 @@ class StressTest { void Open() { assert(db_ == nullptr); - Options options; - options.block_cache = cache_; - options.block_cache_compressed = compressed_cache_; - options.write_buffer_size = FLAGS_write_buffer_size; - options.max_write_buffer_number = FLAGS_max_write_buffer_number; - options.min_write_buffer_number_to_merge = - FLAGS_min_write_buffer_number_to_merge; - options.max_background_compactions = FLAGS_max_background_compactions; - options.compaction_style = - static_cast(FLAGS_compaction_style); - options.block_size = FLAGS_block_size; - options.filter_policy = filter_policy_; - options.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size)); - options.max_open_files = FLAGS_open_files; - options.statistics = dbstats; - options.env = FLAGS_env; - options.disableDataSync = FLAGS_disable_data_sync; - options.use_fsync = FLAGS_use_fsync; - options.allow_mmap_reads = FLAGS_mmap_read; + options_.block_cache = cache_; + options_.block_cache_compressed = compressed_cache_; + options_.write_buffer_size = FLAGS_write_buffer_size; + options_.max_write_buffer_number = FLAGS_max_write_buffer_number; + options_.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + options_.max_background_compactions = FLAGS_max_background_compactions; + options_.max_background_flushes = FLAGS_max_background_flushes; + options_.compaction_style = + static_cast(FLAGS_compaction_style); + options_.block_size = FLAGS_block_size; + options_.filter_policy = filter_policy_; + options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size)); + options_.max_open_files = FLAGS_open_files; + options_.statistics = dbstats; + options_.env = FLAGS_env; + options_.disableDataSync = FLAGS_disable_data_sync; + options_.use_fsync = FLAGS_use_fsync; + options_.allow_mmap_reads = FLAGS_mmap_read; rocksdb_kill_odds = FLAGS_kill_random_test; - options.target_file_size_base = FLAGS_target_file_size_base; - options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; - options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; - options.max_bytes_for_level_multiplier = + options_.target_file_size_base = FLAGS_target_file_size_base; + options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier; + options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; + options_.max_bytes_for_level_multiplier = FLAGS_max_bytes_for_level_multiplier; - options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; - options.level0_slowdown_writes_trigger = - FLAGS_level0_slowdown_writes_trigger; - options.level0_file_num_compaction_trigger = - FLAGS_level0_file_num_compaction_trigger; - options.compression = FLAGS_compression_type_e; - options.create_if_missing = true; - options.disable_seek_compaction = FLAGS_disable_seek_compaction; - options.delete_obsolete_files_period_micros = - FLAGS_delete_obsolete_files_period_micros; - options.max_manifest_file_size = 1024; - options.filter_deletes = FLAGS_filter_deletes; + options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; + options_.level0_slowdown_writes_trigger = + FLAGS_level0_slowdown_writes_trigger; + options_.level0_file_num_compaction_trigger = + FLAGS_level0_file_num_compaction_trigger; + options_.compression = FLAGS_compression_type_e; + options_.create_if_missing = true; + options_.disable_seek_compaction = FLAGS_disable_seek_compaction; + options_.max_manifest_file_size = 10 * 1024; + options_.filter_deletes = FLAGS_filter_deletes; if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { fprintf(stderr, "prefix_size should be non-zero iff memtablerep == prefix_hash\n"); @@ -1412,51 +1471,107 @@ class StressTest { } switch (FLAGS_rep_factory) { case kHashSkipList: - options.memtable_factory.reset(NewHashSkipListRepFactory()); + options_.memtable_factory.reset(NewHashSkipListRepFactory()); break; case kSkipList: // no need to do anything break; case kVectorRep: - options.memtable_factory.reset(new VectorRepFactory()); + options_.memtable_factory.reset(new VectorRepFactory()); break; } static Random purge_percent(1000); // no benefit from non-determinism here if (static_cast(purge_percent.Uniform(100)) < FLAGS_purge_redundant_percent - 1) { - options.purge_redundant_kvs_while_flush = false; + options_.purge_redundant_kvs_while_flush = false; } if (FLAGS_use_merge) { - options.merge_operator = MergeOperators::CreatePutOperator(); + options_.merge_operator = MergeOperators::CreatePutOperator(); } // set universal style compaction configurations, if applicable if (FLAGS_universal_size_ratio != 0) { - options.compaction_options_universal.size_ratio = - FLAGS_universal_size_ratio; + options_.compaction_options_universal.size_ratio = + FLAGS_universal_size_ratio; } if (FLAGS_universal_min_merge_width != 0) { - options.compaction_options_universal.min_merge_width = - FLAGS_universal_min_merge_width; + options_.compaction_options_universal.min_merge_width = + FLAGS_universal_min_merge_width; } if (FLAGS_universal_max_merge_width != 0) { - options.compaction_options_universal.max_merge_width = - FLAGS_universal_max_merge_width; + options_.compaction_options_universal.max_merge_width = + FLAGS_universal_max_merge_width; } if (FLAGS_universal_max_size_amplification_percent != 0) { - options.compaction_options_universal.max_size_amplification_percent = - FLAGS_universal_max_size_amplification_percent; + options_.compaction_options_universal.max_size_amplification_percent = + FLAGS_universal_max_size_amplification_percent; } fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); Status s; if (FLAGS_ttl == -1) { - s = DB::Open(options, FLAGS_db, &db_); + std::vector existing_column_families; + s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db, + &existing_column_families); // ignore errors + if (!s.ok()) { + // DB doesn't exist + assert(existing_column_families.empty()); + assert(column_family_names_.empty()); + column_family_names_.push_back(default_column_family_name); + } else if (column_family_names_.empty()) { + // this is the first call to the function Open() + column_family_names_ = existing_column_families; + } else { + // this is a reopen. just assert that existing column_family_names are + // equivalent to what we remember + auto sorted_cfn = column_family_names_; + sort(sorted_cfn.begin(), sorted_cfn.end()); + sort(existing_column_families.begin(), existing_column_families.end()); + if (sorted_cfn != existing_column_families) { + fprintf(stderr, + "Expected column families differ from the existing:\n"); + printf("Expected: {"); + for (auto cf : sorted_cfn) { + printf("%s ", cf.c_str()); + } + printf("}\n"); + printf("Existing: {"); + for (auto cf : existing_column_families) { + printf("%s ", cf.c_str()); + } + printf("}\n"); + } + assert(sorted_cfn == existing_column_families); + } + std::vector cf_descriptors; + for (auto name : column_family_names_) { + if (name != default_column_family_name) { + new_column_family_name_ = + std::max(new_column_family_name_.load(), std::stoi(name) + 1); + } + cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_)); + } + s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors, + &column_families_, &db_); + if (s.ok()) { + while (s.ok() && + column_families_.size() < (size_t)FLAGS_column_families) { + ColumnFamilyHandle* cf = nullptr; + std::string name = std::to_string(new_column_family_name_.load()); + new_column_family_name_++; + s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), name, &cf); + column_families_.push_back(cf); + column_family_names_.push_back(name); + } + } + assert(!s.ok() || column_families_.size() == + static_cast(FLAGS_column_families)); } else { - s = UtilityDB::OpenTtlDB(options, FLAGS_db, &sdb_, FLAGS_ttl); - db_ = sdb_; + StackableDB* sdb; + s = UtilityDB::OpenTtlDB(options_, FLAGS_db, &sdb, FLAGS_ttl); + db_ = sdb; } if (!s.ok()) { fprintf(stderr, "open error: %s\n", s.ToString().c_str()); @@ -1465,13 +1580,11 @@ class StressTest { } void Reopen() { - // do not close the db. Just delete the lock file. This - // simulates a crash-recovery kind of situation. - if (FLAGS_ttl != -1) { - ((DBWithTTL*) db_)->TEST_Destroy_DBWithTtl(); - } else { - ((DBImpl*) db_)->TEST_Destroy_DBImpl(); + for (auto cf : column_families_) { + delete cf; } + column_families_.clear(); + delete db_; db_ = nullptr; num_times_reopened_++; @@ -1493,14 +1606,15 @@ class StressTest { shared_ptr compressed_cache_; const FilterPolicy* filter_policy_; DB* db_; - StackableDB* sdb_; + Options options_; + std::vector column_families_; + std::vector column_family_names_; + std::atomic new_column_family_name_; int num_times_reopened_; }; } // namespace rocksdb - - int main(int argc, char** argv) { google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + " [OPTIONS]..."); diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc index 067404340..19c2b8ca3 100644 --- a/util/auto_roll_logger.cc +++ b/util/auto_roll_logger.cc @@ -81,7 +81,7 @@ Status CreateLoggerFromOptions( const std::string& dbname, const std::string& db_log_dir, Env* env, - const Options& options, + const DBOptions& options, std::shared_ptr* logger) { std::string db_absolute_path; env->GetAbsolutePath(dbname, &db_absolute_path); diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h index 5fd7a1472..4c755f2ab 100644 --- a/util/auto_roll_logger.h +++ b/util/auto_roll_logger.h @@ -85,7 +85,7 @@ Status CreateLoggerFromOptions( const std::string& dbname, const std::string& db_log_dir, Env* env, - const Options& options, + const DBOptions& options, std::shared_ptr* logger); } // namespace rocksdb diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc index 7e30a2777..742713e9d 100755 --- a/util/auto_roll_logger_test.cc +++ b/util/auto_roll_logger_test.cc @@ -197,7 +197,7 @@ TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { } TEST(AutoRollLoggerTest, CreateLoggerFromOptions) { - Options options; + DBOptions options; shared_ptr logger; // Normal logger diff --git a/util/crc32c.cc b/util/crc32c.cc index 50178ae71..d27fb4be9 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -314,24 +314,12 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { } static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { - #ifdef __SSE4_2__ +#ifdef __SSE4_2__ *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); *p += 8; - #else +#else Slow_CRC32(l, p); - #endif -} - -// Detect if SS42 or not. -static bool isSSE42() { - #ifdef __GNUC__ - uint32_t c_; - uint32_t d_; - __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx"); - return c_ & (1U << 20); // copied from CpuId.h in Folly. - #else - return false; - #endif +#endif } template @@ -377,6 +365,18 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { return l ^ 0xffffffffu; } +// Detect if SS42 or not. +static bool isSSE42() { +#if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) + uint32_t c_; + uint32_t d_; + __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx"); + return c_ & (1U << 20); // copied from CpuId.h in Folly. +#else + return false; +#endif +} + typedef uint32_t (*Function)(uint32_t, const char*, size_t); static inline Function Choose_Extend() { diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index e8bbc38e1..4a34d509a 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -3,6 +3,8 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#define __STDC_FORMAT_MACROS +#include #include #include @@ -74,11 +76,12 @@ TEST(DynamicBloomTest, VaryingLengths) { // Count number of filters that significantly exceed the false positive rate int mediocre_filters = 0; int good_filters = 0; + uint32_t num_probes = static_cast(FLAGS_num_probes); fprintf(stderr, "bits_per_key: %d num_probes: %d\n", - FLAGS_bits_per_key, FLAGS_num_probes); + FLAGS_bits_per_key, num_probes); - for (uint32_t cl_per_block = 0; cl_per_block < FLAGS_num_probes; + for (uint32_t cl_per_block = 0; cl_per_block < num_probes; ++cl_per_block) { for (uint32_t num = 1; num <= 10000; num = NextNum(num)) { uint32_t bloom_bits = 0; @@ -88,7 +91,7 @@ TEST(DynamicBloomTest, VaryingLengths) { bloom_bits = std::max(num * FLAGS_bits_per_key, cl_per_block * CACHE_LINE_SIZE * 8); } - DynamicBloom bloom(bloom_bits, cl_per_block, FLAGS_num_probes); + DynamicBloom bloom(bloom_bits, cl_per_block, num_probes); for (uint64_t i = 0; i < num; i++) { bloom.Add(Key(i, buffer)); ASSERT_TRUE(bloom.MayContain(Key(i, buffer))); @@ -127,6 +130,7 @@ TEST(DynamicBloomTest, VaryingLengths) { TEST(DynamicBloomTest, perf) { StopWatchNano timer(Env::Default()); + uint32_t num_probes = static_cast(FLAGS_num_probes); if (!FLAGS_enable_perf) { return; @@ -134,9 +138,9 @@ TEST(DynamicBloomTest, perf) { for (uint64_t m = 1; m <= 8; ++m) { const uint64_t num_keys = m * 8 * 1024 * 1024; - fprintf(stderr, "testing %luM keys\n", m * 8); + fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8); - DynamicBloom std_bloom(num_keys * 10, 0, FLAGS_num_probes); + DynamicBloom std_bloom(num_keys * 10, 0, num_probes); timer.Start(); for (uint64_t i = 1; i <= num_keys; ++i) { @@ -144,7 +148,7 @@ TEST(DynamicBloomTest, perf) { } uint64_t elapsed = timer.ElapsedNanos(); - fprintf(stderr, "standard bloom, avg add latency %lu\n", + fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n", elapsed / num_keys); uint64_t count = 0; @@ -155,13 +159,13 @@ TEST(DynamicBloomTest, perf) { } } elapsed = timer.ElapsedNanos(); - fprintf(stderr, "standard bloom, avg query latency %lu\n", + fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n", elapsed / count); ASSERT_TRUE(count == num_keys); - for (int cl_per_block = 1; cl_per_block <= FLAGS_num_probes; + for (uint32_t cl_per_block = 1; cl_per_block <= num_probes; ++cl_per_block) { - DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, FLAGS_num_probes); + DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes); timer.Start(); for (uint64_t i = 1; i <= num_keys; ++i) { @@ -169,7 +173,7 @@ TEST(DynamicBloomTest, perf) { } uint64_t elapsed = timer.ElapsedNanos(); - fprintf(stderr, "blocked bloom(%d), avg add latency %lu\n", + fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n", cl_per_block, elapsed / num_keys); uint64_t count = 0; @@ -182,7 +186,7 @@ TEST(DynamicBloomTest, perf) { } elapsed = timer.ElapsedNanos(); - fprintf(stderr, "blocked bloom(%d), avg query latency %lu\n", + fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n", cl_per_block, elapsed / count); ASSERT_TRUE(count == num_keys); } diff --git a/util/env.cc b/util/env.cc index f2ebfcd59..5ae6fdf0b 100644 --- a/util/env.cc +++ b/util/env.cc @@ -231,7 +231,7 @@ EnvWrapper::~EnvWrapper() { namespace { // anonymous namespace -void AssignEnvOptions(EnvOptions* env_options, const Options& options) { +void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { env_options->use_os_buffer = options.allow_os_buffer; env_options->use_mmap_reads = options.allow_mmap_reads; env_options->use_mmap_writes = options.allow_mmap_writes; @@ -249,12 +249,12 @@ EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { return env_options; } -EnvOptions::EnvOptions(const Options& options) { +EnvOptions::EnvOptions(const DBOptions& options) { AssignEnvOptions(this, options); } EnvOptions::EnvOptions() { - Options options; + DBOptions options; AssignEnvOptions(this, options); } diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc index f1f064fb3..441f5c993 100644 --- a/util/hash_linklist_rep.cc +++ b/util/hash_linklist_rep.cc @@ -22,12 +22,6 @@ namespace { typedef const char* Key; struct Node { - explicit Node(const Key& k) : - key(k) { - } - - Key const key; - // Accessors/mutators for links. Wrapped in methods so we can // add the appropriate barriers as necessary. Node* Next() { @@ -40,17 +34,19 @@ struct Node { // pointer observes a fully initialized version of the inserted node. next_.Release_Store(x); } - // No-barrier variants that can be safely used in a few locations. Node* NoBarrier_Next() { return reinterpret_cast(next_.NoBarrier_Load()); } + void NoBarrier_SetNext(Node* x) { next_.NoBarrier_Store(x); } -private: + private: port::AtomicPointer next_; + public: + char key[0]; }; class HashLinkListRep : public MemTableRep { @@ -58,7 +54,9 @@ class HashLinkListRep : public MemTableRep { HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, size_t bucket_size); - virtual void Insert(const char* key) override; + virtual KeyHandle Allocate(const size_t len, char** buf) override; + + virtual void Insert(KeyHandle handle) override; virtual bool Contains(const char* key) const override; @@ -93,8 +91,6 @@ class HashLinkListRep : public MemTableRep { const SliceTransform* transform_; const MemTableRep::KeyComparator& compare_; - // immutable after construction - Arena* const arena_; bool BucketContains(Node* head, const Slice& key) const; @@ -114,11 +110,6 @@ class HashLinkListRep : public MemTableRep { return GetBucket(GetHash(slice)); } - Node* NewNode(const Key& key) { - char* mem = arena_->AllocateAligned(sizeof(Node)); - return new (mem) Node(key); - } - bool Equal(const Slice& a, const Key& b) const { return (compare_(b, a) == 0); } @@ -318,10 +309,10 @@ class HashLinkListRep : public MemTableRep { HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, size_t bucket_size) - : bucket_size_(bucket_size), + : MemTableRep(arena), + bucket_size_(bucket_size), transform_(transform), - compare_(compare), - arena_(arena) { + compare_(compare) { char* mem = arena_->AllocateAligned( sizeof(port::AtomicPointer) * bucket_size); @@ -335,15 +326,22 @@ HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, HashLinkListRep::~HashLinkListRep() { } -void HashLinkListRep::Insert(const char* key) { - assert(!Contains(key)); - Slice internal_key = GetLengthPrefixedSlice(key); +KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { + char* mem = arena_->AllocateAligned(sizeof(Node) + len); + Node* x = new (mem) Node(); + *buf = x->key; + return static_cast(x); +} + +void HashLinkListRep::Insert(KeyHandle handle) { + Node* x = static_cast(handle); + assert(!Contains(x->key)); + Slice internal_key = GetLengthPrefixedSlice(x->key); auto transformed = GetPrefix(internal_key); auto& bucket = buckets_[GetHash(transformed)]; Node* head = static_cast(bucket.Acquire_Load()); if (!head) { - Node* x = NewNode(key); // NoBarrier_SetNext() suffices since we will add a barrier when // we publish a pointer to "x" in prev[i]. x->NoBarrier_SetNext(nullptr); @@ -372,9 +370,7 @@ void HashLinkListRep::Insert(const char* key) { } // Our data structure does not allow duplicate insertion - assert(cur == nullptr || !Equal(key, cur->key)); - - Node* x = NewNode(key); + assert(cur == nullptr || !Equal(x->key, cur->key)); // NoBarrier_SetNext() suffices since we will add a barrier when // we publish a pointer to "x" in prev[i]. diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc index ee92e7952..230fae957 100644 --- a/util/hash_skiplist_rep.cc +++ b/util/hash_skiplist_rep.cc @@ -25,7 +25,7 @@ class HashSkipListRep : public MemTableRep { const SliceTransform* transform, size_t bucket_size, int32_t skiplist_height, int32_t skiplist_branching_factor); - virtual void Insert(const char* key) override; + virtual void Insert(KeyHandle handle) override; virtual bool Contains(const char* key) const override; @@ -225,7 +225,8 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena, const SliceTransform* transform, size_t bucket_size, int32_t skiplist_height, int32_t skiplist_branching_factor) - : bucket_size_(bucket_size), + : MemTableRep(arena), + bucket_size_(bucket_size), skiplist_height_(skiplist_height), skiplist_branching_factor_(skiplist_branching_factor), transform_(transform), @@ -255,7 +256,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( return bucket; } -void HashSkipListRep::Insert(const char* key) { +void HashSkipListRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); assert(!Contains(key)); auto transformed = transform_->Transform(UserKey(key)); auto bucket = GetInitializedBucket(transformed); diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index 30288e721..698fab36e 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -11,6 +11,7 @@ #include "db/filename.h" #include "db/write_batch_internal.h" #include "rocksdb/write_batch.h" +#include "rocksdb/cache.h" #include "util/coding.h" #include @@ -152,6 +153,8 @@ LDBCommand* LDBCommand::SelectCommand( return new DBLoaderCommand(cmdParams, option_map, flags); } else if (cmd == ManifestDumpCommand::Name()) { return new ManifestDumpCommand(cmdParams, option_map, flags); + } else if (cmd == ListColumnFamiliesCommand::Name()) { + return new ListColumnFamiliesCommand(cmdParams, option_map, flags); } else if (cmd == InternalDumpCommand::Name()) { return new InternalDumpCommand(cmdParams, option_map, flags); } else if (cmd == CheckConsistencyCommand::Name()) { @@ -540,11 +543,10 @@ void ManifestDumpCommand::DoCommand() { EnvOptions sopt; std::string file(manifestfile); std::string dbname("dummy"); - TableCache* tc = new TableCache(dbname, &options, sopt, 10); - const InternalKeyComparator* cmp = - new InternalKeyComparator(options.comparator); - - VersionSet* versions = new VersionSet(dbname, &options, sopt, tc, cmp); + std::shared_ptr tc(NewLRUCache( + options.max_open_files - 10, options.table_cache_numshardbits, + options.table_cache_remove_scan_count_limit)); + VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get()); Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_); if (!s.ok()) { printf("Error in processing file %s %s\n", manifestfile.c_str(), @@ -557,6 +559,48 @@ void ManifestDumpCommand::DoCommand() { // ---------------------------------------------------------------------------- +void ListColumnFamiliesCommand::Help(string& ret) { + ret.append(" "); + ret.append(ListColumnFamiliesCommand::Name()); + ret.append(" full_path_to_db_directory "); + ret.append("\n"); +} + +ListColumnFamiliesCommand::ListColumnFamiliesCommand( + const vector& params, const map& options, + const vector& flags) + : LDBCommand(options, flags, false, {}) { + + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::FAILED( + "dbname must be specified for the list_column_families command"); + } else { + dbname_ = params[0]; + } +} + +void ListColumnFamiliesCommand::DoCommand() { + vector column_families; + Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families); + if (!s.ok()) { + printf("Error in processing db %s %s\n", dbname_.c_str(), + s.ToString().c_str()); + } else { + printf("Column families in %s: \n{", dbname_.c_str()); + bool first = true; + for (auto cf : column_families) { + if (!first) { + printf(", "); + } + first = false; + printf("%s", cf.c_str()); + } + printf("}\n"); + } +} + +// ---------------------------------------------------------------------------- + string ReadableTime(int unixtime) { char time_buffer [80]; time_t rawtime = unixtime; @@ -1018,19 +1062,26 @@ Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() { Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) { EnvOptions soptions; - TableCache tc(db_path_, &opt, soptions, 10); + std::shared_ptr tc( + NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits, + opt.table_cache_remove_scan_count_limit)); const InternalKeyComparator cmp(opt.comparator); - VersionSet versions(db_path_, &opt, soptions, &tc, &cmp); + VersionSet versions(db_path_, &opt, soptions, tc.get()); + std::vector dummy; + ColumnFamilyDescriptor dummy_descriptor(default_column_family_name, + ColumnFamilyOptions(opt)); + dummy.push_back(dummy_descriptor); // We rely the VersionSet::Recover to tell us the internal data structures // in the db. And the Recover() should never do any change // (like LogAndApply) to the manifest file. - Status st = versions.Recover(); + Status st = versions.Recover(dummy); if (!st.ok()) { return st; } int max = -1; - for (int i = 0; i < versions.NumberLevels(); i++) { - if (versions.current()->NumLevelFiles(i)) { + auto default_cfd = versions.GetColumnFamilySet()->GetDefault(); + for (int i = 0; i < default_cfd->NumberLevels(); i++) { + if (default_cfd->current()->NumLevelFiles(i)) { max = i; } } @@ -1075,7 +1126,6 @@ void ReduceDBLevelsCommand::DoCommand() { CloseDB(); EnvOptions soptions; - st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString()); diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h index 17343bb18..f0ac59158 100644 --- a/util/ldb_cmd.h +++ b/util/ldb_cmd.h @@ -484,6 +484,23 @@ private: static const string ARG_PATH; }; +class ListColumnFamiliesCommand : public LDBCommand { + public: + static string Name() { return "list_column_families"; } + + ListColumnFamiliesCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + virtual void DoCommand(); + + virtual bool NoDBOpen() { return true; } + + private: + string dbname_; +}; + class ReduceDBLevelsCommand : public LDBCommand { public: static string Name() { return "reduce_levels"; } diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc index 10d5a1fa1..134547b19 100644 --- a/util/ldb_tool.cc +++ b/util/ldb_tool.cc @@ -64,6 +64,7 @@ public: DBDumperCommand::Help(ret); DBLoaderCommand::Help(ret); ManifestDumpCommand::Help(ret); + ListColumnFamiliesCommand::Help(ret); InternalDumpCommand::Help(ret); fprintf(stderr, "%s\n", ret.c_str()); diff --git a/util/options.cc b/util/options.cc index 5c5bab557..56a0d43f1 100644 --- a/util/options.cc +++ b/util/options.cc @@ -26,23 +26,17 @@ namespace rocksdb { -Options::Options() +ColumnFamilyOptions::ColumnFamilyOptions() : comparator(BytewiseComparator()), merge_operator(nullptr), compaction_filter(nullptr), compaction_filter_factory(std::shared_ptr( new DefaultCompactionFilterFactory())), - compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()), - create_if_missing(false), - error_if_exists(false), - paranoid_checks(true), - env(Env::Default()), - info_log(nullptr), - info_log_level(INFO), + compaction_filter_factory_v2( + new DefaultCompactionFilterFactoryV2()), write_buffer_size(4 << 20), max_write_buffer_number(2), min_write_buffer_number_to_merge(1), - max_open_files(5000), block_cache(nullptr), block_cache_compressed(nullptr), block_size(4096), @@ -64,42 +58,15 @@ Options::Options() expanded_compaction_factor(25), source_compaction_factor(1), max_grandparent_overlap_factor(10), - disableDataSync(false), - use_fsync(false), - db_stats_log_interval(1800), - db_log_dir(""), - wal_dir(""), disable_seek_compaction(true), - delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL), - max_background_compactions(1), - max_background_flushes(1), - max_log_file_size(0), - log_file_time_to_roll(0), - keep_log_file_num(1000), soft_rate_limit(0.0), hard_rate_limit(0.0), rate_limit_delay_max_milliseconds(1000), - max_manifest_file_size(std::numeric_limits::max()), no_block_cache(false), - table_cache_numshardbits(4), - table_cache_remove_scan_count_limit(16), arena_block_size(0), disable_auto_compactions(false), - WAL_ttl_seconds(0), - WAL_size_limit_MB(0), - manifest_preallocation_size(4 * 1024 * 1024), purge_redundant_kvs_while_flush(true), - allow_os_buffer(true), - allow_mmap_reads(false), - allow_mmap_writes(false), - is_fd_close_on_exec(true), - skip_log_error_on_recovery(false), - stats_dump_period_sec(3600), block_size_deviation(10), - advise_random_on_open(true), - access_hint_on_compaction_start(NORMAL), - use_adaptive_mutex(false), - bytes_per_sync(0), compaction_style(kCompactionStyleLevel), verify_checksums_in_compaction(true), filter_deletes(false), @@ -114,38 +81,229 @@ Options::Options() memtable_prefix_bloom_probes(6), bloom_locality(0), max_successive_merges(0), - min_partial_merge_operands(2), - allow_thread_local(true) { + min_partial_merge_operands(2) { assert(memtable_factory.get() != nullptr); } +ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) + : comparator(options.comparator), + merge_operator(options.merge_operator), + compaction_filter(options.compaction_filter), + compaction_filter_factory(options.compaction_filter_factory), + compaction_filter_factory_v2(options.compaction_filter_factory_v2), + write_buffer_size(options.write_buffer_size), + max_write_buffer_number(options.max_write_buffer_number), + min_write_buffer_number_to_merge( + options.min_write_buffer_number_to_merge), + block_cache(options.block_cache), + block_cache_compressed(options.block_cache_compressed), + block_size(options.block_size), + block_restart_interval(options.block_restart_interval), + compression(options.compression), + compression_per_level(options.compression_per_level), + compression_opts(options.compression_opts), + filter_policy(options.filter_policy), + prefix_extractor(options.prefix_extractor), + whole_key_filtering(options.whole_key_filtering), + num_levels(options.num_levels), + level0_file_num_compaction_trigger( + options.level0_file_num_compaction_trigger), + level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), + level0_stop_writes_trigger(options.level0_stop_writes_trigger), + max_mem_compaction_level(options.max_mem_compaction_level), + target_file_size_base(options.target_file_size_base), + target_file_size_multiplier(options.target_file_size_multiplier), + max_bytes_for_level_base(options.max_bytes_for_level_base), + max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), + max_bytes_for_level_multiplier_additional( + options.max_bytes_for_level_multiplier_additional), + expanded_compaction_factor(options.expanded_compaction_factor), + source_compaction_factor(options.source_compaction_factor), + max_grandparent_overlap_factor(options.max_grandparent_overlap_factor), + disable_seek_compaction(options.disable_seek_compaction), + soft_rate_limit(options.soft_rate_limit), + hard_rate_limit(options.hard_rate_limit), + rate_limit_delay_max_milliseconds( + options.rate_limit_delay_max_milliseconds), + no_block_cache(options.no_block_cache), + arena_block_size(options.arena_block_size), + disable_auto_compactions(options.disable_auto_compactions), + purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), + block_size_deviation(options.block_size_deviation), + compaction_style(options.compaction_style), + verify_checksums_in_compaction(options.verify_checksums_in_compaction), + compaction_options_universal(options.compaction_options_universal), + filter_deletes(options.filter_deletes), + max_sequential_skip_in_iterations( + options.max_sequential_skip_in_iterations), + memtable_factory(options.memtable_factory), + table_factory(options.table_factory), + table_properties_collectors(options.table_properties_collectors), + inplace_update_support(options.inplace_update_support), + inplace_update_num_locks(options.inplace_update_num_locks), + inplace_callback(options.inplace_callback), + memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), + memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), + bloom_locality(options.bloom_locality), + max_successive_merges(options.max_successive_merges), + min_partial_merge_operands(options.min_partial_merge_operands) { + assert(memtable_factory.get() != nullptr); +} + +DBOptions::DBOptions() + : create_if_missing(false), + error_if_exists(false), + paranoid_checks(true), + env(Env::Default()), + info_log(nullptr), + info_log_level(INFO), + max_open_files(5000), + statistics(nullptr), + disableDataSync(false), + use_fsync(false), + db_stats_log_interval(1800), + db_log_dir(""), + wal_dir(""), + delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL), + max_background_compactions(1), + max_background_flushes(1), + max_log_file_size(0), + log_file_time_to_roll(0), + keep_log_file_num(1000), + max_manifest_file_size(std::numeric_limits::max()), + table_cache_numshardbits(4), + table_cache_remove_scan_count_limit(16), + WAL_ttl_seconds(0), + WAL_size_limit_MB(0), + manifest_preallocation_size(4 * 1024 * 1024), + allow_os_buffer(true), + allow_mmap_reads(false), + allow_mmap_writes(false), + is_fd_close_on_exec(true), + skip_log_error_on_recovery(false), + stats_dump_period_sec(3600), + advise_random_on_open(true), + access_hint_on_compaction_start(NORMAL), + use_adaptive_mutex(false), + bytes_per_sync(0), + allow_thread_local(true) {} + +DBOptions::DBOptions(const Options& options) + : create_if_missing(options.create_if_missing), + error_if_exists(options.error_if_exists), + paranoid_checks(options.paranoid_checks), + env(options.env), + info_log(options.info_log), + info_log_level(options.info_log_level), + max_open_files(options.max_open_files), + statistics(options.statistics), + disableDataSync(options.disableDataSync), + use_fsync(options.use_fsync), + db_stats_log_interval(options.db_stats_log_interval), + db_log_dir(options.db_log_dir), + wal_dir(options.wal_dir), + delete_obsolete_files_period_micros( + options.delete_obsolete_files_period_micros), + max_background_compactions(options.max_background_compactions), + max_background_flushes(options.max_background_flushes), + max_log_file_size(options.max_log_file_size), + log_file_time_to_roll(options.log_file_time_to_roll), + keep_log_file_num(options.keep_log_file_num), + max_manifest_file_size(options.max_manifest_file_size), + table_cache_numshardbits(options.table_cache_numshardbits), + table_cache_remove_scan_count_limit( + options.table_cache_remove_scan_count_limit), + WAL_ttl_seconds(options.WAL_ttl_seconds), + WAL_size_limit_MB(options.WAL_size_limit_MB), + manifest_preallocation_size(options.manifest_preallocation_size), + allow_os_buffer(options.allow_os_buffer), + allow_mmap_reads(options.allow_mmap_reads), + allow_mmap_writes(options.allow_mmap_writes), + is_fd_close_on_exec(options.is_fd_close_on_exec), + skip_log_error_on_recovery(options.skip_log_error_on_recovery), + stats_dump_period_sec(options.stats_dump_period_sec), + advise_random_on_open(options.advise_random_on_open), + access_hint_on_compaction_start(options.access_hint_on_compaction_start), + use_adaptive_mutex(options.use_adaptive_mutex), + bytes_per_sync(options.bytes_per_sync), + allow_thread_local(options.allow_thread_local) {} + static const char* const access_hints[] = { "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" }; -void -Options::Dump(Logger* log) const -{ - Log(log," Options.comparator: %s", comparator->Name()); - Log(log," Options.merge_operator: %s", - merge_operator? merge_operator->Name() : "None"); - Log(log," Options.compaction_filter: %s", - compaction_filter? compaction_filter->Name() : "None"); - Log(log," Options.compaction_filter_factory: %s", - compaction_filter_factory->Name()); - Log(log, " Options.compaction_filter_factory_v2: %s", - compaction_filter_factory_v2->Name()); - Log(log," Options.memtable_factory: %s", - memtable_factory->Name()); - Log(log," Options.table_factory: %s", table_factory->Name()); +void DBOptions::Dump(Logger* log) const { Log(log," Options.error_if_exists: %d", error_if_exists); Log(log," Options.create_if_missing: %d", create_if_missing); Log(log," Options.paranoid_checks: %d", paranoid_checks); Log(log," Options.env: %p", env); Log(log," Options.info_log: %p", info_log.get()); - Log(log," Options.write_buffer_size: %zd", write_buffer_size); - Log(log," Options.max_write_buffer_number: %d", max_write_buffer_number); Log(log," Options.max_open_files: %d", max_open_files); + Log(log, " Options.disableDataSync: %d", disableDataSync); + Log(log, " Options.use_fsync: %d", use_fsync); + Log(log, " Options.max_log_file_size: %zu", max_log_file_size); + Log(log, "Options.max_manifest_file_size: %lu", + (unsigned long)max_manifest_file_size); + Log(log, " Options.log_file_time_to_roll: %zu", log_file_time_to_roll); + Log(log, " Options.keep_log_file_num: %zu", keep_log_file_num); + Log(log, " Options.db_stats_log_interval: %d", db_stats_log_interval); + Log(log, " Options.allow_os_buffer: %d", allow_os_buffer); + Log(log, " Options.allow_mmap_reads: %d", allow_mmap_reads); + Log(log, " Options.allow_mmap_writes: %d", allow_mmap_writes); + Log(log, " Options.db_log_dir: %s", + db_log_dir.c_str()); + Log(log, " Options.wal_dir: %s", + wal_dir.c_str()); + Log(log, " Options.table_cache_numshardbits: %d", + table_cache_numshardbits); + Log(log, " Options.table_cache_remove_scan_count_limit: %d", + table_cache_remove_scan_count_limit); + Log(log, " Options.delete_obsolete_files_period_micros: %lu", + (unsigned long)delete_obsolete_files_period_micros); + Log(log, " Options.max_background_compactions: %d", + max_background_compactions); + Log(log, " Options.max_background_flushes: %d", + max_background_flushes); + Log(log, " Options.WAL_ttl_seconds: %lu", + (unsigned long)WAL_ttl_seconds); + Log(log, " Options.WAL_size_limit_MB: %lu", + (unsigned long)WAL_size_limit_MB); + Log(log, " Options.manifest_preallocation_size: %zu", + manifest_preallocation_size); + Log(log, " Options.allow_os_buffer: %d", + allow_os_buffer); + Log(log, " Options.allow_mmap_reads: %d", + allow_mmap_reads); + Log(log, " Options.allow_mmap_writes: %d", + allow_mmap_writes); + Log(log, " Options.is_fd_close_on_exec: %d", + is_fd_close_on_exec); + Log(log, " Options.skip_log_error_on_recovery: %d", + skip_log_error_on_recovery); + Log(log, " Options.stats_dump_period_sec: %u", + stats_dump_period_sec); + Log(log, " Options.advise_random_on_open: %d", + advise_random_on_open); + Log(log, " Options.access_hint_on_compaction_start: %s", + access_hints[access_hint_on_compaction_start]); + Log(log, " Options.use_adaptive_mutex: %d", + use_adaptive_mutex); + Log(log, " Options.bytes_per_sync: %lu", + (unsigned long)bytes_per_sync); +} // DBOptions::Dump + +void ColumnFamilyOptions::Dump(Logger* log) const { + Log(log, " Options.comparator: %s", comparator->Name()); + Log(log, " Options.merge_operator: %s", + merge_operator ? merge_operator->Name() : "None"); + Log(log, " Options.compaction_filter_factory: %s", + compaction_filter_factory->Name()); + Log(log, " Options.compaction_filter_factory_v2: %s", + compaction_filter_factory_v2->Name()); + Log(log, " Options.memtable_factory: %s", memtable_factory->Name()); + Log(log, " Options.table_factory: %s", table_factory->Name()); + Log(log, " Options.write_buffer_size: %zd", write_buffer_size); + Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number); Log(log," Options.block_cache: %p", block_cache.get()); Log(log," Options.block_cache_compressed: %p", block_cache_compressed.get()); @@ -173,18 +331,6 @@ Options::Dump(Logger* log) const prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); Log(log," Options.whole_key_filtering: %d", whole_key_filtering); Log(log," Options.num_levels: %d", num_levels); - Log(log," Options.disableDataSync: %d", disableDataSync); - Log(log," Options.use_fsync: %d", use_fsync); - Log(log," Options.max_log_file_size: %zu", max_log_file_size); - Log(log,"Options.max_manifest_file_size: %lu", - (unsigned long)max_manifest_file_size); - Log(log," Options.log_file_time_to_roll: %zu", log_file_time_to_roll); - Log(log," Options.keep_log_file_num: %zu", keep_log_file_num); - Log(log," Options.db_stats_log_interval: %d", - db_stats_log_interval); - Log(log," Options.allow_os_buffer: %d", allow_os_buffer); - Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads); - Log(log," Options.allow_mmap_writes: %d", allow_mmap_writes); Log(log," Options.min_write_buffer_number_to_merge: %d", min_write_buffer_number_to_merge); Log(log," Options.purge_redundant_kvs_while_flush: %d", @@ -223,26 +369,12 @@ Options::Dump(Logger* log) const source_compaction_factor); Log(log," Options.max_grandparent_overlap_factor: %d", max_grandparent_overlap_factor); - Log(log," Options.db_log_dir: %s", - db_log_dir.c_str()); - Log(log," Options.wal_dir: %s", - wal_dir.c_str()); Log(log," Options.disable_seek_compaction: %d", disable_seek_compaction); Log(log," Options.no_block_cache: %d", no_block_cache); - Log(log," Options.table_cache_numshardbits: %d", - table_cache_numshardbits); - Log(log," Options.table_cache_remove_scan_count_limit: %d", - table_cache_remove_scan_count_limit); Log(log," Options.arena_block_size: %zu", arena_block_size); - Log(log," Options.delete_obsolete_files_period_micros: %lu", - (unsigned long)delete_obsolete_files_period_micros); - Log(log," Options.max_background_compactions: %d", - max_background_compactions); - Log(log," Options.max_background_flushes: %d", - max_background_flushes); Log(log," Options.soft_rate_limit: %.2f", soft_rate_limit); Log(log," Options.hard_rate_limit: %.2f", @@ -251,36 +383,10 @@ Options::Dump(Logger* log) const rate_limit_delay_max_milliseconds); Log(log," Options.disable_auto_compactions: %d", disable_auto_compactions); - Log(log," Options.WAL_ttl_seconds: %lu", - (unsigned long)WAL_ttl_seconds); - Log(log," Options.WAL_size_limit_MB: %lu", - (unsigned long)WAL_size_limit_MB); - Log(log," Options.manifest_preallocation_size: %zu", - manifest_preallocation_size); Log(log," Options.purge_redundant_kvs_while_flush: %d", purge_redundant_kvs_while_flush); - Log(log," Options.allow_os_buffer: %d", - allow_os_buffer); - Log(log," Options.allow_mmap_reads: %d", - allow_mmap_reads); - Log(log," Options.allow_mmap_writes: %d", - allow_mmap_writes); - Log(log," Options.is_fd_close_on_exec: %d", - is_fd_close_on_exec); - Log(log," Options.skip_log_error_on_recovery: %d", - skip_log_error_on_recovery); - Log(log," Options.stats_dump_period_sec: %u", - stats_dump_period_sec); Log(log," Options.block_size_deviation: %d", block_size_deviation); - Log(log," Options.advise_random_on_open: %d", - advise_random_on_open); - Log(log," Options.access_hint_on_compaction_start: %s", - access_hints[access_hint_on_compaction_start]); - Log(log," Options.use_adaptive_mutex: %d", - use_adaptive_mutex); - Log(log," Options.bytes_per_sync: %lu", - (unsigned long)bytes_per_sync); Log(log," Options.filter_deletes: %d", filter_deletes); Log(log, " Options.verify_checksums_in_compaction: %d", @@ -317,8 +423,15 @@ Options::Dump(Logger* log) const memtable_prefix_bloom_bits); Log(log, " Options.memtable_prefix_bloom_probes: %d", memtable_prefix_bloom_probes); + Log(log, " Options.bloom_locality: %d", + bloom_locality); Log(log, " Options.max_successive_merges: %zd", max_successive_merges); +} // ColumnFamilyOptions::Dump + +void Options::Dump(Logger* log) const { + DBOptions::Dump(log); + ColumnFamilyOptions::Dump(log); } // Options::Dump // diff --git a/util/perf_context.cc b/util/perf_context.cc index 650abebca..264b10d73 100644 --- a/util/perf_context.cc +++ b/util/perf_context.cc @@ -9,12 +9,21 @@ namespace rocksdb { -// by default, enable counts only +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) PerfLevel perf_level = kEnableCount; +// This is a dummy variable since some place references it +PerfContext perf_context; +#else +__thread PerfLevel perf_level = kEnableCount; +__thread PerfContext perf_context; +#endif -void SetPerfLevel(PerfLevel level) { perf_level = level; } +void SetPerfLevel(PerfLevel level) { + perf_level = level; +} void PerfContext::Reset() { +#if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE) user_key_comparison_count = 0; block_cache_hit_count = 0; block_read_count = 0; @@ -38,11 +47,15 @@ void PerfContext::Reset() { find_next_user_entry_time = 0; write_pre_and_post_process_time = 0; write_memtable_time = 0; +#endif } #define OUTPUT(counter) #counter << " = " << counter << ", " std::string PerfContext::ToString() const { +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) + return ""; +#else std::ostringstream ss; ss << OUTPUT(user_key_comparison_count) << OUTPUT(block_cache_hit_count) @@ -67,8 +80,7 @@ std::string PerfContext::ToString() const { << OUTPUT(write_pre_and_post_process_time) << OUTPUT(write_memtable_time); return ss.str(); +#endif } -__thread PerfContext perf_context; - } diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h index f7818e69c..dc4ae95e5 100644 --- a/util/perf_context_imp.h +++ b/util/perf_context_imp.h @@ -9,26 +9,80 @@ namespace rocksdb { -extern enum PerfLevel perf_level; +#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) -inline void StartPerfTimer(StopWatchNano* timer) { - if (perf_level >= PerfLevel::kEnableTime) { - timer->Start(); +#define PERF_TIMER_DECLARE() +#define PERF_TIMER_START(metric) +#define PERF_TIMER_AUTO(metric) +#define PERF_TIMER_MEASURE(metric) +#define PERF_TIMER_STOP(metric) +#define PERF_COUNTER_ADD(metric, value) + +#else + +extern __thread PerfLevel perf_level; + +class PerfStepTimer { + public: + PerfStepTimer() + : enabled_(perf_level >= PerfLevel::kEnableTime), + env_(enabled_ ? Env::Default() : nullptr), + start_(0) { } -} -inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) { - if (perf_level >= PerfLevel::kEnableCount) { - *count += delta; + void Start() { + if (enabled_) { + start_ = env_->NowNanos(); + } } -} -inline void BumpPerfTime(uint64_t* time, - StopWatchNano* timer, - bool reset = true) { - if (perf_level >= PerfLevel::kEnableTime) { - *time += timer->ElapsedNanos(reset); + void Measure(uint64_t* metric) { + if (start_) { + uint64_t now = env_->NowNanos(); + *metric += now - start_; + start_ = now; + } } -} + + void Stop(uint64_t* metric) { + if (start_) { + *metric += env_->NowNanos() - start_; + start_ = 0; + } + } + + private: + const bool enabled_; + Env* const env_; + uint64_t start_; +}; + +// Declare the local timer object to be used later on +#define PERF_TIMER_DECLARE() \ + PerfStepTimer perf_step_timer; + +// Set start time of the timer +#define PERF_TIMER_START(metric) \ + perf_step_timer.Start(); + +// Declare and set start time of the timer +#define PERF_TIMER_AUTO(metric) \ + PerfStepTimer perf_step_timer; \ + perf_step_timer.Start(); + +// Update metric with time elapsed since last START. start time is reset +// to current timestamp. +#define PERF_TIMER_MEASURE(metric) \ + perf_step_timer.Measure(&(perf_context.metric)); + +// Update metric with time elapsed since last START. But start time is not set. +#define PERF_TIMER_STOP(metric) \ + perf_step_timer.Stop(&(perf_context.metric)); + +// Increase metric value +#define PERF_COUNTER_ADD(metric, value) \ + perf_context.metric += value; + +#endif } diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index e78e760e9..93f7134c7 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -13,13 +13,13 @@ class SkipListRep : public MemTableRep { SkipList skip_list_; public: explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena) - : skip_list_(compare, arena) { + : MemTableRep(arena), skip_list_(compare, arena) { } // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. - virtual void Insert(const char* key) override { - skip_list_.Insert(key); + virtual void Insert(KeyHandle handle) override { + skip_list_.Insert(static_cast(handle)); } // Returns true iff an entry that compares equal to key is in the list. diff --git a/util/sync_point.cc b/util/sync_point.cc new file mode 100644 index 000000000..5d0ac2dd6 --- /dev/null +++ b/util/sync_point.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "util/sync_point.h" + +namespace rocksdb { + +SyncPoint* SyncPoint::GetInstance() { + static SyncPoint sync_point; + return &sync_point; +} + +void SyncPoint::LoadDependency(const std::vector& dependencies) { + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + for (const auto& dependency : dependencies) { + successors_[dependency.predecessor].push_back(dependency.successor); + predecessors_[dependency.successor].push_back(dependency.predecessor); + } +} + +bool SyncPoint::PredecessorsAllCleared(const std::string& point) { + for (const auto& pred : predecessors_[point]) { + if (cleared_points_.count(pred) == 0) { + return false; + } + } + return true; +} + +void SyncPoint::EnableProcessing() { + std::unique_lock lock(mutex_); + enabled_ = true; +} + +void SyncPoint::DisableProcessing() { + std::unique_lock lock(mutex_); + enabled_ = false; +} + +void SyncPoint::ClearTrace() { + std::unique_lock lock(mutex_); + cleared_points_.clear(); +} + +void SyncPoint::Process(const std::string& point) { + std::unique_lock lock(mutex_); + + if (!enabled_) return; + + while (!PredecessorsAllCleared(point)) { + cv_.wait(lock); + } + + cleared_points_.insert(point); + cv_.notify_all(); +} + +} // namespace rocksdb diff --git a/util/sync_point.h b/util/sync_point.h new file mode 100644 index 000000000..3cc892370 --- /dev/null +++ b/util/sync_point.h @@ -0,0 +1,79 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace rocksdb { + +// This class provides facility to reproduce race conditions deterministically +// in unit tests. +// Developer could specify sync points in the codebase via TEST_SYNC_POINT. +// Each sync point represents a position in the execution stream of a thread. +// In the unit test, 'Happens After' relationship among sync points could be +// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of +// threads execution. +// Refer to (DBTest,TransactionLogIteratorRace), for an exmaple use case. + +class SyncPoint { + public: + static SyncPoint* GetInstance(); + + struct Dependency { + std::string predecessor; + std::string successor; + }; + // call once at the beginning of a test to setup the dependency between + // sync points + void LoadDependency(const std::vector& dependencies); + + // enable sync point processing (disabled on startup) + void EnableProcessing(); + + // disable sync point processing + void DisableProcessing(); + + // remove the execution trace of all sync points + void ClearTrace(); + + // triggered by TEST_SYNC_POINT, blocking execution until all predecessors + // are executed. + void Process(const std::string& point); + + // TODO: it might be useful to provide a function that blocks until all + // sync points are cleared. + + private: + bool PredecessorsAllCleared(const std::string& point); + + // successor/predecessor map loaded from LoadDependency + std::unordered_map> successors_; + std::unordered_map> predecessors_; + + std::mutex mutex_; + std::condition_variable cv_; + // sync points that have been passed through + std::unordered_set cleared_points_; + bool enabled_ = false; +}; + +} // namespace rocksdb + +// Use TEST_SYNC_POINT to specify sync points inside code base. +// Sync points can have happens-after depedency on other sync points, +// configured at runtime via SyncPoint::LoadDependency. This could be +// utilized to re-produce race conditions between threads. +// See TransactionLogIteratorRace in db_test.cc for an example use case. +// TEST_SYNC_POINT is no op in release build. +#ifdef NDEBUG +#define TEST_SYNC_POINT(x) +#else +#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x) +#endif diff --git a/util/thread_local.h b/util/thread_local.h index d1434e3e5..a7728ed64 100644 --- a/util/thread_local.h +++ b/util/thread_local.h @@ -16,6 +16,7 @@ #include "util/autovector.h" #include "port/port_posix.h" +#include "util/thread_local.h" namespace rocksdb { diff --git a/util/vectorrep.cc b/util/vectorrep.cc index 3777f7ffe..14e7c9f91 100644 --- a/util/vectorrep.cc +++ b/util/vectorrep.cc @@ -30,7 +30,7 @@ class VectorRep : public MemTableRep { // single buffer and pass that in as the parameter to Insert) // REQUIRES: nothing that compares equal to key is currently in the // collection. - virtual void Insert(const char* key) override; + virtual void Insert(KeyHandle handle) override; // Returns true iff an entry that compares equal to key is in the collection. virtual bool Contains(const char* key) const override; @@ -106,7 +106,8 @@ class VectorRep : public MemTableRep { const KeyComparator& compare_; }; -void VectorRep::Insert(const char* key) { +void VectorRep::Insert(KeyHandle handle) { + auto* key = static_cast(handle); assert(!Contains(key)); WriteLock l(&rwlock_); assert(!immutable_); @@ -134,7 +135,8 @@ size_t VectorRep::ApproximateMemoryUsage() { } VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count) - : bucket_(new Bucket()), + : MemTableRep(arena), + bucket_(new Bucket()), immutable_(false), sorted_(false), compare_(compare) { bucket_.get()->reserve(count); } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 5a0b6928b..f6ffd9487 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -44,7 +44,9 @@ class DummyDB : public StackableDB { return options_.env; } - virtual const Options& GetOptions() const override { + using DB::GetOptions; + virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const + override { return options_; } @@ -68,6 +70,10 @@ class DummyDB : public StackableDB { return Status::OK(); } + virtual ColumnFamilyHandle* DefaultColumnFamily() const override { + return nullptr; + } + class DummyLogFile : public LogFile { public: /* implicit */ @@ -345,7 +351,7 @@ class BackupableDBTest { options_.wal_dir = dbname_; // set up backup db options CreateLoggerFromOptions(dbname_, backupdir_, env_, - Options(), &logger_); + DBOptions(), &logger_); backupable_options_.reset(new BackupableDBOptions( backupdir_, test_backup_env_.get(), true, logger_.get(), true)); @@ -425,6 +431,19 @@ class BackupableDBTest { } } + void DeleteLogFiles() { + std::vector delete_logs; + env_->GetChildren(dbname_, &delete_logs); + for (auto f : delete_logs) { + uint64_t number; + FileType type; + bool ok = ParseFileName(f, &number, &type); + if (ok && type == kLogFile) { + env_->DeleteFile(dbname_ + "/" + f); + } + } + } + // files std::string dbname_; std::string backupdir_; @@ -721,10 +740,11 @@ TEST(BackupableDBTest, FailOverwritingBackups) { // create backups 1, 2, 3, 4, 5 OpenBackupableDB(true); for (int i = 0; i < 5; ++i) { + CloseBackupableDB(); + DeleteLogFiles(); + OpenBackupableDB(false); FillDB(db_.get(), 100 * i, 100 * (i + 1)); ASSERT_OK(db_->CreateNewBackup(true)); - CloseBackupableDB(); - OpenBackupableDB(false); } CloseBackupableDB(); @@ -826,7 +846,7 @@ TEST(BackupableDBTest, RateLimiting) { auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / backupable_options_->backup_rate_limit; ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); - ASSERT_LT(backup_time, 1.5 * rate_limited_backup_time); + ASSERT_LT(backup_time, 2.5 * rate_limited_backup_time); CloseBackupableDB(); @@ -838,7 +858,7 @@ TEST(BackupableDBTest, RateLimiting) { auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / backupable_options_->restore_rate_limit; ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); - ASSERT_LT(restore_time, 1.5 * rate_limited_restore_time); + ASSERT_LT(restore_time, 2.5 * rate_limited_restore_time); AssertBackupConsistency(0, 0, 100000, 100010); } diff --git a/utilities/geodb/geodb_test.cc b/utilities/geodb/geodb_test.cc index d7af6c32b..1a42e3247 100644 --- a/utilities/geodb/geodb_test.cc +++ b/utilities/geodb/geodb_test.cc @@ -35,7 +35,7 @@ class GeoDBTest { } }; -const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault/"; +const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault"; Options GeoDBTest::options = Options(); // Insert, Get and Remove @@ -106,14 +106,14 @@ TEST(GeoDBTest, Search) { std::vector values; status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values); ASSERT_TRUE(status.ok()); - ASSERT_EQ(values.size(), 1); + ASSERT_EQ(values.size(), 1U); // search all objects centered at 46 degree latitude with // a radius of 2 kilometers. There should be none. values.clear(); status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values); ASSERT_TRUE(status.ok()); - ASSERT_EQ(values.size(), 0); + ASSERT_EQ(values.size(), 0U); } } // namespace rocksdb diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index 5b704930b..21626bec2 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -119,15 +119,16 @@ Status DBWithTTL::StripTS(std::string* str) { return st; } -Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key, +Status DBWithTTL::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) { WriteBatch batch; batch.Put(key, val); - return Write(opt, &batch); + return Write(options, &batch); } Status DBWithTTL::Get(const ReadOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { Status st = db_->Get(options, key, value); if (!st.ok()) { @@ -140,18 +141,18 @@ Status DBWithTTL::Get(const ReadOptions& options, return StripTS(value); } -std::vector DBWithTTL::MultiGet(const ReadOptions& options, - const std::vector& keys, - std::vector* values) { +std::vector DBWithTTL::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { return std::vector(keys.size(), Status::NotSupported("MultiGet not\ supported with TTL")); } bool DBWithTTL::KeyMayExist(const ReadOptions& options, - const Slice& key, - std::string* value, - bool* value_found) { + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { bool ret = db_->KeyMayExist(options, key, value, value_found); if (ret && value != nullptr && value_found != nullptr && *value_found) { if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) { @@ -161,12 +162,12 @@ bool DBWithTTL::KeyMayExist(const ReadOptions& options, return ret; } -Status DBWithTTL::Merge(const WriteOptions& opt, - const Slice& key, +Status DBWithTTL::Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { WriteBatch batch; batch.Merge(key, value); - return Write(opt, &batch); + return Write(options, &batch); } Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) { @@ -208,12 +209,9 @@ Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) { } } -Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) { - return new TtlIterator(db_->NewIterator(opts)); -} - -void DBWithTTL::TEST_Destroy_DBWithTtl() { - ((DBImpl*) db_)->TEST_Destroy_DBImpl(); +Iterator* DBWithTTL::NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) { + return new TtlIterator(db_->NewIterator(opts, column_family)); } } // namespace rocksdb diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index 519ae32c7..90194d21f 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -23,30 +23,39 @@ class DBWithTTL : public StackableDB { virtual ~DBWithTTL(); - virtual Status Put(const WriteOptions& o, const Slice& key, + using StackableDB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) override; - virtual Status Get(const ReadOptions& options, const Slice& key, + using StackableDB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value) override; + using StackableDB::MultiGet; virtual std::vector MultiGet( - const ReadOptions& options, const std::vector& keys, + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) override; + using StackableDB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, - const Slice& key, + ColumnFamilyHandle* column_family, const Slice& key, std::string* value, bool* value_found = nullptr) override; - virtual Status Merge(const WriteOptions& options, const Slice& key, + using StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) override; virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; - virtual Iterator* NewIterator(const ReadOptions& opts) override; - - // Simulate a db crash, no elegant closing of database. - void TEST_Destroy_DBWithTtl(); + using StackableDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override; virtual DB* GetBaseDB() { return db_;