Merge branch 'master' of github.com:facebook/rocksdb into gh-pages
This commit is contained in:
commit
3a0a8a02be
10
.arcconfig
Normal file
10
.arcconfig
Normal file
@ -0,0 +1,10 @@
|
||||
{
|
||||
"project_id" : "leveldb",
|
||||
"conduit_uri" : "https://reviews.facebook.net/",
|
||||
"copyright_holder" : "",
|
||||
"load" : [
|
||||
"linters/src/"
|
||||
],
|
||||
"lint.engine" : "FacebookFbcodeLintEngine",
|
||||
"lint.engine.single.linter" : "FbcodeCppLinter"
|
||||
}
|
23
.gitignore
vendored
Normal file
23
.gitignore
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
build_config.mk
|
||||
|
||||
*.a
|
||||
*.arc
|
||||
*.d
|
||||
*.dylib*
|
||||
*.gcda
|
||||
*.gcno
|
||||
*.o
|
||||
*.so
|
||||
*.so.*
|
||||
*_test
|
||||
*_bench
|
||||
*_stress
|
||||
|
||||
ldb
|
||||
manifest_dump
|
||||
sst_dump
|
||||
util/build_version.cc
|
||||
build_tools/VALGRIND_LOGS/
|
||||
coverage/COVERAGE_REPORT
|
||||
util/build_version.cc.tmp
|
||||
.gdbhistory
|
35
LICENSE
Normal file
35
LICENSE
Normal file
@ -0,0 +1,35 @@
|
||||
BSD License
|
||||
|
||||
For rocksdb software
|
||||
|
||||
Copyright (c) 2013, Facebook, Inc.
|
||||
All rights reserved.
|
||||
---------------------------------------------------------------------
|
||||
|
||||
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
380
Makefile
Normal file
380
Makefile
Normal file
@ -0,0 +1,380 @@
|
||||
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
# Inherit some settings from environment variables, if available
|
||||
INSTALL_PATH ?= $(CURDIR)
|
||||
|
||||
#-----------------------------------------------
|
||||
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
|
||||
# to switch between compilation modes.
|
||||
|
||||
# OPT ?= -DNDEBUG # (A) Production use (optimized mode)
|
||||
OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
|
||||
#-----------------------------------------------
|
||||
|
||||
# detect what platform we're building on
|
||||
$(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platform $(CURDIR)/build_config.mk))
|
||||
# this file is generated by the previous line to set build flags and sources
|
||||
include build_config.mk
|
||||
|
||||
WARNING_FLAGS = -Wall -Werror
|
||||
CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
||||
CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -std=gnu++0x -Woverloaded-virtual
|
||||
|
||||
LDFLAGS += $(PLATFORM_LDFLAGS)
|
||||
|
||||
LIBOBJECTS = $(SOURCES:.cc=.o)
|
||||
LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
|
||||
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
|
||||
|
||||
TESTUTIL = ./util/testutil.o
|
||||
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
|
||||
VALGRIND_ERROR = 2
|
||||
VALGRIND_DIR = build_tools/VALGRIND_LOGS
|
||||
VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
|
||||
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
|
||||
|
||||
TESTS = \
|
||||
table_stats_collector_test \
|
||||
arena_test \
|
||||
auto_roll_logger_test \
|
||||
block_test \
|
||||
bloom_test \
|
||||
c_test \
|
||||
cache_test \
|
||||
coding_test \
|
||||
corruption_test \
|
||||
crc32c_test \
|
||||
db_test \
|
||||
dbformat_test \
|
||||
env_test \
|
||||
blob_store_test \
|
||||
filelock_test \
|
||||
filename_test \
|
||||
filter_block_test \
|
||||
histogram_test \
|
||||
log_test \
|
||||
manual_compaction_test \
|
||||
memenv_test \
|
||||
merge_test \
|
||||
redis_test \
|
||||
reduce_levels_test \
|
||||
simple_table_db_test \
|
||||
skiplist_test \
|
||||
stringappend_test \
|
||||
table_test \
|
||||
ttl_test \
|
||||
version_edit_test \
|
||||
version_set_test \
|
||||
write_batch_test\
|
||||
deletefile_test
|
||||
|
||||
TOOLS = \
|
||||
sst_dump \
|
||||
db_stress \
|
||||
ldb \
|
||||
db_repl_stress \
|
||||
blob_store_bench
|
||||
|
||||
PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
|
||||
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
|
||||
|
||||
# The library name is configurable since we are maintaining libraries of both
|
||||
# debug/release mode.
|
||||
LIBNAME = librocksdb
|
||||
LIBRARY = ${LIBNAME}.a
|
||||
MEMENVLIBRARY = libmemenv.a
|
||||
|
||||
default: all
|
||||
|
||||
#-----------------------------------------------
|
||||
# Create platform independent shared libraries.
|
||||
#-----------------------------------------------
|
||||
ifneq ($(PLATFORM_SHARED_EXT),)
|
||||
|
||||
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
|
||||
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
|
||||
SHARED2 = $(SHARED1)
|
||||
SHARED3 = $(SHARED1)
|
||||
SHARED = $(SHARED1)
|
||||
else
|
||||
# Update db.h if you change these.
|
||||
SHARED_MAJOR = 2
|
||||
SHARED_MINOR = 0
|
||||
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
|
||||
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
|
||||
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
|
||||
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
|
||||
$(SHARED1): $(SHARED3)
|
||||
ln -fs $(SHARED3) $(SHARED1)
|
||||
$(SHARED2): $(SHARED3)
|
||||
ln -fs $(SHARED3) $(SHARED2)
|
||||
endif
|
||||
|
||||
$(SHARED3):
|
||||
$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3)
|
||||
|
||||
endif # PLATFORM_SHARED_EXT
|
||||
|
||||
all: $(LIBRARY) $(PROGRAMS)
|
||||
|
||||
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
|
||||
release tags valgrind_check whitebox_crash_test
|
||||
|
||||
release:
|
||||
$(MAKE) clean
|
||||
OPT=-DNDEBUG $(MAKE) -j32
|
||||
|
||||
coverage:
|
||||
$(MAKE) clean
|
||||
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
|
||||
(cd coverage; ./coverage_test.sh)
|
||||
# Delete intermediate files
|
||||
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
|
||||
|
||||
check: all $(PROGRAMS) $(TESTS) $(TOOLS) ldb_tests
|
||||
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
|
||||
|
||||
ldb_tests: all $(PROGRAMS) $(TOOLS)
|
||||
python tools/ldb_test.py
|
||||
|
||||
crash_test: blackbox_crash_test whitebox_crash_test
|
||||
|
||||
blackbox_crash_test: db_stress
|
||||
python -u tools/db_crashtest.py
|
||||
|
||||
whitebox_crash_test: db_stress
|
||||
python -u tools/db_crashtest2.py
|
||||
|
||||
valgrind_check: all $(PROGRAMS) $(TESTS)
|
||||
mkdir -p $(VALGRIND_DIR)
|
||||
echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
|
||||
echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
|
||||
for t in $(filter-out skiplist_test,$(TESTS)); do \
|
||||
stime=`date '+%s'`; \
|
||||
$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
|
||||
if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
|
||||
echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
|
||||
fi; \
|
||||
etime=`date '+%s'`; \
|
||||
echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
|
||||
done
|
||||
|
||||
clean:
|
||||
-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
|
||||
-rm -rf ios-x86/* ios-arm/*
|
||||
-find . -name "*.[od]" -exec rm {} \;
|
||||
-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
|
||||
tags:
|
||||
ctags * -R
|
||||
cscope -b `find . -name '*.cc'` `find . -name '*.h'`
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests and tools
|
||||
# ---------------------------------------------------------------------------
|
||||
$(LIBRARY): $(LIBOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(LIBOBJECTS)
|
||||
|
||||
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lsqlite3 $(COVERAGEFLAGS)
|
||||
|
||||
db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
|
||||
$(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lkyotocabinet $(COVERAGEFLAGS)
|
||||
|
||||
signal_test: util/signal_test.o $(LIBOBJECTS)
|
||||
$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
table_stats_collector_test: db/table_stats_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/table_stats_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
|
||||
$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
||||
|
||||
prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
||||
|
||||
ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
||||
|
||||
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
|
||||
rm -f $@
|
||||
$(AR) -rs $@ $(MEMENVOBJECTS)
|
||||
|
||||
memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
|
||||
$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
|
||||
$(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
|
||||
$(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
sst_dump: tools/sst_dump.o $(LIBOBJECTS)
|
||||
$(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
ldb: tools/ldb.o $(LIBOBJECTS)
|
||||
$(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Platform-specific compilation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ifeq ($(PLATFORM), IOS)
|
||||
# For iOS, create universal object files to be used on both the simulator and
|
||||
# a device.
|
||||
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
|
||||
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
|
||||
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
|
||||
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString)
|
||||
|
||||
.cc.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS)
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS)
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
.c.o:
|
||||
mkdir -p ios-x86/$(dir $@)
|
||||
$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
|
||||
mkdir -p ios-arm/$(dir $@)
|
||||
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
|
||||
lipo ios-x86/$@ ios-arm/$@ -create -output $@
|
||||
|
||||
else
|
||||
.cc.o:
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
|
||||
|
||||
.c.o:
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source files dependencies detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Add proper dependency support so changing a .h file forces a .cc file to
|
||||
# rebuild.
|
||||
|
||||
# The .d file indicates .cc file's dependencies on .h files. We generate such
|
||||
# dependency by g++'s -MM option, whose output is a make dependency rule.
|
||||
# The sed command makes sure the "target" file in the generated .d file has
|
||||
# the correct path prefix.
|
||||
%.d: %.cc
|
||||
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
|
||||
@sed -i -e 's|.*:|$*.o:|' $@
|
||||
|
||||
DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
|
||||
|
||||
depend: $(DEPFILES)
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include $(DEPFILES)
|
||||
endif
|
23
PATENTS
Normal file
23
PATENTS
Normal file
@ -0,0 +1,23 @@
|
||||
Additional Grant of Patent Rights
|
||||
|
||||
“Software” means the rocksdb software distributed by Facebook, Inc.
|
||||
|
||||
Facebook hereby grants you a perpetual, worldwide, royalty-free,
|
||||
non-exclusive, irrevocable (subject to the termination provision below)
|
||||
license under any rights in any patent claims owned by Facebook, to make,
|
||||
have made, use, sell, offer to sell, import, and otherwise transfer the
|
||||
Software. For avoidance of doubt, no license is granted under Facebook’s
|
||||
rights in any patent claims that are infringed by (i) modifications to the
|
||||
Software made by you or a third party, or (ii) the Software in combination
|
||||
with any software or other technology provided by you or a third party.
|
||||
|
||||
The license granted hereunder will terminate, automatically and without
|
||||
notice, for anyone that makes any claim (including by filing any lawsuit,
|
||||
assertion or other action) alleging (a) direct, indirect, or contributory
|
||||
infringement or inducement to infringe any patent: (i) by Facebook or any
|
||||
of its subsidiaries or affiliates, whether or not such claim is related
|
||||
to the Software, (ii) by any party if such claim arises in whole or in
|
||||
part from any software, product or service of Facebook or any of its
|
||||
subsidiaries or affiliates, whether or not such claim is related to the
|
||||
Software, or (iii) by any party relating to the Software; or (b) that
|
||||
any right in any patent claim of Facebook is invalid or unenforceable.
|
82
README
Normal file
82
README
Normal file
@ -0,0 +1,82 @@
|
||||
rocksdb: A persistent key-value store for flash storage
|
||||
Authors: * The Facebook Database Engineering Team
|
||||
* Build on earlier work on leveldb by Sanjay Ghemawat
|
||||
(sanjay@google.com) and Jeff Dean (jeff@google.com)
|
||||
|
||||
This code is a library that forms the core building block for a fast
|
||||
key value server, especially suited for storing data on flash drives.
|
||||
It has an Log-Stuctured-Merge-Database (LSM) design with flexible tradeoffs
|
||||
between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF)
|
||||
and Space-Amplification-Factor(SAF). It has multi-threaded compactions,
|
||||
making it specially suitable for storing multiple terabytes of data in a
|
||||
single database.
|
||||
|
||||
The core of this code has been derived from open-source leveldb.
|
||||
|
||||
The code under this directory implements a system for maintaining a
|
||||
persistent key/value store.
|
||||
|
||||
See doc/index.html for more explanation.
|
||||
See doc/impl.html for a brief overview of the implementation.
|
||||
|
||||
The public interface is in include/*. Callers should not include or
|
||||
rely on the details of any other header files in this package. Those
|
||||
internal APIs may be changed without warning.
|
||||
|
||||
Guide to header files:
|
||||
|
||||
include/rocksdb/db.h
|
||||
Main interface to the DB: Start here
|
||||
|
||||
include/rocksdb/options.h
|
||||
Control over the behavior of an entire database, and also
|
||||
control over the behavior of individual reads and writes.
|
||||
|
||||
include/rocksdb/comparator.h
|
||||
Abstraction for user-specified comparison function. If you want
|
||||
just bytewise comparison of keys, you can use the default comparator,
|
||||
but clients can write their own comparator implementations if they
|
||||
want custom ordering (e.g. to handle different character
|
||||
encodings, etc.)
|
||||
|
||||
include/rocksdb/iterator.h
|
||||
Interface for iterating over data. You can get an iterator
|
||||
from a DB object.
|
||||
|
||||
include/rocksdb/write_batch.h
|
||||
Interface for atomically applying multiple updates to a database.
|
||||
|
||||
include/rocksdb/slice.h
|
||||
A simple module for maintaining a pointer and a length into some
|
||||
other byte array.
|
||||
|
||||
include/rocksdb/status.h
|
||||
Status is returned from many of the public interfaces and is used
|
||||
to report success and various kinds of errors.
|
||||
|
||||
include/rocksdb/env.h
|
||||
Abstraction of the OS environment. A posix implementation of
|
||||
this interface is in util/env_posix.cc
|
||||
|
||||
include/rocksdb/table_builder.h
|
||||
Lower-level modules that most clients probably won't use directly
|
||||
|
||||
include/rocksdb/cache.h
|
||||
An API for the block cache.
|
||||
|
||||
include/rocksdb/compaction_filter.h
|
||||
An API for a application filter invoked on every compaction.
|
||||
|
||||
include/rocksdb/filter_policy.h
|
||||
An API for configuring a bloom filter.
|
||||
|
||||
include/rocksdb/memtablerep.h
|
||||
An API for implementing a memtable.
|
||||
|
||||
include/rocksdb/statistics.h
|
||||
An API to retrieve various database statistics.
|
||||
|
||||
include/rocksdb/transaction_log.h
|
||||
An API to retrieve transaction logs from a database.
|
||||
|
||||
|
3
README.fb
Normal file
3
README.fb
Normal file
@ -0,0 +1,3 @@
|
||||
* Detailed instructions on how to compile using fbcode and jemalloc
|
||||
|
||||
* Latest release is 2.5.fb
|
252
build_tools/build_detect_platform
Executable file
252
build_tools/build_detect_platform
Executable file
@ -0,0 +1,252 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Detects OS we're compiling on and outputs a file specified by the first
|
||||
# argument, which in turn gets read while processing Makefile.
|
||||
#
|
||||
# The output will set the following variables:
|
||||
# CC C Compiler path
|
||||
# CXX C++ Compiler path
|
||||
# PLATFORM_LDFLAGS Linker flags
|
||||
# PLATFORM_SHARED_EXT Extension for shared libraries
|
||||
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
|
||||
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
|
||||
# PLATFORM_CCFLAGS C compiler flags
|
||||
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
|
||||
# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned
|
||||
# shared libraries, empty otherwise.
|
||||
#
|
||||
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
|
||||
#
|
||||
# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
|
||||
# -DLEVELDB_PLATFORM_NOATOMIC if it is not
|
||||
# -DSNAPPY if the Snappy library is present
|
||||
#
|
||||
|
||||
OUTPUT=$1
|
||||
if test -z "$OUTPUT"; then
|
||||
echo "usage: $0 <output-filename>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Default to fbcode gcc on internal fb machines
|
||||
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
|
||||
if [ -z "$USE_CLANG" ]; then
|
||||
source $PWD/build_tools/fbcode.gcc471.sh
|
||||
else
|
||||
source $PWD/build_tools/fbcode.clang31.sh
|
||||
fi
|
||||
fi
|
||||
|
||||
# Delete existing output, if it exists
|
||||
rm -f $OUTPUT
|
||||
touch $OUTPUT
|
||||
|
||||
if test -z "$CC"; then
|
||||
CC=cc
|
||||
fi
|
||||
|
||||
if test -z "$CXX"; then
|
||||
CXX=g++
|
||||
fi
|
||||
|
||||
# Detect OS
|
||||
if test -z "$TARGET_OS"; then
|
||||
TARGET_OS=`uname -s`
|
||||
fi
|
||||
|
||||
COMMON_FLAGS="${CFLAGS}"
|
||||
CROSS_COMPILE=
|
||||
PLATFORM_CCFLAGS=
|
||||
PLATFORM_CXXFLAGS="${CXXFLAGS}"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
|
||||
PLATFORM_SHARED_EXT="so"
|
||||
PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl,"
|
||||
PLATFORM_SHARED_CFLAGS="-fPIC"
|
||||
PLATFORM_SHARED_VERSIONED=true
|
||||
|
||||
# generic port files (working on all platform by #ifdef) go directly in /port
|
||||
GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
|
||||
|
||||
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
|
||||
case "$TARGET_OS" in
|
||||
Darwin)
|
||||
PLATFORM=OS_MACOSX
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
|
||||
PLATFORM_SHARED_EXT=dylib
|
||||
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
|
||||
# PORT_FILES=port/darwin/darwin_specific.cc
|
||||
;;
|
||||
Linux)
|
||||
PLATFORM=OS_LINUX
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
|
||||
if [ -z "$USE_CLANG" ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
|
||||
fi
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
|
||||
# PORT_FILES=port/linux/linux_specific.cc
|
||||
;;
|
||||
SunOS)
|
||||
PLATFORM=OS_SOLARIS
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
|
||||
# PORT_FILES=port/sunos/sunos_specific.cc
|
||||
;;
|
||||
FreeBSD)
|
||||
PLATFORM=OS_FREEBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
|
||||
# PORT_FILES=port/freebsd/freebsd_specific.cc
|
||||
;;
|
||||
NetBSD)
|
||||
PLATFORM=OS_NETBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
|
||||
# PORT_FILES=port/netbsd/netbsd_specific.cc
|
||||
;;
|
||||
OpenBSD)
|
||||
PLATFORM=OS_OPENBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
|
||||
# PORT_FILES=port/openbsd/openbsd_specific.cc
|
||||
;;
|
||||
DragonFly)
|
||||
PLATFORM=OS_DRAGONFLYBSD
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
|
||||
# PORT_FILES=port/dragonfly/dragonfly_specific.cc
|
||||
;;
|
||||
OS_ANDROID_CROSSCOMPILE)
|
||||
PLATFORM=OS_ANDROID
|
||||
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library
|
||||
# PORT_FILES=port/android/android.cc
|
||||
CROSS_COMPILE=true
|
||||
;;
|
||||
*)
|
||||
echo "Unknown platform!" >&2
|
||||
exit 1
|
||||
esac
|
||||
|
||||
$PWD/build_tools/build_detect_version
|
||||
|
||||
# We want to make a list of all cc files within util, db, table, and helpers
|
||||
# except for the test and benchmark files. By default, find will output a list
|
||||
# of all files matching either rule, so we need to append -print to make the
|
||||
# prune take effect.
|
||||
DIRS="util db table utilities"
|
||||
|
||||
set -f # temporarily disable globbing so that our patterns arent expanded
|
||||
PRUNE_TEST="-name *test*.cc -prune"
|
||||
PRUNE_BENCH="-name *_bench.cc -prune"
|
||||
PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
|
||||
PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
|
||||
set +f # re-enable globbing
|
||||
|
||||
# The sources consist of the portable files, plus the platform-specific port
|
||||
# file.
|
||||
echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT
|
||||
echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT
|
||||
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
|
||||
|
||||
if [ "$CROSS_COMPILE" = "true" ]; then
|
||||
# Cross-compiling; do not try any compilation tests.
|
||||
true
|
||||
else
|
||||
# If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
|
||||
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <cstdatomic>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
|
||||
PLATFORM_CXXFLAGS="-std=c++0x"
|
||||
else
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
|
||||
fi
|
||||
|
||||
# Test whether Snappy library is installed
|
||||
# http://code.google.com/p/snappy/
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <snappy.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS ${SNAPPY_LDFLAGS:-./snappy/libs/libsnappy.a}"
|
||||
fi
|
||||
|
||||
|
||||
# Test whether gflags library is installed
|
||||
# http://code.google.com/p/gflags/
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <gflags/gflags.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS"
|
||||
fi
|
||||
|
||||
# Test whether zlib library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <zlib.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
|
||||
fi
|
||||
|
||||
# Test whether bzip library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <bzlib.h>
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
|
||||
fi
|
||||
|
||||
# Test whether tcmalloc is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
|
||||
fi
|
||||
fi
|
||||
|
||||
# shall we use HDFS?
|
||||
|
||||
if test "$USE_HDFS"; then
|
||||
if test -z "$JAVA_HOME"; then
|
||||
echo "JAVA_HOME has to be set for HDFS usage."
|
||||
exit 1
|
||||
fi
|
||||
HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
|
||||
HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64"
|
||||
HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
|
||||
HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
|
||||
COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
|
||||
fi
|
||||
|
||||
# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
|
||||
COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
|
||||
|
||||
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
|
||||
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
|
||||
|
||||
VALGRIND_VER="$VALGRIND_VER"
|
||||
|
||||
echo "CC=$CC" >> $OUTPUT
|
||||
echo "CXX=$CXX" >> $OUTPUT
|
||||
echo "PLATFORM=$PLATFORM" >> $OUTPUT
|
||||
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
|
||||
echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT
|
||||
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
|
||||
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
|
||||
echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT
|
42
build_tools/build_detect_version
Executable file
42
build_tools/build_detect_version
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Record the version of the source that we are compiling.
|
||||
# We keep a record of the git revision in util/version.cc. This source file
|
||||
# is then built as a regular source file as part of the compilation process.
|
||||
# One can run "strings executable_filename | grep _build_" to find the version of
|
||||
# the source that we used to build the executable file.
|
||||
#
|
||||
|
||||
# create git version file
|
||||
VFILE=$ROCKSDB_ROOT/util/build_version.cc.tmp
|
||||
trap "rm $VFILE" EXIT
|
||||
|
||||
# check to see if git is in the path
|
||||
which git > /dev/null
|
||||
|
||||
if [ "$?" = 0 ]; then
|
||||
env -i git rev-parse HEAD |
|
||||
awk '
|
||||
BEGIN {
|
||||
print "#include \"build_version.h\"\n"
|
||||
}
|
||||
{ print "const char* rocksdb_build_git_sha = \"rocksdb_build_git_sha:" $0"\";" }
|
||||
' > ${VFILE}
|
||||
else
|
||||
echo "git not found" |
|
||||
awk '
|
||||
BEGIN {
|
||||
print "#include \"build_version.h\""
|
||||
}
|
||||
{ print "const char* rocksdb_build_git_sha = \"rocksdb_build_git_sha:git not found\";" }
|
||||
' > ${VFILE}
|
||||
fi
|
||||
|
||||
echo "const char* rocksdb_build_git_datetime = \"rocksdb_build_git_datetime:$(date)\";" >> ${VFILE}
|
||||
echo "const char* rocksdb_build_compile_date = __DATE__;" >> ${VFILE}
|
||||
echo "const char* rocksdb_build_compile_time = __TIME__;" >> ${VFILE}
|
||||
|
||||
OUTFILE=$ROCKSDB_ROOT/util/build_version.cc
|
||||
if [ ! -e $OUTFILE ] || ! cmp -s $VFILE $OUTFILE; then
|
||||
cp $VFILE $OUTFILE
|
||||
fi
|
49
build_tools/fbcode.clang31.sh
Normal file
49
build_tools/fbcode.clang31.sh
Normal file
@ -0,0 +1,49 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set environment variables so that we can compile leveldb using
|
||||
# fbcode settings. It uses the latest g++ compiler and also
|
||||
# uses jemalloc
|
||||
|
||||
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
|
||||
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
|
||||
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
|
||||
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
|
||||
GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
|
||||
|
||||
# location of snappy headers and libraries
|
||||
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
|
||||
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
|
||||
|
||||
# location of libevent
|
||||
LIBEVENT_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/include"
|
||||
LIBEVENT_LIBS=" -L $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/lib"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
|
||||
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
|
||||
|
||||
# use Intel SSE support for checksum calculations
|
||||
export USE_SSE=" -msse -msse4.2 "
|
||||
|
||||
CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.1/6ca8a59/bin/clang $CLANG_INCLUDES"
|
||||
CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.1/6ca8a59/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $LIBEVENT_INCLUDE $GFLAGS_INCLUDE"
|
||||
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
|
||||
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
|
||||
|
||||
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.1/c8f7279/lib/clang/3.1/include -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
|
||||
CXXFLAGS="$CFLAGS -nostdinc++ -std=gnu++0x"
|
||||
|
||||
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
|
||||
|
||||
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
|
||||
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
|
||||
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $LIBEVENT_LIBS $GFLAGS_LIBS"
|
||||
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
|
||||
EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
|
||||
|
||||
PLATFORM_LDFLAGS="-L$TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/lib -L$TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
|
||||
|
||||
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
|
||||
SNAPPY_LDFLAGS="$SNAPPY_LIBS"
|
||||
|
||||
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED SNAPPY_LDFLAGS
|
60
build_tools/fbcode.gcc471.sh
Normal file
60
build_tools/fbcode.gcc471.sh
Normal file
@ -0,0 +1,60 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set environment variables so that we can compile leveldb using
|
||||
# fbcode settings. It uses the latest g++ compiler and also
|
||||
# uses jemalloc
|
||||
|
||||
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
|
||||
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
|
||||
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
|
||||
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
|
||||
|
||||
# location of libhdfs libraries
|
||||
if test "$USE_HDFS"; then
|
||||
JAVA_HOME="/usr/local/jdk-6u22-64"
|
||||
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
|
||||
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
|
||||
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
|
||||
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
|
||||
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
|
||||
fi
|
||||
|
||||
# location of snappy headers and libraries
|
||||
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
|
||||
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
|
||||
|
||||
# location of zlib headers and libraries
|
||||
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
|
||||
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
|
||||
|
||||
# location of libevent
|
||||
LIBEVENT_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/include"
|
||||
LIBEVENT_LIBS=" -L $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/lib"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
|
||||
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
|
||||
|
||||
# use Intel SSE support for checksum calculations
|
||||
export USE_SSE=" -msse -msse4.2 "
|
||||
|
||||
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
|
||||
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $LIBEVENT_INCLUDE $GFLAGS_INCLUDE"
|
||||
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
|
||||
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
|
||||
|
||||
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/bin/gold -m64 -mtune=generic -fPIC"
|
||||
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
|
||||
|
||||
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
|
||||
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
|
||||
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $LIBEVENT_LIBS $GFLAGS_LIBS"
|
||||
|
||||
PLATFORM_LDFLAGS="-L$TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/lib -L$TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
|
||||
|
||||
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
|
||||
SNAPPY_LDFLAGS="$SNAPPY_LIBS $ZLIB_LIBS"
|
||||
|
||||
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
|
||||
|
||||
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED SNAPPY_LDFLAGS VALGRIND_VER
|
100
build_tools/regression_build_test.sh
Executable file
100
build_tools/regression_build_test.sh
Executable file
@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
NUM=10000000
|
||||
|
||||
if [ $# -eq 1 ];then
|
||||
DATA_DIR=$1
|
||||
elif [ $# -eq 2 ];then
|
||||
DATA_DIR=$1
|
||||
STAT_FILE=$2
|
||||
fi
|
||||
|
||||
# On the production build servers, set data and stat
|
||||
# files/directories not in /tmp or else the tempdir cleaning
|
||||
# scripts will make you very unhappy.
|
||||
DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
|
||||
STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
|
||||
|
||||
function cleanup {
|
||||
rm -rf $DATA_DIR
|
||||
rm -f $STAT_FILE.fillseq
|
||||
rm -f $STAT_FILE.readrandom
|
||||
rm -f $STAT_FILE.overwrite
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
function send_to_ods {
|
||||
key="$1"
|
||||
value="$2"
|
||||
|
||||
if [ -z "$value" ];then
|
||||
echo >&2 "ERROR: Key $key doesn't have a value."
|
||||
return
|
||||
fi
|
||||
curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
|
||||
--connect-timeout 60
|
||||
}
|
||||
|
||||
make clean
|
||||
make db_bench -j$(nproc)
|
||||
|
||||
./db_bench \
|
||||
--benchmarks=fillseq \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=0 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--writes=$NUM \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 > ${STAT_FILE}.fillseq
|
||||
|
||||
./db_bench \
|
||||
--benchmarks=overwrite \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--writes=$((NUM / 2)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=8 > ${STAT_FILE}.overwrite
|
||||
|
||||
./db_bench \
|
||||
--benchmarks=readrandom \
|
||||
--db=$DATA_DIR \
|
||||
--use_existing_db=1 \
|
||||
--bloom_bits=10 \
|
||||
--num=$NUM \
|
||||
--reads=$((NUM / 100)) \
|
||||
--cache_size=6442450944 \
|
||||
--cache_numshardbits=6 \
|
||||
--open_files=55000 \
|
||||
--statistics=1 \
|
||||
--histogram=1 \
|
||||
--disable_data_sync=1 \
|
||||
--disable_wal=1 \
|
||||
--sync=0 \
|
||||
--threads=128 > ${STAT_FILE}.readrandom
|
||||
|
||||
OVERWRITE_OPS=$(awk '/overwrite/ {print $5}' $STAT_FILE.overwrite)
|
||||
FILLSEQ_OPS=$(awk '/fillseq/ {print $5}' $STAT_FILE.fillseq)
|
||||
READRANDOM_OPS=$(awk '/readrandom/ {print $5}' $STAT_FILE.readrandom)
|
||||
|
||||
send_to_ods rocksdb.build.overwrite.qps $OVERWRITE_OPS
|
||||
send_to_ods rocksdb.build.fillseq.qps $FILLSEQ_OPS
|
||||
send_to_ods rocksdb.build.readrandom.qps $READRANDOM_OPS
|
15
build_tools/valgrind_test.sh
Executable file
15
build_tools/valgrind_test.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
#A shell script for Jenknis to run valgrind on rocksdb tests
|
||||
#Returns 0 on success when there are no failed tests
|
||||
|
||||
VALGRIND_DIR=build_tools/VALGRIND_LOGS
|
||||
make clean
|
||||
make -j$(nproc) valgrind_check
|
||||
NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
|
||||
if [ $NUM_FAILED_TESTS -lt 1 ]; then
|
||||
echo No tests have valgrind errors
|
||||
exit 0
|
||||
else
|
||||
cat $VALGRIND_DIR/valgrind_failed_tests
|
||||
exit 1
|
||||
fi
|
73
coverage/coverage_test.sh
Executable file
73
coverage/coverage_test.sh
Executable file
@ -0,0 +1,73 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Exit on error.
|
||||
set -e
|
||||
|
||||
if [ -n "$USE_CLANG" ]; then
|
||||
echo "Error: Coverage test is supported only for gcc."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROOT=".."
|
||||
# Fetch right version of gcov
|
||||
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
|
||||
source $ROOT/build_tools/fbcode.gcc471.sh
|
||||
GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
|
||||
else
|
||||
GCOV=$(which gcov)
|
||||
fi
|
||||
|
||||
COVERAGE_DIR="$PWD/COVERAGE_REPORT"
|
||||
mkdir -p $COVERAGE_DIR
|
||||
|
||||
# Find all gcno files to generate the coverage report
|
||||
|
||||
GCNO_FILES=`find $ROOT -name "*.gcno"`
|
||||
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
|
||||
# Parse the raw gcov report to more human readable form.
|
||||
python $ROOT/coverage/parse_gcov_output.py |
|
||||
# Write the output to both stdout and report file.
|
||||
tee $COVERAGE_DIR/coverage_report_all.txt &&
|
||||
echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
|
||||
|
||||
# TODO: we also need to get the files of the latest commits.
|
||||
# Get the most recently committed files.
|
||||
LATEST_FILES=`
|
||||
git show --pretty="format:" --name-only HEAD |
|
||||
grep -v "^$" |
|
||||
paste -s -d,`
|
||||
RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
|
||||
|
||||
echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
|
||||
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
|
||||
python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
|
||||
tee -a $RECENT_REPORT &&
|
||||
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
|
||||
|
||||
# Generate the html report. If we cannot find lcov in this machine, we'll simply
|
||||
# skip this step.
|
||||
echo "Generating the html coverage report..."
|
||||
|
||||
LCOV=$(which lcov || true 2>/dev/null)
|
||||
if [ -z $LCOV ]
|
||||
then
|
||||
echo "Skip: Cannot find lcov to generate the html report."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
LCOV_VERSION=$(lcov -v | grep 1.1 || true)
|
||||
if [ $LCOV_VERSION ]
|
||||
then
|
||||
echo "Not supported lcov version. Expect lcov 1.1."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
(cd $ROOT; lcov --no-external \
|
||||
--capture \
|
||||
--directory $PWD \
|
||||
--gcov-tool $GCOV \
|
||||
--output-file $COVERAGE_DIR/coverage.info)
|
||||
|
||||
genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
|
||||
|
||||
echo "HTML Coverage report is generated in $COVERAGE_DIR"
|
118
coverage/parse_gcov_output.py
Normal file
118
coverage/parse_gcov_output.py
Normal file
@ -0,0 +1,118 @@
|
||||
import optparse
|
||||
import re
|
||||
import sys
|
||||
|
||||
from optparse import OptionParser
|
||||
|
||||
# the gcov report follows certain pattern. Each file will have two lines
|
||||
# of report, from which we can extract the file name, total lines and coverage
|
||||
# percentage.
|
||||
def parse_gcov_report(gcov_input):
|
||||
per_file_coverage = {}
|
||||
total_coverage = None
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
|
||||
# --First line of the coverage report (with file name in it)?
|
||||
match_obj = re.match("^File '(.*)'$", line)
|
||||
if match_obj:
|
||||
# fetch the file name from the first line of the report.
|
||||
current_file = match_obj.group(1)
|
||||
continue
|
||||
|
||||
# -- Second line of the file report (with coverage percentage)
|
||||
match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
|
||||
|
||||
if match_obj:
|
||||
coverage = float(match_obj.group(1))
|
||||
lines = int(match_obj.group(2))
|
||||
|
||||
if current_file is not None:
|
||||
per_file_coverage[current_file] = (coverage, lines)
|
||||
current_file = None
|
||||
else:
|
||||
# If current_file is not set, we reach the last line of report,
|
||||
# which contains the summarized coverage percentage.
|
||||
total_coverage = (coverage, lines)
|
||||
continue
|
||||
|
||||
# If the line's pattern doesn't fall into the above categories. We
|
||||
# can simply ignore them since they're either empty line or doesn't
|
||||
# find executable lines of the given file.
|
||||
current_file = None
|
||||
|
||||
return per_file_coverage, total_coverage
|
||||
|
||||
def get_option_parser():
|
||||
usage = "Parse the gcov output and generate more human-readable code " +\
|
||||
"coverage report."
|
||||
parser = OptionParser(usage)
|
||||
|
||||
parser.add_option(
|
||||
"--interested-files", "-i",
|
||||
dest="filenames",
|
||||
help="Comma separated files names. if specified, we will display " +
|
||||
"the coverage report only for interested source files. " +
|
||||
"Otherwise we will display the coverage report for all " +
|
||||
"source files."
|
||||
)
|
||||
return parser
|
||||
|
||||
def display_file_coverage(per_file_coverage, total_coverage):
|
||||
# To print out auto-adjustable column, we need to know the longest
|
||||
# length of file names.
|
||||
max_file_name_length = max(
|
||||
len(fname) for fname in per_file_coverage.keys()
|
||||
)
|
||||
|
||||
# -- Print header
|
||||
# size of separator is determined by 3 column sizes:
|
||||
# file name, coverage percentage and lines.
|
||||
header_template = \
|
||||
"%" + str(max_file_name_length) + "s\t%s\t%s"
|
||||
separator = "-" * (max_file_name_length + 10 + 20)
|
||||
print header_template % ("Filename", "Coverage", "Lines")
|
||||
print separator
|
||||
|
||||
# -- Print body
|
||||
# template for printing coverage report for each file.
|
||||
record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
|
||||
|
||||
for fname, coverage_info in per_file_coverage.items():
|
||||
coverage, lines = coverage_info
|
||||
print record_template % (fname, coverage, lines)
|
||||
|
||||
# -- Print footer
|
||||
if total_coverage:
|
||||
print separator
|
||||
print record_template % ("Total", total_coverage[0], total_coverage[1])
|
||||
|
||||
def report_coverage():
|
||||
parser = get_option_parser()
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
interested_files = set()
|
||||
if options.filenames is not None:
|
||||
interested_files = set(f.strip() for f in options.filenames.split(','))
|
||||
|
||||
# To make things simple, right now we only read gcov report from the input
|
||||
per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
|
||||
|
||||
# Check if we need to display coverage info for interested files.
|
||||
if len(interested_files):
|
||||
per_file_coverage = dict(
|
||||
(fname, per_file_coverage[fname]) for fname in interested_files
|
||||
if fname in per_file_coverage
|
||||
)
|
||||
# If we only interested in several files, it makes no sense to report
|
||||
# the total_coverage
|
||||
total_coverage = None
|
||||
|
||||
if not len(per_file_coverage):
|
||||
print >> sys.stderr, "Cannot find coverage info for the given files."
|
||||
return
|
||||
display_file_coverage(per_file_coverage, total_coverage)
|
||||
|
||||
if __name__ == "__main__":
|
||||
report_coverage()
|
226
db/builder.cc
Normal file
226
db/builder.cc
Normal file
@ -0,0 +1,226 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/builder.h"
|
||||
|
||||
#include "db/filename.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/merge_helper.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "table/block_based_table_builder.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class TableFactory;
|
||||
|
||||
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
||||
CompressionType compression_type) {
|
||||
return options.table_factory->GetTableBuilder(options, file,
|
||||
compression_type);
|
||||
}
|
||||
|
||||
Status BuildTable(const std::string& dbname,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const EnvOptions& soptions,
|
||||
TableCache* table_cache,
|
||||
Iterator* iter,
|
||||
FileMetaData* meta,
|
||||
const Comparator* user_comparator,
|
||||
const SequenceNumber newest_snapshot,
|
||||
const SequenceNumber earliest_seqno_in_memtable,
|
||||
const bool enable_compression) {
|
||||
Status s;
|
||||
meta->file_size = 0;
|
||||
meta->smallest_seqno = meta->largest_seqno = 0;
|
||||
iter->SeekToFirst();
|
||||
|
||||
// If the sequence number of the smallest entry in the memtable is
|
||||
// smaller than the most recent snapshot, then we do not trigger
|
||||
// removal of duplicate/deleted keys as part of this builder.
|
||||
bool purge = options.purge_redundant_kvs_while_flush;
|
||||
if (earliest_seqno_in_memtable <= newest_snapshot) {
|
||||
purge = false;
|
||||
}
|
||||
|
||||
std::string fname = TableFileName(dbname, meta->number);
|
||||
if (iter->Valid()) {
|
||||
unique_ptr<WritableFile> file;
|
||||
s = env->NewWritableFile(fname, &file, soptions);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
TableBuilder* builder = GetTableBuilder(options, file.get(),
|
||||
options.compression);
|
||||
|
||||
// the first key is the smallest key
|
||||
Slice key = iter->key();
|
||||
meta->smallest.DecodeFrom(key);
|
||||
meta->smallest_seqno = GetInternalKeySeqno(key);
|
||||
meta->largest_seqno = meta->smallest_seqno;
|
||||
|
||||
MergeHelper merge(user_comparator, options.merge_operator.get(),
|
||||
options.info_log.get(),
|
||||
true /* internal key corruption is not ok */);
|
||||
|
||||
if (purge) {
|
||||
// Ugly walkaround to avoid compiler error for release build
|
||||
bool ok __attribute__((unused)) = true;
|
||||
|
||||
// Will write to builder if current key != prev key
|
||||
ParsedInternalKey prev_ikey;
|
||||
std::string prev_key;
|
||||
bool is_first_key = true; // Also write if this is the very first key
|
||||
|
||||
while (iter->Valid()) {
|
||||
bool iterator_at_next = false;
|
||||
|
||||
// Get current key
|
||||
ParsedInternalKey this_ikey;
|
||||
Slice key = iter->key();
|
||||
Slice value = iter->value();
|
||||
|
||||
// In-memory key corruption is not ok;
|
||||
// TODO: find a clean way to treat in memory key corruption
|
||||
ok = ParseInternalKey(key, &this_ikey);
|
||||
assert(ok);
|
||||
assert(this_ikey.sequence >= earliest_seqno_in_memtable);
|
||||
|
||||
// If the key is the same as the previous key (and it is not the
|
||||
// first key), then we skip it, since it is an older version.
|
||||
// Otherwise we output the key and mark it as the "new" previous key.
|
||||
if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
|
||||
this_ikey.user_key)) {
|
||||
// seqno within the same key are in decreasing order
|
||||
assert(this_ikey.sequence < prev_ikey.sequence);
|
||||
} else {
|
||||
is_first_key = false;
|
||||
|
||||
if (this_ikey.type == kTypeMerge) {
|
||||
// Handle merge-type keys using the MergeHelper
|
||||
merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
|
||||
iterator_at_next = true;
|
||||
if (merge.IsSuccess()) {
|
||||
// Merge completed correctly.
|
||||
// Add the resulting merge key/value and continue to next
|
||||
builder->Add(merge.key(), merge.value());
|
||||
prev_key.assign(merge.key().data(), merge.key().size());
|
||||
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
|
||||
assert(ok);
|
||||
} else {
|
||||
// Merge did not find a Put/Delete.
|
||||
// Can not compact these merges into a kValueType.
|
||||
// Write them out one-by-one. (Proceed back() to front())
|
||||
const std::deque<std::string>& keys = merge.keys();
|
||||
const std::deque<std::string>& values = merge.values();
|
||||
assert(keys.size() == values.size() && keys.size() >= 1);
|
||||
std::deque<std::string>::const_reverse_iterator key_iter;
|
||||
std::deque<std::string>::const_reverse_iterator value_iter;
|
||||
for (key_iter=keys.rbegin(), value_iter = values.rbegin();
|
||||
key_iter != keys.rend() && value_iter != values.rend();
|
||||
++key_iter, ++value_iter) {
|
||||
|
||||
builder->Add(Slice(*key_iter), Slice(*value_iter));
|
||||
}
|
||||
|
||||
// Sanity check. Both iterators should end at the same time
|
||||
assert(key_iter == keys.rend() && value_iter == values.rend());
|
||||
|
||||
prev_key.assign(keys.front());
|
||||
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
|
||||
assert(ok);
|
||||
}
|
||||
} else {
|
||||
// Handle Put/Delete-type keys by simply writing them
|
||||
builder->Add(key, value);
|
||||
prev_key.assign(key.data(), key.size());
|
||||
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
|
||||
assert(ok);
|
||||
}
|
||||
}
|
||||
|
||||
if (!iterator_at_next) iter->Next();
|
||||
}
|
||||
|
||||
// The last key is the largest key
|
||||
meta->largest.DecodeFrom(Slice(prev_key));
|
||||
SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
|
||||
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
|
||||
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
|
||||
|
||||
} else {
|
||||
for (; iter->Valid(); iter->Next()) {
|
||||
Slice key = iter->key();
|
||||
meta->largest.DecodeFrom(key);
|
||||
builder->Add(key, iter->value());
|
||||
SequenceNumber seqno = GetInternalKeySeqno(key);
|
||||
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
|
||||
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
|
||||
}
|
||||
}
|
||||
|
||||
// Finish and check for builder errors
|
||||
if (s.ok()) {
|
||||
s = builder->Finish();
|
||||
if (s.ok()) {
|
||||
meta->file_size = builder->FileSize();
|
||||
assert(meta->file_size > 0);
|
||||
}
|
||||
} else {
|
||||
builder->Abandon();
|
||||
}
|
||||
delete builder;
|
||||
|
||||
// Finish and check for file errors
|
||||
if (s.ok() && !options.disableDataSync) {
|
||||
if (options.use_fsync) {
|
||||
StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS);
|
||||
s = file->Fsync();
|
||||
} else {
|
||||
StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS);
|
||||
s = file->Sync();
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = file->Close();
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
// Verify that the table is usable
|
||||
Iterator* it = table_cache->NewIterator(ReadOptions(),
|
||||
soptions,
|
||||
meta->number,
|
||||
meta->file_size);
|
||||
s = it->status();
|
||||
delete it;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for input iterator errors
|
||||
if (!iter->status().ok()) {
|
||||
s = iter->status();
|
||||
}
|
||||
|
||||
if (s.ok() && meta->file_size > 0) {
|
||||
// Keep it
|
||||
} else {
|
||||
env->DeleteFile(fname);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
48
db/builder.h
Normal file
48
db/builder.h
Normal file
@ -0,0 +1,48 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#pragma once
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct Options;
|
||||
struct FileMetaData;
|
||||
|
||||
class Env;
|
||||
struct EnvOptions;
|
||||
class Iterator;
|
||||
class TableCache;
|
||||
class VersionEdit;
|
||||
class TableBuilder;
|
||||
class WritableFile;
|
||||
|
||||
|
||||
extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
||||
CompressionType compression_type);
|
||||
|
||||
// Build a Table file from the contents of *iter. The generated file
|
||||
// will be named according to meta->number. On success, the rest of
|
||||
// *meta will be filled with metadata about the generated table.
|
||||
// If no data is present in *iter, meta->file_size will be set to
|
||||
// zero, and no Table file will be produced.
|
||||
extern Status BuildTable(const std::string& dbname,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const EnvOptions& soptions,
|
||||
TableCache* table_cache,
|
||||
Iterator* iter,
|
||||
FileMetaData* meta,
|
||||
const Comparator* user_comparator,
|
||||
const SequenceNumber newest_snapshot,
|
||||
const SequenceNumber earliest_seqno_in_memtable,
|
||||
const bool enable_compression);
|
||||
|
||||
} // namespace rocksdb
|
691
db/c.cc
Normal file
691
db/c.cc
Normal file
@ -0,0 +1,691 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "rocksdb/c.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/write_batch.h"
|
||||
|
||||
using rocksdb::Cache;
|
||||
using rocksdb::Comparator;
|
||||
using rocksdb::CompressionType;
|
||||
using rocksdb::DB;
|
||||
using rocksdb::Env;
|
||||
using rocksdb::FileLock;
|
||||
using rocksdb::FilterPolicy;
|
||||
using rocksdb::Iterator;
|
||||
using rocksdb::Logger;
|
||||
using rocksdb::NewBloomFilterPolicy;
|
||||
using rocksdb::NewLRUCache;
|
||||
using rocksdb::Options;
|
||||
using rocksdb::RandomAccessFile;
|
||||
using rocksdb::Range;
|
||||
using rocksdb::ReadOptions;
|
||||
using rocksdb::SequentialFile;
|
||||
using rocksdb::Slice;
|
||||
using rocksdb::Snapshot;
|
||||
using rocksdb::Status;
|
||||
using rocksdb::WritableFile;
|
||||
using rocksdb::WriteBatch;
|
||||
using rocksdb::WriteOptions;
|
||||
|
||||
using std::shared_ptr;
|
||||
|
||||
extern "C" {
|
||||
|
||||
struct leveldb_t { DB* rep; };
|
||||
struct leveldb_iterator_t { Iterator* rep; };
|
||||
struct leveldb_writebatch_t { WriteBatch rep; };
|
||||
struct leveldb_snapshot_t { const Snapshot* rep; };
|
||||
struct leveldb_readoptions_t { ReadOptions rep; };
|
||||
struct leveldb_writeoptions_t { WriteOptions rep; };
|
||||
struct leveldb_options_t { Options rep; };
|
||||
struct leveldb_seqfile_t { SequentialFile* rep; };
|
||||
struct leveldb_randomfile_t { RandomAccessFile* rep; };
|
||||
struct leveldb_writablefile_t { WritableFile* rep; };
|
||||
struct leveldb_filelock_t { FileLock* rep; };
|
||||
struct leveldb_logger_t { shared_ptr<Logger> rep; };
|
||||
struct leveldb_cache_t { shared_ptr<Cache> rep; };
|
||||
|
||||
struct leveldb_comparator_t : public Comparator {
|
||||
void* state_;
|
||||
void (*destructor_)(void*);
|
||||
int (*compare_)(
|
||||
void*,
|
||||
const char* a, size_t alen,
|
||||
const char* b, size_t blen);
|
||||
const char* (*name_)(void*);
|
||||
|
||||
virtual ~leveldb_comparator_t() {
|
||||
(*destructor_)(state_);
|
||||
}
|
||||
|
||||
virtual int Compare(const Slice& a, const Slice& b) const {
|
||||
return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
|
||||
}
|
||||
|
||||
virtual const char* Name() const {
|
||||
return (*name_)(state_);
|
||||
}
|
||||
|
||||
// No-ops since the C binding does not support key shortening methods.
|
||||
virtual void FindShortestSeparator(std::string*, const Slice&) const { }
|
||||
virtual void FindShortSuccessor(std::string* key) const { }
|
||||
};
|
||||
|
||||
struct leveldb_filterpolicy_t : public FilterPolicy {
|
||||
void* state_;
|
||||
void (*destructor_)(void*);
|
||||
const char* (*name_)(void*);
|
||||
char* (*create_)(
|
||||
void*,
|
||||
const char* const* key_array, const size_t* key_length_array,
|
||||
int num_keys,
|
||||
size_t* filter_length);
|
||||
unsigned char (*key_match_)(
|
||||
void*,
|
||||
const char* key, size_t length,
|
||||
const char* filter, size_t filter_length);
|
||||
|
||||
virtual ~leveldb_filterpolicy_t() {
|
||||
(*destructor_)(state_);
|
||||
}
|
||||
|
||||
virtual const char* Name() const {
|
||||
return (*name_)(state_);
|
||||
}
|
||||
|
||||
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
|
||||
std::vector<const char*> key_pointers(n);
|
||||
std::vector<size_t> key_sizes(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
key_pointers[i] = keys[i].data();
|
||||
key_sizes[i] = keys[i].size();
|
||||
}
|
||||
size_t len;
|
||||
char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
|
||||
dst->append(filter, len);
|
||||
free(filter);
|
||||
}
|
||||
|
||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
|
||||
return (*key_match_)(state_, key.data(), key.size(),
|
||||
filter.data(), filter.size());
|
||||
}
|
||||
};
|
||||
|
||||
struct leveldb_env_t {
|
||||
Env* rep;
|
||||
bool is_default;
|
||||
};
|
||||
|
||||
static bool SaveError(char** errptr, const Status& s) {
|
||||
assert(errptr != NULL);
|
||||
if (s.ok()) {
|
||||
return false;
|
||||
} else if (*errptr == NULL) {
|
||||
*errptr = strdup(s.ToString().c_str());
|
||||
} else {
|
||||
// TODO(sanjay): Merge with existing error?
|
||||
free(*errptr);
|
||||
*errptr = strdup(s.ToString().c_str());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static char* CopyString(const std::string& str) {
|
||||
char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
|
||||
memcpy(result, str.data(), sizeof(char) * str.size());
|
||||
return result;
|
||||
}
|
||||
|
||||
leveldb_t* leveldb_open(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr) {
|
||||
DB* db;
|
||||
if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
|
||||
return NULL;
|
||||
}
|
||||
leveldb_t* result = new leveldb_t;
|
||||
result->rep = db;
|
||||
return result;
|
||||
}
|
||||
|
||||
void leveldb_close(leveldb_t* db) {
|
||||
delete db->rep;
|
||||
delete db;
|
||||
}
|
||||
|
||||
void leveldb_put(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr) {
|
||||
SaveError(errptr,
|
||||
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
|
||||
}
|
||||
|
||||
void leveldb_delete(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
char** errptr) {
|
||||
SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
|
||||
}
|
||||
|
||||
|
||||
void leveldb_write(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
leveldb_writebatch_t* batch,
|
||||
char** errptr) {
|
||||
SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
|
||||
}
|
||||
|
||||
char* leveldb_get(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr) {
|
||||
char* result = NULL;
|
||||
std::string tmp;
|
||||
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
|
||||
if (s.ok()) {
|
||||
*vallen = tmp.size();
|
||||
result = CopyString(tmp);
|
||||
} else {
|
||||
*vallen = 0;
|
||||
if (!s.IsNotFound()) {
|
||||
SaveError(errptr, s);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
leveldb_iterator_t* leveldb_create_iterator(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options) {
|
||||
leveldb_iterator_t* result = new leveldb_iterator_t;
|
||||
result->rep = db->rep->NewIterator(options->rep);
|
||||
return result;
|
||||
}
|
||||
|
||||
const leveldb_snapshot_t* leveldb_create_snapshot(
|
||||
leveldb_t* db) {
|
||||
leveldb_snapshot_t* result = new leveldb_snapshot_t;
|
||||
result->rep = db->rep->GetSnapshot();
|
||||
return result;
|
||||
}
|
||||
|
||||
void leveldb_release_snapshot(
|
||||
leveldb_t* db,
|
||||
const leveldb_snapshot_t* snapshot) {
|
||||
db->rep->ReleaseSnapshot(snapshot->rep);
|
||||
delete snapshot;
|
||||
}
|
||||
|
||||
char* leveldb_property_value(
|
||||
leveldb_t* db,
|
||||
const char* propname) {
|
||||
std::string tmp;
|
||||
if (db->rep->GetProperty(Slice(propname), &tmp)) {
|
||||
// We use strdup() since we expect human readable output.
|
||||
return strdup(tmp.c_str());
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void leveldb_approximate_sizes(
|
||||
leveldb_t* db,
|
||||
int num_ranges,
|
||||
const char* const* range_start_key, const size_t* range_start_key_len,
|
||||
const char* const* range_limit_key, const size_t* range_limit_key_len,
|
||||
uint64_t* sizes) {
|
||||
Range* ranges = new Range[num_ranges];
|
||||
for (int i = 0; i < num_ranges; i++) {
|
||||
ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
|
||||
ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
|
||||
}
|
||||
db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
|
||||
delete[] ranges;
|
||||
}
|
||||
|
||||
void leveldb_compact_range(
|
||||
leveldb_t* db,
|
||||
const char* start_key, size_t start_key_len,
|
||||
const char* limit_key, size_t limit_key_len) {
|
||||
Slice a, b;
|
||||
db->rep->CompactRange(
|
||||
// Pass NULL Slice if corresponding "const char*" is NULL
|
||||
(start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
|
||||
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
|
||||
}
|
||||
|
||||
void leveldb_destroy_db(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr) {
|
||||
SaveError(errptr, DestroyDB(name, options->rep));
|
||||
}
|
||||
|
||||
void leveldb_repair_db(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr) {
|
||||
SaveError(errptr, RepairDB(name, options->rep));
|
||||
}
|
||||
|
||||
void leveldb_iter_destroy(leveldb_iterator_t* iter) {
|
||||
delete iter->rep;
|
||||
delete iter;
|
||||
}
|
||||
|
||||
unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
|
||||
return iter->rep->Valid();
|
||||
}
|
||||
|
||||
void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
|
||||
iter->rep->SeekToFirst();
|
||||
}
|
||||
|
||||
void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
|
||||
iter->rep->SeekToLast();
|
||||
}
|
||||
|
||||
void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
|
||||
iter->rep->Seek(Slice(k, klen));
|
||||
}
|
||||
|
||||
void leveldb_iter_next(leveldb_iterator_t* iter) {
|
||||
iter->rep->Next();
|
||||
}
|
||||
|
||||
void leveldb_iter_prev(leveldb_iterator_t* iter) {
|
||||
iter->rep->Prev();
|
||||
}
|
||||
|
||||
const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
|
||||
Slice s = iter->rep->key();
|
||||
*klen = s.size();
|
||||
return s.data();
|
||||
}
|
||||
|
||||
const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
|
||||
Slice s = iter->rep->value();
|
||||
*vlen = s.size();
|
||||
return s.data();
|
||||
}
|
||||
|
||||
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
|
||||
SaveError(errptr, iter->rep->status());
|
||||
}
|
||||
|
||||
leveldb_writebatch_t* leveldb_writebatch_create() {
|
||||
return new leveldb_writebatch_t;
|
||||
}
|
||||
|
||||
void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
|
||||
delete b;
|
||||
}
|
||||
|
||||
void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
|
||||
b->rep.Clear();
|
||||
}
|
||||
|
||||
void leveldb_writebatch_put(
|
||||
leveldb_writebatch_t* b,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen) {
|
||||
b->rep.Put(Slice(key, klen), Slice(val, vlen));
|
||||
}
|
||||
|
||||
void leveldb_writebatch_delete(
|
||||
leveldb_writebatch_t* b,
|
||||
const char* key, size_t klen) {
|
||||
b->rep.Delete(Slice(key, klen));
|
||||
}
|
||||
|
||||
void leveldb_writebatch_iterate(
|
||||
leveldb_writebatch_t* b,
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*deleted)(void*, const char* k, size_t klen)) {
|
||||
class H : public WriteBatch::Handler {
|
||||
public:
|
||||
void* state_;
|
||||
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
|
||||
void (*deleted_)(void*, const char* k, size_t klen);
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
(*deleted_)(state_, key.data(), key.size());
|
||||
}
|
||||
};
|
||||
H handler;
|
||||
handler.state_ = state;
|
||||
handler.put_ = put;
|
||||
handler.deleted_ = deleted;
|
||||
b->rep.Iterate(&handler);
|
||||
}
|
||||
|
||||
leveldb_options_t* leveldb_options_create() {
|
||||
return new leveldb_options_t;
|
||||
}
|
||||
|
||||
void leveldb_options_destroy(leveldb_options_t* options) {
|
||||
delete options;
|
||||
}
|
||||
|
||||
void leveldb_options_set_comparator(
|
||||
leveldb_options_t* opt,
|
||||
leveldb_comparator_t* cmp) {
|
||||
opt->rep.comparator = cmp;
|
||||
}
|
||||
|
||||
void leveldb_options_set_filter_policy(
|
||||
leveldb_options_t* opt,
|
||||
leveldb_filterpolicy_t* policy) {
|
||||
opt->rep.filter_policy = policy;
|
||||
}
|
||||
|
||||
void leveldb_options_set_create_if_missing(
|
||||
leveldb_options_t* opt, unsigned char v) {
|
||||
opt->rep.create_if_missing = v;
|
||||
}
|
||||
|
||||
void leveldb_options_set_error_if_exists(
|
||||
leveldb_options_t* opt, unsigned char v) {
|
||||
opt->rep.error_if_exists = v;
|
||||
}
|
||||
|
||||
void leveldb_options_set_paranoid_checks(
|
||||
leveldb_options_t* opt, unsigned char v) {
|
||||
opt->rep.paranoid_checks = v;
|
||||
}
|
||||
|
||||
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
|
||||
opt->rep.env = (env ? env->rep : NULL);
|
||||
}
|
||||
|
||||
void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
|
||||
if (l) {
|
||||
opt->rep.info_log = l->rep;
|
||||
}
|
||||
}
|
||||
|
||||
void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
|
||||
opt->rep.write_buffer_size = s;
|
||||
}
|
||||
|
||||
void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
|
||||
opt->rep.max_open_files = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
|
||||
if (c) {
|
||||
opt->rep.block_cache = c->rep;
|
||||
}
|
||||
}
|
||||
|
||||
void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
|
||||
opt->rep.block_size = s;
|
||||
}
|
||||
|
||||
void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
|
||||
opt->rep.block_restart_interval = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_target_file_size_base(
|
||||
leveldb_options_t* opt, uint64_t n) {
|
||||
opt->rep.target_file_size_base = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_target_file_size_multiplier(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.target_file_size_multiplier = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_max_bytes_for_level_base(
|
||||
leveldb_options_t* opt, uint64_t n) {
|
||||
opt->rep.max_bytes_for_level_base = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_max_bytes_for_level_multiplier(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.max_bytes_for_level_multiplier = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_expanded_compaction_factor(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.expanded_compaction_factor = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_max_grandparent_overlap_factor(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.max_grandparent_overlap_factor = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_num_levels(leveldb_options_t* opt, int n) {
|
||||
opt->rep.num_levels = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_level0_file_num_compaction_trigger(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.level0_file_num_compaction_trigger = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_level0_slowdown_writes_trigger(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.level0_slowdown_writes_trigger = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_level0_stop_writes_trigger(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.level0_stop_writes_trigger = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_max_mem_compaction_level(
|
||||
leveldb_options_t* opt, int n) {
|
||||
opt->rep.max_mem_compaction_level = n;
|
||||
}
|
||||
|
||||
void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
|
||||
opt->rep.compression = static_cast<CompressionType>(t);
|
||||
}
|
||||
|
||||
void leveldb_options_set_compression_per_level(leveldb_options_t* opt,
|
||||
int* level_values,
|
||||
size_t num_levels) {
|
||||
opt->rep.compression_per_level.resize(num_levels);
|
||||
for (size_t i = 0; i < num_levels; ++i) {
|
||||
opt->rep.compression_per_level[i] =
|
||||
static_cast<CompressionType>(level_values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void leveldb_options_set_compression_options(
|
||||
leveldb_options_t* opt, int w_bits, int level, int strategy) {
|
||||
opt->rep.compression_opts.window_bits = w_bits;
|
||||
opt->rep.compression_opts.level = level;
|
||||
opt->rep.compression_opts.strategy = strategy;
|
||||
}
|
||||
|
||||
void leveldb_options_set_disable_data_sync(
|
||||
leveldb_options_t* opt, bool disable_data_sync) {
|
||||
opt->rep.disableDataSync = disable_data_sync;
|
||||
}
|
||||
|
||||
void leveldb_options_set_use_fsync(
|
||||
leveldb_options_t* opt, bool use_fsync) {
|
||||
opt->rep.use_fsync = use_fsync;
|
||||
}
|
||||
|
||||
void leveldb_options_set_db_stats_log_interval(
|
||||
leveldb_options_t* opt, int db_stats_log_interval) {
|
||||
opt->rep.db_stats_log_interval = db_stats_log_interval;
|
||||
}
|
||||
|
||||
void leveldb_options_set_db_log_dir(
|
||||
leveldb_options_t* opt, const char* db_log_dir) {
|
||||
opt->rep.db_log_dir = db_log_dir;
|
||||
}
|
||||
|
||||
void leveldb_options_set_WAL_ttl_seconds(leveldb_options_t* opt, uint64_t ttl) {
|
||||
opt->rep.WAL_ttl_seconds = ttl;
|
||||
}
|
||||
|
||||
void leveldb_options_set_WAL_size_limit_MB(
|
||||
leveldb_options_t* opt, uint64_t limit) {
|
||||
opt->rep.WAL_size_limit_MB = limit;
|
||||
}
|
||||
|
||||
leveldb_comparator_t* leveldb_comparator_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
int (*compare)(
|
||||
void*,
|
||||
const char* a, size_t alen,
|
||||
const char* b, size_t blen),
|
||||
const char* (*name)(void*)) {
|
||||
leveldb_comparator_t* result = new leveldb_comparator_t;
|
||||
result->state_ = state;
|
||||
result->destructor_ = destructor;
|
||||
result->compare_ = compare;
|
||||
result->name_ = name;
|
||||
return result;
|
||||
}
|
||||
|
||||
void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
|
||||
delete cmp;
|
||||
}
|
||||
|
||||
leveldb_filterpolicy_t* leveldb_filterpolicy_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
char* (*create_filter)(
|
||||
void*,
|
||||
const char* const* key_array, const size_t* key_length_array,
|
||||
int num_keys,
|
||||
size_t* filter_length),
|
||||
unsigned char (*key_may_match)(
|
||||
void*,
|
||||
const char* key, size_t length,
|
||||
const char* filter, size_t filter_length),
|
||||
const char* (*name)(void*)) {
|
||||
leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
|
||||
result->state_ = state;
|
||||
result->destructor_ = destructor;
|
||||
result->create_ = create_filter;
|
||||
result->key_match_ = key_may_match;
|
||||
result->name_ = name;
|
||||
return result;
|
||||
}
|
||||
|
||||
void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
|
||||
delete filter;
|
||||
}
|
||||
|
||||
leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
|
||||
// Make a leveldb_filterpolicy_t, but override all of its methods so
|
||||
// they delegate to a NewBloomFilterPolicy() instead of user
|
||||
// supplied C functions.
|
||||
struct Wrapper : public leveldb_filterpolicy_t {
|
||||
const FilterPolicy* rep_;
|
||||
~Wrapper() { delete rep_; }
|
||||
const char* Name() const { return rep_->Name(); }
|
||||
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
|
||||
return rep_->CreateFilter(keys, n, dst);
|
||||
}
|
||||
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
|
||||
return rep_->KeyMayMatch(key, filter);
|
||||
}
|
||||
static void DoNothing(void*) { }
|
||||
};
|
||||
Wrapper* wrapper = new Wrapper;
|
||||
wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
|
||||
wrapper->state_ = NULL;
|
||||
wrapper->destructor_ = &Wrapper::DoNothing;
|
||||
return wrapper;
|
||||
}
|
||||
|
||||
leveldb_readoptions_t* leveldb_readoptions_create() {
|
||||
return new leveldb_readoptions_t;
|
||||
}
|
||||
|
||||
void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
|
||||
delete opt;
|
||||
}
|
||||
|
||||
void leveldb_readoptions_set_verify_checksums(
|
||||
leveldb_readoptions_t* opt,
|
||||
unsigned char v) {
|
||||
opt->rep.verify_checksums = v;
|
||||
}
|
||||
|
||||
void leveldb_readoptions_set_fill_cache(
|
||||
leveldb_readoptions_t* opt, unsigned char v) {
|
||||
opt->rep.fill_cache = v;
|
||||
}
|
||||
|
||||
void leveldb_readoptions_set_snapshot(
|
||||
leveldb_readoptions_t* opt,
|
||||
const leveldb_snapshot_t* snap) {
|
||||
opt->rep.snapshot = (snap ? snap->rep : NULL);
|
||||
}
|
||||
|
||||
leveldb_writeoptions_t* leveldb_writeoptions_create() {
|
||||
return new leveldb_writeoptions_t;
|
||||
}
|
||||
|
||||
void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
|
||||
delete opt;
|
||||
}
|
||||
|
||||
void leveldb_writeoptions_set_sync(
|
||||
leveldb_writeoptions_t* opt, unsigned char v) {
|
||||
opt->rep.sync = v;
|
||||
}
|
||||
|
||||
leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
|
||||
leveldb_cache_t* c = new leveldb_cache_t;
|
||||
c->rep = NewLRUCache(capacity);
|
||||
return c;
|
||||
}
|
||||
|
||||
void leveldb_cache_destroy(leveldb_cache_t* cache) {
|
||||
delete cache;
|
||||
}
|
||||
|
||||
leveldb_env_t* leveldb_create_default_env() {
|
||||
leveldb_env_t* result = new leveldb_env_t;
|
||||
result->rep = Env::Default();
|
||||
result->is_default = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
void leveldb_env_destroy(leveldb_env_t* env) {
|
||||
if (!env->is_default) delete env->rep;
|
||||
delete env;
|
||||
}
|
||||
|
||||
} // end extern "C"
|
390
db/c_test.c
Normal file
390
db/c_test.c
Normal file
@ -0,0 +1,390 @@
|
||||
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
Use of this source code is governed by a BSD-style license that can be
|
||||
found in the LICENSE file. See the AUTHORS file for names of contributors. */
|
||||
|
||||
#include "rocksdb/c.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
const char* phase = "";
|
||||
static char dbname[200];
|
||||
|
||||
static void StartPhase(const char* name) {
|
||||
fprintf(stderr, "=== Test %s\n", name);
|
||||
phase = name;
|
||||
}
|
||||
|
||||
static const char* GetTempDir(void) {
|
||||
const char* ret = getenv("TEST_TMPDIR");
|
||||
if (ret == NULL || ret[0] == '\0')
|
||||
ret = "/tmp";
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define CheckNoError(err) \
|
||||
if ((err) != NULL) { \
|
||||
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
#define CheckCondition(cond) \
|
||||
if (!(cond)) { \
|
||||
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
static void CheckEqual(const char* expected, const char* v, size_t n) {
|
||||
if (expected == NULL && v == NULL) {
|
||||
// ok
|
||||
} else if (expected != NULL && v != NULL && n == strlen(expected) &&
|
||||
memcmp(expected, v, n) == 0) {
|
||||
// ok
|
||||
return;
|
||||
} else {
|
||||
fprintf(stderr, "%s: expected '%s', got '%s'\n",
|
||||
phase,
|
||||
(expected ? expected : "(null)"),
|
||||
(v ? v : "(null"));
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
static void Free(char** ptr) {
|
||||
if (*ptr) {
|
||||
free(*ptr);
|
||||
*ptr = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void CheckGet(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options,
|
||||
const char* key,
|
||||
const char* expected) {
|
||||
char* err = NULL;
|
||||
size_t val_len;
|
||||
char* val;
|
||||
val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
|
||||
CheckNoError(err);
|
||||
CheckEqual(expected, val, val_len);
|
||||
Free(&val);
|
||||
}
|
||||
|
||||
static void CheckIter(leveldb_iterator_t* iter,
|
||||
const char* key, const char* val) {
|
||||
size_t len;
|
||||
const char* str;
|
||||
str = leveldb_iter_key(iter, &len);
|
||||
CheckEqual(key, str, len);
|
||||
str = leveldb_iter_value(iter, &len);
|
||||
CheckEqual(val, str, len);
|
||||
}
|
||||
|
||||
// Callback from leveldb_writebatch_iterate()
|
||||
static void CheckPut(void* ptr,
|
||||
const char* k, size_t klen,
|
||||
const char* v, size_t vlen) {
|
||||
int* state = (int*) ptr;
|
||||
CheckCondition(*state < 2);
|
||||
switch (*state) {
|
||||
case 0:
|
||||
CheckEqual("bar", k, klen);
|
||||
CheckEqual("b", v, vlen);
|
||||
break;
|
||||
case 1:
|
||||
CheckEqual("box", k, klen);
|
||||
CheckEqual("c", v, vlen);
|
||||
break;
|
||||
}
|
||||
(*state)++;
|
||||
}
|
||||
|
||||
// Callback from leveldb_writebatch_iterate()
|
||||
static void CheckDel(void* ptr, const char* k, size_t klen) {
|
||||
int* state = (int*) ptr;
|
||||
CheckCondition(*state == 2);
|
||||
CheckEqual("bar", k, klen);
|
||||
(*state)++;
|
||||
}
|
||||
|
||||
static void CmpDestroy(void* arg) { }
|
||||
|
||||
static int CmpCompare(void* arg, const char* a, size_t alen,
|
||||
const char* b, size_t blen) {
|
||||
int n = (alen < blen) ? alen : blen;
|
||||
int r = memcmp(a, b, n);
|
||||
if (r == 0) {
|
||||
if (alen < blen) r = -1;
|
||||
else if (alen > blen) r = +1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static const char* CmpName(void* arg) {
|
||||
return "foo";
|
||||
}
|
||||
|
||||
// Custom filter policy
|
||||
static unsigned char fake_filter_result = 1;
|
||||
static void FilterDestroy(void* arg) { }
|
||||
static const char* FilterName(void* arg) {
|
||||
return "TestFilter";
|
||||
}
|
||||
static char* FilterCreate(
|
||||
void* arg,
|
||||
const char* const* key_array, const size_t* key_length_array,
|
||||
int num_keys,
|
||||
size_t* filter_length) {
|
||||
*filter_length = 4;
|
||||
char* result = malloc(4);
|
||||
memcpy(result, "fake", 4);
|
||||
return result;
|
||||
}
|
||||
unsigned char FilterKeyMatch(
|
||||
void* arg,
|
||||
const char* key, size_t length,
|
||||
const char* filter, size_t filter_length) {
|
||||
CheckCondition(filter_length == 4);
|
||||
CheckCondition(memcmp(filter, "fake", 4) == 0);
|
||||
return fake_filter_result;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
leveldb_t* db;
|
||||
leveldb_comparator_t* cmp;
|
||||
leveldb_cache_t* cache;
|
||||
leveldb_env_t* env;
|
||||
leveldb_options_t* options;
|
||||
leveldb_readoptions_t* roptions;
|
||||
leveldb_writeoptions_t* woptions;
|
||||
char* err = NULL;
|
||||
int run = -1;
|
||||
|
||||
snprintf(dbname, sizeof(dbname),
|
||||
"%s/leveldb_c_test-%d",
|
||||
GetTempDir(),
|
||||
((int) geteuid()));
|
||||
|
||||
StartPhase("create_objects");
|
||||
cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
|
||||
env = leveldb_create_default_env();
|
||||
cache = leveldb_cache_create_lru(100000);
|
||||
|
||||
options = leveldb_options_create();
|
||||
leveldb_options_set_comparator(options, cmp);
|
||||
leveldb_options_set_error_if_exists(options, 1);
|
||||
leveldb_options_set_cache(options, cache);
|
||||
leveldb_options_set_env(options, env);
|
||||
leveldb_options_set_info_log(options, NULL);
|
||||
leveldb_options_set_write_buffer_size(options, 100000);
|
||||
leveldb_options_set_paranoid_checks(options, 1);
|
||||
leveldb_options_set_max_open_files(options, 10);
|
||||
leveldb_options_set_block_size(options, 1024);
|
||||
leveldb_options_set_block_restart_interval(options, 8);
|
||||
leveldb_options_set_compression(options, leveldb_no_compression);
|
||||
leveldb_options_set_compression_options(options, -14, -1, 0);
|
||||
int compression_levels[] = {leveldb_no_compression, leveldb_no_compression,
|
||||
leveldb_no_compression, leveldb_no_compression};
|
||||
leveldb_options_set_compression_per_level(options, compression_levels, 4);
|
||||
|
||||
roptions = leveldb_readoptions_create();
|
||||
leveldb_readoptions_set_verify_checksums(roptions, 1);
|
||||
leveldb_readoptions_set_fill_cache(roptions, 0);
|
||||
|
||||
woptions = leveldb_writeoptions_create();
|
||||
leveldb_writeoptions_set_sync(woptions, 1);
|
||||
|
||||
StartPhase("destroy");
|
||||
leveldb_destroy_db(options, dbname, &err);
|
||||
Free(&err);
|
||||
|
||||
StartPhase("open_error");
|
||||
db = leveldb_open(options, dbname, &err);
|
||||
CheckCondition(err != NULL);
|
||||
Free(&err);
|
||||
|
||||
StartPhase("open");
|
||||
leveldb_options_set_create_if_missing(options, 1);
|
||||
db = leveldb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
|
||||
StartPhase("put");
|
||||
leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
|
||||
StartPhase("compactall");
|
||||
leveldb_compact_range(db, NULL, 0, NULL, 0);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
|
||||
StartPhase("compactrange");
|
||||
leveldb_compact_range(db, "a", 1, "z", 1);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
|
||||
StartPhase("writebatch");
|
||||
{
|
||||
leveldb_writebatch_t* wb = leveldb_writebatch_create();
|
||||
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
|
||||
leveldb_writebatch_clear(wb);
|
||||
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
|
||||
leveldb_writebatch_put(wb, "box", 3, "c", 1);
|
||||
leveldb_writebatch_delete(wb, "bar", 3);
|
||||
leveldb_write(db, woptions, wb, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
CheckGet(db, roptions, "bar", NULL);
|
||||
CheckGet(db, roptions, "box", "c");
|
||||
int pos = 0;
|
||||
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
|
||||
CheckCondition(pos == 3);
|
||||
leveldb_writebatch_destroy(wb);
|
||||
}
|
||||
|
||||
StartPhase("iter");
|
||||
{
|
||||
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
|
||||
CheckCondition(!leveldb_iter_valid(iter));
|
||||
leveldb_iter_seek_to_first(iter);
|
||||
CheckCondition(leveldb_iter_valid(iter));
|
||||
CheckIter(iter, "box", "c");
|
||||
leveldb_iter_next(iter);
|
||||
CheckIter(iter, "foo", "hello");
|
||||
leveldb_iter_prev(iter);
|
||||
CheckIter(iter, "box", "c");
|
||||
leveldb_iter_prev(iter);
|
||||
CheckCondition(!leveldb_iter_valid(iter));
|
||||
leveldb_iter_seek_to_last(iter);
|
||||
CheckIter(iter, "foo", "hello");
|
||||
leveldb_iter_seek(iter, "b", 1);
|
||||
CheckIter(iter, "box", "c");
|
||||
leveldb_iter_get_error(iter, &err);
|
||||
CheckNoError(err);
|
||||
leveldb_iter_destroy(iter);
|
||||
}
|
||||
|
||||
StartPhase("approximate_sizes");
|
||||
{
|
||||
int i;
|
||||
int n = 20000;
|
||||
char keybuf[100];
|
||||
char valbuf[100];
|
||||
uint64_t sizes[2];
|
||||
const char* start[2] = { "a", "k00000000000000010000" };
|
||||
size_t start_len[2] = { 1, 21 };
|
||||
const char* limit[2] = { "k00000000000000010000", "z" };
|
||||
size_t limit_len[2] = { 21, 1 };
|
||||
leveldb_writeoptions_set_sync(woptions, 0);
|
||||
for (i = 0; i < n; i++) {
|
||||
snprintf(keybuf, sizeof(keybuf), "k%020d", i);
|
||||
snprintf(valbuf, sizeof(valbuf), "v%020d", i);
|
||||
leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
|
||||
&err);
|
||||
CheckNoError(err);
|
||||
}
|
||||
leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
|
||||
CheckCondition(sizes[0] > 0);
|
||||
CheckCondition(sizes[1] > 0);
|
||||
}
|
||||
|
||||
StartPhase("property");
|
||||
{
|
||||
char* prop = leveldb_property_value(db, "nosuchprop");
|
||||
CheckCondition(prop == NULL);
|
||||
prop = leveldb_property_value(db, "rocksdb.stats");
|
||||
CheckCondition(prop != NULL);
|
||||
Free(&prop);
|
||||
}
|
||||
|
||||
StartPhase("snapshot");
|
||||
{
|
||||
const leveldb_snapshot_t* snap;
|
||||
snap = leveldb_create_snapshot(db);
|
||||
leveldb_delete(db, woptions, "foo", 3, &err);
|
||||
CheckNoError(err);
|
||||
leveldb_readoptions_set_snapshot(roptions, snap);
|
||||
CheckGet(db, roptions, "foo", "hello");
|
||||
leveldb_readoptions_set_snapshot(roptions, NULL);
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
leveldb_release_snapshot(db, snap);
|
||||
}
|
||||
|
||||
StartPhase("repair");
|
||||
{
|
||||
// If we do not compact here, then the lazy deletion of
|
||||
// files (https://reviews.facebook.net/D6123) would leave
|
||||
// around deleted files and the repair process will find
|
||||
// those files and put them back into the database.
|
||||
leveldb_compact_range(db, NULL, 0, NULL, 0);
|
||||
leveldb_close(db);
|
||||
leveldb_options_set_create_if_missing(options, 0);
|
||||
leveldb_options_set_error_if_exists(options, 0);
|
||||
leveldb_repair_db(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
db = leveldb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
CheckGet(db, roptions, "bar", NULL);
|
||||
CheckGet(db, roptions, "box", "c");
|
||||
leveldb_options_set_create_if_missing(options, 1);
|
||||
leveldb_options_set_error_if_exists(options, 1);
|
||||
}
|
||||
|
||||
StartPhase("filter");
|
||||
for (run = 0; run < 2; run++) {
|
||||
// First run uses custom filter, second run uses bloom filter
|
||||
CheckNoError(err);
|
||||
leveldb_filterpolicy_t* policy;
|
||||
if (run == 0) {
|
||||
policy = leveldb_filterpolicy_create(
|
||||
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
|
||||
} else {
|
||||
policy = leveldb_filterpolicy_create_bloom(10);
|
||||
}
|
||||
|
||||
// Create new database
|
||||
leveldb_close(db);
|
||||
leveldb_destroy_db(options, dbname, &err);
|
||||
leveldb_options_set_filter_policy(options, policy);
|
||||
db = leveldb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
|
||||
CheckNoError(err);
|
||||
leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
|
||||
CheckNoError(err);
|
||||
leveldb_compact_range(db, NULL, 0, NULL, 0);
|
||||
|
||||
fake_filter_result = 1;
|
||||
CheckGet(db, roptions, "foo", "foovalue");
|
||||
CheckGet(db, roptions, "bar", "barvalue");
|
||||
if (phase == 0) {
|
||||
// Must not find value when custom filter returns false
|
||||
fake_filter_result = 0;
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
CheckGet(db, roptions, "bar", NULL);
|
||||
fake_filter_result = 1;
|
||||
|
||||
CheckGet(db, roptions, "foo", "foovalue");
|
||||
CheckGet(db, roptions, "bar", "barvalue");
|
||||
}
|
||||
leveldb_options_set_filter_policy(options, NULL);
|
||||
leveldb_filterpolicy_destroy(policy);
|
||||
}
|
||||
|
||||
StartPhase("cleanup");
|
||||
leveldb_close(db);
|
||||
leveldb_options_destroy(options);
|
||||
leveldb_readoptions_destroy(roptions);
|
||||
leveldb_writeoptions_destroy(woptions);
|
||||
leveldb_cache_destroy(cache);
|
||||
leveldb_comparator_destroy(cmp);
|
||||
leveldb_env_destroy(env);
|
||||
|
||||
fprintf(stderr, "PASS\n");
|
||||
return 0;
|
||||
}
|
378
db/corruption_test.cc
Normal file
378
db/corruption_test.cc
Normal file
@ -0,0 +1,378 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/write_batch.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/log_format.h"
|
||||
#include "db/version_set.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static const int kValueSize = 1000;
|
||||
|
||||
class CorruptionTest {
|
||||
public:
|
||||
test::ErrorEnv env_;
|
||||
std::string dbname_;
|
||||
shared_ptr<Cache> tiny_cache_;
|
||||
Options options_;
|
||||
DB* db_;
|
||||
|
||||
CorruptionTest() {
|
||||
tiny_cache_ = NewLRUCache(100);
|
||||
options_.env = &env_;
|
||||
dbname_ = test::TmpDir() + "/db_test";
|
||||
DestroyDB(dbname_, options_);
|
||||
|
||||
db_ = nullptr;
|
||||
options_.create_if_missing = true;
|
||||
options_.block_size_deviation = 0; // make unit test pass for now
|
||||
Reopen();
|
||||
options_.create_if_missing = false;
|
||||
}
|
||||
|
||||
~CorruptionTest() {
|
||||
delete db_;
|
||||
DestroyDB(dbname_, Options());
|
||||
}
|
||||
|
||||
Status TryReopen(Options* options = nullptr) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
Options opt = (options ? *options : options_);
|
||||
opt.env = &env_;
|
||||
opt.block_cache = tiny_cache_;
|
||||
opt.block_size_deviation = 0;
|
||||
opt.arena_block_size = 4096;
|
||||
return DB::Open(opt, dbname_, &db_);
|
||||
}
|
||||
|
||||
void Reopen(Options* options = nullptr) {
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void RepairDB() {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
|
||||
}
|
||||
|
||||
void Build(int n) {
|
||||
std::string key_space, value_space;
|
||||
WriteBatch batch;
|
||||
for (int i = 0; i < n; i++) {
|
||||
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
|
||||
Slice key = Key(i, &key_space);
|
||||
batch.Clear();
|
||||
batch.Put(key, Value(i, &value_space));
|
||||
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
||||
}
|
||||
}
|
||||
|
||||
void Check(int min_expected, int max_expected) {
|
||||
unsigned int next_expected = 0;
|
||||
int missed = 0;
|
||||
int bad_keys = 0;
|
||||
int bad_values = 0;
|
||||
int correct = 0;
|
||||
std::string value_space;
|
||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
uint64_t key;
|
||||
Slice in(iter->key());
|
||||
if (!ConsumeDecimalNumber(&in, &key) ||
|
||||
!in.empty() ||
|
||||
key < next_expected) {
|
||||
bad_keys++;
|
||||
continue;
|
||||
}
|
||||
missed += (key - next_expected);
|
||||
next_expected = key + 1;
|
||||
if (iter->value() != Value(key, &value_space)) {
|
||||
bad_values++;
|
||||
} else {
|
||||
correct++;
|
||||
}
|
||||
}
|
||||
delete iter;
|
||||
|
||||
fprintf(stderr,
|
||||
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
|
||||
min_expected, max_expected, correct, bad_keys, bad_values, missed);
|
||||
ASSERT_LE(min_expected, correct);
|
||||
ASSERT_GE(max_expected, correct);
|
||||
}
|
||||
|
||||
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
|
||||
// Pick file to corrupt
|
||||
std::vector<std::string> filenames;
|
||||
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
std::string fname;
|
||||
int picked_number = -1;
|
||||
for (unsigned int i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type) &&
|
||||
type == filetype &&
|
||||
int(number) > picked_number) { // Pick latest file
|
||||
fname = dbname_ + "/" + filenames[i];
|
||||
picked_number = number;
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(!fname.empty()) << filetype;
|
||||
|
||||
struct stat sbuf;
|
||||
if (stat(fname.c_str(), &sbuf) != 0) {
|
||||
const char* msg = strerror(errno);
|
||||
ASSERT_TRUE(false) << fname << ": " << msg;
|
||||
}
|
||||
|
||||
if (offset < 0) {
|
||||
// Relative to end of file; make it absolute
|
||||
if (-offset > sbuf.st_size) {
|
||||
offset = 0;
|
||||
} else {
|
||||
offset = sbuf.st_size + offset;
|
||||
}
|
||||
}
|
||||
if (offset > sbuf.st_size) {
|
||||
offset = sbuf.st_size;
|
||||
}
|
||||
if (offset + bytes_to_corrupt > sbuf.st_size) {
|
||||
bytes_to_corrupt = sbuf.st_size - offset;
|
||||
}
|
||||
|
||||
// Do it
|
||||
std::string contents;
|
||||
Status s = ReadFileToString(Env::Default(), fname, &contents);
|
||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||
for (int i = 0; i < bytes_to_corrupt; i++) {
|
||||
contents[i + offset] ^= 0x80;
|
||||
}
|
||||
s = WriteStringToFile(Env::Default(), contents, fname);
|
||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||
}
|
||||
|
||||
int Property(const std::string& name) {
|
||||
std::string property;
|
||||
int result;
|
||||
if (db_->GetProperty(name, &property) &&
|
||||
sscanf(property.c_str(), "%d", &result) == 1) {
|
||||
return result;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the ith key
|
||||
Slice Key(int i, std::string* storage) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%016d", i);
|
||||
storage->assign(buf, strlen(buf));
|
||||
return Slice(*storage);
|
||||
}
|
||||
|
||||
// Return the value to associate with the specified key
|
||||
Slice Value(int k, std::string* storage) {
|
||||
Random r(k);
|
||||
return test::RandomString(&r, kValueSize, storage);
|
||||
}
|
||||
};
|
||||
|
||||
TEST(CorruptionTest, Recovery) {
|
||||
Build(100);
|
||||
Check(100, 100);
|
||||
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
|
||||
Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
|
||||
Reopen();
|
||||
|
||||
// The 64 records in the first two log blocks are completely lost.
|
||||
Check(36, 36);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, RecoverWriteError) {
|
||||
env_.writable_file_error_ = true;
|
||||
Status s = TryReopen();
|
||||
ASSERT_TRUE(!s.ok());
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, NewFileErrorDuringWrite) {
|
||||
// Do enough writing to force minor compaction
|
||||
env_.writable_file_error_ = true;
|
||||
const int num = 3 + (Options().write_buffer_size / kValueSize);
|
||||
std::string value_storage;
|
||||
Status s;
|
||||
bool failed = false;
|
||||
for (int i = 0; i < num; i++) {
|
||||
WriteBatch batch;
|
||||
batch.Put("a", Value(100, &value_storage));
|
||||
s = db_->Write(WriteOptions(), &batch);
|
||||
if (!s.ok()) {
|
||||
failed = true;
|
||||
}
|
||||
ASSERT_TRUE(!failed || !s.ok());
|
||||
}
|
||||
ASSERT_TRUE(!s.ok());
|
||||
ASSERT_GE(env_.num_writable_file_errors_, 1);
|
||||
env_.writable_file_error_ = false;
|
||||
Reopen();
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, TableFile) {
|
||||
Build(100);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
dbi->TEST_CompactRange(0, nullptr, nullptr);
|
||||
dbi->TEST_CompactRange(1, nullptr, nullptr);
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(99, 99);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, TableFileIndexData) {
|
||||
Build(10000); // Enough to build multiple Tables
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
|
||||
Corrupt(kTableFile, -2000, 500);
|
||||
Reopen();
|
||||
Check(5000, 9999);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, MissingDescriptor) {
|
||||
Build(1000);
|
||||
RepairDB();
|
||||
Reopen();
|
||||
Check(1000, 1000);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, SequenceNumberRecovery) {
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
|
||||
RepairDB();
|
||||
Reopen();
|
||||
std::string v;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("v5", v);
|
||||
// Write something. If sequence number was not recovered properly,
|
||||
// it will be hidden by an earlier write.
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("v6", v);
|
||||
Reopen();
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("v6", v);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, CorruptedDescriptor) {
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
dbi->TEST_CompactRange(0, nullptr, nullptr);
|
||||
|
||||
Corrupt(kDescriptorFile, 0, 1000);
|
||||
Status s = TryReopen();
|
||||
ASSERT_TRUE(!s.ok());
|
||||
|
||||
RepairDB();
|
||||
Reopen();
|
||||
std::string v;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
|
||||
ASSERT_EQ("hello", v);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, CompactionInputError) {
|
||||
Build(10);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
const int last = dbi->MaxMemCompactionLevel();
|
||||
ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(9, 9);
|
||||
|
||||
// Force compactions by writing lots of values
|
||||
Build(10000);
|
||||
Check(10000, 10000);
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, CompactionInputErrorParanoid) {
|
||||
Options options;
|
||||
options.paranoid_checks = true;
|
||||
options.write_buffer_size = 1048576;
|
||||
Reopen(&options);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
|
||||
// Fill levels >= 1 so memtable compaction outputs to level 1
|
||||
for (int level = 1; level < dbi->NumberLevels(); level++) {
|
||||
dbi->Put(WriteOptions(), "", "begin");
|
||||
dbi->Put(WriteOptions(), "~", "end");
|
||||
dbi->TEST_FlushMemTable();
|
||||
}
|
||||
|
||||
Build(10);
|
||||
dbi->TEST_FlushMemTable();
|
||||
dbi->TEST_WaitForCompact();
|
||||
ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
|
||||
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
Check(9, 9);
|
||||
|
||||
// Write must eventually fail because of corrupted table
|
||||
Status s;
|
||||
std::string tmp1, tmp2;
|
||||
bool failed = false;
|
||||
for (int i = 0; i < 10000 && s.ok(); i++) {
|
||||
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
|
||||
if (!s.ok()) {
|
||||
failed = true;
|
||||
}
|
||||
// if one write failed, every subsequent write must fail, too
|
||||
ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
|
||||
}
|
||||
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
|
||||
}
|
||||
|
||||
TEST(CorruptionTest, UnrelatedKeys) {
|
||||
Build(10);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
Corrupt(kTableFile, 100, 1);
|
||||
|
||||
std::string tmp1, tmp2;
|
||||
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
|
||||
std::string v;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
|
||||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
|
||||
dbi->TEST_FlushMemTable();
|
||||
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
|
||||
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
2461
db/db_bench.cc
Normal file
2461
db/db_bench.cc
Normal file
File diff suppressed because it is too large
Load Diff
100
db/db_filesnapshot.cc
Normal file
100
db/db_filesnapshot.cc
Normal file
@ -0,0 +1,100 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2012 Facebook.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "util/mutexlock.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status DBImpl::DisableFileDeletions() {
|
||||
MutexLock l(&mutex_);
|
||||
disable_delete_obsolete_files_ = true;
|
||||
Log(options_.info_log, "File Deletions Disabled");
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::EnableFileDeletions() {
|
||||
DeletionState deletion_state;
|
||||
{
|
||||
MutexLock l(&mutex_);
|
||||
disable_delete_obsolete_files_ = false;
|
||||
Log(options_.info_log, "File Deletions Enabled");
|
||||
FindObsoleteFiles(deletion_state, true);
|
||||
}
|
||||
PurgeObsoleteFiles(deletion_state);
|
||||
LogFlush(options_.info_log);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable) {
|
||||
|
||||
*manifest_file_size = 0;
|
||||
|
||||
if (flush_memtable) {
|
||||
// flush all dirty data to disk.
|
||||
Status status = Flush(FlushOptions());
|
||||
if (!status.ok()) {
|
||||
Log(options_.info_log, "Cannot Flush data %s\n",
|
||||
status.ToString().c_str());
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
MutexLock l(&mutex_);
|
||||
|
||||
// Make a set of all of the live *.sst files
|
||||
std::set<uint64_t> live;
|
||||
versions_->AddLiveFilesCurrentVersion(&live);
|
||||
|
||||
ret.clear();
|
||||
ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
|
||||
|
||||
// create names of the live files. The names are not absolute
|
||||
// paths, instead they are relative to dbname_;
|
||||
for (auto live_file : live) {
|
||||
ret.push_back(TableFileName("", live_file));
|
||||
}
|
||||
|
||||
ret.push_back(CurrentFileName(""));
|
||||
ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
|
||||
|
||||
// find length of manifest file while holding the mutex lock
|
||||
*manifest_file_size = versions_->ManifestFileSize();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
|
||||
// First get sorted files in archive dir, then append sorted files from main
|
||||
// dir to maintain sorted order
|
||||
|
||||
// list wal files in archive dir.
|
||||
Status s;
|
||||
std::string archivedir = ArchivalDirectory(options_.wal_dir);
|
||||
if (env_->FileExists(archivedir)) {
|
||||
s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
// list wal files in main db dir.
|
||||
return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile);
|
||||
}
|
||||
|
||||
}
|
3595
db/db_impl.cc
Normal file
3595
db/db_impl.cc
Normal file
File diff suppressed because it is too large
Load Diff
497
db/db_impl.h
Normal file
497
db/db_impl.h
Normal file
@ -0,0 +1,497 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#pragma once
|
||||
#include <atomic>
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/snapshot.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include "port/port.h"
|
||||
#include "util/stats_logger.h"
|
||||
#include "memtablelist.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class MemTable;
|
||||
class TableCache;
|
||||
class Version;
|
||||
class VersionEdit;
|
||||
class VersionSet;
|
||||
|
||||
class DBImpl : public DB {
|
||||
public:
|
||||
DBImpl(const Options& options, const std::string& dbname);
|
||||
virtual ~DBImpl();
|
||||
|
||||
// Implementations of the DB interface
|
||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
|
||||
virtual Status Merge(const WriteOptions&, const Slice& key,
|
||||
const Slice& value);
|
||||
virtual Status Delete(const WriteOptions&, const Slice& key);
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value);
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values);
|
||||
|
||||
// Returns false if key doesn't exist in the database and true if it may.
|
||||
// If value_found is not passed in as null, then return the value if found in
|
||||
// memory. On return, if value was found, then value_found will be set to true
|
||||
// , otherwise false.
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr);
|
||||
virtual Iterator* NewIterator(const ReadOptions&);
|
||||
virtual const Snapshot* GetSnapshot();
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
||||
virtual bool GetProperty(const Slice& property, std::string* value);
|
||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1);
|
||||
virtual int NumberLevels();
|
||||
virtual int MaxMemCompactionLevel();
|
||||
virtual int Level0StopWriteTrigger();
|
||||
virtual Status Flush(const FlushOptions& options);
|
||||
virtual Status DisableFileDeletions();
|
||||
virtual Status EnableFileDeletions();
|
||||
// All the returned filenames start with "/"
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true);
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files);
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const;
|
||||
virtual Status GetUpdatesSince(SequenceNumber seq_number,
|
||||
unique_ptr<TransactionLogIterator>* iter);
|
||||
virtual Status DeleteFile(std::string name);
|
||||
|
||||
virtual void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData> *metadata);
|
||||
|
||||
// Extra methods (for testing) that are not in the public DB interface
|
||||
|
||||
// Compact any files in the named level that overlap [*begin, *end]
|
||||
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status TEST_FlushMemTable();
|
||||
|
||||
// Wait for memtable compaction
|
||||
Status TEST_WaitForFlushMemTable();
|
||||
|
||||
// Wait for any compaction
|
||||
Status TEST_WaitForCompact();
|
||||
|
||||
// Return an internal iterator over the current state of the database.
|
||||
// The keys of this iterator are internal keys (see format.h).
|
||||
// The returned iterator should be deleted when no longer needed.
|
||||
Iterator* TEST_NewInternalIterator();
|
||||
|
||||
// Return the maximum overlapping data (in bytes) at next level for any
|
||||
// file at a level >= 1.
|
||||
int64_t TEST_MaxNextLevelOverlappingBytes();
|
||||
|
||||
// Simulate a db crash, no elegant closing of database.
|
||||
void TEST_Destroy_DBImpl();
|
||||
|
||||
// Return the current manifest file no.
|
||||
uint64_t TEST_Current_Manifest_FileNo();
|
||||
|
||||
// Trigger's a background call for testing.
|
||||
void TEST_PurgeObsoleteteWAL();
|
||||
|
||||
// get total level0 file size. Only for testing.
|
||||
uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}
|
||||
|
||||
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
|
||||
{
|
||||
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
|
||||
}
|
||||
|
||||
// needed for CleanupIteratorState
|
||||
|
||||
struct DeletionState {
|
||||
inline bool HaveSomethingToDelete() const {
|
||||
return all_files.size() ||
|
||||
sst_delete_files.size() ||
|
||||
log_delete_files.size();
|
||||
}
|
||||
// a list of all files that we'll consider deleting
|
||||
// (every once in a while this is filled up with all files
|
||||
// in the DB directory)
|
||||
std::vector<std::string> all_files;
|
||||
|
||||
// the list of all live sst files that cannot be deleted
|
||||
std::vector<uint64_t> sst_live;
|
||||
|
||||
// a list of sst files that we need to delete
|
||||
std::vector<FileMetaData*> sst_delete_files;
|
||||
|
||||
// a list of log files that we need to delete
|
||||
std::vector<uint64_t> log_delete_files;
|
||||
|
||||
// the current manifest_file_number, log_number and prev_log_number
|
||||
// that corresponds to the set of files in 'live'.
|
||||
uint64_t manifest_file_number, log_number, prev_log_number;
|
||||
|
||||
DeletionState() {
|
||||
manifest_file_number = 0;
|
||||
log_number = 0;
|
||||
prev_log_number = 0;
|
||||
}
|
||||
};
|
||||
|
||||
// Returns the list of live files in 'live' and the list
|
||||
// of all files in the filesystem in 'all_files'.
|
||||
// If force == false and the last call was less than
|
||||
// options_.delete_obsolete_files_period_micros microseconds ago,
|
||||
// it will not fill up the deletion_state
|
||||
void FindObsoleteFiles(DeletionState& deletion_state,
|
||||
bool force,
|
||||
bool no_full_scan = false);
|
||||
|
||||
// Diffs the files listed in filenames and those that do not
|
||||
// belong to live files are posibly removed. Also, removes all the
|
||||
// files in sst_delete_files and log_delete_files.
|
||||
// It is not necessary to hold the mutex when invoking this method.
|
||||
void PurgeObsoleteFiles(DeletionState& deletion_state);
|
||||
|
||||
protected:
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
unique_ptr<VersionSet> versions_;
|
||||
const InternalKeyComparator internal_comparator_;
|
||||
const Options options_; // options_.comparator == &internal_comparator_
|
||||
|
||||
const Comparator* user_comparator() const {
|
||||
return internal_comparator_.user_comparator();
|
||||
}
|
||||
|
||||
MemTable* GetMemTable() {
|
||||
return mem_;
|
||||
}
|
||||
|
||||
Iterator* NewInternalIterator(const ReadOptions&,
|
||||
SequenceNumber* latest_snapshot);
|
||||
|
||||
private:
|
||||
friend class DB;
|
||||
struct CompactionState;
|
||||
struct Writer;
|
||||
|
||||
Status NewDB();
|
||||
|
||||
// Recover the descriptor from persistent storage. May do a significant
|
||||
// amount of work to recover recently logged updates. Any changes to
|
||||
// be made to the descriptor are added to *edit.
|
||||
Status Recover(VersionEdit* edit, MemTable* external_table = nullptr,
|
||||
bool error_if_log_file_exist = false);
|
||||
|
||||
void MaybeIgnoreError(Status* s) const;
|
||||
|
||||
const Status CreateArchivalDirectory();
|
||||
|
||||
// Delete any unneeded files and stale in-memory entries.
|
||||
void DeleteObsoleteFiles();
|
||||
|
||||
// Flush the in-memory write buffer to storage. Switches to a new
|
||||
// log-file/memtable and writes a new descriptor iff successful.
|
||||
Status FlushMemTableToOutputFile(bool* madeProgress,
|
||||
DeletionState& deletion_state);
|
||||
|
||||
Status RecoverLogFile(uint64_t log_number,
|
||||
VersionEdit* edit,
|
||||
SequenceNumber* max_sequence,
|
||||
MemTable* external_table);
|
||||
|
||||
// The following two methods are used to flush a memtable to
|
||||
// storage. The first one is used atdatabase RecoveryTime (when the
|
||||
// database is opened) and is heavyweight because it holds the mutex
|
||||
// for the entire period. The second method WriteLevel0Table supports
|
||||
// concurrent flush memtables to storage.
|
||||
Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
|
||||
Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
|
||||
uint64_t* filenumber);
|
||||
|
||||
uint64_t SlowdownAmount(int n, int top, int bottom);
|
||||
Status MakeRoomForWrite(bool force /* compact even if there is room? */);
|
||||
WriteBatch* BuildBatchGroup(Writer** last_writer);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status FlushMemTable(const FlushOptions& options);
|
||||
|
||||
// Wait for memtable flushed
|
||||
Status WaitForFlushMemTable();
|
||||
|
||||
void MaybeScheduleLogDBDeployStats();
|
||||
static void BGLogDBDeployStats(void* db);
|
||||
void LogDBDeployStats();
|
||||
|
||||
void MaybeScheduleFlushOrCompaction();
|
||||
static void BGWorkCompaction(void* db);
|
||||
static void BGWorkFlush(void* db);
|
||||
void BackgroundCallCompaction();
|
||||
void BackgroundCallFlush();
|
||||
Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state);
|
||||
Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state);
|
||||
void CleanupCompaction(CompactionState* compact, Status status);
|
||||
Status DoCompactionWork(CompactionState* compact,
|
||||
DeletionState& deletion_state);
|
||||
|
||||
Status OpenCompactionOutputFile(CompactionState* compact);
|
||||
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
||||
Status InstallCompactionResults(CompactionState* compact);
|
||||
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
|
||||
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
|
||||
|
||||
void PurgeObsoleteWALFiles();
|
||||
|
||||
Status AppendSortedWalsOfType(const std::string& path,
|
||||
VectorLogPtr& log_files,
|
||||
WalFileType type);
|
||||
|
||||
// Requires: all_logs should be sorted with earliest log file first
|
||||
// Retains all log files in all_logs which contain updates with seq no.
|
||||
// Greater Than or Equal to the requested SequenceNumber.
|
||||
Status RetainProbableWalFiles(VectorLogPtr& all_logs,
|
||||
const SequenceNumber target);
|
||||
// return true if
|
||||
bool CheckWalFileExistsAndEmpty(const WalFileType type,
|
||||
const uint64_t number);
|
||||
|
||||
Status ReadFirstRecord(const WalFileType type, const uint64_t number,
|
||||
WriteBatch* const result);
|
||||
|
||||
Status ReadFirstLine(const std::string& fname, WriteBatch* const batch);
|
||||
|
||||
void PrintStatistics();
|
||||
|
||||
// dump rocksdb.stats to LOG
|
||||
void MaybeDumpStats();
|
||||
|
||||
// Return the minimum empty level that could hold the total data in the
|
||||
// input level. Return the input level, if such level could not be found.
|
||||
int FindMinimumEmptyLevelFitting(int level);
|
||||
|
||||
// Move the files in the input level to the target level.
|
||||
// If target_level < 0, automatically calculate the minimum level that could
|
||||
// hold the data set.
|
||||
void ReFitLevel(int level, int target_level = -1);
|
||||
|
||||
// Constant after construction
|
||||
const InternalFilterPolicy internal_filter_policy_;
|
||||
bool owns_info_log_;
|
||||
|
||||
// table_cache_ provides its own synchronization
|
||||
unique_ptr<TableCache> table_cache_;
|
||||
|
||||
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
|
||||
FileLock* db_lock_;
|
||||
|
||||
// State below is protected by mutex_
|
||||
port::Mutex mutex_;
|
||||
port::AtomicPointer shutting_down_;
|
||||
port::CondVar bg_cv_; // Signalled when background work finishes
|
||||
std::shared_ptr<MemTableRepFactory> mem_rep_factory_;
|
||||
MemTable* mem_;
|
||||
MemTableList imm_; // Memtable that are not changing
|
||||
uint64_t logfile_number_;
|
||||
unique_ptr<log::Writer> log_;
|
||||
|
||||
std::string host_name_;
|
||||
|
||||
// Queue of writers.
|
||||
std::deque<Writer*> writers_;
|
||||
WriteBatch tmp_batch_;
|
||||
|
||||
SnapshotList snapshots_;
|
||||
|
||||
// Set of table files to protect from deletion because they are
|
||||
// part of ongoing compactions.
|
||||
std::set<uint64_t> pending_outputs_;
|
||||
|
||||
// count how many background compaction been scheduled or is running?
|
||||
int bg_compaction_scheduled_;
|
||||
|
||||
// number of background memtable flush jobs, submitted to the HIGH pool
|
||||
int bg_flush_scheduled_;
|
||||
|
||||
// Has a background stats log thread scheduled?
|
||||
bool bg_logstats_scheduled_;
|
||||
|
||||
// Information for a manual compaction
|
||||
struct ManualCompaction {
|
||||
int level;
|
||||
bool done;
|
||||
bool in_progress; // compaction request being processed?
|
||||
const InternalKey* begin; // nullptr means beginning of key range
|
||||
const InternalKey* end; // nullptr means end of key range
|
||||
InternalKey tmp_storage; // Used to keep track of compaction progress
|
||||
};
|
||||
ManualCompaction* manual_compaction_;
|
||||
|
||||
// Have we encountered a background error in paranoid mode?
|
||||
Status bg_error_;
|
||||
|
||||
std::unique_ptr<StatsLogger> logger_;
|
||||
|
||||
int64_t volatile last_log_ts;
|
||||
|
||||
// shall we disable deletion of obsolete files
|
||||
bool disable_delete_obsolete_files_;
|
||||
|
||||
// last time when DeleteObsoleteFiles was invoked
|
||||
uint64_t delete_obsolete_files_last_run_;
|
||||
|
||||
// last time when PurgeObsoleteWALFiles ran.
|
||||
uint64_t purge_wal_files_last_run_;
|
||||
|
||||
// last time stats were dumped to LOG
|
||||
std::atomic<uint64_t> last_stats_dump_time_microsec_;
|
||||
|
||||
// obsolete files will be deleted every this seconds if ttl deletion is
|
||||
// enabled and archive size_limit is disabled.
|
||||
uint64_t default_interval_to_delete_obsolete_WAL_;
|
||||
|
||||
// These count the number of microseconds for which MakeRoomForWrite stalls.
|
||||
uint64_t stall_level0_slowdown_;
|
||||
uint64_t stall_memtable_compaction_;
|
||||
uint64_t stall_level0_num_files_;
|
||||
std::vector<uint64_t> stall_leveln_slowdown_;
|
||||
uint64_t stall_level0_slowdown_count_;
|
||||
uint64_t stall_memtable_compaction_count_;
|
||||
uint64_t stall_level0_num_files_count_;
|
||||
std::vector<uint64_t> stall_leveln_slowdown_count_;
|
||||
|
||||
// Time at which this instance was started.
|
||||
const uint64_t started_at_;
|
||||
|
||||
bool flush_on_destroy_; // Used when disableWAL is true.
|
||||
|
||||
// Per level compaction stats. stats_[level] stores the stats for
|
||||
// compactions that produced data for the specified "level".
|
||||
struct CompactionStats {
|
||||
uint64_t micros;
|
||||
|
||||
// Bytes read from level N during compaction between levels N and N+1
|
||||
int64_t bytes_readn;
|
||||
|
||||
// Bytes read from level N+1 during compaction between levels N and N+1
|
||||
int64_t bytes_readnp1;
|
||||
|
||||
// Total bytes written during compaction between levels N and N+1
|
||||
int64_t bytes_written;
|
||||
|
||||
// Files read from level N during compaction between levels N and N+1
|
||||
int files_in_leveln;
|
||||
|
||||
// Files read from level N+1 during compaction between levels N and N+1
|
||||
int files_in_levelnp1;
|
||||
|
||||
// Files written during compaction between levels N and N+1
|
||||
int files_out_levelnp1;
|
||||
|
||||
// Number of compactions done
|
||||
int count;
|
||||
|
||||
CompactionStats() : micros(0), bytes_readn(0), bytes_readnp1(0),
|
||||
bytes_written(0), files_in_leveln(0),
|
||||
files_in_levelnp1(0), files_out_levelnp1(0),
|
||||
count(0) { }
|
||||
|
||||
void Add(const CompactionStats& c) {
|
||||
this->micros += c.micros;
|
||||
this->bytes_readn += c.bytes_readn;
|
||||
this->bytes_readnp1 += c.bytes_readnp1;
|
||||
this->bytes_written += c.bytes_written;
|
||||
this->files_in_leveln += c.files_in_leveln;
|
||||
this->files_in_levelnp1 += c.files_in_levelnp1;
|
||||
this->files_out_levelnp1 += c.files_out_levelnp1;
|
||||
this->count += 1;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<CompactionStats> stats_;
|
||||
|
||||
// Used to compute per-interval statistics
|
||||
struct StatsSnapshot {
|
||||
uint64_t bytes_read_;
|
||||
uint64_t bytes_written_;
|
||||
uint64_t bytes_new_;
|
||||
double seconds_up_;
|
||||
|
||||
StatsSnapshot() : bytes_read_(0), bytes_written_(0),
|
||||
bytes_new_(0), seconds_up_(0) {}
|
||||
};
|
||||
|
||||
StatsSnapshot last_stats_;
|
||||
|
||||
static const int KEEP_LOG_FILE_NUM = 1000;
|
||||
std::string db_absolute_path_;
|
||||
|
||||
// count of the number of contiguous delaying writes
|
||||
int delayed_writes_;
|
||||
|
||||
// The options to access storage files
|
||||
const EnvOptions storage_options_;
|
||||
|
||||
// A value of true temporarily disables scheduling of background work
|
||||
bool bg_work_gate_closed_;
|
||||
|
||||
// Guard against multiple concurrent refitting
|
||||
bool refitting_level_;
|
||||
|
||||
// No copying allowed
|
||||
DBImpl(const DBImpl&);
|
||||
void operator=(const DBImpl&);
|
||||
|
||||
// dump the delayed_writes_ to the log file and reset counter.
|
||||
void DelayLoggingAndReset();
|
||||
|
||||
// Return the earliest snapshot where seqno is visible.
|
||||
// Store the snapshot right before that, if any, in prev_snapshot
|
||||
inline SequenceNumber findEarliestVisibleSnapshot(
|
||||
SequenceNumber in,
|
||||
std::vector<SequenceNumber>& snapshots,
|
||||
SequenceNumber* prev_snapshot);
|
||||
|
||||
// Function that Get and KeyMayExist call with no_io true or false
|
||||
// Note: 'value_found' from KeyMayExist propagates here
|
||||
Status GetImpl(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr);
|
||||
};
|
||||
|
||||
// Sanitize db options. The caller should delete result.info_log if
|
||||
// it is not equal to src.info_log.
|
||||
extern Options SanitizeOptions(const std::string& db,
|
||||
const InternalKeyComparator* icmp,
|
||||
const InternalFilterPolicy* ipolicy,
|
||||
const Options& src);
|
||||
|
||||
|
||||
// Determine compression type, based on user options, level of the output
|
||||
// file and whether compression is disabled.
|
||||
// If enable_compression is false, then compression is always disabled no
|
||||
// matter what the values of the other two parameters are.
|
||||
// Otherwise, the compression type is determined based on options and level.
|
||||
CompressionType GetCompressionType(const Options& options, int level,
|
||||
const bool enable_compression);
|
||||
|
||||
} // namespace rocksdb
|
99
db/db_impl_readonly.cc
Normal file
99
db/db_impl_readonly.cc
Normal file
@ -0,0 +1,99 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2012 Facebook. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "db/db_impl_readonly.h"
|
||||
#include "db/db_impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "db/db_iter.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "port/port.h"
|
||||
#include "table/block.h"
|
||||
#include "table/merger.h"
|
||||
#include "table/two_level_iterator.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/build_version.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
DBImplReadOnly::DBImplReadOnly(const Options& options,
|
||||
const std::string& dbname)
|
||||
: DBImpl(options, dbname) {
|
||||
Log(options_.info_log, "Opening the db in read only mode");
|
||||
}
|
||||
|
||||
DBImplReadOnly::~DBImplReadOnly() {
|
||||
}
|
||||
|
||||
// Implementations of the DB interface
|
||||
Status DBImplReadOnly::Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value) {
|
||||
Status s;
|
||||
MemTable* mem = GetMemTable();
|
||||
Version* current = versions_->current();
|
||||
SequenceNumber snapshot = versions_->LastSequence();
|
||||
std::deque<std::string> merge_operands;
|
||||
LookupKey lkey(key, snapshot);
|
||||
if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
|
||||
} else {
|
||||
Version::GetStats stats;
|
||||
current->Get(options, lkey, value, &s, &merge_operands, &stats, options_);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) {
|
||||
SequenceNumber latest_snapshot;
|
||||
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
|
||||
return NewDBIterator(
|
||||
&dbname_, env_, options_, user_comparator(),internal_iter,
|
||||
(options.snapshot != nullptr
|
||||
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
|
||||
: latest_snapshot));
|
||||
}
|
||||
|
||||
|
||||
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
|
||||
DB** dbptr, bool error_if_log_file_exist) {
|
||||
*dbptr = nullptr;
|
||||
|
||||
DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
|
||||
impl->mutex_.Lock();
|
||||
VersionEdit edit(impl->NumberLevels());
|
||||
Status s = impl->Recover(&edit, impl->GetMemTable(),
|
||||
error_if_log_file_exist);
|
||||
impl->mutex_.Unlock();
|
||||
if (s.ok()) {
|
||||
*dbptr = impl;
|
||||
} else {
|
||||
delete impl;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
78
db/db_impl_readonly.h
Normal file
78
db/db_impl_readonly.h
Normal file
@ -0,0 +1,78 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2012 Facebook. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#include "db/db_impl.h"
|
||||
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/snapshot.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "util/stats_logger.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class DBImplReadOnly : public DBImpl {
|
||||
public:
|
||||
DBImplReadOnly(const Options& options, const std::string& dbname);
|
||||
virtual ~DBImplReadOnly();
|
||||
|
||||
// Implementations of the DB interface
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value);
|
||||
|
||||
// TODO: Implement ReadOnly MultiGet?
|
||||
|
||||
virtual Iterator* NewIterator(const ReadOptions&);
|
||||
|
||||
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Merge(const WriteOptions&, const Slice& key,
|
||||
const Slice& value) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Delete(const WriteOptions&, const Slice& key) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false, int target_level = -1) {
|
||||
}
|
||||
virtual Status DisableFileDeletions() {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status EnableFileDeletions() {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
virtual Status Flush(const FlushOptions& options) {
|
||||
return Status::NotSupported("Not supported operation in read only mode.");
|
||||
}
|
||||
|
||||
private:
|
||||
friend class DB;
|
||||
|
||||
// No copying allowed
|
||||
DBImplReadOnly(const DBImplReadOnly&);
|
||||
void operator=(const DBImplReadOnly&);
|
||||
};
|
||||
|
||||
}
|
481
db/db_iter.cc
Normal file
481
db/db_iter.cc
Normal file
@ -0,0 +1,481 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_iter.h"
|
||||
#include <stdexcept>
|
||||
#include <deque>
|
||||
|
||||
#include "db/filename.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "port/port.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
#if 0
|
||||
static void DumpInternalIter(Iterator* iter) {
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
ParsedInternalKey k;
|
||||
if (!ParseInternalKey(iter->key(), &k)) {
|
||||
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
|
||||
} else {
|
||||
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
// Memtables and sstables that make the DB representation contain
|
||||
// (userkey,seq,type) => uservalue entries. DBIter
|
||||
// combines multiple entries for the same userkey found in the DB
|
||||
// representation into a single entry while accounting for sequence
|
||||
// numbers, deletion markers, overwrites, etc.
|
||||
class DBIter: public Iterator {
|
||||
public:
|
||||
// The following is grossly complicated. TODO: clean it up
|
||||
// Which direction is the iterator currently moving?
|
||||
// (1) When moving forward, the internal iterator is positioned at
|
||||
// the exact entry that yields this->key(), this->value()
|
||||
// (2) When moving backwards, the internal iterator is positioned
|
||||
// just before all entries whose user key == this->key().
|
||||
enum Direction {
|
||||
kForward,
|
||||
kReverse
|
||||
};
|
||||
|
||||
DBIter(const std::string* dbname, Env* env, const Options& options,
|
||||
const Comparator* cmp, Iterator* iter, SequenceNumber s)
|
||||
: dbname_(dbname),
|
||||
env_(env),
|
||||
logger_(options.info_log),
|
||||
user_comparator_(cmp),
|
||||
user_merge_operator_(options.merge_operator.get()),
|
||||
iter_(iter),
|
||||
sequence_(s),
|
||||
direction_(kForward),
|
||||
valid_(false),
|
||||
current_entry_is_merged_(false),
|
||||
statistics_(options.statistics) {
|
||||
RecordTick(statistics_, NO_ITERATORS, 1);
|
||||
max_skip_ = options.max_sequential_skip_in_iterations;
|
||||
}
|
||||
virtual ~DBIter() {
|
||||
RecordTick(statistics_, NO_ITERATORS, -1);
|
||||
delete iter_;
|
||||
}
|
||||
virtual bool Valid() const { return valid_; }
|
||||
virtual Slice key() const {
|
||||
assert(valid_);
|
||||
return saved_key_;
|
||||
}
|
||||
virtual Slice value() const {
|
||||
assert(valid_);
|
||||
return (direction_ == kForward && !current_entry_is_merged_) ?
|
||||
iter_->value() : saved_value_;
|
||||
}
|
||||
virtual Status status() const {
|
||||
if (status_.ok()) {
|
||||
return iter_->status();
|
||||
} else {
|
||||
return status_;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Next();
|
||||
virtual void Prev();
|
||||
virtual void Seek(const Slice& target);
|
||||
virtual void SeekToFirst();
|
||||
virtual void SeekToLast();
|
||||
|
||||
private:
|
||||
void FindNextUserEntry(bool skipping);
|
||||
void FindPrevUserEntry();
|
||||
bool ParseKey(ParsedInternalKey* key);
|
||||
void MergeValuesNewToOld();
|
||||
|
||||
inline void SaveKey(const Slice& k, std::string* dst) {
|
||||
dst->assign(k.data(), k.size());
|
||||
}
|
||||
|
||||
inline void ClearSavedValue() {
|
||||
if (saved_value_.capacity() > 1048576) {
|
||||
std::string empty;
|
||||
swap(empty, saved_value_);
|
||||
} else {
|
||||
saved_value_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
const std::string* const dbname_;
|
||||
Env* const env_;
|
||||
shared_ptr<Logger> logger_;
|
||||
const Comparator* const user_comparator_;
|
||||
const MergeOperator* const user_merge_operator_;
|
||||
Iterator* const iter_;
|
||||
SequenceNumber const sequence_;
|
||||
|
||||
Status status_;
|
||||
std::string saved_key_; // == current key when direction_==kReverse
|
||||
std::string saved_value_; // == current raw value when direction_==kReverse
|
||||
std::string skip_key_;
|
||||
Direction direction_;
|
||||
bool valid_;
|
||||
bool current_entry_is_merged_;
|
||||
std::shared_ptr<Statistics> statistics_;
|
||||
uint64_t max_skip_;
|
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&);
|
||||
void operator=(const DBIter&);
|
||||
};
|
||||
|
||||
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
|
||||
if (!ParseInternalKey(iter_->key(), ikey)) {
|
||||
status_ = Status::Corruption("corrupted internal key in DBIter");
|
||||
Log(logger_, "corrupted internal key in DBIter: %s",
|
||||
iter_->key().ToString(true).c_str());
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::Next() {
|
||||
assert(valid_);
|
||||
|
||||
if (direction_ == kReverse) { // Switch directions?
|
||||
direction_ = kForward;
|
||||
// iter_ is pointing just before the entries for this->key(),
|
||||
// so advance into the range of entries for this->key() and then
|
||||
// use the normal skipping code below.
|
||||
if (!iter_->Valid()) {
|
||||
iter_->SeekToFirst();
|
||||
} else {
|
||||
iter_->Next();
|
||||
}
|
||||
if (!iter_->Valid()) {
|
||||
valid_ = false;
|
||||
saved_key_.clear();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// If the current value is merged, we might already hit end of iter_
|
||||
if (!iter_->Valid()) {
|
||||
valid_ = false;
|
||||
return;
|
||||
}
|
||||
FindNextUserEntry(true /* skipping the current user key */);
|
||||
}
|
||||
|
||||
|
||||
// PRE: saved_key_ has the current user key if skipping
|
||||
// POST: saved_key_ should have the next user key if valid_,
|
||||
// if the current entry is a result of merge
|
||||
// current_entry_is_merged_ => true
|
||||
// saved_value_ => the merged value
|
||||
//
|
||||
// NOTE: In between, saved_key_ can point to a user key that has
|
||||
// a delete marker
|
||||
void DBIter::FindNextUserEntry(bool skipping) {
|
||||
// Loop until we hit an acceptable entry to yield
|
||||
assert(iter_->Valid());
|
||||
assert(direction_ == kForward);
|
||||
current_entry_is_merged_ = false;
|
||||
uint64_t num_skipped = 0;
|
||||
do {
|
||||
ParsedInternalKey ikey;
|
||||
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
||||
if (skipping &&
|
||||
user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
|
||||
num_skipped++; // skip this entry
|
||||
BumpPerfCount(&perf_context.internal_key_skipped_count);
|
||||
} else {
|
||||
skipping = false;
|
||||
switch (ikey.type) {
|
||||
case kTypeDeletion:
|
||||
// Arrange to skip all upcoming entries for this key since
|
||||
// they are hidden by this deletion.
|
||||
SaveKey(ikey.user_key, &saved_key_);
|
||||
skipping = true;
|
||||
num_skipped = 0;
|
||||
BumpPerfCount(&perf_context.internal_delete_skipped_count);
|
||||
break;
|
||||
case kTypeValue:
|
||||
valid_ = true;
|
||||
SaveKey(ikey.user_key, &saved_key_);
|
||||
return;
|
||||
case kTypeMerge:
|
||||
// By now, we are sure the current ikey is going to yield a value
|
||||
SaveKey(ikey.user_key, &saved_key_);
|
||||
current_entry_is_merged_ = true;
|
||||
valid_ = true;
|
||||
MergeValuesNewToOld(); // Go to a different state machine
|
||||
return;
|
||||
case kTypeLogData:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If we have sequentially iterated via numerous keys and still not
|
||||
// found the next user-key, then it is better to seek so that we can
|
||||
// avoid too many key comparisons. We seek to the last occurence of
|
||||
// our current key by looking for sequence number 0.
|
||||
if (skipping && num_skipped > max_skip_) {
|
||||
num_skipped = 0;
|
||||
std::string last_key;
|
||||
AppendInternalKey(&last_key,
|
||||
ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
|
||||
iter_->Seek(last_key);
|
||||
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
|
||||
} else {
|
||||
iter_->Next();
|
||||
}
|
||||
} while (iter_->Valid());
|
||||
valid_ = false;
|
||||
}
|
||||
|
||||
// Merge values of the same user key starting from the current iter_ position
|
||||
// Scan from the newer entries to older entries.
|
||||
// PRE: iter_->key() points to the first merge type entry
|
||||
// saved_key_ stores the user key
|
||||
// POST: saved_value_ has the merged value for the user key
|
||||
// iter_ points to the next entry (or invalid)
|
||||
void DBIter::MergeValuesNewToOld() {
|
||||
if (!user_merge_operator_) {
|
||||
Log(logger_, "Options::merge_operator is null.");
|
||||
throw std::logic_error("DBIter::MergeValuesNewToOld() with"
|
||||
" Options::merge_operator null");
|
||||
}
|
||||
|
||||
// Start the merge process by pushing the first operand
|
||||
std::deque<std::string> operands;
|
||||
operands.push_front(iter_->value().ToString());
|
||||
|
||||
std::string merge_result; // Temporary string to hold merge result later
|
||||
ParsedInternalKey ikey;
|
||||
for (iter_->Next(); iter_->Valid(); iter_->Next()) {
|
||||
if (!ParseKey(&ikey)) {
|
||||
// skip corrupted key
|
||||
continue;
|
||||
}
|
||||
|
||||
if (user_comparator_->Compare(ikey.user_key, saved_key_) != 0) {
|
||||
// hit the next user key, stop right here
|
||||
break;
|
||||
}
|
||||
|
||||
if (kTypeDeletion == ikey.type) {
|
||||
// hit a delete with the same user key, stop right here
|
||||
// iter_ is positioned after delete
|
||||
iter_->Next();
|
||||
break;
|
||||
}
|
||||
|
||||
if (kTypeValue == ikey.type) {
|
||||
// hit a put, merge the put value with operands and store the
|
||||
// final result in saved_value_. We are done!
|
||||
// ignore corruption if there is any.
|
||||
const Slice value = iter_->value();
|
||||
user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
|
||||
&saved_value_, logger_.get());
|
||||
// iter_ is positioned after put
|
||||
iter_->Next();
|
||||
return;
|
||||
}
|
||||
|
||||
if (kTypeMerge == ikey.type) {
|
||||
// hit a merge, add the value as an operand and run associative merge.
|
||||
// when complete, add result to operands and continue.
|
||||
const Slice& value = iter_->value();
|
||||
operands.push_front(value.ToString());
|
||||
while(operands.size() >= 2) {
|
||||
// Call user associative-merge until it returns false
|
||||
if (user_merge_operator_->PartialMerge(ikey.user_key,
|
||||
Slice(operands[0]),
|
||||
Slice(operands[1]),
|
||||
&merge_result,
|
||||
logger_.get())) {
|
||||
operands.pop_front();
|
||||
swap(operands.front(), merge_result);
|
||||
} else {
|
||||
// Associative merge returns false ==> stack the operands
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// we either exhausted all internal keys under this user key, or hit
|
||||
// a deletion marker.
|
||||
// feed null as the existing value to the merge operator, such that
|
||||
// client can differentiate this scenario and do things accordingly.
|
||||
user_merge_operator_->FullMerge(saved_key_, nullptr, operands,
|
||||
&saved_value_, logger_.get());
|
||||
}
|
||||
|
||||
void DBIter::Prev() {
|
||||
assert(valid_);
|
||||
|
||||
// Throw an exception now if merge_operator is provided
|
||||
// TODO: support backward iteration
|
||||
if (user_merge_operator_) {
|
||||
Log(logger_, "Prev not supported yet if merge_operator is provided");
|
||||
throw std::logic_error("DBIter::Prev backward iteration not supported"
|
||||
" if merge_operator is provided");
|
||||
}
|
||||
|
||||
if (direction_ == kForward) { // Switch directions?
|
||||
// iter_ is pointing at the current entry. Scan backwards until
|
||||
// the key changes so we can use the normal reverse scanning code.
|
||||
assert(iter_->Valid()); // Otherwise valid_ would have been false
|
||||
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
|
||||
while (true) {
|
||||
iter_->Prev();
|
||||
if (!iter_->Valid()) {
|
||||
valid_ = false;
|
||||
saved_key_.clear();
|
||||
ClearSavedValue();
|
||||
return;
|
||||
}
|
||||
if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
|
||||
saved_key_) < 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
direction_ = kReverse;
|
||||
}
|
||||
|
||||
FindPrevUserEntry();
|
||||
}
|
||||
|
||||
void DBIter::FindPrevUserEntry() {
|
||||
assert(direction_ == kReverse);
|
||||
uint64_t num_skipped = 0;
|
||||
|
||||
ValueType value_type = kTypeDeletion;
|
||||
if (iter_->Valid()) {
|
||||
do {
|
||||
ParsedInternalKey ikey;
|
||||
bool saved_key_cleared = false;
|
||||
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
||||
if ((value_type != kTypeDeletion) &&
|
||||
user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
|
||||
// We encountered a non-deleted value in entries for previous keys,
|
||||
break;
|
||||
}
|
||||
value_type = ikey.type;
|
||||
if (value_type == kTypeDeletion) {
|
||||
saved_key_.clear();
|
||||
ClearSavedValue();
|
||||
saved_key_cleared = true;
|
||||
} else {
|
||||
Slice raw_value = iter_->value();
|
||||
if (saved_value_.capacity() > raw_value.size() + 1048576) {
|
||||
std::string empty;
|
||||
swap(empty, saved_value_);
|
||||
}
|
||||
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
|
||||
saved_value_.assign(raw_value.data(), raw_value.size());
|
||||
}
|
||||
}
|
||||
num_skipped++;
|
||||
// If we have sequentially iterated via numerous keys and still not
|
||||
// found the prev user-key, then it is better to seek so that we can
|
||||
// avoid too many key comparisons. We seek to the first occurence of
|
||||
// our current key by looking for max sequence number.
|
||||
if (!saved_key_cleared && num_skipped > max_skip_) {
|
||||
num_skipped = 0;
|
||||
std::string last_key;
|
||||
AppendInternalKey(&last_key,
|
||||
ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
|
||||
kValueTypeForSeek));
|
||||
iter_->Seek(last_key);
|
||||
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
|
||||
} else {
|
||||
iter_->Prev();
|
||||
}
|
||||
} while (iter_->Valid());
|
||||
}
|
||||
|
||||
if (value_type == kTypeDeletion) {
|
||||
// End
|
||||
valid_ = false;
|
||||
saved_key_.clear();
|
||||
ClearSavedValue();
|
||||
direction_ = kForward;
|
||||
} else {
|
||||
valid_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::Seek(const Slice& target) {
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
saved_key_.clear();
|
||||
AppendInternalKey(
|
||||
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
|
||||
iter_->Seek(saved_key_);
|
||||
if (iter_->Valid()) {
|
||||
FindNextUserEntry(false /*not skipping */);
|
||||
} else {
|
||||
valid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::SeekToFirst() {
|
||||
direction_ = kForward;
|
||||
ClearSavedValue();
|
||||
iter_->SeekToFirst();
|
||||
if (iter_->Valid()) {
|
||||
FindNextUserEntry(false /* not skipping */);
|
||||
} else {
|
||||
valid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::SeekToLast() {
|
||||
// Throw an exception for now if merge_operator is provided
|
||||
// TODO: support backward iteration
|
||||
if (user_merge_operator_) {
|
||||
Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
|
||||
throw std::logic_error("DBIter::SeekToLast: backward iteration not"
|
||||
" supported if merge_operator is provided");
|
||||
}
|
||||
|
||||
direction_ = kReverse;
|
||||
ClearSavedValue();
|
||||
iter_->SeekToLast();
|
||||
FindPrevUserEntry();
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
Iterator* NewDBIterator(
|
||||
const std::string* dbname,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const Comparator *user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
const SequenceNumber& sequence) {
|
||||
return new DBIter(dbname, env, options, user_key_comparator,
|
||||
internal_iter, sequence);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
28
db/db_iter.h
Normal file
28
db/db_iter.h
Normal file
@ -0,0 +1,28 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/dbformat.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Return a new iterator that converts internal keys (yielded by
|
||||
// "*internal_iter") that were live at the specified "sequence" number
|
||||
// into appropriate user keys.
|
||||
extern Iterator* NewDBIterator(
|
||||
const std::string* dbname,
|
||||
Env* env,
|
||||
const Options& options,
|
||||
const Comparator *user_key_comparator,
|
||||
Iterator* internal_iter,
|
||||
const SequenceNumber& sequence);
|
||||
|
||||
} // namespace rocksdb
|
65
db/db_statistics.h
Normal file
65
db/db_statistics.h
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <cassert>
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "util/histogram.h"
|
||||
#include "port/port.h"
|
||||
#include "util/mutexlock.h"
|
||||
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class DBStatistics: public Statistics {
|
||||
public:
|
||||
DBStatistics() : allTickers_(TICKER_ENUM_MAX),
|
||||
allHistograms_(HISTOGRAM_ENUM_MAX) { }
|
||||
|
||||
virtual ~DBStatistics() {}
|
||||
|
||||
virtual long getTickerCount(Tickers tickerType) {
|
||||
assert(tickerType < TICKER_ENUM_MAX);
|
||||
return allTickers_[tickerType].getCount();
|
||||
}
|
||||
|
||||
virtual void setTickerCount(Tickers tickerType, uint64_t count) {
|
||||
assert(tickerType < TICKER_ENUM_MAX);
|
||||
allTickers_[tickerType].setTickerCount(count);
|
||||
}
|
||||
|
||||
virtual void recordTick(Tickers tickerType, uint64_t count) {
|
||||
assert(tickerType < TICKER_ENUM_MAX);
|
||||
allTickers_[tickerType].recordTick(count);
|
||||
}
|
||||
|
||||
virtual void measureTime(Histograms histogramType, uint64_t value) {
|
||||
assert(histogramType < HISTOGRAM_ENUM_MAX);
|
||||
allHistograms_[histogramType].Add(value);
|
||||
}
|
||||
|
||||
virtual void histogramData(Histograms histogramType,
|
||||
HistogramData * const data) {
|
||||
assert(histogramType < HISTOGRAM_ENUM_MAX);
|
||||
allHistograms_[histogramType].Data(data);
|
||||
}
|
||||
|
||||
std::vector<Ticker> allTickers_;
|
||||
std::vector<HistogramImpl> allHistograms_;
|
||||
};
|
||||
|
||||
std::shared_ptr<Statistics> CreateDBStatistics() {
|
||||
return std::make_shared<DBStatistics>();
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
93
db/db_stats_logger.cc
Normal file
93
db/db_stats_logger.cc
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include "db/version_set.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "port/port.h"
|
||||
#include "util/mutexlock.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
void DBImpl::MaybeScheduleLogDBDeployStats() {
|
||||
|
||||
// There is a lock in the actual logger.
|
||||
if (!logger_ || options_.db_stats_log_interval < 0
|
||||
|| host_name_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
|
||||
// Already scheduled
|
||||
} else {
|
||||
int64_t current_ts = 0;
|
||||
Status st = env_->GetCurrentTime(¤t_ts);
|
||||
if (!st.ok()) {
|
||||
return;
|
||||
}
|
||||
if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
|
||||
return;
|
||||
}
|
||||
last_log_ts = current_ts;
|
||||
bg_logstats_scheduled_ = true;
|
||||
env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
|
||||
}
|
||||
}
|
||||
|
||||
void DBImpl::BGLogDBDeployStats(void* db) {
|
||||
DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
|
||||
db_inst->LogDBDeployStats();
|
||||
}
|
||||
|
||||
void DBImpl::LogDBDeployStats() {
|
||||
mutex_.Lock();
|
||||
|
||||
if (shutting_down_.Acquire_Load()) {
|
||||
bg_logstats_scheduled_ = false;
|
||||
bg_cv_.SignalAll();
|
||||
mutex_.Unlock();
|
||||
return;
|
||||
}
|
||||
|
||||
char tmp_ver[100];
|
||||
sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
|
||||
std::string version_info(tmp_ver);
|
||||
|
||||
uint64_t file_total_size = 0;
|
||||
uint32_t file_total_num = 0;
|
||||
for (int i = 0; i < versions_->NumberLevels(); i++) {
|
||||
file_total_num += versions_->NumLevelFiles(i);
|
||||
file_total_size += versions_->NumLevelBytes(i);
|
||||
}
|
||||
|
||||
VersionSet::LevelSummaryStorage scratch;
|
||||
const char* file_num_summary = versions_->LevelSummary(&scratch);
|
||||
std::string file_num_per_level(file_num_summary);
|
||||
std::string data_size_per_level(file_num_summary);
|
||||
|
||||
mutex_.Unlock();
|
||||
|
||||
int64_t unix_ts;
|
||||
env_->GetCurrentTime(&unix_ts);
|
||||
|
||||
logger_->Log_Deploy_Stats(version_info, host_name_,
|
||||
db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
|
||||
data_size_per_level, unix_ts);
|
||||
|
||||
mutex_.Lock();
|
||||
bg_logstats_scheduled_ = false;
|
||||
bg_cv_.SignalAll();
|
||||
mutex_.Unlock();
|
||||
}
|
||||
|
||||
}
|
4914
db/db_test.cc
Normal file
4914
db/db_test.cc
Normal file
File diff suppressed because it is too large
Load Diff
147
db/dbformat.cc
Normal file
147
db/dbformat.cc
Normal file
@ -0,0 +1,147 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <stdio.h>
|
||||
#include "db/dbformat.h"
|
||||
#include "port/port.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/perf_context_imp.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
|
||||
assert(seq <= kMaxSequenceNumber);
|
||||
assert(t <= kValueTypeForSeek);
|
||||
return (seq << 8) | t;
|
||||
}
|
||||
|
||||
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
|
||||
result->append(key.user_key.data(), key.user_key.size());
|
||||
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
|
||||
}
|
||||
|
||||
std::string ParsedInternalKey::DebugString(bool hex) const {
|
||||
char buf[50];
|
||||
snprintf(buf, sizeof(buf), "' @ %llu : %d",
|
||||
(unsigned long long) sequence,
|
||||
int(type));
|
||||
std::string result = "'";
|
||||
result += user_key.ToString(hex);
|
||||
result += buf;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string InternalKey::DebugString(bool hex) const {
|
||||
std::string result;
|
||||
ParsedInternalKey parsed;
|
||||
if (ParseInternalKey(rep_, &parsed)) {
|
||||
result = parsed.DebugString(hex);
|
||||
} else {
|
||||
result = "(bad)";
|
||||
result.append(EscapeString(rep_));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
const char* InternalKeyComparator::Name() const {
|
||||
return name_.c_str();
|
||||
}
|
||||
|
||||
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
|
||||
// Order by:
|
||||
// increasing user key (according to user-supplied comparator)
|
||||
// decreasing sequence number
|
||||
// decreasing type (though sequence# should be enough to disambiguate)
|
||||
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
||||
BumpPerfCount(&perf_context.user_key_comparison_count);
|
||||
if (r == 0) {
|
||||
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
|
||||
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
|
||||
if (anum > bnum) {
|
||||
r = -1;
|
||||
} else if (anum < bnum) {
|
||||
r = +1;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
void InternalKeyComparator::FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const {
|
||||
// Attempt to shorten the user portion of the key
|
||||
Slice user_start = ExtractUserKey(*start);
|
||||
Slice user_limit = ExtractUserKey(limit);
|
||||
std::string tmp(user_start.data(), user_start.size());
|
||||
user_comparator_->FindShortestSeparator(&tmp, user_limit);
|
||||
if (tmp.size() < user_start.size() &&
|
||||
user_comparator_->Compare(user_start, tmp) < 0) {
|
||||
// User key has become shorter physically, but larger logically.
|
||||
// Tack on the earliest possible number to the shortened user key.
|
||||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
|
||||
assert(this->Compare(*start, tmp) < 0);
|
||||
assert(this->Compare(tmp, limit) < 0);
|
||||
start->swap(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
|
||||
Slice user_key = ExtractUserKey(*key);
|
||||
std::string tmp(user_key.data(), user_key.size());
|
||||
user_comparator_->FindShortSuccessor(&tmp);
|
||||
if (tmp.size() < user_key.size() &&
|
||||
user_comparator_->Compare(user_key, tmp) < 0) {
|
||||
// User key has become shorter physically, but larger logically.
|
||||
// Tack on the earliest possible number to the shortened user key.
|
||||
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
|
||||
assert(this->Compare(*key, tmp) < 0);
|
||||
key->swap(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
const char* InternalFilterPolicy::Name() const {
|
||||
return user_policy_->Name();
|
||||
}
|
||||
|
||||
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
|
||||
std::string* dst) const {
|
||||
// We rely on the fact that the code in table.cc does not mind us
|
||||
// adjusting keys[].
|
||||
Slice* mkey = const_cast<Slice*>(keys);
|
||||
for (int i = 0; i < n; i++) {
|
||||
mkey[i] = ExtractUserKey(keys[i]);
|
||||
// TODO(sanjay): Suppress dups?
|
||||
}
|
||||
user_policy_->CreateFilter(keys, n, dst);
|
||||
}
|
||||
|
||||
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
|
||||
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
|
||||
}
|
||||
|
||||
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
|
||||
size_t usize = user_key.size();
|
||||
size_t needed = usize + 13; // A conservative estimate
|
||||
char* dst;
|
||||
if (needed <= sizeof(space_)) {
|
||||
dst = space_;
|
||||
} else {
|
||||
dst = new char[needed];
|
||||
}
|
||||
start_ = dst;
|
||||
dst = EncodeVarint32(dst, usize + 8);
|
||||
kstart_ = dst;
|
||||
memcpy(dst, user_key.data(), usize);
|
||||
dst += usize;
|
||||
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
|
||||
dst += 8;
|
||||
end_ = dst;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
229
db/dbformat.h
Normal file
229
db/dbformat.h
Normal file
@ -0,0 +1,229 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <stdio.h>
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class InternalKey;
|
||||
|
||||
// Value types encoded as the last component of internal keys.
|
||||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
||||
// data structures.
|
||||
enum ValueType {
|
||||
kTypeDeletion = 0x0,
|
||||
kTypeValue = 0x1,
|
||||
kTypeMerge = 0x2,
|
||||
kTypeLogData = 0x3
|
||||
};
|
||||
// kValueTypeForSeek defines the ValueType that should be passed when
|
||||
// constructing a ParsedInternalKey object for seeking to a particular
|
||||
// sequence number (since we sort sequence numbers in decreasing order
|
||||
// and the value type is embedded as the low 8 bits in the sequence
|
||||
// number in internal keys, we need to use the highest-numbered
|
||||
// ValueType, not the lowest).
|
||||
static const ValueType kValueTypeForSeek = kTypeMerge;
|
||||
|
||||
// We leave eight bits empty at the bottom so a type and sequence#
|
||||
// can be packed together into 64-bits.
|
||||
static const SequenceNumber kMaxSequenceNumber =
|
||||
((0x1ull << 56) - 1);
|
||||
|
||||
struct ParsedInternalKey {
|
||||
Slice user_key;
|
||||
SequenceNumber sequence;
|
||||
ValueType type;
|
||||
|
||||
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
|
||||
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
|
||||
: user_key(u), sequence(seq), type(t) { }
|
||||
std::string DebugString(bool hex = false) const;
|
||||
};
|
||||
|
||||
// Return the length of the encoding of "key".
|
||||
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
|
||||
return key.user_key.size() + 8;
|
||||
}
|
||||
|
||||
// Append the serialization of "key" to *result.
|
||||
extern void AppendInternalKey(std::string* result,
|
||||
const ParsedInternalKey& key);
|
||||
|
||||
// Attempt to parse an internal key from "internal_key". On success,
|
||||
// stores the parsed data in "*result", and returns true.
|
||||
//
|
||||
// On error, returns false, leaves "*result" in an undefined state.
|
||||
extern bool ParseInternalKey(const Slice& internal_key,
|
||||
ParsedInternalKey* result);
|
||||
|
||||
// Returns the user key portion of an internal key.
|
||||
inline Slice ExtractUserKey(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
return Slice(internal_key.data(), internal_key.size() - 8);
|
||||
}
|
||||
|
||||
inline ValueType ExtractValueType(const Slice& internal_key) {
|
||||
assert(internal_key.size() >= 8);
|
||||
const size_t n = internal_key.size();
|
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
||||
unsigned char c = num & 0xff;
|
||||
return static_cast<ValueType>(c);
|
||||
}
|
||||
|
||||
// A comparator for internal keys that uses a specified comparator for
|
||||
// the user key portion and breaks ties by decreasing sequence number.
|
||||
class InternalKeyComparator : public Comparator {
|
||||
private:
|
||||
const Comparator* user_comparator_;
|
||||
std::string name_;
|
||||
public:
|
||||
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
|
||||
name_("rocksdb.InternalKeyComparator:" +
|
||||
std::string(user_comparator_->Name())) {
|
||||
}
|
||||
|
||||
virtual const char* Name() const;
|
||||
virtual int Compare(const Slice& a, const Slice& b) const;
|
||||
virtual void FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const;
|
||||
virtual void FindShortSuccessor(std::string* key) const;
|
||||
|
||||
const Comparator* user_comparator() const { return user_comparator_; }
|
||||
|
||||
int Compare(const InternalKey& a, const InternalKey& b) const;
|
||||
};
|
||||
|
||||
// Filter policy wrapper that converts from internal keys to user keys
|
||||
class InternalFilterPolicy : public FilterPolicy {
|
||||
private:
|
||||
const FilterPolicy* const user_policy_;
|
||||
public:
|
||||
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
|
||||
virtual const char* Name() const;
|
||||
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
|
||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
|
||||
};
|
||||
|
||||
// Modules in this directory should keep internal keys wrapped inside
|
||||
// the following class instead of plain strings so that we do not
|
||||
// incorrectly use string comparisons instead of an InternalKeyComparator.
|
||||
class InternalKey {
|
||||
private:
|
||||
std::string rep_;
|
||||
public:
|
||||
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
|
||||
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
|
||||
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
|
||||
}
|
||||
|
||||
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
|
||||
Slice Encode() const {
|
||||
assert(!rep_.empty());
|
||||
return rep_;
|
||||
}
|
||||
|
||||
Slice user_key() const { return ExtractUserKey(rep_); }
|
||||
|
||||
void SetFrom(const ParsedInternalKey& p) {
|
||||
rep_.clear();
|
||||
AppendInternalKey(&rep_, p);
|
||||
}
|
||||
|
||||
void Clear() { rep_.clear(); }
|
||||
|
||||
std::string DebugString(bool hex = false) const;
|
||||
};
|
||||
|
||||
inline int InternalKeyComparator::Compare(
|
||||
const InternalKey& a, const InternalKey& b) const {
|
||||
return Compare(a.Encode(), b.Encode());
|
||||
}
|
||||
|
||||
inline bool ParseInternalKey(const Slice& internal_key,
|
||||
ParsedInternalKey* result) {
|
||||
const size_t n = internal_key.size();
|
||||
if (n < 8) return false;
|
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
||||
unsigned char c = num & 0xff;
|
||||
result->sequence = num >> 8;
|
||||
result->type = static_cast<ValueType>(c);
|
||||
result->user_key = Slice(internal_key.data(), n - 8);
|
||||
return (c <= static_cast<unsigned char>(kValueTypeForSeek));
|
||||
}
|
||||
|
||||
// Update the sequence number in the internal key
|
||||
inline void UpdateInternalKey(char* internal_key,
|
||||
const size_t internal_key_size,
|
||||
uint64_t seq, ValueType t) {
|
||||
assert(internal_key_size >= 8);
|
||||
char* seqtype = internal_key + internal_key_size - 8;
|
||||
uint64_t newval = (seq << 8) | t;
|
||||
EncodeFixed64(seqtype, newval);
|
||||
}
|
||||
|
||||
// Get the sequence number from the internal key
|
||||
inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
|
||||
const size_t n = internal_key.size();
|
||||
assert(n >= 8);
|
||||
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
|
||||
return num >> 8;
|
||||
}
|
||||
|
||||
|
||||
// A helper class useful for DBImpl::Get()
|
||||
class LookupKey {
|
||||
public:
|
||||
// Initialize *this for looking up user_key at a snapshot with
|
||||
// the specified sequence number.
|
||||
LookupKey(const Slice& user_key, SequenceNumber sequence);
|
||||
|
||||
~LookupKey();
|
||||
|
||||
// Return a key suitable for lookup in a MemTable.
|
||||
Slice memtable_key() const { return Slice(start_, end_ - start_); }
|
||||
|
||||
// Return an internal key (suitable for passing to an internal iterator)
|
||||
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
|
||||
|
||||
// Return the user key
|
||||
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
|
||||
|
||||
private:
|
||||
// We construct a char array of the form:
|
||||
// klength varint32 <-- start_
|
||||
// userkey char[klength] <-- kstart_
|
||||
// tag uint64
|
||||
// <-- end_
|
||||
// The array is a suitable MemTable key.
|
||||
// The suffix starting with "userkey" can be used as an InternalKey.
|
||||
const char* start_;
|
||||
const char* kstart_;
|
||||
const char* end_;
|
||||
char space_[200]; // Avoid allocation for short keys
|
||||
|
||||
// No copying allowed
|
||||
LookupKey(const LookupKey&);
|
||||
void operator=(const LookupKey&);
|
||||
};
|
||||
|
||||
inline LookupKey::~LookupKey() {
|
||||
if (start_ != space_) delete[] start_;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
117
db/dbformat_test.cc
Normal file
117
db/dbformat_test.cc
Normal file
@ -0,0 +1,117 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static std::string IKey(const std::string& user_key,
|
||||
uint64_t seq,
|
||||
ValueType vt) {
|
||||
std::string encoded;
|
||||
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
|
||||
return encoded;
|
||||
}
|
||||
|
||||
static std::string Shorten(const std::string& s, const std::string& l) {
|
||||
std::string result = s;
|
||||
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string ShortSuccessor(const std::string& s) {
|
||||
std::string result = s;
|
||||
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void TestKey(const std::string& key,
|
||||
uint64_t seq,
|
||||
ValueType vt) {
|
||||
std::string encoded = IKey(key, seq, vt);
|
||||
|
||||
Slice in(encoded);
|
||||
ParsedInternalKey decoded("", 0, kTypeValue);
|
||||
|
||||
ASSERT_TRUE(ParseInternalKey(in, &decoded));
|
||||
ASSERT_EQ(key, decoded.user_key.ToString());
|
||||
ASSERT_EQ(seq, decoded.sequence);
|
||||
ASSERT_EQ(vt, decoded.type);
|
||||
|
||||
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
|
||||
}
|
||||
|
||||
class FormatTest { };
|
||||
|
||||
TEST(FormatTest, InternalKey_EncodeDecode) {
|
||||
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
|
||||
const uint64_t seq[] = {
|
||||
1, 2, 3,
|
||||
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
|
||||
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
|
||||
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
|
||||
};
|
||||
for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
|
||||
for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
|
||||
TestKey(keys[k], seq[s], kTypeValue);
|
||||
TestKey("hello", 1, kTypeDeletion);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FormatTest, InternalKeyShortSeparator) {
|
||||
// When user keys are same
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 99, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 101, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foo", 100, kTypeDeletion)));
|
||||
|
||||
// When user keys are misordered
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("bar", 99, kTypeValue)));
|
||||
|
||||
// When user keys are different, but correctly ordered
|
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("hello", 200, kTypeValue)));
|
||||
|
||||
// When start user key is prefix of limit user key
|
||||
ASSERT_EQ(IKey("foo", 100, kTypeValue),
|
||||
Shorten(IKey("foo", 100, kTypeValue),
|
||||
IKey("foobar", 200, kTypeValue)));
|
||||
|
||||
// When limit user key is prefix of start user key
|
||||
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
|
||||
Shorten(IKey("foobar", 100, kTypeValue),
|
||||
IKey("foo", 200, kTypeValue)));
|
||||
}
|
||||
|
||||
TEST(FormatTest, InternalKeyShortestSuccessor) {
|
||||
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
|
||||
ShortSuccessor(IKey("foo", 100, kTypeValue)));
|
||||
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
|
||||
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
283
db/deletefile_test.cc
Normal file
283
db/deletefile_test.cc
Normal file
@ -0,0 +1,283 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include <vector>
|
||||
#include <stdlib.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class DeleteFileTest {
|
||||
public:
|
||||
std::string dbname_;
|
||||
Options options_;
|
||||
DB* db_;
|
||||
Env* env_;
|
||||
int numlevels_;
|
||||
|
||||
DeleteFileTest() {
|
||||
db_ = nullptr;
|
||||
env_ = Env::Default();
|
||||
options_.write_buffer_size = 1024*1024*1000;
|
||||
options_.target_file_size_base = 1024*1024*1000;
|
||||
options_.max_bytes_for_level_base = 1024*1024*1000;
|
||||
options_.WAL_ttl_seconds = 300; // Used to test log files
|
||||
options_.WAL_size_limit_MB = 1024; // Used to test log files
|
||||
dbname_ = test::TmpDir() + "/deletefile_test";
|
||||
options_.wal_dir = dbname_ + "/wal_files";
|
||||
DestroyDB(dbname_, options_);
|
||||
numlevels_ = 7;
|
||||
ASSERT_OK(ReopenDB(true));
|
||||
}
|
||||
|
||||
Status ReopenDB(bool create) {
|
||||
delete db_;
|
||||
if (create) {
|
||||
DestroyDB(dbname_, options_);
|
||||
}
|
||||
db_ = nullptr;
|
||||
options_.create_if_missing = create;
|
||||
return DB::Open(options_, dbname_, &db_);
|
||||
}
|
||||
|
||||
void CloseDB() {
|
||||
delete db_;
|
||||
}
|
||||
|
||||
void AddKeys(int numkeys, int startkey = 0) {
|
||||
WriteOptions options;
|
||||
options.sync = false;
|
||||
ReadOptions roptions;
|
||||
for (int i = startkey; i < (numkeys + startkey) ; i++) {
|
||||
std::string temp = std::to_string(i);
|
||||
Slice key(temp);
|
||||
Slice value(temp);
|
||||
ASSERT_OK(db_->Put(options, key, value));
|
||||
}
|
||||
}
|
||||
|
||||
int numKeysInLevels(
|
||||
std::vector<LiveFileMetaData> &metadata,
|
||||
std::vector<int> *keysperlevel = nullptr) {
|
||||
|
||||
if (keysperlevel != nullptr) {
|
||||
keysperlevel->resize(numlevels_);
|
||||
}
|
||||
|
||||
int numKeys = 0;
|
||||
for (size_t i = 0; i < metadata.size(); i++) {
|
||||
int startkey = atoi(metadata[i].smallestkey.c_str());
|
||||
int endkey = atoi(metadata[i].largestkey.c_str());
|
||||
int numkeysinfile = (endkey - startkey + 1);
|
||||
numKeys += numkeysinfile;
|
||||
if (keysperlevel != nullptr) {
|
||||
(*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
|
||||
}
|
||||
fprintf(stderr, "level %d name %s smallest %s largest %s\n",
|
||||
metadata[i].level, metadata[i].name.c_str(),
|
||||
metadata[i].smallestkey.c_str(),
|
||||
metadata[i].largestkey.c_str());
|
||||
}
|
||||
return numKeys;
|
||||
}
|
||||
|
||||
void CreateTwoLevels() {
|
||||
AddKeys(50000, 10000);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
ASSERT_OK(dbi->TEST_FlushMemTable());
|
||||
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
|
||||
|
||||
AddKeys(50000, 10000);
|
||||
ASSERT_OK(dbi->TEST_FlushMemTable());
|
||||
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
|
||||
}
|
||||
|
||||
void CheckFileTypeCounts(std::string& dir,
|
||||
int required_log,
|
||||
int required_sst,
|
||||
int required_manifest) {
|
||||
std::vector<std::string> filenames;
|
||||
env_->GetChildren(dir, &filenames);
|
||||
|
||||
int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
|
||||
for (auto file : filenames) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
if (ParseFileName(file, &number, &type)) {
|
||||
log_cnt += (type == kLogFile);
|
||||
sst_cnt += (type == kTableFile);
|
||||
manifest_cnt += (type == kDescriptorFile);
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(required_log, log_cnt);
|
||||
ASSERT_EQ(required_sst, sst_cnt);
|
||||
ASSERT_EQ(required_manifest, manifest_cnt);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
TEST(DeleteFileTest, AddKeysAndQueryLevels) {
|
||||
CreateTwoLevels();
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
std::vector<int> keysinlevel;
|
||||
db_->GetLiveFilesMetaData(&metadata);
|
||||
|
||||
std::string level1file = "";
|
||||
int level1keycount = 0;
|
||||
std::string level2file = "";
|
||||
int level2keycount = 0;
|
||||
int level1index = 0;
|
||||
int level2index = 1;
|
||||
|
||||
ASSERT_EQ((int)metadata.size(), 2);
|
||||
if (metadata[0].level == 2) {
|
||||
level1index = 1;
|
||||
level2index = 0;
|
||||
}
|
||||
|
||||
level1file = metadata[level1index].name;
|
||||
int startkey = atoi(metadata[level1index].smallestkey.c_str());
|
||||
int endkey = atoi(metadata[level1index].largestkey.c_str());
|
||||
level1keycount = (endkey - startkey + 1);
|
||||
level2file = metadata[level2index].name;
|
||||
startkey = atoi(metadata[level2index].smallestkey.c_str());
|
||||
endkey = atoi(metadata[level2index].largestkey.c_str());
|
||||
level2keycount = (endkey - startkey + 1);
|
||||
|
||||
// COntrolled setup. Levels 1 and 2 should both have 50K files.
|
||||
// This is a little fragile as it depends on the current
|
||||
// compaction heuristics.
|
||||
ASSERT_EQ(level1keycount, 50000);
|
||||
ASSERT_EQ(level2keycount, 50000);
|
||||
|
||||
Status status = db_->DeleteFile("0.sst");
|
||||
ASSERT_TRUE(status.IsInvalidArgument());
|
||||
|
||||
// intermediate level files cannot be deleted.
|
||||
status = db_->DeleteFile(level1file);
|
||||
ASSERT_TRUE(status.IsInvalidArgument());
|
||||
|
||||
// Lowest level file deletion should succeed.
|
||||
ASSERT_OK(db_->DeleteFile(level2file));
|
||||
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
|
||||
CreateTwoLevels();
|
||||
// there should be only one (empty) log file because CreateTwoLevels()
|
||||
// flushes the memtables to disk
|
||||
CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
|
||||
// 2 ssts, 1 manifest
|
||||
CheckFileTypeCounts(dbname_, 0, 2, 1);
|
||||
std::string first("0"), last("999999");
|
||||
Slice first_slice(first), last_slice(last);
|
||||
db_->CompactRange(&first_slice, &last_slice, true, 2);
|
||||
// 1 sst after compaction
|
||||
CheckFileTypeCounts(dbname_, 0, 1, 1);
|
||||
|
||||
// this time, we keep an iterator alive
|
||||
ReopenDB(true);
|
||||
Iterator *itr = 0;
|
||||
CreateTwoLevels();
|
||||
itr = db_->NewIterator(ReadOptions());
|
||||
db_->CompactRange(&first_slice, &last_slice, true, 2);
|
||||
// 3 sst after compaction with live iterator
|
||||
CheckFileTypeCounts(dbname_, 0, 3, 1);
|
||||
delete itr;
|
||||
// 1 sst after iterator deletion
|
||||
CheckFileTypeCounts(dbname_, 0, 1, 1);
|
||||
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
TEST(DeleteFileTest, DeleteFileWithIterator) {
|
||||
CreateTwoLevels();
|
||||
ReadOptions options;
|
||||
Iterator* it = db_->NewIterator(options);
|
||||
std::vector<LiveFileMetaData> metadata;
|
||||
db_->GetLiveFilesMetaData(&metadata);
|
||||
|
||||
std::string level2file = "";
|
||||
|
||||
ASSERT_EQ((int)metadata.size(), 2);
|
||||
if (metadata[0].level == 1) {
|
||||
level2file = metadata[1].name;
|
||||
} else {
|
||||
level2file = metadata[0].name;
|
||||
}
|
||||
|
||||
Status status = db_->DeleteFile(level2file);
|
||||
fprintf(stdout, "Deletion status %s: %s\n",
|
||||
level2file.c_str(), status.ToString().c_str());
|
||||
ASSERT_TRUE(status.ok());
|
||||
it->SeekToFirst();
|
||||
int numKeysIterated = 0;
|
||||
while(it->Valid()) {
|
||||
numKeysIterated++;
|
||||
it->Next();
|
||||
}
|
||||
ASSERT_EQ(numKeysIterated, 50000);
|
||||
delete it;
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
TEST(DeleteFileTest, DeleteLogFiles) {
|
||||
AddKeys(10, 0);
|
||||
VectorLogPtr logfiles;
|
||||
db_->GetSortedWalFiles(logfiles);
|
||||
ASSERT_GT(logfiles.size(), 0UL);
|
||||
// Take the last log file which is expected to be alive and try to delete it
|
||||
// Should not succeed because live logs are not allowed to be deleted
|
||||
std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
|
||||
ASSERT_EQ(alive_log->Type(), kAliveLogFile);
|
||||
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
|
||||
fprintf(stdout, "Deleting alive log file %s\n",
|
||||
alive_log->PathName().c_str());
|
||||
ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
|
||||
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
|
||||
logfiles.clear();
|
||||
|
||||
// Call Flush to bring about a new working log file and add more keys
|
||||
// Call Flush again to flush out memtable and move alive log to archived log
|
||||
// and try to delete the archived log file
|
||||
FlushOptions fopts;
|
||||
db_->Flush(fopts);
|
||||
AddKeys(10, 0);
|
||||
db_->Flush(fopts);
|
||||
db_->GetSortedWalFiles(logfiles);
|
||||
ASSERT_GT(logfiles.size(), 0UL);
|
||||
std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
|
||||
ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
|
||||
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
|
||||
archived_log->PathName()));
|
||||
fprintf(stdout, "Deleting archived log file %s\n",
|
||||
archived_log->PathName().c_str());
|
||||
ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
|
||||
ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
|
||||
archived_log->PathName()));
|
||||
CloseDB();
|
||||
}
|
||||
|
||||
} //namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
||||
|
266
db/filename.cc
Normal file
266
db/filename.cc
Normal file
@ -0,0 +1,266 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/filename.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Given a path, flatten the path name by replacing all chars not in
|
||||
// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
|
||||
// Return the number of chars stored in dest not including the trailing '\0'.
|
||||
static int FlattenPath(const std::string& path, char* dest, int len) {
|
||||
int write_idx = 0;
|
||||
int i = 0;
|
||||
int src_len = path.size();
|
||||
|
||||
while (i < src_len && write_idx < len - 1) {
|
||||
if ((path[i] >= 'a' && path[i] <= 'z') ||
|
||||
(path[i] >= '0' && path[i] <= '9') ||
|
||||
(path[i] >= 'A' && path[i] <= 'Z') ||
|
||||
path[i] == '-' ||
|
||||
path[i] == '.' ||
|
||||
path[i] == '_'){
|
||||
dest[write_idx++] = path[i];
|
||||
} else {
|
||||
if (i > 0)
|
||||
dest[write_idx++] = '_';
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
dest[write_idx] = '\0';
|
||||
return write_idx;
|
||||
}
|
||||
|
||||
// A utility routine: write "data" to the named file and Sync() it.
|
||||
extern Status WriteStringToFileSync(Env* env, const Slice& data,
|
||||
const std::string& fname);
|
||||
|
||||
static std::string MakeFileName(const std::string& name, uint64_t number,
|
||||
const char* suffix) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "/%06llu.%s",
|
||||
static_cast<unsigned long long>(number),
|
||||
suffix);
|
||||
return name + buf;
|
||||
}
|
||||
|
||||
std::string LogFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name, number, "log");
|
||||
}
|
||||
|
||||
std::string ArchivalDirectory(const std::string& dir) {
|
||||
return dir + "/" + ARCHIVAL_DIR;
|
||||
}
|
||||
std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
|
||||
}
|
||||
|
||||
std::string TableFileName(const std::string& name, uint64_t number) {
|
||||
assert(number > 0);
|
||||
return MakeFileName(name, number, "sst");
|
||||
}
|
||||
|
||||
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
|
||||
assert(number > 0);
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
|
||||
static_cast<unsigned long long>(number));
|
||||
return dbname + buf;
|
||||
}
|
||||
|
||||
std::string CurrentFileName(const std::string& dbname) {
|
||||
return dbname + "/CURRENT";
|
||||
}
|
||||
|
||||
std::string LockFileName(const std::string& dbname) {
|
||||
return dbname + "/LOCK";
|
||||
}
|
||||
|
||||
std::string TempFileName(const std::string& dbname, uint64_t number) {
|
||||
assert(number >= 0);
|
||||
return MakeFileName(dbname, number, "dbtmp");
|
||||
}
|
||||
|
||||
std::string InfoLogFileName(const std::string& dbname,
|
||||
const std::string& db_path, const std::string& log_dir) {
|
||||
if (log_dir.empty())
|
||||
return dbname + "/LOG";
|
||||
|
||||
char flatten_db_path[256];
|
||||
FlattenPath(db_path, flatten_db_path, 256);
|
||||
return log_dir + "/" + flatten_db_path + "_LOG";
|
||||
}
|
||||
|
||||
// Return the name of the old info log file for "dbname".
|
||||
std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
|
||||
const std::string& db_path, const std::string& log_dir) {
|
||||
char buf[50];
|
||||
snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
|
||||
|
||||
if (log_dir.empty())
|
||||
return dbname + "/LOG.old." + buf;
|
||||
|
||||
char flatten_db_path[256];
|
||||
FlattenPath(db_path, flatten_db_path, 256);
|
||||
return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
|
||||
}
|
||||
|
||||
std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "/METADB-%llu",
|
||||
static_cast<unsigned long long>(number));
|
||||
return dbname + buf;
|
||||
}
|
||||
|
||||
std::string IdentityFileName(const std::string& dbname) {
|
||||
return dbname + "/IDENTITY";
|
||||
}
|
||||
|
||||
// Owned filenames have the form:
|
||||
// dbname/IDENTITY
|
||||
// dbname/CURRENT
|
||||
// dbname/LOCK
|
||||
// dbname/LOG
|
||||
// dbname/LOG.old.[0-9]+
|
||||
// dbname/MANIFEST-[0-9]+
|
||||
// dbname/[0-9]+.(log|sst)
|
||||
// dbname/METADB-[0-9]+
|
||||
// Disregards / at the beginning
|
||||
bool ParseFileName(const std::string& fname,
|
||||
uint64_t* number,
|
||||
FileType* type,
|
||||
WalFileType* log_type) {
|
||||
Slice rest(fname);
|
||||
if (fname.length() > 1 && fname[0] == '/') {
|
||||
rest.remove_prefix(1);
|
||||
}
|
||||
if (rest == "IDENTITY") {
|
||||
*number = 0;
|
||||
*type = kIdentityFile;
|
||||
} else if (rest == "CURRENT") {
|
||||
*number = 0;
|
||||
*type = kCurrentFile;
|
||||
} else if (rest == "LOCK") {
|
||||
*number = 0;
|
||||
*type = kDBLockFile;
|
||||
} else if (rest == "LOG" || rest == "LOG.old") {
|
||||
*number = 0;
|
||||
*type = kInfoLogFile;
|
||||
} else if (rest.starts_with("LOG.old.")) {
|
||||
uint64_t ts_suffix;
|
||||
// sizeof also counts the trailing '\0'.
|
||||
rest.remove_prefix(sizeof("LOG.old.") - 1);
|
||||
if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
|
||||
return false;
|
||||
}
|
||||
*number = ts_suffix;
|
||||
*type = kInfoLogFile;
|
||||
} else if (rest.starts_with("MANIFEST-")) {
|
||||
rest.remove_prefix(strlen("MANIFEST-"));
|
||||
uint64_t num;
|
||||
if (!ConsumeDecimalNumber(&rest, &num)) {
|
||||
return false;
|
||||
}
|
||||
if (!rest.empty()) {
|
||||
return false;
|
||||
}
|
||||
*type = kDescriptorFile;
|
||||
*number = num;
|
||||
} else if (rest.starts_with("METADB-")) {
|
||||
rest.remove_prefix(strlen("METADB-"));
|
||||
uint64_t num;
|
||||
if (!ConsumeDecimalNumber(&rest, &num)) {
|
||||
return false;
|
||||
}
|
||||
if (!rest.empty()) {
|
||||
return false;
|
||||
}
|
||||
*type = kMetaDatabase;
|
||||
*number = num;
|
||||
} else {
|
||||
// Avoid strtoull() to keep filename format independent of the
|
||||
// current locale
|
||||
bool archive_dir_found = false;
|
||||
if (rest.starts_with(ARCHIVAL_DIR)) {
|
||||
if (rest.size() <= ARCHIVAL_DIR.size()) {
|
||||
return false;
|
||||
}
|
||||
rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
|
||||
if (log_type) {
|
||||
*log_type = kArchivedLogFile;
|
||||
}
|
||||
archive_dir_found = true;
|
||||
}
|
||||
uint64_t num;
|
||||
if (!ConsumeDecimalNumber(&rest, &num)) {
|
||||
return false;
|
||||
}
|
||||
Slice suffix = rest;
|
||||
if (suffix == Slice(".log")) {
|
||||
*type = kLogFile;
|
||||
if (log_type && !archive_dir_found) {
|
||||
*log_type = kAliveLogFile;
|
||||
}
|
||||
} else if (archive_dir_found) {
|
||||
return false; // Archive dir can contain only log files
|
||||
} else if (suffix == Slice(".sst")) {
|
||||
*type = kTableFile;
|
||||
} else if (suffix == Slice(".dbtmp")) {
|
||||
*type = kTempFile;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
*number = num;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Status SetCurrentFile(Env* env, const std::string& dbname,
|
||||
uint64_t descriptor_number) {
|
||||
// Remove leading "dbname/" and add newline to manifest file name
|
||||
std::string manifest = DescriptorFileName(dbname, descriptor_number);
|
||||
Slice contents = manifest;
|
||||
assert(contents.starts_with(dbname + "/"));
|
||||
contents.remove_prefix(dbname.size() + 1);
|
||||
std::string tmp = TempFileName(dbname, descriptor_number);
|
||||
Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
|
||||
if (s.ok()) {
|
||||
s = env->RenameFile(tmp, CurrentFileName(dbname));
|
||||
}
|
||||
if (!s.ok()) {
|
||||
env->DeleteFile(tmp);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status SetIdentityFile(Env* env, const std::string& dbname) {
|
||||
std::string id = env->GenerateUniqueId();
|
||||
assert(!id.empty());
|
||||
// Reserve the filename dbname/000000.dbtmp for the temporary identity file
|
||||
std::string tmp = TempFileName(dbname, 0);
|
||||
Status s = WriteStringToFileSync(env, id, tmp);
|
||||
if (s.ok()) {
|
||||
s = env->RenameFile(tmp, IdentityFileName(dbname));
|
||||
}
|
||||
if (!s.ok()) {
|
||||
env->DeleteFile(tmp);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
108
db/filename.h
Normal file
108
db/filename.h
Normal file
@ -0,0 +1,108 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// File names used by DB code
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include "port/port.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Env;
|
||||
|
||||
enum FileType {
|
||||
kLogFile,
|
||||
kDBLockFile,
|
||||
kTableFile,
|
||||
kDescriptorFile,
|
||||
kCurrentFile,
|
||||
kTempFile,
|
||||
kInfoLogFile, // Either the current one, or an old one
|
||||
kMetaDatabase,
|
||||
kIdentityFile
|
||||
};
|
||||
|
||||
// Return the name of the log file with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string LogFileName(const std::string& dbname, uint64_t number);
|
||||
|
||||
static const std::string ARCHIVAL_DIR = "archive";
|
||||
|
||||
extern std::string ArchivalDirectory(const std::string& dbname);
|
||||
|
||||
// Return the name of the archived log file with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with "dbname".
|
||||
extern std::string ArchivedLogFileName(const std::string& dbname,
|
||||
uint64_t num);
|
||||
|
||||
// Return the name of the sstable with the specified number
|
||||
// in the db named by "dbname". The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string TableFileName(const std::string& dbname, uint64_t number);
|
||||
|
||||
// Return the name of the descriptor file for the db named by
|
||||
// "dbname" and the specified incarnation number. The result will be
|
||||
// prefixed with "dbname".
|
||||
extern std::string DescriptorFileName(const std::string& dbname,
|
||||
uint64_t number);
|
||||
|
||||
// Return the name of the current file. This file contains the name
|
||||
// of the current manifest file. The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string CurrentFileName(const std::string& dbname);
|
||||
|
||||
// Return the name of the lock file for the db named by
|
||||
// "dbname". The result will be prefixed with "dbname".
|
||||
extern std::string LockFileName(const std::string& dbname);
|
||||
|
||||
// Return the name of a temporary file owned by the db named "dbname".
|
||||
// The result will be prefixed with "dbname".
|
||||
extern std::string TempFileName(const std::string& dbname, uint64_t number);
|
||||
|
||||
// Return the name of the info log file for "dbname".
|
||||
extern std::string InfoLogFileName(const std::string& dbname,
|
||||
const std::string& db_path="", const std::string& log_dir="");
|
||||
|
||||
// Return the name of the old info log file for "dbname".
|
||||
extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
|
||||
const std::string& db_path="", const std::string& log_dir="");
|
||||
|
||||
// Return the name to use for a metadatabase. The result will be prefixed with
|
||||
// "dbname".
|
||||
extern std::string MetaDatabaseName(const std::string& dbname,
|
||||
uint64_t number);
|
||||
|
||||
// Return the name of the Identity file which stores a unique number for the db
|
||||
// that will get regenerated if the db loses all its data and is recreated fresh
|
||||
// either from a backup-image or empty
|
||||
extern std::string IdentityFileName(const std::string& dbname);
|
||||
|
||||
// If filename is a rocksdb file, store the type of the file in *type.
|
||||
// The number encoded in the filename is stored in *number. If the
|
||||
// filename was successfully parsed, returns true. Else return false.
|
||||
extern bool ParseFileName(const std::string& filename,
|
||||
uint64_t* number,
|
||||
FileType* type,
|
||||
WalFileType* log_type = nullptr);
|
||||
|
||||
// Make the CURRENT file point to the descriptor file with the
|
||||
// specified number.
|
||||
extern Status SetCurrentFile(Env* env, const std::string& dbname,
|
||||
uint64_t descriptor_number);
|
||||
|
||||
// Make the IDENTITY file for the db
|
||||
extern Status SetIdentityFile(Env* env, const std::string& dbname);
|
||||
|
||||
} // namespace rocksdb
|
140
db/filename_test.cc
Normal file
140
db/filename_test.cc
Normal file
@ -0,0 +1,140 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/filename.h"
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "port/port.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FileNameTest { };
|
||||
|
||||
TEST(FileNameTest, Parse) {
|
||||
Slice db;
|
||||
FileType type;
|
||||
uint64_t number;
|
||||
|
||||
// Successful parses
|
||||
static struct {
|
||||
const char* fname;
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
} cases[] = {
|
||||
{ "100.log", 100, kLogFile },
|
||||
{ "0.log", 0, kLogFile },
|
||||
{ "0.sst", 0, kTableFile },
|
||||
{ "CURRENT", 0, kCurrentFile },
|
||||
{ "LOCK", 0, kDBLockFile },
|
||||
{ "MANIFEST-2", 2, kDescriptorFile },
|
||||
{ "MANIFEST-7", 7, kDescriptorFile },
|
||||
{ "METADB-2", 2, kMetaDatabase },
|
||||
{ "METADB-7", 7, kMetaDatabase },
|
||||
{ "LOG", 0, kInfoLogFile },
|
||||
{ "LOG.old", 0, kInfoLogFile },
|
||||
{ "18446744073709551615.log", 18446744073709551615ull, kLogFile },
|
||||
};
|
||||
for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
|
||||
std::string f = cases[i].fname;
|
||||
ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
|
||||
ASSERT_EQ(cases[i].type, type) << f;
|
||||
ASSERT_EQ(cases[i].number, number) << f;
|
||||
}
|
||||
|
||||
// Errors
|
||||
static const char* errors[] = {
|
||||
"",
|
||||
"foo",
|
||||
"foo-dx-100.log",
|
||||
".log",
|
||||
"",
|
||||
"manifest",
|
||||
"CURREN",
|
||||
"CURRENTX",
|
||||
"MANIFES",
|
||||
"MANIFEST",
|
||||
"MANIFEST-",
|
||||
"XMANIFEST-3",
|
||||
"MANIFEST-3x",
|
||||
"META",
|
||||
"METADB",
|
||||
"METADB-",
|
||||
"XMETADB-3",
|
||||
"METADB-3x",
|
||||
"LOC",
|
||||
"LOCKx",
|
||||
"LO",
|
||||
"LOGx",
|
||||
"18446744073709551616.log",
|
||||
"184467440737095516150.log",
|
||||
"100",
|
||||
"100.",
|
||||
"100.lop"
|
||||
};
|
||||
for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
|
||||
std::string f = errors[i];
|
||||
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
|
||||
};
|
||||
}
|
||||
|
||||
TEST(FileNameTest, Construction) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
std::string fname;
|
||||
|
||||
fname = CurrentFileName("foo");
|
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(0U, number);
|
||||
ASSERT_EQ(kCurrentFile, type);
|
||||
|
||||
fname = LockFileName("foo");
|
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(0U, number);
|
||||
ASSERT_EQ(kDBLockFile, type);
|
||||
|
||||
fname = LogFileName("foo", 192);
|
||||
ASSERT_EQ("foo/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(192U, number);
|
||||
ASSERT_EQ(kLogFile, type);
|
||||
|
||||
fname = TableFileName("bar", 200);
|
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(200U, number);
|
||||
ASSERT_EQ(kTableFile, type);
|
||||
|
||||
fname = DescriptorFileName("bar", 100);
|
||||
ASSERT_EQ("bar/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(100U, number);
|
||||
ASSERT_EQ(kDescriptorFile, type);
|
||||
|
||||
fname = TempFileName("tmp", 999);
|
||||
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(999U, number);
|
||||
ASSERT_EQ(kTempFile, type);
|
||||
|
||||
fname = MetaDatabaseName("met", 100);
|
||||
ASSERT_EQ("met/", std::string(fname.data(), 4));
|
||||
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
|
||||
ASSERT_EQ(100U, number);
|
||||
ASSERT_EQ(kMetaDatabase, type);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
36
db/log_format.h
Normal file
36
db/log_format.h
Normal file
@ -0,0 +1,36 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Log format information shared by reader and writer.
|
||||
// See ../doc/log_format.txt for more detail.
|
||||
|
||||
#pragma once
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
enum RecordType {
|
||||
// Zero is reserved for preallocated files
|
||||
kZeroType = 0,
|
||||
|
||||
kFullType = 1,
|
||||
|
||||
// For fragments
|
||||
kFirstType = 2,
|
||||
kMiddleType = 3,
|
||||
kLastType = 4
|
||||
};
|
||||
static const int kMaxRecordType = kLastType;
|
||||
|
||||
static const unsigned int kBlockSize = 32768;
|
||||
|
||||
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
|
||||
static const int kHeaderSize = 4 + 1 + 2;
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
264
db/log_reader.cc
Normal file
264
db/log_reader.cc
Normal file
@ -0,0 +1,264 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_reader.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
Reader::Reporter::~Reporter() {
|
||||
}
|
||||
|
||||
Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
|
||||
bool checksum, uint64_t initial_offset)
|
||||
: file_(std::move(file)),
|
||||
reporter_(reporter),
|
||||
checksum_(checksum),
|
||||
backing_store_(new char[kBlockSize]),
|
||||
buffer_(),
|
||||
eof_(false),
|
||||
last_record_offset_(0),
|
||||
end_of_buffer_offset_(0),
|
||||
initial_offset_(initial_offset) {
|
||||
}
|
||||
|
||||
Reader::~Reader() {
|
||||
delete[] backing_store_;
|
||||
}
|
||||
|
||||
bool Reader::SkipToInitialBlock() {
|
||||
size_t offset_in_block = initial_offset_ % kBlockSize;
|
||||
uint64_t block_start_location = initial_offset_ - offset_in_block;
|
||||
|
||||
// Don't search a block if we'd be in the trailer
|
||||
if (offset_in_block > kBlockSize - 6) {
|
||||
offset_in_block = 0;
|
||||
block_start_location += kBlockSize;
|
||||
}
|
||||
|
||||
end_of_buffer_offset_ = block_start_location;
|
||||
|
||||
// Skip to start of first block that can contain the initial record
|
||||
if (block_start_location > 0) {
|
||||
Status skip_status = file_->Skip(block_start_location);
|
||||
if (!skip_status.ok()) {
|
||||
ReportDrop(block_start_location, skip_status);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
||||
if (last_record_offset_ < initial_offset_) {
|
||||
if (!SkipToInitialBlock()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
scratch->clear();
|
||||
record->clear();
|
||||
bool in_fragmented_record = false;
|
||||
// Record offset of the logical record that we're reading
|
||||
// 0 is a dummy value to make compilers happy
|
||||
uint64_t prospective_record_offset = 0;
|
||||
|
||||
Slice fragment;
|
||||
while (true) {
|
||||
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
||||
const unsigned int record_type = ReadPhysicalRecord(&fragment);
|
||||
switch (record_type) {
|
||||
case kFullType:
|
||||
if (in_fragmented_record) {
|
||||
// Handle bug in earlier versions of log::Writer where
|
||||
// it could emit an empty kFirstType record at the tail end
|
||||
// of a block followed by a kFullType or kFirstType record
|
||||
// at the beginning of the next block.
|
||||
if (scratch->empty()) {
|
||||
in_fragmented_record = false;
|
||||
} else {
|
||||
ReportCorruption(scratch->size(), "partial record without end(1)");
|
||||
}
|
||||
}
|
||||
prospective_record_offset = physical_record_offset;
|
||||
scratch->clear();
|
||||
*record = fragment;
|
||||
last_record_offset_ = prospective_record_offset;
|
||||
return true;
|
||||
|
||||
case kFirstType:
|
||||
if (in_fragmented_record) {
|
||||
// Handle bug in earlier versions of log::Writer where
|
||||
// it could emit an empty kFirstType record at the tail end
|
||||
// of a block followed by a kFullType or kFirstType record
|
||||
// at the beginning of the next block.
|
||||
if (scratch->empty()) {
|
||||
in_fragmented_record = false;
|
||||
} else {
|
||||
ReportCorruption(scratch->size(), "partial record without end(2)");
|
||||
}
|
||||
}
|
||||
prospective_record_offset = physical_record_offset;
|
||||
scratch->assign(fragment.data(), fragment.size());
|
||||
in_fragmented_record = true;
|
||||
break;
|
||||
|
||||
case kMiddleType:
|
||||
if (!in_fragmented_record) {
|
||||
ReportCorruption(fragment.size(),
|
||||
"missing start of fragmented record(1)");
|
||||
} else {
|
||||
scratch->append(fragment.data(), fragment.size());
|
||||
}
|
||||
break;
|
||||
|
||||
case kLastType:
|
||||
if (!in_fragmented_record) {
|
||||
ReportCorruption(fragment.size(),
|
||||
"missing start of fragmented record(2)");
|
||||
} else {
|
||||
scratch->append(fragment.data(), fragment.size());
|
||||
*record = Slice(*scratch);
|
||||
last_record_offset_ = prospective_record_offset;
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
case kEof:
|
||||
if (in_fragmented_record) {
|
||||
ReportCorruption(scratch->size(), "partial record without end(3)");
|
||||
scratch->clear();
|
||||
}
|
||||
return false;
|
||||
|
||||
case kBadRecord:
|
||||
if (in_fragmented_record) {
|
||||
ReportCorruption(scratch->size(), "error in middle of record");
|
||||
in_fragmented_record = false;
|
||||
scratch->clear();
|
||||
}
|
||||
break;
|
||||
|
||||
default: {
|
||||
char buf[40];
|
||||
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
|
||||
ReportCorruption(
|
||||
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
|
||||
buf);
|
||||
in_fragmented_record = false;
|
||||
scratch->clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t Reader::LastRecordOffset() {
|
||||
return last_record_offset_;
|
||||
}
|
||||
|
||||
void Reader::ReportCorruption(size_t bytes, const char* reason) {
|
||||
ReportDrop(bytes, Status::Corruption(reason));
|
||||
}
|
||||
|
||||
void Reader::ReportDrop(size_t bytes, const Status& reason) {
|
||||
if (reporter_ != nullptr &&
|
||||
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
||||
reporter_->Corruption(bytes, reason);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
||||
while (true) {
|
||||
if (buffer_.size() < (size_t)kHeaderSize) {
|
||||
if (!eof_) {
|
||||
// Last read was a full read, so this is a trailer to skip
|
||||
buffer_.clear();
|
||||
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
||||
end_of_buffer_offset_ += buffer_.size();
|
||||
if (!status.ok()) {
|
||||
buffer_.clear();
|
||||
ReportDrop(kBlockSize, status);
|
||||
eof_ = true;
|
||||
return kEof;
|
||||
} else if (buffer_.size() < (size_t)kBlockSize) {
|
||||
eof_ = true;
|
||||
}
|
||||
continue;
|
||||
} else if (buffer_.size() == 0) {
|
||||
// End of file
|
||||
return kEof;
|
||||
} else {
|
||||
size_t drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
ReportCorruption(drop_size, "truncated record at end of file");
|
||||
return kEof;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the header
|
||||
const char* header = buffer_.data();
|
||||
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
|
||||
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
|
||||
const unsigned int type = header[6];
|
||||
const uint32_t length = a | (b << 8);
|
||||
if (kHeaderSize + length > buffer_.size()) {
|
||||
size_t drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
ReportCorruption(drop_size, "bad record length");
|
||||
return kBadRecord;
|
||||
}
|
||||
|
||||
if (type == kZeroType && length == 0) {
|
||||
// Skip zero length record without reporting any drops since
|
||||
// such records are produced by the mmap based writing code in
|
||||
// env_posix.cc that preallocates file regions.
|
||||
buffer_.clear();
|
||||
return kBadRecord;
|
||||
}
|
||||
|
||||
// Check crc
|
||||
if (checksum_) {
|
||||
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
|
||||
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
|
||||
if (actual_crc != expected_crc) {
|
||||
// Drop the rest of the buffer since "length" itself may have
|
||||
// been corrupted and if we trust it, we could find some
|
||||
// fragment of a real log record that just happens to look
|
||||
// like a valid log record.
|
||||
size_t drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
ReportCorruption(drop_size, "checksum mismatch");
|
||||
return kBadRecord;
|
||||
}
|
||||
}
|
||||
|
||||
buffer_.remove_prefix(kHeaderSize + length);
|
||||
|
||||
// Skip physical record that started before initial_offset_
|
||||
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
|
||||
initial_offset_) {
|
||||
result->clear();
|
||||
return kBadRecord;
|
||||
}
|
||||
|
||||
*result = Slice(header + kHeaderSize, length);
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
124
db/log_reader.h
Normal file
124
db/log_reader.h
Normal file
@ -0,0 +1,124 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "db/log_format.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class SequentialFile;
|
||||
using std::unique_ptr;
|
||||
|
||||
namespace log {
|
||||
|
||||
class Reader {
|
||||
public:
|
||||
// Interface for reporting errors.
|
||||
class Reporter {
|
||||
public:
|
||||
virtual ~Reporter();
|
||||
|
||||
// Some corruption was detected. "size" is the approximate number
|
||||
// of bytes dropped due to the corruption.
|
||||
virtual void Corruption(size_t bytes, const Status& status) = 0;
|
||||
};
|
||||
|
||||
// Create a reader that will return log records from "*file".
|
||||
// "*file" must remain live while this Reader is in use.
|
||||
//
|
||||
// If "reporter" is non-nullptr, it is notified whenever some data is
|
||||
// dropped due to a detected corruption. "*reporter" must remain
|
||||
// live while this Reader is in use.
|
||||
//
|
||||
// If "checksum" is true, verify checksums if available.
|
||||
//
|
||||
// The Reader will start reading at the first record located at physical
|
||||
// position >= initial_offset within the file.
|
||||
Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
|
||||
bool checksum, uint64_t initial_offset);
|
||||
|
||||
~Reader();
|
||||
|
||||
// Read the next record into *record. Returns true if read
|
||||
// successfully, false if we hit end of the input. May use
|
||||
// "*scratch" as temporary storage. The contents filled in *record
|
||||
// will only be valid until the next mutating operation on this
|
||||
// reader or the next mutation to *scratch.
|
||||
bool ReadRecord(Slice* record, std::string* scratch);
|
||||
|
||||
// Returns the physical offset of the last record returned by ReadRecord.
|
||||
//
|
||||
// Undefined before the first call to ReadRecord.
|
||||
uint64_t LastRecordOffset();
|
||||
|
||||
// returns true if the reader has encountered an eof condition.
|
||||
bool IsEOF() {
|
||||
return eof_;
|
||||
}
|
||||
|
||||
// when we know more data has been written to the file. we can use this
|
||||
// function to force the reader to look again in the file.
|
||||
void UnmarkEOF() {
|
||||
eof_ = false;
|
||||
}
|
||||
|
||||
SequentialFile* file() { return file_.get(); }
|
||||
|
||||
private:
|
||||
const unique_ptr<SequentialFile> file_;
|
||||
Reporter* const reporter_;
|
||||
bool const checksum_;
|
||||
char* const backing_store_;
|
||||
Slice buffer_;
|
||||
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
|
||||
|
||||
// Offset of the last record returned by ReadRecord.
|
||||
uint64_t last_record_offset_;
|
||||
// Offset of the first location past the end of buffer_.
|
||||
uint64_t end_of_buffer_offset_;
|
||||
|
||||
// Offset at which to start looking for the first record to return
|
||||
uint64_t const initial_offset_;
|
||||
|
||||
// Extend record types with the following special values
|
||||
enum {
|
||||
kEof = kMaxRecordType + 1,
|
||||
// Returned whenever we find an invalid physical record.
|
||||
// Currently there are three situations in which this happens:
|
||||
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
|
||||
// * The record is a 0-length record (No drop is reported)
|
||||
// * The record is below constructor's initial_offset (No drop is reported)
|
||||
kBadRecord = kMaxRecordType + 2
|
||||
};
|
||||
|
||||
// Skips all blocks that are completely before "initial_offset_".
|
||||
//
|
||||
// Returns true on success. Handles reporting.
|
||||
bool SkipToInitialBlock();
|
||||
|
||||
// Return type, or one of the preceding special values
|
||||
unsigned int ReadPhysicalRecord(Slice* result);
|
||||
|
||||
// Reports dropped bytes to the reporter.
|
||||
// buffer_ must be updated to remove the dropped bytes prior to invocation.
|
||||
void ReportCorruption(size_t bytes, const char* reason);
|
||||
void ReportDrop(size_t bytes, const Status& reason);
|
||||
|
||||
// No copying allowed
|
||||
Reader(const Reader&);
|
||||
void operator=(const Reader&);
|
||||
};
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
528
db/log_test.cc
Normal file
528
db/log_test.cc
Normal file
@ -0,0 +1,528 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include "util/random.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
// Construct a string of the specified length made out of the supplied
|
||||
// partial string.
|
||||
static std::string BigString(const std::string& partial_string, size_t n) {
|
||||
std::string result;
|
||||
while (result.size() < n) {
|
||||
result.append(partial_string);
|
||||
}
|
||||
result.resize(n);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Construct a string from a number
|
||||
static std::string NumberString(int n) {
|
||||
char buf[50];
|
||||
snprintf(buf, sizeof(buf), "%d.", n);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
// Return a skewed potentially long string
|
||||
static std::string RandomSkewedString(int i, Random* rnd) {
|
||||
return BigString(NumberString(i), rnd->Skewed(17));
|
||||
}
|
||||
|
||||
class LogTest {
|
||||
private:
|
||||
class StringDest : public WritableFile {
|
||||
public:
|
||||
std::string contents_;
|
||||
|
||||
virtual Status Close() { return Status::OK(); }
|
||||
virtual Status Flush() { return Status::OK(); }
|
||||
virtual Status Sync() { return Status::OK(); }
|
||||
virtual Status Append(const Slice& slice) {
|
||||
contents_.append(slice.data(), slice.size());
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
class StringSource : public SequentialFile {
|
||||
public:
|
||||
Slice contents_;
|
||||
bool force_error_;
|
||||
bool returned_partial_;
|
||||
StringSource() : force_error_(false), returned_partial_(false) { }
|
||||
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) {
|
||||
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
|
||||
|
||||
if (force_error_) {
|
||||
force_error_ = false;
|
||||
returned_partial_ = true;
|
||||
return Status::Corruption("read error");
|
||||
}
|
||||
|
||||
if (contents_.size() < n) {
|
||||
n = contents_.size();
|
||||
returned_partial_ = true;
|
||||
}
|
||||
*result = Slice(contents_.data(), n);
|
||||
contents_.remove_prefix(n);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status Skip(uint64_t n) {
|
||||
if (n > contents_.size()) {
|
||||
contents_.clear();
|
||||
return Status::NotFound("in-memory file skipepd past end");
|
||||
}
|
||||
|
||||
contents_.remove_prefix(n);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
class ReportCollector : public Reader::Reporter {
|
||||
public:
|
||||
size_t dropped_bytes_;
|
||||
std::string message_;
|
||||
|
||||
ReportCollector() : dropped_bytes_(0) { }
|
||||
virtual void Corruption(size_t bytes, const Status& status) {
|
||||
dropped_bytes_ += bytes;
|
||||
message_.append(status.ToString());
|
||||
}
|
||||
};
|
||||
|
||||
std::string& dest_contents() {
|
||||
auto dest = dynamic_cast<StringDest*>(writer_.file());
|
||||
assert(dest);
|
||||
return dest->contents_;
|
||||
}
|
||||
|
||||
const std::string& dest_contents() const {
|
||||
auto dest = dynamic_cast<const StringDest*>(writer_.file());
|
||||
assert(dest);
|
||||
return dest->contents_;
|
||||
}
|
||||
|
||||
void reset_source_contents() {
|
||||
auto src = dynamic_cast<StringSource*>(reader_.file());
|
||||
assert(src);
|
||||
src->contents_ = dest_contents();
|
||||
}
|
||||
|
||||
unique_ptr<StringDest> dest_holder_;
|
||||
unique_ptr<StringSource> source_holder_;
|
||||
ReportCollector report_;
|
||||
bool reading_;
|
||||
Writer writer_;
|
||||
Reader reader_;
|
||||
|
||||
// Record metadata for testing initial offset functionality
|
||||
static size_t initial_offset_record_sizes_[];
|
||||
static uint64_t initial_offset_last_record_offsets_[];
|
||||
|
||||
public:
|
||||
LogTest() : dest_holder_(new StringDest),
|
||||
source_holder_(new StringSource),
|
||||
reading_(false),
|
||||
writer_(std::move(dest_holder_)),
|
||||
reader_(std::move(source_holder_), &report_, true/*checksum*/,
|
||||
0/*initial_offset*/) {
|
||||
}
|
||||
|
||||
void Write(const std::string& msg) {
|
||||
ASSERT_TRUE(!reading_) << "Write() after starting to read";
|
||||
writer_.AddRecord(Slice(msg));
|
||||
}
|
||||
|
||||
size_t WrittenBytes() const {
|
||||
return dest_contents().size();
|
||||
}
|
||||
|
||||
std::string Read() {
|
||||
if (!reading_) {
|
||||
reading_ = true;
|
||||
reset_source_contents();
|
||||
}
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
if (reader_.ReadRecord(&record, &scratch)) {
|
||||
return record.ToString();
|
||||
} else {
|
||||
return "EOF";
|
||||
}
|
||||
}
|
||||
|
||||
void IncrementByte(int offset, int delta) {
|
||||
dest_contents()[offset] += delta;
|
||||
}
|
||||
|
||||
void SetByte(int offset, char new_byte) {
|
||||
dest_contents()[offset] = new_byte;
|
||||
}
|
||||
|
||||
void ShrinkSize(int bytes) {
|
||||
dest_contents().resize(dest_contents().size() - bytes);
|
||||
}
|
||||
|
||||
void FixChecksum(int header_offset, int len) {
|
||||
// Compute crc of type/len/data
|
||||
uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
|
||||
crc = crc32c::Mask(crc);
|
||||
EncodeFixed32(&dest_contents()[header_offset], crc);
|
||||
}
|
||||
|
||||
void ForceError() {
|
||||
auto src = dynamic_cast<StringSource*>(reader_.file());
|
||||
src->force_error_ = true;
|
||||
}
|
||||
|
||||
size_t DroppedBytes() const {
|
||||
return report_.dropped_bytes_;
|
||||
}
|
||||
|
||||
std::string ReportMessage() const {
|
||||
return report_.message_;
|
||||
}
|
||||
|
||||
// Returns OK iff recorded error message contains "msg"
|
||||
std::string MatchError(const std::string& msg) const {
|
||||
if (report_.message_.find(msg) == std::string::npos) {
|
||||
return report_.message_;
|
||||
} else {
|
||||
return "OK";
|
||||
}
|
||||
}
|
||||
|
||||
void WriteInitialOffsetLog() {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
std::string record(initial_offset_record_sizes_[i],
|
||||
static_cast<char>('a' + i));
|
||||
Write(record);
|
||||
}
|
||||
}
|
||||
|
||||
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
|
||||
WriteInitialOffsetLog();
|
||||
reading_ = true;
|
||||
unique_ptr<StringSource> source(new StringSource);
|
||||
source->contents_ = dest_contents();
|
||||
unique_ptr<Reader> offset_reader(
|
||||
new Reader(std::move(source), &report_, true/*checksum*/,
|
||||
WrittenBytes() + offset_past_end));
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
|
||||
}
|
||||
|
||||
void CheckInitialOffsetRecord(uint64_t initial_offset,
|
||||
int expected_record_offset) {
|
||||
WriteInitialOffsetLog();
|
||||
reading_ = true;
|
||||
unique_ptr<StringSource> source(new StringSource);
|
||||
source->contents_ = dest_contents();
|
||||
unique_ptr<Reader> offset_reader(
|
||||
new Reader(std::move(source), &report_, true/*checksum*/,
|
||||
initial_offset));
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
|
||||
ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
|
||||
record.size());
|
||||
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
|
||||
offset_reader->LastRecordOffset());
|
||||
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
size_t LogTest::initial_offset_record_sizes_[] =
|
||||
{10000, // Two sizable records in first block
|
||||
10000,
|
||||
2 * log::kBlockSize - 1000, // Span three blocks
|
||||
1};
|
||||
|
||||
uint64_t LogTest::initial_offset_last_record_offsets_[] =
|
||||
{0,
|
||||
kHeaderSize + 10000,
|
||||
2 * (kHeaderSize + 10000),
|
||||
2 * (kHeaderSize + 10000) +
|
||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
|
||||
|
||||
|
||||
TEST(LogTest, Empty) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadWrite) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
Write("");
|
||||
Write("xxxx");
|
||||
ASSERT_EQ("foo", Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("", Read());
|
||||
ASSERT_EQ("xxxx", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
|
||||
}
|
||||
|
||||
TEST(LogTest, ManyBlocks) {
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
Write(NumberString(i));
|
||||
}
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
ASSERT_EQ(NumberString(i), Read());
|
||||
}
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, Fragmentation) {
|
||||
Write("small");
|
||||
Write(BigString("medium", 50000));
|
||||
Write(BigString("large", 100000));
|
||||
ASSERT_EQ("small", Read());
|
||||
ASSERT_EQ(BigString("medium", 50000), Read());
|
||||
ASSERT_EQ(BigString("large", 100000), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, MarginalTrailer) {
|
||||
// Make a trailer that is exactly the same length as an empty record.
|
||||
const int n = kBlockSize - 2*kHeaderSize;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
|
||||
Write("");
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("", Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, MarginalTrailer2) {
|
||||
// Make a trailer that is exactly the same length as an empty record.
|
||||
const int n = kBlockSize - 2*kHeaderSize;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
}
|
||||
|
||||
TEST(LogTest, ShortTrailer) {
|
||||
const int n = kBlockSize - 2*kHeaderSize + 4;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
|
||||
Write("");
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("", Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, AlignedEof) {
|
||||
const int n = kBlockSize - 2*kHeaderSize + 4;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST(LogTest, RandomRead) {
|
||||
const int N = 500;
|
||||
Random write_rnd(301);
|
||||
for (int i = 0; i < N; i++) {
|
||||
Write(RandomSkewedString(i, &write_rnd));
|
||||
}
|
||||
Random read_rnd(301);
|
||||
for (int i = 0; i < N; i++) {
|
||||
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
|
||||
}
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
// Tests of all the error paths in log_reader.cc follow:
|
||||
|
||||
TEST(LogTest, ReadError) {
|
||||
Write("foo");
|
||||
ForceError();
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("read error"));
|
||||
}
|
||||
|
||||
TEST(LogTest, BadRecordType) {
|
||||
Write("foo");
|
||||
// Type is stored in header[6]
|
||||
IncrementByte(6, 100);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("unknown record type"));
|
||||
}
|
||||
|
||||
TEST(LogTest, TruncatedTrailingRecord) {
|
||||
Write("foo");
|
||||
ShrinkSize(4); // Drop all payload as well as a header byte
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ((unsigned int)(kHeaderSize - 1), DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
|
||||
}
|
||||
|
||||
TEST(LogTest, BadLength) {
|
||||
Write("foo");
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ((unsigned int)(kHeaderSize + 2), DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("bad record length"));
|
||||
}
|
||||
|
||||
TEST(LogTest, ChecksumMismatch) {
|
||||
Write("foo");
|
||||
IncrementByte(0, 10);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(10U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("checksum mismatch"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedMiddleType) {
|
||||
Write("foo");
|
||||
SetByte(6, kMiddleType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("missing start"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedLastType) {
|
||||
Write("foo");
|
||||
SetByte(6, kLastType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("missing start"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedFullType) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
SetByte(6, kFirstType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||
}
|
||||
|
||||
TEST(LogTest, UnexpectedFirstType) {
|
||||
Write("foo");
|
||||
Write(BigString("bar", 100000));
|
||||
SetByte(6, kFirstType);
|
||||
FixChecksum(0, 3);
|
||||
ASSERT_EQ(BigString("bar", 100000), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||
}
|
||||
|
||||
TEST(LogTest, ErrorJoinsRecords) {
|
||||
// Consider two fragmented records:
|
||||
// first(R1) last(R1) first(R2) last(R2)
|
||||
// where the middle two fragments disappear. We do not want
|
||||
// first(R1),last(R2) to get joined and returned as a valid record.
|
||||
|
||||
// Write records that span two blocks
|
||||
Write(BigString("foo", kBlockSize));
|
||||
Write(BigString("bar", kBlockSize));
|
||||
Write("correct");
|
||||
|
||||
// Wipe the middle block
|
||||
for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
|
||||
SetByte(offset, 'x');
|
||||
}
|
||||
|
||||
ASSERT_EQ("correct", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
const unsigned int dropped = DroppedBytes();
|
||||
ASSERT_LE(dropped, 2*kBlockSize + 100);
|
||||
ASSERT_GE(dropped, 2*kBlockSize);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadStart) {
|
||||
CheckInitialOffsetRecord(0, 0);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadSecondOneOff) {
|
||||
CheckInitialOffsetRecord(1, 1);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadSecondTenThousand) {
|
||||
CheckInitialOffsetRecord(10000, 1);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadSecondStart) {
|
||||
CheckInitialOffsetRecord(10007, 1);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadThirdOneOff) {
|
||||
CheckInitialOffsetRecord(10008, 2);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadThirdStart) {
|
||||
CheckInitialOffsetRecord(20014, 2);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthOneOff) {
|
||||
CheckInitialOffsetRecord(20015, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthFirstBlockTrailer) {
|
||||
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthMiddleBlock) {
|
||||
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthLastBlock) {
|
||||
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadFourthStart) {
|
||||
CheckInitialOffsetRecord(
|
||||
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
|
||||
3);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadEnd) {
|
||||
CheckOffsetPastEndReturnsNoRecords(0);
|
||||
}
|
||||
|
||||
TEST(LogTest, ReadPastEnd) {
|
||||
CheckOffsetPastEndReturnsNoRecords(5);
|
||||
}
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
108
db/log_writer.cc
Normal file
108
db/log_writer.cc
Normal file
@ -0,0 +1,108 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/log_writer.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
Writer::Writer(unique_ptr<WritableFile>&& dest)
|
||||
: dest_(std::move(dest)),
|
||||
block_offset_(0) {
|
||||
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||
char t = static_cast<char>(i);
|
||||
type_crc_[i] = crc32c::Value(&t, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Writer::~Writer() {
|
||||
}
|
||||
|
||||
Status Writer::AddRecord(const Slice& slice) {
|
||||
const char* ptr = slice.data();
|
||||
size_t left = slice.size();
|
||||
|
||||
// Fragment the record if necessary and emit it. Note that if slice
|
||||
// is empty, we still want to iterate once to emit a single
|
||||
// zero-length record
|
||||
Status s;
|
||||
bool begin = true;
|
||||
do {
|
||||
const int leftover = kBlockSize - block_offset_;
|
||||
assert(leftover >= 0);
|
||||
if (leftover < kHeaderSize) {
|
||||
// Switch to a new block
|
||||
if (leftover > 0) {
|
||||
// Fill the trailer (literal below relies on kHeaderSize being 7)
|
||||
assert(kHeaderSize == 7);
|
||||
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
|
||||
}
|
||||
block_offset_ = 0;
|
||||
}
|
||||
|
||||
// Invariant: we never leave < kHeaderSize bytes in a block.
|
||||
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
|
||||
|
||||
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
|
||||
const size_t fragment_length = (left < avail) ? left : avail;
|
||||
|
||||
RecordType type;
|
||||
const bool end = (left == fragment_length);
|
||||
if (begin && end) {
|
||||
type = kFullType;
|
||||
} else if (begin) {
|
||||
type = kFirstType;
|
||||
} else if (end) {
|
||||
type = kLastType;
|
||||
} else {
|
||||
type = kMiddleType;
|
||||
}
|
||||
|
||||
s = EmitPhysicalRecord(type, ptr, fragment_length);
|
||||
ptr += fragment_length;
|
||||
left -= fragment_length;
|
||||
begin = false;
|
||||
} while (s.ok() && left > 0);
|
||||
return s;
|
||||
}
|
||||
|
||||
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
|
||||
assert(n <= 0xffff); // Must fit in two bytes
|
||||
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
|
||||
|
||||
// Format the header
|
||||
char buf[kHeaderSize];
|
||||
buf[4] = static_cast<char>(n & 0xff);
|
||||
buf[5] = static_cast<char>(n >> 8);
|
||||
buf[6] = static_cast<char>(t);
|
||||
|
||||
// Compute the crc of the record type and the payload.
|
||||
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
|
||||
crc = crc32c::Mask(crc); // Adjust for storage
|
||||
EncodeFixed32(buf, crc);
|
||||
|
||||
// Write the header and the payload
|
||||
Status s = dest_->Append(Slice(buf, kHeaderSize));
|
||||
if (s.ok()) {
|
||||
s = dest_->Append(Slice(ptr, n));
|
||||
if (s.ok()) {
|
||||
s = dest_->Flush();
|
||||
}
|
||||
}
|
||||
block_offset_ += kHeaderSize + n;
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
55
db/log_writer.h
Normal file
55
db/log_writer.h
Normal file
@ -0,0 +1,55 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
#include "db/log_format.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class WritableFile;
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
namespace log {
|
||||
|
||||
class Writer {
|
||||
public:
|
||||
// Create a writer that will append data to "*dest".
|
||||
// "*dest" must be initially empty.
|
||||
// "*dest" must remain live while this Writer is in use.
|
||||
explicit Writer(unique_ptr<WritableFile>&& dest);
|
||||
~Writer();
|
||||
|
||||
Status AddRecord(const Slice& slice);
|
||||
|
||||
WritableFile* file() { return dest_.get(); }
|
||||
const WritableFile* file() const { return dest_.get(); }
|
||||
|
||||
private:
|
||||
unique_ptr<WritableFile> dest_;
|
||||
int block_offset_; // Current offset in block
|
||||
|
||||
// crc32c values for all supported record types. These are
|
||||
// pre-computed to reduce the overhead of computing the crc of the
|
||||
// record type stored in the header.
|
||||
uint32_t type_crc_[kMaxRecordType + 1];
|
||||
|
||||
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
|
||||
|
||||
// No copying allowed
|
||||
Writer(const Writer&);
|
||||
void operator=(const Writer&);
|
||||
};
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
330
db/memtable.cc
Normal file
330
db/memtable.cc
Normal file
@ -0,0 +1,330 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/memtable.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/murmurhash.h"
|
||||
|
||||
namespace std {
|
||||
template <>
|
||||
struct hash<rocksdb::Slice> {
|
||||
size_t operator()(const rocksdb::Slice& slice) const {
|
||||
return MurmurHash(slice.data(), slice.size(), 0);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
std::shared_ptr<MemTableRepFactory> table_factory,
|
||||
int numlevel,
|
||||
const Options& options)
|
||||
: comparator_(cmp),
|
||||
refs_(0),
|
||||
arena_impl_(options.arena_block_size),
|
||||
table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)),
|
||||
flush_in_progress_(false),
|
||||
flush_completed_(false),
|
||||
file_number_(0),
|
||||
edit_(numlevel),
|
||||
first_seqno_(0),
|
||||
mem_next_logfile_number_(0),
|
||||
mem_logfile_number_(0),
|
||||
locks_(options.inplace_update_support
|
||||
? options.inplace_update_num_locks
|
||||
: 0) { }
|
||||
|
||||
MemTable::~MemTable() {
|
||||
assert(refs_ == 0);
|
||||
}
|
||||
|
||||
size_t MemTable::ApproximateMemoryUsage() {
|
||||
return arena_impl_.ApproximateMemoryUsage() +
|
||||
table_->ApproximateMemoryUsage();
|
||||
}
|
||||
|
||||
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
|
||||
const {
|
||||
// Internal keys are encoded as length-prefixed strings.
|
||||
Slice a = GetLengthPrefixedSlice(aptr);
|
||||
Slice b = GetLengthPrefixedSlice(bptr);
|
||||
return comparator.Compare(a, b);
|
||||
}
|
||||
|
||||
Slice MemTableRep::UserKey(const char* key) const {
|
||||
Slice slice = GetLengthPrefixedSlice(key);
|
||||
return Slice(slice.data(), slice.size() - 8);
|
||||
}
|
||||
|
||||
// Encode a suitable internal key target for "target" and return it.
|
||||
// Uses *scratch as scratch space, and the returned pointer will point
|
||||
// into this scratch space.
|
||||
static const char* EncodeKey(std::string* scratch, const Slice& target) {
|
||||
scratch->clear();
|
||||
PutVarint32(scratch, target.size());
|
||||
scratch->append(target.data(), target.size());
|
||||
return scratch->data();
|
||||
}
|
||||
|
||||
class MemTableIterator: public Iterator {
|
||||
public:
|
||||
MemTableIterator(MemTableRep* table, const ReadOptions& options)
|
||||
: iter_() {
|
||||
if (options.prefix) {
|
||||
iter_ = table->GetPrefixIterator(*options.prefix);
|
||||
} else if (options.prefix_seek) {
|
||||
iter_ = table->GetDynamicPrefixIterator();
|
||||
} else {
|
||||
iter_ = table->GetIterator();
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool Valid() const { return iter_->Valid(); }
|
||||
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
|
||||
virtual void SeekToFirst() { iter_->SeekToFirst(); }
|
||||
virtual void SeekToLast() { iter_->SeekToLast(); }
|
||||
virtual void Next() { iter_->Next(); }
|
||||
virtual void Prev() { iter_->Prev(); }
|
||||
virtual Slice key() const {
|
||||
return GetLengthPrefixedSlice(iter_->key());
|
||||
}
|
||||
virtual Slice value() const {
|
||||
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
|
||||
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
||||
}
|
||||
|
||||
virtual Status status() const { return Status::OK(); }
|
||||
|
||||
private:
|
||||
std::shared_ptr<MemTableRep::Iterator> iter_;
|
||||
std::string tmp_; // For passing to EncodeKey
|
||||
|
||||
// No copying allowed
|
||||
MemTableIterator(const MemTableIterator&);
|
||||
void operator=(const MemTableIterator&);
|
||||
};
|
||||
|
||||
Iterator* MemTable::NewIterator(const ReadOptions& options) {
|
||||
return new MemTableIterator(table_.get(), options);
|
||||
}
|
||||
|
||||
port::RWMutex* MemTable::GetLock(const Slice& key) {
|
||||
return &locks_[std::hash<Slice>()(key) % locks_.size()];
|
||||
}
|
||||
|
||||
void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
const Slice& key,
|
||||
const Slice& value) {
|
||||
// Format of an entry is concatenation of:
|
||||
// key_size : varint32 of internal_key.size()
|
||||
// key bytes : char[internal_key.size()]
|
||||
// value_size : varint32 of value.size()
|
||||
// value bytes : char[value.size()]
|
||||
size_t key_size = key.size();
|
||||
size_t val_size = value.size();
|
||||
size_t internal_key_size = key_size + 8;
|
||||
const size_t encoded_len =
|
||||
VarintLength(internal_key_size) + internal_key_size +
|
||||
VarintLength(val_size) + val_size;
|
||||
char* buf = arena_impl_.Allocate(encoded_len);
|
||||
char* p = EncodeVarint32(buf, internal_key_size);
|
||||
memcpy(p, key.data(), key_size);
|
||||
p += key_size;
|
||||
EncodeFixed64(p, (s << 8) | type);
|
||||
p += 8;
|
||||
p = EncodeVarint32(p, val_size);
|
||||
memcpy(p, value.data(), val_size);
|
||||
assert((p + val_size) - buf == (unsigned)encoded_len);
|
||||
table_->Insert(buf);
|
||||
|
||||
// The first sequence number inserted into the memtable
|
||||
assert(first_seqno_ == 0 || s > first_seqno_);
|
||||
if (first_seqno_ == 0) {
|
||||
first_seqno_ = s;
|
||||
}
|
||||
}
|
||||
|
||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
||||
std::deque<std::string>* operands, const Options& options) {
|
||||
Slice memkey = key.memtable_key();
|
||||
std::shared_ptr<MemTableRep::Iterator> iter(
|
||||
table_->GetIterator(key.user_key()));
|
||||
iter->Seek(memkey.data());
|
||||
|
||||
// It is the caller's responsibility to allocate/delete operands list
|
||||
assert(operands != nullptr);
|
||||
|
||||
bool merge_in_progress = s->IsMergeInProgress();
|
||||
auto merge_operator = options.merge_operator.get();
|
||||
auto logger = options.info_log;
|
||||
std::string merge_result;
|
||||
|
||||
for (; iter->Valid(); iter->Next()) {
|
||||
// entry format is:
|
||||
// klength varint32
|
||||
// userkey char[klength-8]
|
||||
// tag uint64
|
||||
// vlength varint32
|
||||
// value char[vlength]
|
||||
// Check that it belongs to same user key. We do not check the
|
||||
// sequence number since the Seek() call above should have skipped
|
||||
// all entries with overly large sequence numbers.
|
||||
const char* entry = iter->key();
|
||||
uint32_t key_length;
|
||||
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||
if (comparator_.comparator.user_comparator()->Compare(
|
||||
Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
|
||||
// Correct user key
|
||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||
case kTypeValue: {
|
||||
if (options.inplace_update_support) {
|
||||
GetLock(key.user_key())->ReadLock();
|
||||
}
|
||||
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||
*s = Status::OK();
|
||||
if (merge_in_progress) {
|
||||
assert(merge_operator);
|
||||
if (!merge_operator->FullMerge(key.user_key(), &v, *operands,
|
||||
value, logger.get())) {
|
||||
RecordTick(options.statistics, NUMBER_MERGE_FAILURES);
|
||||
*s = Status::Corruption("Error: Could not perform merge.");
|
||||
}
|
||||
} else {
|
||||
value->assign(v.data(), v.size());
|
||||
}
|
||||
if (options.inplace_update_support) {
|
||||
GetLock(key.user_key())->Unlock();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case kTypeDeletion: {
|
||||
if (merge_in_progress) {
|
||||
assert(merge_operator);
|
||||
*s = Status::OK();
|
||||
if (!merge_operator->FullMerge(key.user_key(), nullptr, *operands,
|
||||
value, logger.get())) {
|
||||
RecordTick(options.statistics, NUMBER_MERGE_FAILURES);
|
||||
*s = Status::Corruption("Error: Could not perform merge.");
|
||||
}
|
||||
} else {
|
||||
*s = Status::NotFound(Slice());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case kTypeMerge: {
|
||||
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||
merge_in_progress = true;
|
||||
operands->push_front(v.ToString());
|
||||
while(operands->size() >= 2) {
|
||||
// Attempt to associative merge. (Returns true if successful)
|
||||
if (merge_operator->PartialMerge(key.user_key(),
|
||||
Slice((*operands)[0]),
|
||||
Slice((*operands)[1]),
|
||||
&merge_result,
|
||||
logger.get())) {
|
||||
operands->pop_front();
|
||||
swap(operands->front(), merge_result);
|
||||
} else {
|
||||
// Stack them because user can't associative merge
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kTypeLogData:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// exit loop if user key does not match
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// No change to value, since we have not yet found a Put/Delete
|
||||
|
||||
if (merge_in_progress) {
|
||||
*s = Status::MergeInProgress("");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool MemTable::Update(SequenceNumber seq, ValueType type,
|
||||
const Slice& key,
|
||||
const Slice& value) {
|
||||
LookupKey lkey(key, seq);
|
||||
Slice memkey = lkey.memtable_key();
|
||||
|
||||
std::shared_ptr<MemTableRep::Iterator> iter(
|
||||
table_.get()->GetIterator(lkey.user_key()));
|
||||
iter->Seek(memkey.data());
|
||||
|
||||
if (iter->Valid()) {
|
||||
// entry format is:
|
||||
// klength varint32
|
||||
// userkey char[klength-8]
|
||||
// tag uint64
|
||||
// vlength varint32
|
||||
// value char[vlength]
|
||||
// Check that it belongs to same user key. We do not check the
|
||||
// sequence number since the Seek() call above should have skipped
|
||||
// all entries with overly large sequence numbers.
|
||||
const char* entry = iter->key();
|
||||
uint32_t key_length;
|
||||
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||
if (comparator_.comparator.user_comparator()->Compare(
|
||||
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
|
||||
// Correct user key
|
||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||
case kTypeValue: {
|
||||
uint32_t vlength;
|
||||
GetVarint32Ptr(key_ptr + key_length,
|
||||
key_ptr + key_length+5, &vlength);
|
||||
// Update value, if newValue size <= curValue size
|
||||
if (value.size() <= vlength) {
|
||||
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
||||
value.size());
|
||||
WriteLock wl(GetLock(lkey.user_key()));
|
||||
memcpy(p, value.data(), value.size());
|
||||
assert(
|
||||
(p + value.size()) - entry ==
|
||||
(unsigned) (VarintLength(key_length) +
|
||||
key_length +
|
||||
VarintLength(value.size()) +
|
||||
value.size())
|
||||
);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
default:
|
||||
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
|
||||
// then we probably don't have enough space to update in-place
|
||||
// Maybe do something later
|
||||
// Return false, and do normal Add()
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Key doesn't exist
|
||||
return false;
|
||||
}
|
||||
} // namespace rocksdb
|
172
db/memtable.h
Normal file
172
db/memtable.h
Normal file
@ -0,0 +1,172 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <deque>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/skiplist.h"
|
||||
#include "db/version_set.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "util/arena_impl.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Mutex;
|
||||
class MemTableIterator;
|
||||
|
||||
class MemTable {
|
||||
public:
|
||||
struct KeyComparator : public MemTableRep::KeyComparator {
|
||||
const InternalKeyComparator comparator;
|
||||
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
|
||||
virtual int operator()(const char* a, const char* b) const;
|
||||
};
|
||||
|
||||
// MemTables are reference counted. The initial reference count
|
||||
// is zero and the caller must call Ref() at least once.
|
||||
explicit MemTable(
|
||||
const InternalKeyComparator& comparator,
|
||||
std::shared_ptr<MemTableRepFactory> table_factory,
|
||||
int numlevel = 7,
|
||||
const Options& options = Options());
|
||||
|
||||
// Increase reference count.
|
||||
void Ref() { ++refs_; }
|
||||
|
||||
// Drop reference count. Delete if no more references exist.
|
||||
void Unref() {
|
||||
--refs_;
|
||||
assert(refs_ >= 0);
|
||||
if (refs_ <= 0) {
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns an estimate of the number of bytes of data in use by this
|
||||
// data structure.
|
||||
//
|
||||
// REQUIRES: external synchronization to prevent simultaneous
|
||||
// operations on the same MemTable.
|
||||
size_t ApproximateMemoryUsage();
|
||||
|
||||
// Return an iterator that yields the contents of the memtable.
|
||||
//
|
||||
// The caller must ensure that the underlying MemTable remains live
|
||||
// while the returned iterator is live. The keys returned by this
|
||||
// iterator are internal keys encoded by AppendInternalKey in the
|
||||
// db/dbformat.{h,cc} module.
|
||||
//
|
||||
// If options.prefix is supplied, it is passed to the underlying MemTableRep
|
||||
// as a hint that the iterator only need to support access to keys with that
|
||||
// specific prefix.
|
||||
// If options.prefix is not supplied and options.prefix_seek is set, the
|
||||
// iterator is not bound to a specific prefix. However, the semantics of
|
||||
// Seek is changed - the result might only include keys with the same prefix
|
||||
// as the seek-key.
|
||||
Iterator* NewIterator(const ReadOptions& options = ReadOptions());
|
||||
|
||||
// Add an entry into memtable that maps key to value at the
|
||||
// specified sequence number and with the specified type.
|
||||
// Typically value will be empty if type==kTypeDeletion.
|
||||
void Add(SequenceNumber seq, ValueType type,
|
||||
const Slice& key,
|
||||
const Slice& value);
|
||||
|
||||
// If memtable contains a value for key, store it in *value and return true.
|
||||
// If memtable contains a deletion for key, store a NotFound() error
|
||||
// in *status and return true.
|
||||
// If memtable contains Merge operation as the most recent entry for a key,
|
||||
// and the merge process does not stop (not reaching a value or delete),
|
||||
// prepend the current merge operand to *operands.
|
||||
// store MergeInProgress in s, and return false.
|
||||
// Else, return false.
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
std::deque<std::string>* operands, const Options& options);
|
||||
|
||||
// Update the value and return status ok,
|
||||
// if key exists in current memtable
|
||||
// if new sizeof(new_value) <= sizeof(old_value) &&
|
||||
// old_value for that key is a put i.e. kTypeValue
|
||||
// else return false, and status - NotUpdatable()
|
||||
// else return false, and status - NotFound()
|
||||
bool Update(SequenceNumber seq, ValueType type,
|
||||
const Slice& key,
|
||||
const Slice& value);
|
||||
|
||||
// Returns the edits area that is needed for flushing the memtable
|
||||
VersionEdit* GetEdits() { return &edit_; }
|
||||
|
||||
// Returns the sequence number of the first element that was inserted
|
||||
// into the memtable
|
||||
SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
|
||||
|
||||
// Returns the next active logfile number when this memtable is about to
|
||||
// be flushed to storage
|
||||
uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
|
||||
|
||||
// Sets the next active logfile number when this memtable is about to
|
||||
// be flushed to storage
|
||||
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
|
||||
|
||||
// Returns the logfile number that can be safely deleted when this
|
||||
// memstore is flushed to storage
|
||||
uint64_t GetLogNumber() { return mem_logfile_number_; }
|
||||
|
||||
// Sets the logfile number that can be safely deleted when this
|
||||
// memstore is flushed to storage
|
||||
void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; }
|
||||
|
||||
// Notify the underlying storage that no more items will be added
|
||||
void MarkImmutable() { table_->MarkReadOnly(); }
|
||||
|
||||
private:
|
||||
~MemTable(); // Private since only Unref() should be used to delete it
|
||||
friend class MemTableIterator;
|
||||
friend class MemTableBackwardIterator;
|
||||
friend class MemTableList;
|
||||
|
||||
KeyComparator comparator_;
|
||||
int refs_;
|
||||
ArenaImpl arena_impl_;
|
||||
shared_ptr<MemTableRep> table_;
|
||||
|
||||
// These are used to manage memtable flushes to storage
|
||||
bool flush_in_progress_; // started the flush
|
||||
bool flush_completed_; // finished the flush
|
||||
uint64_t file_number_; // filled up after flush is complete
|
||||
|
||||
// The udpates to be applied to the transaction log when this
|
||||
// memtable is flushed to storage.
|
||||
VersionEdit edit_;
|
||||
|
||||
// The sequence number of the kv that was inserted first
|
||||
SequenceNumber first_seqno_;
|
||||
|
||||
// The log files earlier than this number can be deleted.
|
||||
uint64_t mem_next_logfile_number_;
|
||||
|
||||
// The log file that backs this memtable (to be deleted when
|
||||
// memtable flush is done)
|
||||
uint64_t mem_logfile_number_;
|
||||
|
||||
// rw locks for inplace updates
|
||||
std::vector<port::RWMutex> locks_;
|
||||
|
||||
// No copying allowed
|
||||
MemTable(const MemTable&);
|
||||
void operator=(const MemTable&);
|
||||
|
||||
// Get the lock associated for the key
|
||||
port::RWMutex* GetLock(const Slice& key);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
215
db/memtablelist.cc
Normal file
215
db/memtablelist.cc
Normal file
@ -0,0 +1,215 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include "db/memtablelist.h"
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/memtable.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class InternalKeyComparator;
|
||||
class Mutex;
|
||||
class MemTableListIterator;
|
||||
class VersionSet;
|
||||
|
||||
using std::list;
|
||||
|
||||
// Increase reference count on all underling memtables
|
||||
void MemTableList::RefAll() {
|
||||
for (auto &memtable : memlist_) {
|
||||
memtable->Ref();
|
||||
}
|
||||
}
|
||||
|
||||
// Drop reference count on all underling memtables
|
||||
void MemTableList::UnrefAll() {
|
||||
for (auto &memtable : memlist_) {
|
||||
memtable->Unref();
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the total number of memtables in the list
|
||||
int MemTableList::size() {
|
||||
assert(num_flush_not_started_ <= size_);
|
||||
return size_;
|
||||
}
|
||||
|
||||
// Returns true if there is at least one memtable on which flush has
|
||||
// not yet started.
|
||||
bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) {
|
||||
if ((flush_requested_ && num_flush_not_started_ >= 1) ||
|
||||
(num_flush_not_started_ >= min_write_buffer_number_to_merge)) {
|
||||
assert(imm_flush_needed.NoBarrier_Load() != nullptr);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns the memtables that need to be flushed.
|
||||
void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
|
||||
for (auto it = memlist_.rbegin(); it != memlist_.rend(); it++) {
|
||||
MemTable* m = *it;
|
||||
if (!m->flush_in_progress_) {
|
||||
assert(!m->flush_completed_);
|
||||
num_flush_not_started_--;
|
||||
if (num_flush_not_started_ == 0) {
|
||||
imm_flush_needed.Release_Store(nullptr);
|
||||
}
|
||||
m->flush_in_progress_ = true; // flushing will start very soon
|
||||
ret->push_back(m);
|
||||
}
|
||||
}
|
||||
flush_requested_ = false; // start-flush request is complete
|
||||
}
|
||||
|
||||
// Record a successful flush in the manifest file
|
||||
Status MemTableList::InstallMemtableFlushResults(
|
||||
const std::vector<MemTable*> &mems,
|
||||
VersionSet* vset, Status flushStatus,
|
||||
port::Mutex* mu, Logger* info_log,
|
||||
uint64_t file_number,
|
||||
std::set<uint64_t>& pending_outputs) {
|
||||
mu->AssertHeld();
|
||||
|
||||
// If the flush was not successful, then just reset state.
|
||||
// Maybe a suceeding attempt to flush will be successful.
|
||||
if (!flushStatus.ok()) {
|
||||
for (MemTable* m : mems) {
|
||||
assert(m->flush_in_progress_);
|
||||
assert(m->file_number_ == 0);
|
||||
|
||||
m->flush_in_progress_ = false;
|
||||
m->flush_completed_ = false;
|
||||
m->edit_.Clear();
|
||||
num_flush_not_started_++;
|
||||
imm_flush_needed.Release_Store((void *)1);
|
||||
pending_outputs.erase(file_number);
|
||||
}
|
||||
return flushStatus;
|
||||
}
|
||||
|
||||
// flush was sucessful
|
||||
for (size_t i = 0; i < mems.size(); ++i) {
|
||||
// All the edits are associated with the first memtable of this batch.
|
||||
assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
|
||||
|
||||
mems[i]->flush_completed_ = true;
|
||||
mems[i]->file_number_ = file_number;
|
||||
}
|
||||
|
||||
// if some other thread is already commiting, then return
|
||||
Status s;
|
||||
if (commit_in_progress_) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Only a single thread can be executing this piece of code
|
||||
commit_in_progress_ = true;
|
||||
|
||||
// scan all memtables from the earliest, and commit those
|
||||
// (in that order) that have finished flushing. Memetables
|
||||
// are always committed in the order that they were created.
|
||||
while (!memlist_.empty() && s.ok()) {
|
||||
MemTable* m = memlist_.back(); // get the last element
|
||||
if (!m->flush_completed_) {
|
||||
break;
|
||||
}
|
||||
|
||||
Log(info_log,
|
||||
"Level-0 commit table #%lu started",
|
||||
(unsigned long)m->file_number_);
|
||||
|
||||
// this can release and reacquire the mutex.
|
||||
s = vset->LogAndApply(&m->edit_, mu);
|
||||
|
||||
// All the later memtables that have the same filenum
|
||||
// are part of the same batch. They can be committed now.
|
||||
uint64_t mem_id = 1; // how many memtables has been flushed.
|
||||
do {
|
||||
if (s.ok()) { // commit new state
|
||||
Log(info_log,
|
||||
"Level-0 commit table #%lu: memtable #%lu done",
|
||||
(unsigned long)m->file_number_,
|
||||
(unsigned long)mem_id);
|
||||
memlist_.remove(m);
|
||||
assert(m->file_number_ > 0);
|
||||
|
||||
// pending_outputs can be cleared only after the newly created file
|
||||
// has been written to a committed version so that other concurrently
|
||||
// executing compaction threads do not mistakenly assume that this
|
||||
// file is not live.
|
||||
pending_outputs.erase(m->file_number_);
|
||||
m->Unref();
|
||||
size_--;
|
||||
} else {
|
||||
//commit failed. setup state so that we can flush again.
|
||||
Log(info_log,
|
||||
"Level-0 commit table #%lu: memtable #%lu failed",
|
||||
(unsigned long)m->file_number_,
|
||||
(unsigned long)mem_id);
|
||||
m->flush_completed_ = false;
|
||||
m->flush_in_progress_ = false;
|
||||
m->edit_.Clear();
|
||||
num_flush_not_started_++;
|
||||
pending_outputs.erase(m->file_number_);
|
||||
m->file_number_ = 0;
|
||||
imm_flush_needed.Release_Store((void *)1);
|
||||
s = Status::IOError("Unable to commit flushed memtable");
|
||||
}
|
||||
++mem_id;
|
||||
} while (!memlist_.empty() && (m = memlist_.back()) &&
|
||||
m->file_number_ == file_number);
|
||||
}
|
||||
commit_in_progress_ = false;
|
||||
return s;
|
||||
}
|
||||
|
||||
// New memtables are inserted at the front of the list.
|
||||
void MemTableList::Add(MemTable* m) {
|
||||
assert(size_ >= num_flush_not_started_);
|
||||
size_++;
|
||||
memlist_.push_front(m);
|
||||
m->MarkImmutable();
|
||||
num_flush_not_started_++;
|
||||
if (num_flush_not_started_ == 1) {
|
||||
imm_flush_needed.Release_Store((void *)1);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns an estimate of the number of bytes of data in use.
|
||||
size_t MemTableList::ApproximateMemoryUsage() {
|
||||
size_t size = 0;
|
||||
for (auto &memtable : memlist_) {
|
||||
size += memtable->ApproximateMemoryUsage();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
// Search all the memtables starting from the most recent one.
|
||||
// Return the most recent value found, if any.
|
||||
// Operands stores the list of merge operations to apply, so far.
|
||||
bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s,
|
||||
std::deque<std::string>* operands,
|
||||
const Options& options) {
|
||||
for (auto &memtable : memlist_) {
|
||||
if (memtable->Get(key, value, s, operands, options)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void MemTableList::GetMemTables(std::vector<MemTable*>* output) {
|
||||
for (auto &memtable : memlist_) {
|
||||
output->push_back(memtable);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
105
db/memtablelist.h
Normal file
105
db/memtablelist.h
Normal file
@ -0,0 +1,105 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <deque>
|
||||
#include "rocksdb/db.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/skiplist.h"
|
||||
#include "memtable.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class InternalKeyComparator;
|
||||
class Mutex;
|
||||
class MemTableListIterator;
|
||||
|
||||
//
|
||||
// This class stores references to all the immutable memtables.
|
||||
// The memtables are flushed to L0 as soon as possible and in
|
||||
// any order. If there are more than one immutable memtable, their
|
||||
// flushes can occur concurrently. However, they are 'committed'
|
||||
// to the manifest in FIFO order to maintain correctness and
|
||||
// recoverability from a crash.
|
||||
//
|
||||
class MemTableList {
|
||||
public:
|
||||
// A list of memtables.
|
||||
MemTableList() : size_(0), num_flush_not_started_(0),
|
||||
commit_in_progress_(false),
|
||||
flush_requested_(false) {
|
||||
imm_flush_needed.Release_Store(nullptr);
|
||||
}
|
||||
~MemTableList() {};
|
||||
|
||||
// so that backgrund threads can detect non-nullptr pointer to
|
||||
// determine whether this is anything more to start flushing.
|
||||
port::AtomicPointer imm_flush_needed;
|
||||
|
||||
// Increase reference count on all underling memtables
|
||||
void RefAll();
|
||||
|
||||
// Drop reference count on all underling memtables
|
||||
void UnrefAll();
|
||||
|
||||
// Returns the total number of memtables in the list
|
||||
int size();
|
||||
|
||||
// Returns true if there is at least one memtable on which flush has
|
||||
// not yet started.
|
||||
bool IsFlushPending(int min_write_buffer_number_to_merge);
|
||||
|
||||
// Returns the earliest memtables that needs to be flushed. The returned
|
||||
// memtables are guaranteed to be in the ascending order of created time.
|
||||
void PickMemtablesToFlush(std::vector<MemTable*>* mems);
|
||||
|
||||
// Commit a successful flush in the manifest file
|
||||
Status InstallMemtableFlushResults(const std::vector<MemTable*> &m,
|
||||
VersionSet* vset, Status flushStatus,
|
||||
port::Mutex* mu, Logger* info_log,
|
||||
uint64_t file_number,
|
||||
std::set<uint64_t>& pending_outputs);
|
||||
|
||||
// New memtables are inserted at the front of the list.
|
||||
// Takes ownership of the referenced held on *m by the caller of Add().
|
||||
void Add(MemTable* m);
|
||||
|
||||
// Returns an estimate of the number of bytes of data in use.
|
||||
size_t ApproximateMemoryUsage();
|
||||
|
||||
// Search all the memtables starting from the most recent one.
|
||||
// Return the most recent value found, if any.
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
std::deque<std::string>* operands, const Options& options);
|
||||
|
||||
// Returns the list of underlying memtables.
|
||||
void GetMemTables(std::vector<MemTable*>* list);
|
||||
|
||||
// Request a flush of all existing memtables to storage
|
||||
void FlushRequested() { flush_requested_ = true; }
|
||||
|
||||
// Copying allowed
|
||||
// MemTableList(const MemTableList&);
|
||||
// void operator=(const MemTableList&);
|
||||
|
||||
private:
|
||||
std::list<MemTable*> memlist_;
|
||||
int size_;
|
||||
|
||||
// the number of elements that still need flushing
|
||||
int num_flush_not_started_;
|
||||
|
||||
// committing in progress
|
||||
bool commit_in_progress_;
|
||||
|
||||
// Requested a flush of all memtables to storage
|
||||
bool flush_requested_;
|
||||
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
197
db/merge_helper.cc
Normal file
197
db/merge_helper.cc
Normal file
@ -0,0 +1,197 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include "merge_helper.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// PRE: iter points to the first merge type entry
|
||||
// POST: iter points to the first entry beyond the merge process (or the end)
|
||||
// keys_, operands_ are updated to reflect the merge result.
|
||||
// keys_ stores the list of keys encountered while merging.
|
||||
// operands_ stores the list of merge operands encountered while merging.
|
||||
// keys_[i] corresponds to operands_[i] for each i.
|
||||
void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
|
||||
bool at_bottom, shared_ptr<Statistics> stats) {
|
||||
// Get a copy of the internal key, before it's invalidated by iter->Next()
|
||||
// Also maintain the list of merge operands seen.
|
||||
keys_.clear();
|
||||
operands_.clear();
|
||||
keys_.push_front(iter->key().ToString());
|
||||
operands_.push_front(iter->value().ToString());
|
||||
|
||||
success_ = false; // Will become true if we hit Put/Delete or bottom
|
||||
|
||||
// We need to parse the internal key again as the parsed key is
|
||||
// backed by the internal key!
|
||||
// Assume no internal key corruption as it has been successfully parsed
|
||||
// by the caller.
|
||||
// Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
|
||||
ParsedInternalKey orig_ikey;
|
||||
ParseInternalKey(keys_.back(), &orig_ikey);
|
||||
|
||||
bool hit_the_next_user_key = false;
|
||||
ParsedInternalKey ikey;
|
||||
std::string merge_result; // Temporary value for merge results
|
||||
for (iter->Next(); iter->Valid(); iter->Next()) {
|
||||
assert(operands_.size() >= 1); // Should be invariants!
|
||||
assert(keys_.size() == operands_.size());
|
||||
|
||||
if (!ParseInternalKey(iter->key(), &ikey)) {
|
||||
// stop at corrupted key
|
||||
if (assert_valid_internal_key_) {
|
||||
assert(!"corrupted internal key is not expected");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
|
||||
// hit a different user key, stop right here
|
||||
hit_the_next_user_key = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (stop_before && ikey.sequence <= stop_before) {
|
||||
// hit an entry that's visible by the previous snapshot, can't touch that
|
||||
break;
|
||||
}
|
||||
|
||||
// At this point we are guaranteed that we need to process this key.
|
||||
|
||||
if (kTypeDeletion == ikey.type) {
|
||||
// hit a delete
|
||||
// => merge nullptr with operands_
|
||||
// => store result in operands_.back() (and update keys_.back())
|
||||
// => change the entry type to kTypeValue for keys_.back()
|
||||
// We are done! Return a success if the merge passes.
|
||||
success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
|
||||
operands_, &merge_result,
|
||||
logger_);
|
||||
|
||||
// We store the result in keys_.back() and operands_.back()
|
||||
// if nothing went wrong (i.e.: no operand corruption on disk)
|
||||
if (success_) {
|
||||
std::string& key = keys_.back(); // The original key encountered
|
||||
orig_ikey.type = kTypeValue;
|
||||
UpdateInternalKey(&key[0], key.size(),
|
||||
orig_ikey.sequence, orig_ikey.type);
|
||||
swap(operands_.back(), merge_result);
|
||||
} else {
|
||||
RecordTick(stats, NUMBER_MERGE_FAILURES);
|
||||
}
|
||||
|
||||
// move iter to the next entry (before doing anything else)
|
||||
iter->Next();
|
||||
return;
|
||||
}
|
||||
|
||||
if (kTypeValue == ikey.type) {
|
||||
// hit a put
|
||||
// => merge the put value with operands_
|
||||
// => store result in operands_.back() (and update keys_.back())
|
||||
// => change the entry type to kTypeValue for keys_.back()
|
||||
// We are done! Success!
|
||||
const Slice value = iter->value();
|
||||
success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
|
||||
operands_, &merge_result,
|
||||
logger_);
|
||||
|
||||
// We store the result in keys_.back() and operands_.back()
|
||||
// if nothing went wrong (i.e.: no operand corruption on disk)
|
||||
if (success_) {
|
||||
std::string& key = keys_.back(); // The original key encountered
|
||||
orig_ikey.type = kTypeValue;
|
||||
UpdateInternalKey(&key[0], key.size(),
|
||||
orig_ikey.sequence, orig_ikey.type);
|
||||
swap(operands_.back(), merge_result);
|
||||
} else {
|
||||
RecordTick(stats, NUMBER_MERGE_FAILURES);
|
||||
}
|
||||
|
||||
// move iter to the next entry
|
||||
iter->Next();
|
||||
return;
|
||||
}
|
||||
|
||||
if (kTypeMerge == ikey.type) {
|
||||
// hit a merge
|
||||
// => merge the operand into the front of the operands_ list
|
||||
// => use the user's associative merge function to determine how.
|
||||
// => then continue because we haven't yet seen a Put/Delete.
|
||||
assert(!operands_.empty()); // Should have at least one element in it
|
||||
|
||||
keys_.push_front(iter->key().ToString());
|
||||
operands_.push_front(iter->value().ToString());
|
||||
while (operands_.size() >= 2) {
|
||||
// Returns false when the merge_operator can no longer process it
|
||||
if (user_merge_operator_->PartialMerge(ikey.user_key,
|
||||
Slice(operands_[0]),
|
||||
Slice(operands_[1]),
|
||||
&merge_result,
|
||||
logger_)) {
|
||||
// Merging of operands (associative merge) was successful.
|
||||
// Replace these frontmost two operands with the merge result
|
||||
keys_.pop_front();
|
||||
operands_.pop_front();
|
||||
swap(operands_.front(), merge_result);
|
||||
} else {
|
||||
// Merging of operands (associative merge) returned false.
|
||||
// The user merge_operator does not know how to merge these operands.
|
||||
// So we just stack them up until we find a Put/Delete or end of key.
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// We are sure we have seen this key's entire history if we are at the
|
||||
// last level and exhausted all internal keys of this user key.
|
||||
// NOTE: !iter->Valid() does not necessarily mean we hit the
|
||||
// beginning of a user key, as versions of a user key might be
|
||||
// split into multiple files (even files on the same level)
|
||||
// and some files might not be included in the compaction/merge.
|
||||
//
|
||||
// There are also cases where we have seen the root of history of this
|
||||
// key without being sure of it. Then, we simply miss the opportunity
|
||||
// to combine the keys. Since VersionSet::SetupOtherInputs() always makes
|
||||
// sure that all merge-operands on the same level get compacted together,
|
||||
// this will simply lead to these merge operands moving to the next level.
|
||||
//
|
||||
// So, we only perform the following logic (to merge all operands together
|
||||
// without a Put/Delete) if we are certain that we have seen the end of key.
|
||||
bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
|
||||
if (surely_seen_the_beginning) {
|
||||
// do a final merge with nullptr as the existing value and say
|
||||
// bye to the merge type (it's now converted to a Put)
|
||||
assert(kTypeMerge == orig_ikey.type);
|
||||
assert(operands_.size() >= 1);
|
||||
assert(operands_.size() == keys_.size());
|
||||
success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
|
||||
operands_, &merge_result,
|
||||
logger_);
|
||||
|
||||
if (success_) {
|
||||
std::string& key = keys_.back(); // The original key encountered
|
||||
orig_ikey.type = kTypeValue;
|
||||
UpdateInternalKey(&key[0], key.size(),
|
||||
orig_ikey.sequence, orig_ikey.type);
|
||||
|
||||
// The final value() is always stored in operands_.back()
|
||||
swap(operands_.back(),merge_result);
|
||||
} else {
|
||||
RecordTick(stats, NUMBER_MERGE_FAILURES);
|
||||
// Do nothing if not success_. Leave keys() and operands() as they are.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
102
db/merge_helper.h
Normal file
102
db/merge_helper.h
Normal file
@ -0,0 +1,102 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#ifndef MERGE_HELPER_H
|
||||
#define MERGE_HELPER_H
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include <string>
|
||||
#include <deque>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Comparator;
|
||||
class Iterator;
|
||||
class Logger;
|
||||
class MergeOperator;
|
||||
|
||||
class MergeHelper {
|
||||
public:
|
||||
MergeHelper(const Comparator* user_comparator,
|
||||
const MergeOperator* user_merge_operator,
|
||||
Logger* logger,
|
||||
bool assert_valid_internal_key)
|
||||
: user_comparator_(user_comparator),
|
||||
user_merge_operator_(user_merge_operator),
|
||||
logger_(logger),
|
||||
assert_valid_internal_key_(assert_valid_internal_key),
|
||||
keys_(),
|
||||
operands_(),
|
||||
success_(false) {}
|
||||
|
||||
// Merge entries until we hit
|
||||
// - a corrupted key
|
||||
// - a Put/Delete,
|
||||
// - a different user key,
|
||||
// - a specific sequence number (snapshot boundary),
|
||||
// or - the end of iteration
|
||||
// iter: (IN) points to the first merge type entry
|
||||
// (OUT) points to the first entry not included in the merge process
|
||||
// stop_before: (IN) a sequence number that merge should not cross.
|
||||
// 0 means no restriction
|
||||
// at_bottom: (IN) true if the iterator covers the bottem level, which means
|
||||
// we could reach the start of the history of this user key.
|
||||
void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
|
||||
bool at_bottom = false, shared_ptr<Statistics> stats=nullptr);
|
||||
|
||||
// Query the merge result
|
||||
// These are valid until the next MergeUntil call
|
||||
// If the merging was successful:
|
||||
// - IsSuccess() will be true
|
||||
// - key() will have the latest sequence number of the merges.
|
||||
// The type will be Put or Merge. See IMPORTANT 1 note, below.
|
||||
// - value() will be the result of merging all the operands together
|
||||
// - The user should ignore keys() and values().
|
||||
//
|
||||
// IMPORTANT 1: the key type could change after the MergeUntil call.
|
||||
// Put/Delete + Merge + ... + Merge => Put
|
||||
// Merge + ... + Merge => Merge
|
||||
//
|
||||
// If the merge operator is not associative, and if a Put/Delete is not found
|
||||
// then the merging will be unsuccessful. In this case:
|
||||
// - IsSuccess() will be false
|
||||
// - keys() contains the list of internal keys seen in order of iteration.
|
||||
// - values() contains the list of values (merges) seen in the same order.
|
||||
// values() is parallel to keys() so that the first entry in
|
||||
// keys() is the key associated with the first entry in values()
|
||||
// and so on. These lists will be the same length.
|
||||
// All of these pairs will be merges over the same user key.
|
||||
// See IMPORTANT 2 note below.
|
||||
// - The user should ignore key() and value().
|
||||
//
|
||||
// IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
|
||||
// So keys().back() was the first key seen by iterator.
|
||||
// TODO: Re-style this comment to be like the first one
|
||||
bool IsSuccess() { return success_; }
|
||||
Slice key() { assert(success_); return Slice(keys_.back()); }
|
||||
Slice value() { assert(success_); return Slice(operands_.back()); }
|
||||
const std::deque<std::string>& keys() { assert(!success_); return keys_; }
|
||||
const std::deque<std::string>& values() {
|
||||
assert(!success_); return operands_;
|
||||
}
|
||||
|
||||
private:
|
||||
const Comparator* user_comparator_;
|
||||
const MergeOperator* user_merge_operator_;
|
||||
Logger* logger_;
|
||||
bool assert_valid_internal_key_; // enforce no internal key corruption?
|
||||
|
||||
// the scratch area that holds the result of MergeUntil
|
||||
// valid up to the next MergeUntil call
|
||||
std::deque<std::string> keys_; // Keeps track of the sequence of keys seen
|
||||
std::deque<std::string> operands_; // Parallel with keys_; stores the values
|
||||
bool success_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif
|
53
db/merge_operator.cc
Normal file
53
db/merge_operator.cc
Normal file
@ -0,0 +1,53 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
/**
|
||||
* Back-end implementation details specific to the Merge Operator.
|
||||
*/
|
||||
|
||||
#include "rocksdb/merge_operator.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Given a "real" merge from the library, call the user's
|
||||
// associative merge function one-by-one on each of the operands.
|
||||
// NOTE: It is assumed that the client's merge-operator will handle any errors.
|
||||
bool AssociativeMergeOperator::FullMerge(
|
||||
const Slice& key,
|
||||
const Slice* existing_value,
|
||||
const std::deque<std::string>& operand_list,
|
||||
std::string* new_value,
|
||||
Logger* logger) const {
|
||||
|
||||
// Simply loop through the operands
|
||||
Slice temp_existing;
|
||||
std::string temp_value;
|
||||
for (const auto& operand : operand_list) {
|
||||
Slice value(operand);
|
||||
if (!Merge(key, existing_value, value, &temp_value, logger)) {
|
||||
return false;
|
||||
}
|
||||
swap(temp_value, *new_value);
|
||||
temp_existing = Slice(*new_value);
|
||||
existing_value = &temp_existing;
|
||||
}
|
||||
|
||||
// The result will be in *new_value. All merges succeeded.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Call the user defined simple merge on the operands;
|
||||
// NOTE: It is assumed that the client's merge-operator will handle any errors.
|
||||
bool AssociativeMergeOperator::PartialMerge(
|
||||
const Slice& key,
|
||||
const Slice& left_operand,
|
||||
const Slice& right_operand,
|
||||
std::string* new_value,
|
||||
Logger* logger) const {
|
||||
|
||||
return Merge(key, &left_operand, right_operand, new_value, logger);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
275
db/merge_test.cc
Normal file
275
db/merge_test.cc
Normal file
@ -0,0 +1,275 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
#include "util/testharness.h"
|
||||
#include "utilities/utility_db.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace rocksdb;
|
||||
|
||||
|
||||
std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false) {
|
||||
DB* db;
|
||||
StackableDB* sdb;
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
||||
Status s;
|
||||
DestroyDB(dbname, Options());
|
||||
if (ttl) {
|
||||
cout << "Opening database with TTL\n";
|
||||
s = UtilityDB::OpenTtlDB(options, dbname, &sdb);
|
||||
db = sdb;
|
||||
} else {
|
||||
s = DB::Open(options, dbname, &db);
|
||||
}
|
||||
if (!s.ok()) {
|
||||
cerr << s.ToString() << endl;
|
||||
assert(false);
|
||||
}
|
||||
return std::shared_ptr<DB>(db);
|
||||
}
|
||||
|
||||
// Imagine we are maintaining a set of uint64 counters.
|
||||
// Each counter has a distinct name. And we would like
|
||||
// to support four high level operations:
|
||||
// set, add, get and remove
|
||||
// This is a quick implementation without a Merge operation.
|
||||
class Counters {
|
||||
|
||||
protected:
|
||||
std::shared_ptr<DB> db_;
|
||||
|
||||
WriteOptions put_option_;
|
||||
ReadOptions get_option_;
|
||||
WriteOptions delete_option_;
|
||||
|
||||
uint64_t default_;
|
||||
|
||||
public:
|
||||
explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
|
||||
: db_(db),
|
||||
put_option_(),
|
||||
get_option_(),
|
||||
delete_option_(),
|
||||
default_(defaultCount) {
|
||||
assert(db_);
|
||||
}
|
||||
|
||||
virtual ~Counters() {}
|
||||
|
||||
// public interface of Counters.
|
||||
// All four functions return false
|
||||
// if the underlying level db operation failed.
|
||||
|
||||
// mapped to a levedb Put
|
||||
bool set(const string& key, uint64_t value) {
|
||||
// just treat the internal rep of int64 as the string
|
||||
Slice slice((char *)&value, sizeof(value));
|
||||
auto s = db_->Put(put_option_, key, slice);
|
||||
|
||||
if (s.ok()) {
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// mapped to a rocksdb Delete
|
||||
bool remove(const string& key) {
|
||||
auto s = db_->Delete(delete_option_, key);
|
||||
|
||||
if (s.ok()) {
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// mapped to a rocksdb Get
|
||||
bool get(const string& key, uint64_t *value) {
|
||||
string str;
|
||||
auto s = db_->Get(get_option_, key, &str);
|
||||
|
||||
if (s.IsNotFound()) {
|
||||
// return default value if not found;
|
||||
*value = default_;
|
||||
return true;
|
||||
} else if (s.ok()) {
|
||||
// deserialization
|
||||
if (str.size() != sizeof(uint64_t)) {
|
||||
cerr << "value corruption\n";
|
||||
return false;
|
||||
}
|
||||
*value = DecodeFixed64(&str[0]);
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// 'add' is implemented as get -> modify -> set
|
||||
// An alternative is a single merge operation, see MergeBasedCounters
|
||||
virtual bool add(const string& key, uint64_t value) {
|
||||
uint64_t base = default_;
|
||||
return get(key, &base) && set(key, base + value);
|
||||
}
|
||||
|
||||
|
||||
// convenience functions for testing
|
||||
void assert_set(const string& key, uint64_t value) {
|
||||
assert(set(key, value));
|
||||
}
|
||||
|
||||
void assert_remove(const string& key) {
|
||||
assert(remove(key));
|
||||
}
|
||||
|
||||
uint64_t assert_get(const string& key) {
|
||||
uint64_t value = default_;
|
||||
assert(get(key, &value));
|
||||
return value;
|
||||
}
|
||||
|
||||
void assert_add(const string& key, uint64_t value) {
|
||||
assert(add(key, value));
|
||||
}
|
||||
};
|
||||
|
||||
// Implement 'add' directly with the new Merge operation
|
||||
class MergeBasedCounters : public Counters {
|
||||
private:
|
||||
WriteOptions merge_option_; // for merge
|
||||
|
||||
public:
|
||||
explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
|
||||
: Counters(db, defaultCount),
|
||||
merge_option_() {
|
||||
}
|
||||
|
||||
// mapped to a rocksdb Merge operation
|
||||
virtual bool add(const string& key, uint64_t value) override {
|
||||
char encoded[sizeof(uint64_t)];
|
||||
EncodeFixed64(encoded, value);
|
||||
Slice slice(encoded, sizeof(uint64_t));
|
||||
auto s = db_->Merge(merge_option_, key, slice);
|
||||
|
||||
if (s.ok()) {
|
||||
return true;
|
||||
} else {
|
||||
cerr << s.ToString() << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void dumpDb(DB* db) {
|
||||
auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
uint64_t value = DecodeFixed64(it->value().data());
|
||||
cout << it->key().ToString() << ": " << value << endl;
|
||||
}
|
||||
assert(it->status().ok()); // Check for any errors found during the scan
|
||||
}
|
||||
|
||||
void testCounters(Counters& counters, DB* db, bool test_compaction) {
|
||||
|
||||
FlushOptions o;
|
||||
o.wait = true;
|
||||
|
||||
counters.assert_set("a", 1);
|
||||
|
||||
if (test_compaction) db->Flush(o);
|
||||
|
||||
assert(counters.assert_get("a") == 1);
|
||||
|
||||
counters.assert_remove("b");
|
||||
|
||||
// defaut value is 0 if non-existent
|
||||
assert(counters.assert_get("b") == 0);
|
||||
|
||||
counters.assert_add("a", 2);
|
||||
|
||||
if (test_compaction) db->Flush(o);
|
||||
|
||||
// 1+2 = 3
|
||||
assert(counters.assert_get("a")== 3);
|
||||
|
||||
dumpDb(db);
|
||||
|
||||
std::cout << "1\n";
|
||||
|
||||
// 1+...+49 = ?
|
||||
uint64_t sum = 0;
|
||||
for (int i = 1; i < 50; i++) {
|
||||
counters.assert_add("b", i);
|
||||
sum += i;
|
||||
}
|
||||
assert(counters.assert_get("b") == sum);
|
||||
|
||||
std::cout << "2\n";
|
||||
dumpDb(db);
|
||||
|
||||
std::cout << "3\n";
|
||||
|
||||
if (test_compaction) {
|
||||
db->Flush(o);
|
||||
|
||||
cout << "Compaction started ...\n";
|
||||
db->CompactRange(nullptr, nullptr);
|
||||
cout << "Compaction ended\n";
|
||||
|
||||
dumpDb(db);
|
||||
|
||||
assert(counters.assert_get("a")== 3);
|
||||
assert(counters.assert_get("b") == sum);
|
||||
}
|
||||
}
|
||||
|
||||
void runTest(int argc, const string& dbname, const bool use_ttl = false) {
|
||||
auto db = OpenDb(dbname, use_ttl);
|
||||
|
||||
{
|
||||
cout << "Test read-modify-write counters... \n";
|
||||
Counters counters(db, 0);
|
||||
testCounters(counters, db.get(), true);
|
||||
}
|
||||
|
||||
bool compact = false;
|
||||
if (argc > 1) {
|
||||
compact = true;
|
||||
cout << "Turn on Compaction\n";
|
||||
}
|
||||
|
||||
{
|
||||
cout << "Test merge-based counters... \n";
|
||||
MergeBasedCounters counters(db, 0);
|
||||
testCounters(counters, db.get(), compact);
|
||||
}
|
||||
|
||||
DestroyDB(dbname, Options());
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
//TODO: Make this test like a general rocksdb unit-test
|
||||
runTest(argc, test::TmpDir() + "/merge_testdb");
|
||||
runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
|
||||
return 0;
|
||||
}
|
328
db/perf_context_test.cc
Normal file
328
db/perf_context_test.cc
Normal file
@ -0,0 +1,328 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "/usr/include/valgrind/callgrind.h"
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/perf_context.h"
|
||||
#include "util/histogram.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
|
||||
bool FLAGS_random_key = false;
|
||||
bool FLAGS_use_set_based_memetable = false;
|
||||
int FLAGS_total_keys = 100;
|
||||
int FLAGS_write_buffer_size = 1000000000;
|
||||
int FLAGS_max_write_buffer_number = 8;
|
||||
int FLAGS_min_write_buffer_number_to_merge = 7;
|
||||
|
||||
// Path to the database on file system
|
||||
const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
std::shared_ptr<DB> OpenDb() {
|
||||
DB* db;
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
|
||||
options.min_write_buffer_number_to_merge =
|
||||
FLAGS_min_write_buffer_number_to_merge;
|
||||
|
||||
if (FLAGS_use_set_based_memetable) {
|
||||
auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
|
||||
options.memtable_factory =
|
||||
std::make_shared<rocksdb::PrefixHashRepFactory>(prefix_extractor);
|
||||
}
|
||||
|
||||
Status s = DB::Open(options, kDbName, &db);
|
||||
ASSERT_OK(s);
|
||||
return std::shared_ptr<DB>(db);
|
||||
}
|
||||
|
||||
class PerfContextTest { };
|
||||
|
||||
TEST(PerfContextTest, SeekIntoDeletion) {
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
db->Put(write_options, key, value);
|
||||
}
|
||||
|
||||
for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
db->Delete(write_options, key);
|
||||
}
|
||||
|
||||
HistogramImpl hist_get;
|
||||
HistogramImpl hist_get_time;
|
||||
for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value;
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
auto status = db->Get(read_options, key, &value);
|
||||
auto elapsed_nanos = timer.ElapsedNanos();
|
||||
ASSERT_TRUE(status.IsNotFound());
|
||||
hist_get.Add(perf_context.user_key_comparison_count);
|
||||
hist_get_time.Add(elapsed_nanos);
|
||||
}
|
||||
|
||||
std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
|
||||
<< "Get time: \n" << hist_get_time.ToString();
|
||||
|
||||
HistogramImpl hist_seek_to_first;
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
iter->SeekToFirst();
|
||||
hist_seek_to_first.Add(perf_context.user_key_comparison_count);
|
||||
auto elapsed_nanos = timer.ElapsedNanos();
|
||||
|
||||
std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
|
||||
<< "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
|
||||
<< "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
|
||||
<< "elapsed: " << elapsed_nanos << "\n";
|
||||
|
||||
HistogramImpl hist_seek;
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
std::string key = "k" + std::to_string(i);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
iter->Seek(key);
|
||||
auto elapsed_nanos = timer.ElapsedNanos();
|
||||
hist_seek.Add(perf_context.user_key_comparison_count);
|
||||
std::cout << "seek cmp: " << perf_context.user_key_comparison_count
|
||||
<< " ikey skipped " << perf_context.internal_key_skipped_count
|
||||
<< " idelete skipped " << perf_context.internal_delete_skipped_count
|
||||
<< " elapsed: " << elapsed_nanos << "ns\n";
|
||||
|
||||
perf_context.Reset();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
StopWatchNano timer2(Env::Default(), true);
|
||||
iter->Next();
|
||||
auto elapsed_nanos2 = timer2.ElapsedNanos();
|
||||
std::cout << "next cmp: " << perf_context.user_key_comparison_count
|
||||
<< "elapsed: " << elapsed_nanos2 << "ns\n";
|
||||
}
|
||||
|
||||
std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
|
||||
}
|
||||
|
||||
TEST(PerfContextTest, StopWatchNanoOverhead) {
|
||||
// profile the timer cost by itself!
|
||||
const int kTotalIterations = 1000000;
|
||||
std::vector<uint64_t> timings(kTotalIterations);
|
||||
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
for (auto& timing : timings) {
|
||||
timing = timer.ElapsedNanos(true /* reset */);
|
||||
}
|
||||
|
||||
HistogramImpl histogram;
|
||||
for (const auto timing : timings) {
|
||||
histogram.Add(timing);
|
||||
}
|
||||
|
||||
std::cout << histogram.ToString();
|
||||
}
|
||||
|
||||
TEST(PerfContextTest, StopWatchOverhead) {
|
||||
// profile the timer cost by itself!
|
||||
const int kTotalIterations = 1000000;
|
||||
std::vector<uint64_t> timings(kTotalIterations);
|
||||
|
||||
StopWatch timer(Env::Default());
|
||||
for (auto& timing : timings) {
|
||||
timing = timer.ElapsedMicros();
|
||||
}
|
||||
|
||||
HistogramImpl histogram;
|
||||
uint64_t prev_timing = 0;
|
||||
for (const auto timing : timings) {
|
||||
histogram.Add(timing - prev_timing);
|
||||
prev_timing = timing;
|
||||
}
|
||||
|
||||
std::cout << histogram.ToString();
|
||||
}
|
||||
|
||||
void ProfileKeyComparison() {
|
||||
DestroyDB(kDbName, Options()); // Start this test with a fresh DB
|
||||
|
||||
auto db = OpenDb();
|
||||
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
HistogramImpl hist_put;
|
||||
HistogramImpl hist_get;
|
||||
|
||||
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
|
||||
|
||||
std::vector<int> keys;
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
keys.push_back(i);
|
||||
}
|
||||
|
||||
if (FLAGS_random_key) {
|
||||
std::random_shuffle(keys.begin(), keys.end());
|
||||
}
|
||||
|
||||
for (const int i : keys) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
perf_context.Reset();
|
||||
db->Put(write_options, key, value);
|
||||
hist_put.Add(perf_context.user_key_comparison_count);
|
||||
|
||||
perf_context.Reset();
|
||||
db->Get(read_options, key, &value);
|
||||
hist_get.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
|
||||
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
|
||||
<< "Get uesr key comparison: \n" << hist_get.ToString();
|
||||
|
||||
}
|
||||
|
||||
TEST(PerfContextTest, KeyComparisonCount) {
|
||||
SetPerfLevel(kEnableCount);
|
||||
ProfileKeyComparison();
|
||||
|
||||
SetPerfLevel(kDisable);
|
||||
ProfileKeyComparison();
|
||||
|
||||
SetPerfLevel(kEnableTime);
|
||||
ProfileKeyComparison();
|
||||
}
|
||||
|
||||
// make perf_context_test
|
||||
// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
|
||||
// For one memtable:
|
||||
// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
|
||||
// For two memtables:
|
||||
// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
|
||||
// Specify --random_key=1 to shuffle the key before insertion
|
||||
// Results show that, for sequential insertion, worst-case Seek Key comparison
|
||||
// is close to the total number of keys (linear), when there is only one
|
||||
// memtable. When there are two memtables, even the avg Seek Key comparison
|
||||
// starts to become linear to the input size.
|
||||
|
||||
TEST(PerfContextTest, SeekKeyComparison) {
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
|
||||
|
||||
std::vector<int> keys;
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
keys.push_back(i);
|
||||
}
|
||||
|
||||
if (FLAGS_random_key) {
|
||||
std::random_shuffle(keys.begin(), keys.end());
|
||||
}
|
||||
|
||||
HistogramImpl hist_put_time;
|
||||
HistogramImpl hist_wal_time;
|
||||
HistogramImpl hist_time_diff;
|
||||
|
||||
SetPerfLevel(kEnableTime);
|
||||
StopWatchNano timer(Env::Default());
|
||||
for (const int i : keys) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
perf_context.Reset();
|
||||
timer.Start();
|
||||
db->Put(write_options, key, value);
|
||||
auto put_time = timer.ElapsedNanos();
|
||||
hist_put_time.Add(put_time);
|
||||
hist_wal_time.Add(perf_context.wal_write_time);
|
||||
hist_time_diff.Add(put_time - perf_context.wal_write_time);
|
||||
}
|
||||
|
||||
std::cout << "Put time:\n" << hist_put_time.ToString()
|
||||
<< "WAL time:\n" << hist_wal_time.ToString()
|
||||
<< "time diff:\n" << hist_time_diff.ToString();
|
||||
|
||||
HistogramImpl hist_seek;
|
||||
HistogramImpl hist_next;
|
||||
|
||||
for (int i = 0; i < FLAGS_total_keys; ++i) {
|
||||
std::string key = "k" + std::to_string(i);
|
||||
std::string value = "v" + std::to_string(i);
|
||||
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
perf_context.Reset();
|
||||
iter->Seek(key);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(iter->value().ToString(), value);
|
||||
hist_seek.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
for (iter->SeekToFirst(); iter->Valid();) {
|
||||
perf_context.Reset();
|
||||
iter->Next();
|
||||
hist_next.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
|
||||
std::cout << "Seek:\n" << hist_seek.ToString()
|
||||
<< "Next:\n" << hist_next.ToString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
int n;
|
||||
char junk;
|
||||
|
||||
if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_write_buffer_size = n;
|
||||
}
|
||||
|
||||
if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
|
||||
FLAGS_total_keys = n;
|
||||
}
|
||||
|
||||
if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
|
||||
(n == 0 || n == 1)) {
|
||||
FLAGS_random_key = n;
|
||||
}
|
||||
|
||||
if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
|
||||
(n == 0 || n == 1)) {
|
||||
FLAGS_use_set_based_memetable = n;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::cout << kDbName << "\n";
|
||||
|
||||
rocksdb::test::RunAllTests();
|
||||
return 0;
|
||||
}
|
73
db/prefix_filter_iterator.h
Normal file
73
db/prefix_filter_iterator.h
Normal file
@ -0,0 +1,73 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Wrap an underlying iterator, but exclude any results not starting
|
||||
// with a given prefix. Seeking to keys not beginning with the prefix
|
||||
// is invalid, and SeekToLast is not implemented (that would be
|
||||
// non-trivial), but otherwise this iterator will behave just like the
|
||||
// underlying iterator would if there happened to be no non-matching
|
||||
// keys in the dataset.
|
||||
|
||||
#pragma once
|
||||
#include "rocksdb/iterator.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class PrefixFilterIterator : public Iterator {
|
||||
private:
|
||||
Iterator* iter_;
|
||||
const Slice &prefix_;
|
||||
const SliceTransform *prefix_extractor_;
|
||||
Status status_;
|
||||
|
||||
public:
|
||||
PrefixFilterIterator(Iterator* iter,
|
||||
const Slice &prefix,
|
||||
const SliceTransform* prefix_extractor)
|
||||
: iter_(iter), prefix_(prefix),
|
||||
prefix_extractor_(prefix_extractor),
|
||||
status_(Status::OK()) {
|
||||
if (prefix_extractor == nullptr) {
|
||||
status_ = Status::InvalidArgument("A prefix filter may not be used "
|
||||
"unless a function is also defined "
|
||||
"for extracting prefixes");
|
||||
} else if (!prefix_extractor_->InRange(prefix)) {
|
||||
status_ = Status::InvalidArgument("Must provide a slice for prefix which"
|
||||
"is a prefix for some key");
|
||||
}
|
||||
}
|
||||
~PrefixFilterIterator() {
|
||||
delete iter_;
|
||||
}
|
||||
Slice key() const { return iter_->key(); }
|
||||
Slice value() const { return iter_->value(); }
|
||||
Status status() const {
|
||||
if (!status_.ok()) {
|
||||
return status_;
|
||||
}
|
||||
return iter_->status();
|
||||
}
|
||||
void Next() { iter_->Next(); }
|
||||
void Prev() { iter_->Prev(); }
|
||||
void Seek(const Slice& k) {
|
||||
if (prefix_extractor_->Transform(k) == prefix_) {
|
||||
iter_->Seek(k);
|
||||
} else {
|
||||
status_ = Status::InvalidArgument("Seek must begin with target prefix");
|
||||
}
|
||||
}
|
||||
void SeekToFirst() {
|
||||
Seek(prefix_);
|
||||
}
|
||||
void SeekToLast() {
|
||||
status_ = Status::NotSupported("SeekToLast is incompatible with prefixes");
|
||||
}
|
||||
bool Valid() const {
|
||||
return (status_.ok() && iter_->Valid() &&
|
||||
prefix_extractor_->Transform(iter_->key()) == prefix_);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
319
db/prefix_test.cc
Normal file
319
db/prefix_test.cc
Normal file
@ -0,0 +1,319 @@
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include <gflags/gflags.h>
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/perf_context.h"
|
||||
#include "util/histogram.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
DEFINE_bool(use_prefix_hash_memtable, true, "");
|
||||
DEFINE_bool(use_nolock_version, true, "");
|
||||
DEFINE_bool(trigger_deadlock, false,
|
||||
"issue delete in range scan to trigger PrefixHashMap deadlock");
|
||||
DEFINE_uint64(bucket_count, 100000, "number of buckets");
|
||||
DEFINE_uint64(num_locks, 10001, "number of locks");
|
||||
DEFINE_bool(random_prefix, false, "randomize prefix");
|
||||
DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
|
||||
DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
|
||||
DEFINE_int64(write_buffer_size, 1000000000, "");
|
||||
DEFINE_int64(max_write_buffer_number, 8, "");
|
||||
DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
|
||||
|
||||
// Path to the database on file system
|
||||
const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct TestKey {
|
||||
uint64_t prefix;
|
||||
uint64_t sorted;
|
||||
|
||||
TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
|
||||
};
|
||||
|
||||
// return a slice backed by test_key
|
||||
inline Slice TestKeyToSlice(const TestKey& test_key) {
|
||||
return Slice((const char*)&test_key, sizeof(test_key));
|
||||
}
|
||||
|
||||
inline const TestKey* SliceToTestKey(const Slice& slice) {
|
||||
assert(slice.size() == sizeof(TestKey));
|
||||
return (const TestKey*)slice.data();
|
||||
}
|
||||
|
||||
class TestKeyComparator : public Comparator {
|
||||
public:
|
||||
virtual int Compare(const Slice& a, const Slice& b) const {
|
||||
const TestKey* key_a = SliceToTestKey(a);
|
||||
const TestKey* key_b = SliceToTestKey(b);
|
||||
if (key_a->prefix != key_b->prefix) {
|
||||
if (key_a->prefix < key_b->prefix) return -1;
|
||||
if (key_a->prefix > key_b->prefix) return 1;
|
||||
assert(false);
|
||||
} else {
|
||||
if (key_a->sorted < key_b->sorted) return -1;
|
||||
if (key_a->sorted > key_b->sorted) return 1;
|
||||
if (key_a->sorted == key_b->sorted) return 0;
|
||||
assert(false);
|
||||
}
|
||||
assert(false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "TestKeyComparator";
|
||||
}
|
||||
|
||||
virtual void FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const {
|
||||
}
|
||||
|
||||
virtual void FindShortSuccessor(std::string* key) const {}
|
||||
|
||||
};
|
||||
|
||||
class PrefixTest {
|
||||
public:
|
||||
std::shared_ptr<DB> OpenDb() {
|
||||
DB* db;
|
||||
|
||||
options.create_if_missing = true;
|
||||
options.write_buffer_size = FLAGS_write_buffer_size;
|
||||
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
|
||||
options.min_write_buffer_number_to_merge =
|
||||
FLAGS_min_write_buffer_number_to_merge;
|
||||
|
||||
options.comparator = new TestKeyComparator();
|
||||
if (FLAGS_use_prefix_hash_memtable) {
|
||||
auto prefix_extractor = NewFixedPrefixTransform(8);
|
||||
options.prefix_extractor = prefix_extractor;
|
||||
if (FLAGS_use_nolock_version) {
|
||||
options.memtable_factory.reset(NewHashSkipListRepFactory(
|
||||
prefix_extractor, FLAGS_bucket_count));
|
||||
} else {
|
||||
options.memtable_factory =
|
||||
std::make_shared<rocksdb::PrefixHashRepFactory>(
|
||||
prefix_extractor, FLAGS_bucket_count, FLAGS_num_locks);
|
||||
}
|
||||
}
|
||||
|
||||
Status s = DB::Open(options, kDbName, &db);
|
||||
ASSERT_OK(s);
|
||||
return std::shared_ptr<DB>(db);
|
||||
}
|
||||
~PrefixTest() {
|
||||
delete options.comparator;
|
||||
}
|
||||
protected:
|
||||
Options options;
|
||||
};
|
||||
|
||||
TEST(PrefixTest, DynamicPrefixIterator) {
|
||||
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
std::vector<uint64_t> prefixes;
|
||||
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
|
||||
prefixes.push_back(i);
|
||||
}
|
||||
|
||||
if (FLAGS_random_prefix) {
|
||||
std::random_shuffle(prefixes.begin(), prefixes.end());
|
||||
}
|
||||
|
||||
// insert x random prefix, each with y continuous element.
|
||||
for (auto prefix : prefixes) {
|
||||
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
|
||||
TestKey test_key(prefix, sorted);
|
||||
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
std::string value = "v" + std::to_string(sorted);
|
||||
|
||||
ASSERT_OK(db->Put(write_options, key, value));
|
||||
}
|
||||
}
|
||||
|
||||
// test seek existing keys
|
||||
HistogramImpl hist_seek_time;
|
||||
HistogramImpl hist_seek_comparison;
|
||||
|
||||
if (FLAGS_use_prefix_hash_memtable) {
|
||||
read_options.prefix_seek = true;
|
||||
}
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
|
||||
for (auto prefix : prefixes) {
|
||||
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
std::string value = "v" + std::to_string(0);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
uint64_t total_keys = 0;
|
||||
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
||||
if (FLAGS_trigger_deadlock) {
|
||||
std::cout << "Behold the deadlock!\n";
|
||||
db->Delete(write_options, iter->key());
|
||||
}
|
||||
auto test_key = SliceToTestKey(iter->key());
|
||||
if (test_key->prefix != prefix) break;
|
||||
total_keys++;
|
||||
}
|
||||
hist_seek_time.Add(timer.ElapsedNanos());
|
||||
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
|
||||
}
|
||||
|
||||
std::cout << "Seek key comparison: \n"
|
||||
<< hist_seek_comparison.ToString()
|
||||
<< "Seek time: \n"
|
||||
<< hist_seek_time.ToString();
|
||||
|
||||
// test non-existing keys
|
||||
HistogramImpl hist_no_seek_time;
|
||||
HistogramImpl hist_no_seek_comparison;
|
||||
|
||||
for (auto prefix = FLAGS_total_prefixes;
|
||||
prefix < FLAGS_total_prefixes + 100;
|
||||
prefix++) {
|
||||
TestKey test_key(prefix, 0);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
iter->Seek(key);
|
||||
hist_no_seek_time.Add(timer.ElapsedNanos());
|
||||
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
}
|
||||
|
||||
std::cout << "non-existing Seek key comparison: \n"
|
||||
<< hist_no_seek_comparison.ToString()
|
||||
<< "non-existing Seek time: \n"
|
||||
<< hist_no_seek_time.ToString();
|
||||
}
|
||||
|
||||
TEST(PrefixTest, PrefixHash) {
|
||||
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
std::vector<uint64_t> prefixes;
|
||||
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
|
||||
prefixes.push_back(i);
|
||||
}
|
||||
|
||||
if (FLAGS_random_prefix) {
|
||||
std::random_shuffle(prefixes.begin(), prefixes.end());
|
||||
}
|
||||
|
||||
// insert x random prefix, each with y continuous element.
|
||||
HistogramImpl hist_put_time;
|
||||
HistogramImpl hist_put_comparison;
|
||||
|
||||
for (auto prefix : prefixes) {
|
||||
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
|
||||
TestKey test_key(prefix, sorted);
|
||||
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
std::string value = "v" + std::to_string(sorted);
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
ASSERT_OK(db->Put(write_options, key, value));
|
||||
hist_put_time.Add(timer.ElapsedNanos());
|
||||
hist_put_comparison.Add(perf_context.user_key_comparison_count);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
|
||||
<< "Put time: \n" << hist_put_time.ToString();
|
||||
|
||||
|
||||
// test seek existing keys
|
||||
HistogramImpl hist_seek_time;
|
||||
HistogramImpl hist_seek_comparison;
|
||||
|
||||
for (auto prefix : prefixes) {
|
||||
TestKey test_key(prefix, 0);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
std::string value = "v" + std::to_string(0);
|
||||
|
||||
Slice key_prefix;
|
||||
if (FLAGS_use_prefix_hash_memtable) {
|
||||
key_prefix = options.prefix_extractor->Transform(key);
|
||||
read_options.prefix = &key_prefix;
|
||||
}
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
uint64_t total_keys = 0;
|
||||
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
||||
if (FLAGS_trigger_deadlock) {
|
||||
std::cout << "Behold the deadlock!\n";
|
||||
db->Delete(write_options, iter->key());
|
||||
}
|
||||
auto test_key = SliceToTestKey(iter->key());
|
||||
if (test_key->prefix != prefix) break;
|
||||
total_keys++;
|
||||
}
|
||||
hist_seek_time.Add(timer.ElapsedNanos());
|
||||
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||
ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
|
||||
}
|
||||
|
||||
std::cout << "Seek key comparison: \n"
|
||||
<< hist_seek_comparison.ToString()
|
||||
<< "Seek time: \n"
|
||||
<< hist_seek_time.ToString();
|
||||
|
||||
// test non-existing keys
|
||||
HistogramImpl hist_no_seek_time;
|
||||
HistogramImpl hist_no_seek_comparison;
|
||||
|
||||
for (auto prefix = FLAGS_total_prefixes;
|
||||
prefix < FLAGS_total_prefixes + 100;
|
||||
prefix++) {
|
||||
TestKey test_key(prefix, 0);
|
||||
Slice key = TestKeyToSlice(test_key);
|
||||
|
||||
if (FLAGS_use_prefix_hash_memtable) {
|
||||
Slice key_prefix = options.prefix_extractor->Transform(key);
|
||||
read_options.prefix = &key_prefix;
|
||||
}
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
|
||||
perf_context.Reset();
|
||||
StopWatchNano timer(Env::Default(), true);
|
||||
iter->Seek(key);
|
||||
hist_no_seek_time.Add(timer.ElapsedNanos());
|
||||
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||
ASSERT_TRUE(!iter->Valid());
|
||||
}
|
||||
|
||||
std::cout << "non-existing Seek key comparison: \n"
|
||||
<< hist_no_seek_comparison.ToString()
|
||||
<< "non-existing Seek time: \n"
|
||||
<< hist_no_seek_time.ToString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
std::cout << kDbName << "\n";
|
||||
|
||||
rocksdb::test::RunAllTests();
|
||||
return 0;
|
||||
}
|
390
db/repair.cc
Normal file
390
db/repair.cc
Normal file
@ -0,0 +1,390 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// We recover the contents of the descriptor from the other files we find.
|
||||
// (1) Any log files are first converted to tables
|
||||
// (2) We scan every table to compute
|
||||
// (a) smallest/largest for the table
|
||||
// (b) largest sequence number in the table
|
||||
// (3) We generate descriptor contents:
|
||||
// - log number is set to zero
|
||||
// - next-file-number is set to 1 + largest file number we found
|
||||
// - last-sequence-number is set to largest sequence# found across
|
||||
// all tables (see 2c)
|
||||
// - compaction pointers are cleared
|
||||
// - every table file is added at level 0
|
||||
//
|
||||
// Possible optimization 1:
|
||||
// (a) Compute total size and use to pick appropriate max-level M
|
||||
// (b) Sort tables by largest sequence# in the table
|
||||
// (c) For each table: if it overlaps earlier table, place in level-0,
|
||||
// else place in level-M.
|
||||
// Possible optimization 2:
|
||||
// Store per-table metadata (smallest, largest, largest-seq#, ...)
|
||||
// in the table's meta section to speed up ScanTable.
|
||||
|
||||
#include "db/builder.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
class Repairer {
|
||||
public:
|
||||
Repairer(const std::string& dbname, const Options& options)
|
||||
: dbname_(dbname),
|
||||
env_(options.env),
|
||||
icmp_(options.comparator),
|
||||
ipolicy_(options.filter_policy),
|
||||
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
|
||||
next_file_number_(1) {
|
||||
// TableCache can be small since we expect each table to be opened once.
|
||||
table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
|
||||
edit_ = new VersionEdit(options.num_levels);
|
||||
}
|
||||
|
||||
~Repairer() {
|
||||
delete table_cache_;
|
||||
delete edit_;
|
||||
}
|
||||
|
||||
Status Run() {
|
||||
Status status = FindFiles();
|
||||
if (status.ok()) {
|
||||
ConvertLogFilesToTables();
|
||||
ExtractMetaData();
|
||||
status = WriteDescriptor();
|
||||
}
|
||||
if (status.ok()) {
|
||||
unsigned long long bytes = 0;
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
bytes += tables_[i].meta.file_size;
|
||||
}
|
||||
Log(options_.info_log,
|
||||
"**** Repaired rocksdb %s; "
|
||||
"recovered %d files; %llu bytes. "
|
||||
"Some data may have been lost. "
|
||||
"****",
|
||||
dbname_.c_str(),
|
||||
static_cast<int>(tables_.size()),
|
||||
bytes);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
private:
|
||||
struct TableInfo {
|
||||
FileMetaData meta;
|
||||
SequenceNumber min_sequence;
|
||||
SequenceNumber max_sequence;
|
||||
};
|
||||
|
||||
std::string const dbname_;
|
||||
Env* const env_;
|
||||
InternalKeyComparator const icmp_;
|
||||
InternalFilterPolicy const ipolicy_;
|
||||
Options const options_;
|
||||
TableCache* table_cache_;
|
||||
VersionEdit* edit_;
|
||||
|
||||
std::vector<std::string> manifests_;
|
||||
std::vector<uint64_t> table_numbers_;
|
||||
std::vector<uint64_t> logs_;
|
||||
std::vector<TableInfo> tables_;
|
||||
uint64_t next_file_number_;
|
||||
const EnvOptions storage_options_;
|
||||
|
||||
Status FindFiles() {
|
||||
std::vector<std::string> filenames;
|
||||
Status status = env_->GetChildren(dbname_, &filenames);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
if (filenames.empty()) {
|
||||
return Status::IOError(dbname_, "repair found no files");
|
||||
}
|
||||
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
for (size_t i = 0; i < filenames.size(); i++) {
|
||||
if (ParseFileName(filenames[i], &number, &type)) {
|
||||
if (type == kDescriptorFile) {
|
||||
manifests_.push_back(filenames[i]);
|
||||
} else {
|
||||
if (number + 1 > next_file_number_) {
|
||||
next_file_number_ = number + 1;
|
||||
}
|
||||
if (type == kLogFile) {
|
||||
logs_.push_back(number);
|
||||
} else if (type == kTableFile) {
|
||||
table_numbers_.push_back(number);
|
||||
} else {
|
||||
// Ignore other files
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
void ConvertLogFilesToTables() {
|
||||
for (size_t i = 0; i < logs_.size(); i++) {
|
||||
std::string logname = LogFileName(dbname_, logs_[i]);
|
||||
Status status = ConvertLogToTable(logs_[i]);
|
||||
if (!status.ok()) {
|
||||
Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
|
||||
(unsigned long long) logs_[i],
|
||||
status.ToString().c_str());
|
||||
}
|
||||
ArchiveFile(logname);
|
||||
}
|
||||
}
|
||||
|
||||
Status ConvertLogToTable(uint64_t log) {
|
||||
struct LogReporter : public log::Reader::Reporter {
|
||||
Env* env;
|
||||
std::shared_ptr<Logger> info_log;
|
||||
uint64_t lognum;
|
||||
virtual void Corruption(size_t bytes, const Status& s) {
|
||||
// We print error messages for corruption, but continue repairing.
|
||||
Log(info_log, "Log #%llu: dropping %d bytes; %s",
|
||||
(unsigned long long) lognum,
|
||||
static_cast<int>(bytes),
|
||||
s.ToString().c_str());
|
||||
}
|
||||
};
|
||||
|
||||
// Open the log file
|
||||
std::string logname = LogFileName(dbname_, log);
|
||||
unique_ptr<SequentialFile> lfile;
|
||||
Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Create the log reader.
|
||||
LogReporter reporter;
|
||||
reporter.env = env_;
|
||||
reporter.info_log = options_.info_log;
|
||||
reporter.lognum = log;
|
||||
// We intentially make log::Reader do checksumming so that
|
||||
// corruptions cause entire commits to be skipped instead of
|
||||
// propagating bad information (like overly large sequence
|
||||
// numbers).
|
||||
log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
|
||||
0/*initial_offset*/);
|
||||
|
||||
// Read all the records and add to a memtable
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
WriteBatch batch;
|
||||
MemTable* mem = new MemTable(icmp_, options_.memtable_factory,
|
||||
options_.num_levels);
|
||||
mem->Ref();
|
||||
int counter = 0;
|
||||
while (reader.ReadRecord(&record, &scratch)) {
|
||||
if (record.size() < 12) {
|
||||
reporter.Corruption(
|
||||
record.size(), Status::Corruption("log record too small"));
|
||||
continue;
|
||||
}
|
||||
WriteBatchInternal::SetContents(&batch, record);
|
||||
status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
|
||||
if (status.ok()) {
|
||||
counter += WriteBatchInternal::Count(&batch);
|
||||
} else {
|
||||
Log(options_.info_log, "Log #%llu: ignoring %s",
|
||||
(unsigned long long) log,
|
||||
status.ToString().c_str());
|
||||
status = Status::OK(); // Keep going with rest of file
|
||||
}
|
||||
}
|
||||
|
||||
// Do not record a version edit for this conversion to a Table
|
||||
// since ExtractMetaData() will also generate edits.
|
||||
FileMetaData meta;
|
||||
meta.number = next_file_number_++;
|
||||
Iterator* iter = mem->NewIterator();
|
||||
status = BuildTable(dbname_, env_, options_, storage_options_,
|
||||
table_cache_, iter, &meta,
|
||||
icmp_.user_comparator(), 0, 0, true);
|
||||
delete iter;
|
||||
mem->Unref();
|
||||
mem = nullptr;
|
||||
if (status.ok()) {
|
||||
if (meta.file_size > 0) {
|
||||
table_numbers_.push_back(meta.number);
|
||||
}
|
||||
}
|
||||
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
|
||||
(unsigned long long) log,
|
||||
counter,
|
||||
(unsigned long long) meta.number,
|
||||
status.ToString().c_str());
|
||||
return status;
|
||||
}
|
||||
|
||||
void ExtractMetaData() {
|
||||
std::vector<TableInfo> kept;
|
||||
for (size_t i = 0; i < table_numbers_.size(); i++) {
|
||||
TableInfo t;
|
||||
t.meta.number = table_numbers_[i];
|
||||
Status status = ScanTable(&t);
|
||||
if (!status.ok()) {
|
||||
std::string fname = TableFileName(dbname_, table_numbers_[i]);
|
||||
Log(options_.info_log, "Table #%llu: ignoring %s",
|
||||
(unsigned long long) table_numbers_[i],
|
||||
status.ToString().c_str());
|
||||
ArchiveFile(fname);
|
||||
} else {
|
||||
tables_.push_back(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status ScanTable(TableInfo* t) {
|
||||
std::string fname = TableFileName(dbname_, t->meta.number);
|
||||
int counter = 0;
|
||||
Status status = env_->GetFileSize(fname, &t->meta.file_size);
|
||||
if (status.ok()) {
|
||||
Iterator* iter = table_cache_->NewIterator(
|
||||
ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
|
||||
bool empty = true;
|
||||
ParsedInternalKey parsed;
|
||||
t->min_sequence = 0;
|
||||
t->max_sequence = 0;
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
Slice key = iter->key();
|
||||
if (!ParseInternalKey(key, &parsed)) {
|
||||
Log(options_.info_log, "Table #%llu: unparsable key %s",
|
||||
(unsigned long long) t->meta.number,
|
||||
EscapeString(key).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
counter++;
|
||||
if (empty) {
|
||||
empty = false;
|
||||
t->meta.smallest.DecodeFrom(key);
|
||||
}
|
||||
t->meta.largest.DecodeFrom(key);
|
||||
if (parsed.sequence < t->min_sequence) {
|
||||
t->min_sequence = parsed.sequence;
|
||||
}
|
||||
if (parsed.sequence > t->max_sequence) {
|
||||
t->max_sequence = parsed.sequence;
|
||||
}
|
||||
}
|
||||
if (!iter->status().ok()) {
|
||||
status = iter->status();
|
||||
}
|
||||
delete iter;
|
||||
}
|
||||
Log(options_.info_log, "Table #%llu: %d entries %s",
|
||||
(unsigned long long) t->meta.number,
|
||||
counter,
|
||||
status.ToString().c_str());
|
||||
return status;
|
||||
}
|
||||
|
||||
Status WriteDescriptor() {
|
||||
std::string tmp = TempFileName(dbname_, 1);
|
||||
unique_ptr<WritableFile> file;
|
||||
Status status = env_->NewWritableFile(tmp, &file, storage_options_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
SequenceNumber max_sequence = 0;
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
if (max_sequence < tables_[i].max_sequence) {
|
||||
max_sequence = tables_[i].max_sequence;
|
||||
}
|
||||
}
|
||||
|
||||
edit_->SetComparatorName(icmp_.user_comparator()->Name());
|
||||
edit_->SetLogNumber(0);
|
||||
edit_->SetNextFile(next_file_number_);
|
||||
edit_->SetLastSequence(max_sequence);
|
||||
|
||||
for (size_t i = 0; i < tables_.size(); i++) {
|
||||
// TODO(opt): separate out into multiple levels
|
||||
const TableInfo& t = tables_[i];
|
||||
edit_->AddFile(0, t.meta.number, t.meta.file_size,
|
||||
t.meta.smallest, t.meta.largest,
|
||||
t.min_sequence, t.max_sequence);
|
||||
}
|
||||
|
||||
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
|
||||
{
|
||||
log::Writer log(std::move(file));
|
||||
std::string record;
|
||||
edit_->EncodeTo(&record);
|
||||
status = log.AddRecord(record);
|
||||
}
|
||||
|
||||
if (!status.ok()) {
|
||||
env_->DeleteFile(tmp);
|
||||
} else {
|
||||
// Discard older manifests
|
||||
for (size_t i = 0; i < manifests_.size(); i++) {
|
||||
ArchiveFile(dbname_ + "/" + manifests_[i]);
|
||||
}
|
||||
|
||||
// Install new manifest
|
||||
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
|
||||
if (status.ok()) {
|
||||
status = SetCurrentFile(env_, dbname_, 1);
|
||||
} else {
|
||||
env_->DeleteFile(tmp);
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
void ArchiveFile(const std::string& fname) {
|
||||
// Move into another directory. E.g., for
|
||||
// dir/foo
|
||||
// rename to
|
||||
// dir/lost/foo
|
||||
const char* slash = strrchr(fname.c_str(), '/');
|
||||
std::string new_dir;
|
||||
if (slash != nullptr) {
|
||||
new_dir.assign(fname.data(), slash - fname.data());
|
||||
}
|
||||
new_dir.append("/lost");
|
||||
env_->CreateDir(new_dir); // Ignore error
|
||||
std::string new_file = new_dir;
|
||||
new_file.append("/");
|
||||
new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
|
||||
Status s = env_->RenameFile(fname, new_file);
|
||||
Log(options_.info_log, "Archiving %s: %s\n",
|
||||
fname.c_str(), s.ToString().c_str());
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
Status RepairDB(const std::string& dbname, const Options& options) {
|
||||
Repairer repairer(dbname, options);
|
||||
return repairer.Run();
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
793
db/simple_table_db_test.cc
Normal file
793
db/simple_table_db_test.cc
Normal file
@ -0,0 +1,793 @@
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/version_set.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "db/db_statistics.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/compaction_filter.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||
// as production quality.
|
||||
// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
|
||||
// be longer than 150000 bytes and stored data on disk in this format:
|
||||
// +--------------------------------------------+ <= key1 offset
|
||||
// | key1 | value_size (4 bytes) | |
|
||||
// +----------------------------------------+ |
|
||||
// | value1 |
|
||||
// | |
|
||||
// +----------------------------------------+---+ <= key2 offset
|
||||
// | key2 | value_size (4 bytes) | |
|
||||
// +----------------------------------------+ |
|
||||
// | value2 |
|
||||
// | |
|
||||
// | ...... |
|
||||
// +-----------------+--------------------------+ <= index_block_offset
|
||||
// | key1 | key1 offset (8 bytes) |
|
||||
// +-----------------+--------------------------+
|
||||
// | key2 | key2 offset (8 bytes) |
|
||||
// +-----------------+--------------------------+
|
||||
// | key3 | key3 offset (8 bytes) |
|
||||
// +-----------------+--------------------------+
|
||||
// | ...... |
|
||||
// +-----------------+------------+-------------+
|
||||
// | index_block_offset (8 bytes) |
|
||||
// +------------------------------+
|
||||
|
||||
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||
// as production quality.
|
||||
class SimpleTableReader: public TableReader {
|
||||
public:
|
||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||
// of "file", and read the metadata entries necessary to allow
|
||||
// retrieving data from the table.
|
||||
//
|
||||
// If successful, returns ok and sets "*table" to the newly opened
|
||||
// table. The client should delete "*table" when no longer needed.
|
||||
// If there was an error while initializing the table, sets "*table"
|
||||
// to nullptr and returns a non-ok status. Does not take ownership of
|
||||
// "*source", but the client must ensure that "source" remains live
|
||||
// for the duration of the returned table's lifetime.
|
||||
//
|
||||
// *file must remain live while this Table is in use.
|
||||
static Status Open(const Options& options, const EnvOptions& soptions,
|
||||
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader);
|
||||
|
||||
bool PrefixMayMatch(const Slice& internal_prefix) override;
|
||||
|
||||
Iterator* NewIterator(const ReadOptions&) override;
|
||||
|
||||
Status Get(
|
||||
const ReadOptions&, const Slice& key, void* arg,
|
||||
bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
|
||||
void (*mark_key_may_exist)(void*) = nullptr) override;
|
||||
|
||||
uint64_t ApproximateOffsetOf(const Slice& key) override;
|
||||
|
||||
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
|
||||
|
||||
void SetupForCompaction() override;
|
||||
|
||||
TableStats& GetTableStats() override;
|
||||
|
||||
~SimpleTableReader();
|
||||
|
||||
private:
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
explicit SimpleTableReader(Rep* rep) {
|
||||
rep_ = rep;
|
||||
}
|
||||
friend class TableCache;
|
||||
friend class SimpleTableIterator;
|
||||
|
||||
Status GetOffset(const Slice& target, uint64_t* offset);
|
||||
|
||||
// No copying allowed
|
||||
explicit SimpleTableReader(const TableReader&) = delete;
|
||||
void operator=(const TableReader&) = delete;
|
||||
};
|
||||
|
||||
// Iterator to iterate SimpleTable
|
||||
class SimpleTableIterator: public Iterator {
|
||||
public:
|
||||
explicit SimpleTableIterator(SimpleTableReader* table);
|
||||
~SimpleTableIterator();
|
||||
|
||||
bool Valid() const;
|
||||
|
||||
void SeekToFirst();
|
||||
|
||||
void SeekToLast();
|
||||
|
||||
void Seek(const Slice& target);
|
||||
|
||||
void Next();
|
||||
|
||||
void Prev();
|
||||
|
||||
Slice key() const;
|
||||
|
||||
Slice value() const;
|
||||
|
||||
Status status() const;
|
||||
|
||||
private:
|
||||
SimpleTableReader* table_;
|
||||
uint64_t offset_;
|
||||
uint64_t next_offset_;
|
||||
Slice key_;
|
||||
Slice value_;
|
||||
char tmp_str_[4];
|
||||
char* key_str_;
|
||||
char* value_str_;
|
||||
int value_str_len_;
|
||||
Status status_;
|
||||
// No copying allowed
|
||||
SimpleTableIterator(const SimpleTableIterator&) = delete;
|
||||
void operator=(const Iterator&) = delete;
|
||||
};
|
||||
|
||||
struct SimpleTableReader::Rep {
|
||||
~Rep() {
|
||||
}
|
||||
Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
|
||||
int num_entries) :
|
||||
soptions(storage_options), index_start_offset(index_start_offset),
|
||||
num_entries(num_entries) {
|
||||
}
|
||||
|
||||
Options options;
|
||||
const EnvOptions& soptions;
|
||||
Status status;
|
||||
unique_ptr<RandomAccessFile> file;
|
||||
uint64_t index_start_offset;
|
||||
int num_entries;
|
||||
TableStats table_stats;
|
||||
|
||||
const static int user_key_size = 16;
|
||||
const static int offset_length = 8;
|
||||
const static int key_footer_len = 8;
|
||||
|
||||
static int GetInternalKeyLength() {
|
||||
return user_key_size + key_footer_len;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleTableReader::~SimpleTableReader() {
|
||||
delete rep_;
|
||||
}
|
||||
|
||||
Status SimpleTableReader::Open(const Options& options,
|
||||
const EnvOptions& soptions,
|
||||
unique_ptr<RandomAccessFile> && file,
|
||||
uint64_t size,
|
||||
unique_ptr<TableReader>* table_reader) {
|
||||
char footer_space[Rep::offset_length];
|
||||
Slice footer_input;
|
||||
Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
|
||||
&footer_input, footer_space);
|
||||
if (s.ok()) {
|
||||
uint64_t index_start_offset = DecodeFixed64(footer_space);
|
||||
|
||||
int num_entries = (size - Rep::offset_length - index_start_offset)
|
||||
/ (Rep::GetInternalKeyLength() + Rep::offset_length);
|
||||
SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
|
||||
index_start_offset,
|
||||
num_entries);
|
||||
|
||||
rep->file = std::move(file);
|
||||
rep->options = options;
|
||||
table_reader->reset(new SimpleTableReader(rep));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void SimpleTableReader::SetupForCompaction() {
|
||||
}
|
||||
|
||||
TableStats& SimpleTableReader::GetTableStats() {
|
||||
return rep_->table_stats;
|
||||
}
|
||||
|
||||
bool SimpleTableReader::PrefixMayMatch(const Slice& internal_prefix) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) {
|
||||
return new SimpleTableIterator(this);
|
||||
}
|
||||
|
||||
Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
|
||||
uint32_t left = 0;
|
||||
uint32_t right = rep_->num_entries - 1;
|
||||
char key_chars[Rep::GetInternalKeyLength()];
|
||||
Slice tmp_slice;
|
||||
|
||||
uint32_t target_offset = 0;
|
||||
while (left <= right) {
|
||||
uint32_t mid = (left + right + 1) / 2;
|
||||
|
||||
uint64_t offset_to_read = rep_->index_start_offset
|
||||
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
|
||||
Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
|
||||
&tmp_slice, key_chars);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
|
||||
|
||||
if (compare_result < 0) {
|
||||
if (left == right) {
|
||||
target_offset = right + 1;
|
||||
break;
|
||||
}
|
||||
left = mid;
|
||||
} else {
|
||||
if (left == right) {
|
||||
target_offset = left;
|
||||
break;
|
||||
}
|
||||
right = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (target_offset >= (uint32_t) rep_->num_entries) {
|
||||
*offset = rep_->index_start_offset;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
char value_offset_chars[Rep::offset_length];
|
||||
|
||||
int64_t offset_for_value_offset = rep_->index_start_offset
|
||||
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
|
||||
+ Rep::GetInternalKeyLength();
|
||||
Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
|
||||
&tmp_slice, value_offset_chars);
|
||||
if (s.ok()) {
|
||||
*offset = DecodeFixed64(value_offset_chars);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status SimpleTableReader::Get(
|
||||
const ReadOptions& options, const Slice& k, void* arg,
|
||||
bool (*saver)(void*, const Slice&, const Slice&, bool),
|
||||
void (*mark_key_may_exist)(void*)) {
|
||||
Status s;
|
||||
SimpleTableIterator* iter = new SimpleTableIterator(this);
|
||||
for (iter->Seek(k); iter->Valid(); iter->Next()) {
|
||||
if (!(*saver)(arg, iter->key(), iter->value(), true)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
s = iter->status();
|
||||
delete iter;
|
||||
return s;
|
||||
}
|
||||
|
||||
bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
|
||||
const Slice& key) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
|
||||
table_(table) {
|
||||
key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
|
||||
value_str_len_ = -1;
|
||||
SeekToFirst();
|
||||
}
|
||||
|
||||
SimpleTableIterator::~SimpleTableIterator() {
|
||||
delete[] key_str_;
|
||||
if (value_str_len_ >= 0) {
|
||||
delete[] value_str_;
|
||||
}
|
||||
}
|
||||
|
||||
bool SimpleTableIterator::Valid() const {
|
||||
return offset_ < table_->rep_->index_start_offset;
|
||||
}
|
||||
|
||||
void SimpleTableIterator::SeekToFirst() {
|
||||
next_offset_ = 0;
|
||||
Next();
|
||||
}
|
||||
|
||||
void SimpleTableIterator::SeekToLast() {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
void SimpleTableIterator::Seek(const Slice& target) {
|
||||
Status s = table_->GetOffset(target, &next_offset_);
|
||||
if (!s.ok()) {
|
||||
status_ = s;
|
||||
}
|
||||
Next();
|
||||
}
|
||||
|
||||
void SimpleTableIterator::Next() {
|
||||
offset_ = next_offset_;
|
||||
if (offset_ >= table_->rep_->index_start_offset) {
|
||||
return;
|
||||
}
|
||||
Slice result;
|
||||
int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
|
||||
|
||||
Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
|
||||
key_str_);
|
||||
next_offset_ += internal_key_size;
|
||||
key_ = result;
|
||||
|
||||
Slice value_size_slice;
|
||||
s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
|
||||
next_offset_ += 4;
|
||||
uint32_t value_size = DecodeFixed32(tmp_str_);
|
||||
|
||||
Slice value_slice;
|
||||
if ((int) value_size > value_str_len_) {
|
||||
if (value_str_len_ >= 0) {
|
||||
delete[] value_str_;
|
||||
}
|
||||
value_str_ = new char[value_size];
|
||||
value_str_len_ = value_size;
|
||||
}
|
||||
s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
|
||||
value_str_);
|
||||
next_offset_ += value_size;
|
||||
value_ = value_slice;
|
||||
}
|
||||
|
||||
void SimpleTableIterator::Prev() {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
Slice SimpleTableIterator::key() const {
|
||||
Log(table_->rep_->options.info_log, "key!!!!");
|
||||
return key_;
|
||||
}
|
||||
|
||||
Slice SimpleTableIterator::value() const {
|
||||
return value_;
|
||||
}
|
||||
|
||||
Status SimpleTableIterator::status() const {
|
||||
return status_;
|
||||
}
|
||||
|
||||
class SimpleTableBuilder: public TableBuilder {
|
||||
public:
|
||||
// Create a builder that will store the contents of the table it is
|
||||
// building in *file. Does not close the file. It is up to the
|
||||
// caller to close the file after calling Finish(). The output file
|
||||
// will be part of level specified by 'level'. A value of -1 means
|
||||
// that the caller does not know which level the output file will reside.
|
||||
SimpleTableBuilder(const Options& options, WritableFile* file,
|
||||
CompressionType compression_type);
|
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~SimpleTableBuilder();
|
||||
|
||||
// Add key,value to the table being constructed.
|
||||
// REQUIRES: key is after any previously added key according to comparator.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Add(const Slice& key, const Slice& value) override;
|
||||
|
||||
// Return non-ok iff some error has been detected.
|
||||
Status status() const override;
|
||||
|
||||
// Finish building the table. Stops using the file passed to the
|
||||
// constructor after this function returns.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
Status Finish() override;
|
||||
|
||||
// Indicate that the contents of this builder should be abandoned. Stops
|
||||
// using the file passed to the constructor after this function returns.
|
||||
// If the caller is not going to call Finish(), it must call Abandon()
|
||||
// before destroying this builder.
|
||||
// REQUIRES: Finish(), Abandon() have not been called
|
||||
void Abandon() override;
|
||||
|
||||
// Number of calls to Add() so far.
|
||||
uint64_t NumEntries() const override;
|
||||
|
||||
// Size of the file generated so far. If invoked after a successful
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const override;
|
||||
|
||||
private:
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
// No copying allowed
|
||||
SimpleTableBuilder(const SimpleTableBuilder&) = delete;
|
||||
void operator=(const SimpleTableBuilder&) = delete;
|
||||
};
|
||||
|
||||
struct SimpleTableBuilder::Rep {
|
||||
Options options;
|
||||
WritableFile* file;
|
||||
uint64_t offset = 0;
|
||||
Status status;
|
||||
|
||||
uint64_t num_entries = 0;
|
||||
|
||||
bool closed = false; // Either Finish() or Abandon() has been called.
|
||||
|
||||
const static int user_key_size = 16;
|
||||
const static int offset_length = 8;
|
||||
const static int key_footer_len = 8;
|
||||
|
||||
static int GetInternalKeyLength() {
|
||||
return user_key_size + key_footer_len;
|
||||
}
|
||||
|
||||
std::string index;
|
||||
|
||||
Rep(const Options& opt, WritableFile* f) :
|
||||
options(opt), file(f) {
|
||||
}
|
||||
~Rep() {
|
||||
}
|
||||
};
|
||||
|
||||
SimpleTableBuilder::SimpleTableBuilder(const Options& options,
|
||||
WritableFile* file,
|
||||
CompressionType compression_type) :
|
||||
rep_(new SimpleTableBuilder::Rep(options, file)) {
|
||||
}
|
||||
|
||||
SimpleTableBuilder::~SimpleTableBuilder() {
|
||||
delete (rep_);
|
||||
}
|
||||
|
||||
void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
assert((int ) key.size() == Rep::GetInternalKeyLength());
|
||||
|
||||
// Update index
|
||||
rep_->index.append(key.data(), key.size());
|
||||
PutFixed64(&(rep_->index), rep_->offset);
|
||||
|
||||
// Write key-value pair
|
||||
rep_->file->Append(key);
|
||||
rep_->offset += Rep::GetInternalKeyLength();
|
||||
|
||||
std::string size;
|
||||
int value_size = value.size();
|
||||
PutFixed32(&size, value_size);
|
||||
Slice sizeSlice(size);
|
||||
rep_->file->Append(sizeSlice);
|
||||
rep_->file->Append(value);
|
||||
rep_->offset += value_size + 4;
|
||||
|
||||
rep_->num_entries++;
|
||||
}
|
||||
|
||||
Status SimpleTableBuilder::status() const {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status SimpleTableBuilder::Finish() {
|
||||
Rep* r = rep_;
|
||||
assert(!r->closed);
|
||||
r->closed = true;
|
||||
|
||||
uint64_t index_offset = rep_->offset;
|
||||
Slice index_slice(rep_->index);
|
||||
rep_->file->Append(index_slice);
|
||||
rep_->offset += index_slice.size();
|
||||
|
||||
std::string index_offset_str;
|
||||
PutFixed64(&index_offset_str, index_offset);
|
||||
Slice foot_slice(index_offset_str);
|
||||
rep_->file->Append(foot_slice);
|
||||
rep_->offset += foot_slice.size();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void SimpleTableBuilder::Abandon() {
|
||||
rep_->closed = true;
|
||||
}
|
||||
|
||||
uint64_t SimpleTableBuilder::NumEntries() const {
|
||||
return rep_->num_entries;
|
||||
}
|
||||
|
||||
uint64_t SimpleTableBuilder::FileSize() const {
|
||||
return rep_->offset;
|
||||
}
|
||||
|
||||
class SimpleTableFactory: public TableFactory {
|
||||
public:
|
||||
~SimpleTableFactory() {
|
||||
}
|
||||
SimpleTableFactory() {
|
||||
}
|
||||
const char* Name() const override {
|
||||
return "SimpleTable";
|
||||
}
|
||||
Status GetTableReader(const Options& options, const EnvOptions& soptions,
|
||||
unique_ptr<RandomAccessFile> && file,
|
||||
uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const;
|
||||
|
||||
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
||||
CompressionType compression_type) const;
|
||||
};
|
||||
|
||||
Status SimpleTableFactory::GetTableReader(
|
||||
const Options& options, const EnvOptions& soptions,
|
||||
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
|
||||
unique_ptr<TableReader>* table_reader) const {
|
||||
|
||||
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
|
||||
table_reader);
|
||||
}
|
||||
|
||||
TableBuilder* SimpleTableFactory::GetTableBuilder(
|
||||
const Options& options, WritableFile* file,
|
||||
CompressionType compression_type) const {
|
||||
return new SimpleTableBuilder(options, file, compression_type);
|
||||
}
|
||||
|
||||
class SimpleTableDBTest {
|
||||
protected:
|
||||
public:
|
||||
std::string dbname_;
|
||||
Env* env_;
|
||||
DB* db_;
|
||||
|
||||
Options last_options_;
|
||||
|
||||
SimpleTableDBTest() :
|
||||
env_(Env::Default()) {
|
||||
dbname_ = test::TmpDir() + "/simple_table_db_test";
|
||||
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||
db_ = nullptr;
|
||||
Reopen();
|
||||
}
|
||||
|
||||
~SimpleTableDBTest() {
|
||||
delete db_;
|
||||
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||
}
|
||||
|
||||
// Return the current option configuration.
|
||||
Options CurrentOptions() {
|
||||
Options options;
|
||||
options.table_factory.reset(new SimpleTableFactory());
|
||||
return options;
|
||||
}
|
||||
|
||||
DBImpl* dbfull() {
|
||||
return reinterpret_cast<DBImpl*>(db_);
|
||||
}
|
||||
|
||||
void Reopen(Options* options = nullptr) {
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void Close() {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
}
|
||||
|
||||
void DestroyAndReopen(Options* options = nullptr) {
|
||||
//Destroy using last options
|
||||
Destroy(&last_options_);
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
|
||||
void Destroy(Options* options) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
ASSERT_OK(DestroyDB(dbname_, *options));
|
||||
}
|
||||
|
||||
Status PureReopen(Options* options, DB** db) {
|
||||
return DB::Open(*options, dbname_, db);
|
||||
}
|
||||
|
||||
Status TryReopen(Options* options = nullptr) {
|
||||
delete db_;
|
||||
db_ = nullptr;
|
||||
Options opts;
|
||||
if (options != nullptr) {
|
||||
opts = *options;
|
||||
} else {
|
||||
opts = CurrentOptions();
|
||||
opts.create_if_missing = true;
|
||||
}
|
||||
last_options_ = opts;
|
||||
|
||||
return DB::Open(opts, dbname_, &db_);
|
||||
}
|
||||
|
||||
Status Put(const Slice& k, const Slice& v) {
|
||||
return db_->Put(WriteOptions(), k, v);
|
||||
}
|
||||
|
||||
Status Delete(const std::string& k) {
|
||||
return db_->Delete(WriteOptions(), k);
|
||||
}
|
||||
|
||||
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
|
||||
ReadOptions options;
|
||||
options.snapshot = snapshot;
|
||||
std::string result;
|
||||
Status s = db_->Get(options, k, &result);
|
||||
if (s.IsNotFound()) {
|
||||
result = "NOT_FOUND";
|
||||
} else if (!s.ok()) {
|
||||
result = s.ToString();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
int NumTableFilesAtLevel(int level) {
|
||||
std::string property;
|
||||
ASSERT_TRUE(
|
||||
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
|
||||
&property));
|
||||
return atoi(property.c_str());
|
||||
}
|
||||
|
||||
// Return spread of files per level
|
||||
std::string FilesPerLevel() {
|
||||
std::string result;
|
||||
int last_non_zero_offset = 0;
|
||||
for (int level = 0; level < db_->NumberLevels(); level++) {
|
||||
int f = NumTableFilesAtLevel(level);
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
|
||||
result += buf;
|
||||
if (f > 0) {
|
||||
last_non_zero_offset = result.size();
|
||||
}
|
||||
}
|
||||
result.resize(last_non_zero_offset);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string IterStatus(Iterator* iter) {
|
||||
std::string result;
|
||||
if (iter->Valid()) {
|
||||
result = iter->key().ToString() + "->" + iter->value().ToString();
|
||||
} else {
|
||||
result = "(invalid)";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(SimpleTableDBTest, Empty) {
|
||||
ASSERT_TRUE(db_ != nullptr);
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, ReadWrite) {
|
||||
ASSERT_OK(Put("0000000000000foo", "v1"));
|
||||
ASSERT_EQ("v1", Get("0000000000000foo"));
|
||||
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||
ASSERT_OK(Put("0000000000000foo", "v3"));
|
||||
ASSERT_EQ("v3", Get("0000000000000foo"));
|
||||
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, Flush) {
|
||||
ASSERT_OK(Put("0000000000000foo", "v1"));
|
||||
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||
ASSERT_OK(Put("0000000000000foo", "v3"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v3", Get("0000000000000foo"));
|
||||
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, Flush2) {
|
||||
ASSERT_OK(Put("0000000000000bar", "b"));
|
||||
ASSERT_OK(Put("0000000000000foo", "v1"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
|
||||
ASSERT_OK(Put("0000000000000foo", "v2"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v2", Get("0000000000000foo"));
|
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v3"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v3", Get("0000000000000eee"));
|
||||
|
||||
ASSERT_OK(Delete("0000000000000bar"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
|
||||
|
||||
ASSERT_OK(Put("0000000000000eee", "v5"));
|
||||
dbfull()->TEST_FlushMemTable();
|
||||
ASSERT_EQ("v5", Get("0000000000000eee"));
|
||||
}
|
||||
|
||||
static std::string Key(int i) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof(buf), "key_______%06d", i);
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
static std::string RandomString(Random* rnd, int len) {
|
||||
std::string r;
|
||||
test::RandomString(rnd, len, &r);
|
||||
return r;
|
||||
}
|
||||
|
||||
TEST(SimpleTableDBTest, CompactionTrigger) {
|
||||
Options options = CurrentOptions();
|
||||
options.write_buffer_size = 100 << 10; //100KB
|
||||
options.num_levels = 3;
|
||||
options.max_mem_compaction_level = 0;
|
||||
options.level0_file_num_compaction_trigger = 3;
|
||||
Reopen(&options);
|
||||
|
||||
Random rnd(301);
|
||||
|
||||
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
|
||||
num++) {
|
||||
std::vector<std::string> values;
|
||||
// Write 120KB (12 values, each 10K)
|
||||
for (int i = 0; i < 12; i++) {
|
||||
values.push_back(RandomString(&rnd, 10000));
|
||||
ASSERT_OK(Put(Key(i), values[i]));
|
||||
}
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
|
||||
}
|
||||
|
||||
//generate one more file in level-0, and should trigger level-0 compaction
|
||||
std::vector<std::string> values;
|
||||
for (int i = 0; i < 12; i++) {
|
||||
values.push_back(RandomString(&rnd, 10000));
|
||||
ASSERT_OK(Put(Key(i), values[i]));
|
||||
}
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
416
db/skiplist.h
Normal file
416
db/skiplist.h
Normal file
@ -0,0 +1,416 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Thread safety
|
||||
// -------------
|
||||
//
|
||||
// Writes require external synchronization, most likely a mutex.
|
||||
// Reads require a guarantee that the SkipList will not be destroyed
|
||||
// while the read is in progress. Apart from that, reads progress
|
||||
// without any internal locking or synchronization.
|
||||
//
|
||||
// Invariants:
|
||||
//
|
||||
// (1) Allocated nodes are never deleted until the SkipList is
|
||||
// destroyed. This is trivially guaranteed by the code since we
|
||||
// never delete any skip list nodes.
|
||||
//
|
||||
// (2) The contents of a Node except for the next/prev pointers are
|
||||
// immutable after the Node has been linked into the SkipList.
|
||||
// Only Insert() modifies the list, and it is careful to initialize
|
||||
// a node and use release-stores to publish the nodes in one or
|
||||
// more lists.
|
||||
//
|
||||
// ... prev vs. next pointer ordering ...
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include "port/port.h"
|
||||
#include "util/random.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
class SkipList {
|
||||
private:
|
||||
struct Node;
|
||||
|
||||
public:
|
||||
// Create a new SkipList object that will use "cmp" for comparing keys,
|
||||
// and will allocate memory using "*arena". Objects allocated in the arena
|
||||
// must remain allocated for the lifetime of the skiplist object.
|
||||
explicit SkipList(Comparator cmp, Arena* arena);
|
||||
|
||||
// Insert key into the list.
|
||||
// REQUIRES: nothing that compares equal to key is currently in the list.
|
||||
void Insert(const Key& key);
|
||||
|
||||
// Returns true iff an entry that compares equal to key is in the list.
|
||||
bool Contains(const Key& key) const;
|
||||
|
||||
// Iteration over the contents of a skip list
|
||||
class Iterator {
|
||||
public:
|
||||
// Initialize an iterator over the specified list.
|
||||
// The returned iterator is not valid.
|
||||
explicit Iterator(const SkipList* list);
|
||||
|
||||
// Change the underlying skiplist used for this iterator
|
||||
// This enables us not changing the iterator without deallocating
|
||||
// an old one and then allocating a new one
|
||||
void SetList(const SkipList* list);
|
||||
|
||||
// Returns true iff the iterator is positioned at a valid node.
|
||||
bool Valid() const;
|
||||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
const Key& key() const;
|
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
void Next();
|
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
void Prev();
|
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
void Seek(const Key& target);
|
||||
|
||||
// Position at the first entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
void SeekToFirst();
|
||||
|
||||
// Position at the last entry in list.
|
||||
// Final state of iterator is Valid() iff list is not empty.
|
||||
void SeekToLast();
|
||||
|
||||
private:
|
||||
const SkipList* list_;
|
||||
Node* node_;
|
||||
// Intentionally copyable
|
||||
};
|
||||
|
||||
private:
|
||||
enum { kMaxHeight = 12 };
|
||||
|
||||
// Immutable after construction
|
||||
Comparator const compare_;
|
||||
Arena* const arena_; // Arena used for allocations of nodes
|
||||
|
||||
Node* const head_;
|
||||
|
||||
// Modified only by Insert(). Read racily by readers, but stale
|
||||
// values are ok.
|
||||
port::AtomicPointer max_height_; // Height of the entire list
|
||||
|
||||
// Used for optimizing sequential insert patterns
|
||||
Node* prev_[kMaxHeight];
|
||||
int prev_height_;
|
||||
|
||||
inline int GetMaxHeight() const {
|
||||
return static_cast<int>(
|
||||
reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
|
||||
}
|
||||
|
||||
// Read/written only by Insert().
|
||||
Random rnd_;
|
||||
|
||||
Node* NewNode(const Key& key, int height);
|
||||
int RandomHeight();
|
||||
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
|
||||
|
||||
// Return true if key is greater than the data stored in "n"
|
||||
bool KeyIsAfterNode(const Key& key, Node* n) const;
|
||||
|
||||
// Return the earliest node that comes at or after key.
|
||||
// Return nullptr if there is no such node.
|
||||
//
|
||||
// If prev is non-nullptr, fills prev[level] with pointer to previous
|
||||
// node at "level" for every level in [0..max_height_-1].
|
||||
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
|
||||
|
||||
// Return the latest node with a key < key.
|
||||
// Return head_ if there is no such node.
|
||||
Node* FindLessThan(const Key& key) const;
|
||||
|
||||
// Return the last node in the list.
|
||||
// Return head_ if list is empty.
|
||||
Node* FindLast() const;
|
||||
|
||||
// No copying allowed
|
||||
SkipList(const SkipList&);
|
||||
void operator=(const SkipList&);
|
||||
};
|
||||
|
||||
// Implementation details follow
|
||||
template<typename Key, class Comparator>
|
||||
struct SkipList<Key,Comparator>::Node {
|
||||
explicit Node(const Key& k) : key(k) { }
|
||||
|
||||
Key const key;
|
||||
|
||||
// Accessors/mutators for links. Wrapped in methods so we can
|
||||
// add the appropriate barriers as necessary.
|
||||
Node* Next(int n) {
|
||||
assert(n >= 0);
|
||||
// Use an 'acquire load' so that we observe a fully initialized
|
||||
// version of the returned Node.
|
||||
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
|
||||
}
|
||||
void SetNext(int n, Node* x) {
|
||||
assert(n >= 0);
|
||||
// Use a 'release store' so that anybody who reads through this
|
||||
// pointer observes a fully initialized version of the inserted node.
|
||||
next_[n].Release_Store(x);
|
||||
}
|
||||
|
||||
// No-barrier variants that can be safely used in a few locations.
|
||||
Node* NoBarrier_Next(int n) {
|
||||
assert(n >= 0);
|
||||
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
|
||||
}
|
||||
void NoBarrier_SetNext(int n, Node* x) {
|
||||
assert(n >= 0);
|
||||
next_[n].NoBarrier_Store(x);
|
||||
}
|
||||
|
||||
private:
|
||||
// Array of length equal to the node height. next_[0] is lowest level link.
|
||||
port::AtomicPointer next_[1];
|
||||
};
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key,Comparator>::Node*
|
||||
SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
|
||||
char* mem = arena_->AllocateAligned(
|
||||
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
|
||||
return new (mem) Node(key);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
|
||||
SetList(list);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key,Comparator>::Iterator::SetList(const SkipList* list) {
|
||||
list_ = list;
|
||||
node_ = nullptr;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
|
||||
return node_ != nullptr;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
|
||||
assert(Valid());
|
||||
return node_->key;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key,Comparator>::Iterator::Next() {
|
||||
assert(Valid());
|
||||
node_ = node_->Next(0);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key,Comparator>::Iterator::Prev() {
|
||||
// Instead of using explicit "prev" links, we just search for the
|
||||
// last node that falls before key.
|
||||
assert(Valid());
|
||||
node_ = list_->FindLessThan(node_->key);
|
||||
if (node_ == list_->head_) {
|
||||
node_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
|
||||
node_ = list_->FindGreaterOrEqual(target, nullptr);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
|
||||
node_ = list_->head_->Next(0);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
|
||||
node_ = list_->FindLast();
|
||||
if (node_ == list_->head_) {
|
||||
node_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
int SkipList<Key,Comparator>::RandomHeight() {
|
||||
// Increase height with probability 1 in kBranching
|
||||
static const unsigned int kBranching = 4;
|
||||
int height = 1;
|
||||
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
|
||||
height++;
|
||||
}
|
||||
assert(height > 0);
|
||||
assert(height <= kMaxHeight);
|
||||
return height;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
|
||||
// nullptr n is considered infinite
|
||||
return (n != nullptr) && (compare_(n->key, key) < 0);
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
|
||||
const {
|
||||
// Use prev as an optimization hint and fallback to slow path
|
||||
if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
|
||||
Node* x = prev[0];
|
||||
Node* next = x->Next(0);
|
||||
if ((x == head_) || KeyIsAfterNode(key, x)) {
|
||||
// Adjust all relevant insertion points to the previous entry
|
||||
for (int i = 1; i < prev_height_; i++) {
|
||||
prev[i] = x;
|
||||
}
|
||||
return next;
|
||||
}
|
||||
}
|
||||
// Normal lookup
|
||||
Node* x = head_;
|
||||
int level = GetMaxHeight() - 1;
|
||||
while (true) {
|
||||
Node* next = x->Next(level);
|
||||
// Make sure the lists are sorted.
|
||||
// If x points to head_ or next points nullptr, it is trivially satisfied.
|
||||
assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
|
||||
if (KeyIsAfterNode(key, next)) {
|
||||
// Keep searching in this list
|
||||
x = next;
|
||||
} else {
|
||||
if (prev != nullptr) prev[level] = x;
|
||||
if (level == 0) {
|
||||
return next;
|
||||
} else {
|
||||
// Switch to next list
|
||||
level--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key,Comparator>::Node*
|
||||
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
|
||||
Node* x = head_;
|
||||
int level = GetMaxHeight() - 1;
|
||||
while (true) {
|
||||
assert(x == head_ || compare_(x->key, key) < 0);
|
||||
Node* next = x->Next(level);
|
||||
if (next == nullptr || compare_(next->key, key) >= 0) {
|
||||
if (level == 0) {
|
||||
return x;
|
||||
} else {
|
||||
// Switch to next list
|
||||
level--;
|
||||
}
|
||||
} else {
|
||||
x = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
|
||||
const {
|
||||
Node* x = head_;
|
||||
int level = GetMaxHeight() - 1;
|
||||
while (true) {
|
||||
Node* next = x->Next(level);
|
||||
if (next == nullptr) {
|
||||
if (level == 0) {
|
||||
return x;
|
||||
} else {
|
||||
// Switch to next list
|
||||
level--;
|
||||
}
|
||||
} else {
|
||||
x = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
|
||||
: compare_(cmp),
|
||||
arena_(arena),
|
||||
head_(NewNode(0 /* any key will do */, kMaxHeight)),
|
||||
max_height_(reinterpret_cast<void*>(1)),
|
||||
prev_height_(1),
|
||||
rnd_(0xdeadbeef) {
|
||||
for (int i = 0; i < kMaxHeight; i++) {
|
||||
head_->SetNext(i, nullptr);
|
||||
prev_[i] = head_;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
void SkipList<Key,Comparator>::Insert(const Key& key) {
|
||||
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
|
||||
// here since Insert() is externally synchronized.
|
||||
Node* x = FindGreaterOrEqual(key, prev_);
|
||||
|
||||
// Our data structure does not allow duplicate insertion
|
||||
assert(x == nullptr || !Equal(key, x->key));
|
||||
|
||||
int height = RandomHeight();
|
||||
if (height > GetMaxHeight()) {
|
||||
for (int i = GetMaxHeight(); i < height; i++) {
|
||||
prev_[i] = head_;
|
||||
}
|
||||
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
|
||||
|
||||
// It is ok to mutate max_height_ without any synchronization
|
||||
// with concurrent readers. A concurrent reader that observes
|
||||
// the new value of max_height_ will see either the old value of
|
||||
// new level pointers from head_ (nullptr), or a new value set in
|
||||
// the loop below. In the former case the reader will
|
||||
// immediately drop to the next level since nullptr sorts after all
|
||||
// keys. In the latter case the reader will use the new node.
|
||||
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
|
||||
}
|
||||
|
||||
x = NewNode(key, height);
|
||||
for (int i = 0; i < height; i++) {
|
||||
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||
// we publish a pointer to "x" in prev[i].
|
||||
x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
|
||||
prev_[i]->SetNext(i, x);
|
||||
}
|
||||
prev_[0] = x;
|
||||
prev_height_ = height;
|
||||
}
|
||||
|
||||
template<typename Key, class Comparator>
|
||||
bool SkipList<Key,Comparator>::Contains(const Key& key) const {
|
||||
Node* x = FindGreaterOrEqual(key, nullptr);
|
||||
if (x != nullptr && Equal(key, x->key)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
383
db/skiplist_test.cc
Normal file
383
db/skiplist_test.cc
Normal file
@ -0,0 +1,383 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/skiplist.h"
|
||||
#include <set>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/arena_impl.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/random.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
typedef uint64_t Key;
|
||||
|
||||
struct TestComparator {
|
||||
int operator()(const Key& a, const Key& b) const {
|
||||
if (a < b) {
|
||||
return -1;
|
||||
} else if (a > b) {
|
||||
return +1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class SkipTest { };
|
||||
|
||||
TEST(SkipTest, Empty) {
|
||||
ArenaImpl arena_impl;
|
||||
TestComparator cmp;
|
||||
SkipList<Key, TestComparator> list(cmp, &arena_impl);
|
||||
ASSERT_TRUE(!list.Contains(10));
|
||||
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
iter.SeekToFirst();
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
iter.Seek(100);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
iter.SeekToLast();
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
}
|
||||
|
||||
TEST(SkipTest, InsertAndLookup) {
|
||||
const int N = 2000;
|
||||
const int R = 5000;
|
||||
Random rnd(1000);
|
||||
std::set<Key> keys;
|
||||
ArenaImpl arena_impl;
|
||||
TestComparator cmp;
|
||||
SkipList<Key, TestComparator> list(cmp, &arena_impl);
|
||||
for (int i = 0; i < N; i++) {
|
||||
Key key = rnd.Next() % R;
|
||||
if (keys.insert(key).second) {
|
||||
list.Insert(key);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < R; i++) {
|
||||
if (list.Contains(i)) {
|
||||
ASSERT_EQ(keys.count(i), 1U);
|
||||
} else {
|
||||
ASSERT_EQ(keys.count(i), 0U);
|
||||
}
|
||||
}
|
||||
|
||||
// Simple iterator tests
|
||||
{
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
|
||||
iter.Seek(0);
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*(keys.begin()), iter.key());
|
||||
|
||||
iter.SeekToFirst();
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*(keys.begin()), iter.key());
|
||||
|
||||
iter.SeekToLast();
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*(keys.rbegin()), iter.key());
|
||||
}
|
||||
|
||||
// Forward iteration test
|
||||
for (int i = 0; i < R; i++) {
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
iter.Seek(i);
|
||||
|
||||
// Compare against model iterator
|
||||
std::set<Key>::iterator model_iter = keys.lower_bound(i);
|
||||
for (int j = 0; j < 3; j++) {
|
||||
if (model_iter == keys.end()) {
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
break;
|
||||
} else {
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*model_iter, iter.key());
|
||||
++model_iter;
|
||||
iter.Next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Backward iteration test
|
||||
{
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||
iter.SeekToLast();
|
||||
|
||||
// Compare against model iterator
|
||||
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
|
||||
model_iter != keys.rend();
|
||||
++model_iter) {
|
||||
ASSERT_TRUE(iter.Valid());
|
||||
ASSERT_EQ(*model_iter, iter.key());
|
||||
iter.Prev();
|
||||
}
|
||||
ASSERT_TRUE(!iter.Valid());
|
||||
}
|
||||
}
|
||||
|
||||
// We want to make sure that with a single writer and multiple
|
||||
// concurrent readers (with no synchronization other than when a
|
||||
// reader's iterator is created), the reader always observes all the
|
||||
// data that was present in the skip list when the iterator was
|
||||
// constructor. Because insertions are happening concurrently, we may
|
||||
// also observe new values that were inserted since the iterator was
|
||||
// constructed, but we should never miss any values that were present
|
||||
// at iterator construction time.
|
||||
//
|
||||
// We generate multi-part keys:
|
||||
// <key,gen,hash>
|
||||
// where:
|
||||
// key is in range [0..K-1]
|
||||
// gen is a generation number for key
|
||||
// hash is hash(key,gen)
|
||||
//
|
||||
// The insertion code picks a random key, sets gen to be 1 + the last
|
||||
// generation number inserted for that key, and sets hash to Hash(key,gen).
|
||||
//
|
||||
// At the beginning of a read, we snapshot the last inserted
|
||||
// generation number for each key. We then iterate, including random
|
||||
// calls to Next() and Seek(). For every key we encounter, we
|
||||
// check that it is either expected given the initial snapshot or has
|
||||
// been concurrently added since the iterator started.
|
||||
class ConcurrentTest {
|
||||
private:
|
||||
static const uint32_t K = 4;
|
||||
|
||||
static uint64_t key(Key key) { return (key >> 40); }
|
||||
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
|
||||
static uint64_t hash(Key key) { return key & 0xff; }
|
||||
|
||||
static uint64_t HashNumbers(uint64_t k, uint64_t g) {
|
||||
uint64_t data[2] = { k, g };
|
||||
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
|
||||
}
|
||||
|
||||
static Key MakeKey(uint64_t k, uint64_t g) {
|
||||
assert(sizeof(Key) == sizeof(uint64_t));
|
||||
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
|
||||
assert(g <= 0xffffffffu);
|
||||
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
|
||||
}
|
||||
|
||||
static bool IsValidKey(Key k) {
|
||||
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
|
||||
}
|
||||
|
||||
static Key RandomTarget(Random* rnd) {
|
||||
switch (rnd->Next() % 10) {
|
||||
case 0:
|
||||
// Seek to beginning
|
||||
return MakeKey(0, 0);
|
||||
case 1:
|
||||
// Seek to end
|
||||
return MakeKey(K, 0);
|
||||
default:
|
||||
// Seek to middle
|
||||
return MakeKey(rnd->Next() % K, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Per-key generation
|
||||
struct State {
|
||||
port::AtomicPointer generation[K];
|
||||
void Set(int k, intptr_t v) {
|
||||
generation[k].Release_Store(reinterpret_cast<void*>(v));
|
||||
}
|
||||
intptr_t Get(int k) {
|
||||
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
|
||||
}
|
||||
|
||||
State() {
|
||||
for (unsigned int k = 0; k < K; k++) {
|
||||
Set(k, 0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Current state of the test
|
||||
State current_;
|
||||
|
||||
ArenaImpl arena_impl_;
|
||||
|
||||
// SkipList is not protected by mu_. We just use a single writer
|
||||
// thread to modify it.
|
||||
SkipList<Key, TestComparator> list_;
|
||||
|
||||
public:
|
||||
ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
|
||||
|
||||
// REQUIRES: External synchronization
|
||||
void WriteStep(Random* rnd) {
|
||||
const uint32_t k = rnd->Next() % K;
|
||||
const intptr_t g = current_.Get(k) + 1;
|
||||
const Key key = MakeKey(k, g);
|
||||
list_.Insert(key);
|
||||
current_.Set(k, g);
|
||||
}
|
||||
|
||||
void ReadStep(Random* rnd) {
|
||||
// Remember the initial committed state of the skiplist.
|
||||
State initial_state;
|
||||
for (unsigned int k = 0; k < K; k++) {
|
||||
initial_state.Set(k, current_.Get(k));
|
||||
}
|
||||
|
||||
Key pos = RandomTarget(rnd);
|
||||
SkipList<Key, TestComparator>::Iterator iter(&list_);
|
||||
iter.Seek(pos);
|
||||
while (true) {
|
||||
Key current;
|
||||
if (!iter.Valid()) {
|
||||
current = MakeKey(K, 0);
|
||||
} else {
|
||||
current = iter.key();
|
||||
ASSERT_TRUE(IsValidKey(current)) << current;
|
||||
}
|
||||
ASSERT_LE(pos, current) << "should not go backwards";
|
||||
|
||||
// Verify that everything in [pos,current) was not present in
|
||||
// initial_state.
|
||||
while (pos < current) {
|
||||
ASSERT_LT(key(pos), K) << pos;
|
||||
|
||||
// Note that generation 0 is never inserted, so it is ok if
|
||||
// <*,0,*> is missing.
|
||||
ASSERT_TRUE((gen(pos) == 0U) ||
|
||||
(gen(pos) > (uint64_t)initial_state.Get(key(pos)))
|
||||
) << "key: " << key(pos)
|
||||
<< "; gen: " << gen(pos)
|
||||
<< "; initgen: "
|
||||
<< initial_state.Get(key(pos));
|
||||
|
||||
// Advance to next key in the valid key space
|
||||
if (key(pos) < key(current)) {
|
||||
pos = MakeKey(key(pos) + 1, 0);
|
||||
} else {
|
||||
pos = MakeKey(key(pos), gen(pos) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!iter.Valid()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (rnd->Next() % 2) {
|
||||
iter.Next();
|
||||
pos = MakeKey(key(pos), gen(pos) + 1);
|
||||
} else {
|
||||
Key new_target = RandomTarget(rnd);
|
||||
if (new_target > pos) {
|
||||
pos = new_target;
|
||||
iter.Seek(new_target);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
const uint32_t ConcurrentTest::K;
|
||||
|
||||
// Simple test that does single-threaded testing of the ConcurrentTest
|
||||
// scaffolding.
|
||||
TEST(SkipTest, ConcurrentWithoutThreads) {
|
||||
ConcurrentTest test;
|
||||
Random rnd(test::RandomSeed());
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
test.ReadStep(&rnd);
|
||||
test.WriteStep(&rnd);
|
||||
}
|
||||
}
|
||||
|
||||
class TestState {
|
||||
public:
|
||||
ConcurrentTest t_;
|
||||
int seed_;
|
||||
port::AtomicPointer quit_flag_;
|
||||
|
||||
enum ReaderState {
|
||||
STARTING,
|
||||
RUNNING,
|
||||
DONE
|
||||
};
|
||||
|
||||
explicit TestState(int s)
|
||||
: seed_(s),
|
||||
quit_flag_(nullptr),
|
||||
state_(STARTING),
|
||||
state_cv_(&mu_) {}
|
||||
|
||||
void Wait(ReaderState s) {
|
||||
mu_.Lock();
|
||||
while (state_ != s) {
|
||||
state_cv_.Wait();
|
||||
}
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
void Change(ReaderState s) {
|
||||
mu_.Lock();
|
||||
state_ = s;
|
||||
state_cv_.Signal();
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
private:
|
||||
port::Mutex mu_;
|
||||
ReaderState state_;
|
||||
port::CondVar state_cv_;
|
||||
};
|
||||
|
||||
static void ConcurrentReader(void* arg) {
|
||||
TestState* state = reinterpret_cast<TestState*>(arg);
|
||||
Random rnd(state->seed_);
|
||||
int64_t reads = 0;
|
||||
state->Change(TestState::RUNNING);
|
||||
while (!state->quit_flag_.Acquire_Load()) {
|
||||
state->t_.ReadStep(&rnd);
|
||||
++reads;
|
||||
}
|
||||
state->Change(TestState::DONE);
|
||||
}
|
||||
|
||||
static void RunConcurrent(int run) {
|
||||
const int seed = test::RandomSeed() + (run * 100);
|
||||
Random rnd(seed);
|
||||
const int N = 1000;
|
||||
const int kSize = 1000;
|
||||
for (int i = 0; i < N; i++) {
|
||||
if ((i % 100) == 0) {
|
||||
fprintf(stderr, "Run %d of %d\n", i, N);
|
||||
}
|
||||
TestState state(seed + 1);
|
||||
Env::Default()->Schedule(ConcurrentReader, &state);
|
||||
state.Wait(TestState::RUNNING);
|
||||
for (int i = 0; i < kSize; i++) {
|
||||
state.t_.WriteStep(&rnd);
|
||||
}
|
||||
state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do
|
||||
state.Wait(TestState::DONE);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
|
||||
TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
|
||||
TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
|
||||
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
|
||||
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
86
db/snapshot.h
Normal file
86
db/snapshot.h
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class SnapshotList;
|
||||
|
||||
// Snapshots are kept in a doubly-linked list in the DB.
|
||||
// Each SnapshotImpl corresponds to a particular sequence number.
|
||||
class SnapshotImpl : public Snapshot {
|
||||
public:
|
||||
SequenceNumber number_; // const after creation
|
||||
|
||||
private:
|
||||
friend class SnapshotList;
|
||||
|
||||
// SnapshotImpl is kept in a doubly-linked circular list
|
||||
SnapshotImpl* prev_;
|
||||
SnapshotImpl* next_;
|
||||
|
||||
SnapshotList* list_; // just for sanity checks
|
||||
};
|
||||
|
||||
class SnapshotList {
|
||||
public:
|
||||
SnapshotList() {
|
||||
list_.prev_ = &list_;
|
||||
list_.next_ = &list_;
|
||||
list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
|
||||
}
|
||||
|
||||
bool empty() const { return list_.next_ == &list_; }
|
||||
SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
|
||||
SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
|
||||
|
||||
const SnapshotImpl* New(SequenceNumber seq) {
|
||||
SnapshotImpl* s = new SnapshotImpl;
|
||||
s->number_ = seq;
|
||||
s->list_ = this;
|
||||
s->next_ = &list_;
|
||||
s->prev_ = list_.prev_;
|
||||
s->prev_->next_ = s;
|
||||
s->next_->prev_ = s;
|
||||
return s;
|
||||
}
|
||||
|
||||
void Delete(const SnapshotImpl* s) {
|
||||
assert(s->list_ == this);
|
||||
s->prev_->next_ = s->next_;
|
||||
s->next_->prev_ = s->prev_;
|
||||
delete s;
|
||||
}
|
||||
|
||||
// retrieve all snapshot numbers. They are sorted in ascending order.
|
||||
void getAll(std::vector<SequenceNumber>& ret) {
|
||||
if (empty()) return;
|
||||
SnapshotImpl* s = &list_;
|
||||
while (s->next_ != &list_) {
|
||||
ret.push_back(s->next_->number_);
|
||||
s = s ->next_;
|
||||
}
|
||||
}
|
||||
|
||||
// get the sequence number of the most recent snapshot
|
||||
const SequenceNumber GetNewest() {
|
||||
if (empty()) {
|
||||
return 0;
|
||||
}
|
||||
return newest()->number_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Dummy head of doubly-linked list of snapshots
|
||||
SnapshotImpl list_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
173
db/table_cache.cc
Normal file
173
db/table_cache.cc
Normal file
@ -0,0 +1,173 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/table_cache.h"
|
||||
|
||||
#include "db/filename.h"
|
||||
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static void DeleteEntry(const Slice& key, void* value) {
|
||||
TableReader* table_reader = reinterpret_cast<TableReader*>(value);
|
||||
delete table_reader;
|
||||
}
|
||||
|
||||
static void UnrefEntry(void* arg1, void* arg2) {
|
||||
Cache* cache = reinterpret_cast<Cache*>(arg1);
|
||||
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
|
||||
cache->Release(h);
|
||||
}
|
||||
|
||||
TableCache::TableCache(const std::string& dbname,
|
||||
const Options* options,
|
||||
const EnvOptions& storage_options,
|
||||
int entries)
|
||||
: env_(options->env),
|
||||
dbname_(dbname),
|
||||
options_(options),
|
||||
storage_options_(storage_options),
|
||||
cache_(
|
||||
NewLRUCache(entries, options->table_cache_numshardbits,
|
||||
options->table_cache_remove_scan_count_limit)) {
|
||||
}
|
||||
|
||||
TableCache::~TableCache() {
|
||||
}
|
||||
|
||||
Status TableCache::FindTable(const EnvOptions& toptions,
|
||||
uint64_t file_number, uint64_t file_size,
|
||||
Cache::Handle** handle, bool* table_io,
|
||||
const bool no_io) {
|
||||
Status s;
|
||||
char buf[sizeof(file_number)];
|
||||
EncodeFixed64(buf, file_number);
|
||||
Slice key(buf, sizeof(buf));
|
||||
*handle = cache_->Lookup(key);
|
||||
if (*handle == nullptr) {
|
||||
if (no_io) { // Dont do IO and return a not-found status
|
||||
return Status::Incomplete("Table not found in table_cache, no_io is set");
|
||||
}
|
||||
if (table_io != nullptr) {
|
||||
*table_io = true; // we had to do IO from storage
|
||||
}
|
||||
std::string fname = TableFileName(dbname_, file_number);
|
||||
unique_ptr<RandomAccessFile> file;
|
||||
unique_ptr<TableReader> table_reader;
|
||||
s = env_->NewRandomAccessFile(fname, &file, toptions);
|
||||
RecordTick(options_->statistics, NO_FILE_OPENS);
|
||||
if (s.ok()) {
|
||||
if (options_->advise_random_on_open) {
|
||||
file->Hint(RandomAccessFile::RANDOM);
|
||||
}
|
||||
StopWatch sw(env_, options_->statistics, TABLE_OPEN_IO_MICROS);
|
||||
s = options_->table_factory->GetTableReader(*options_, toptions,
|
||||
std::move(file), file_size,
|
||||
&table_reader);
|
||||
}
|
||||
|
||||
if (!s.ok()) {
|
||||
assert(table_reader == nullptr);
|
||||
RecordTick(options_->statistics, NO_FILE_ERRORS);
|
||||
// We do not cache error results so that if the error is transient,
|
||||
// or somebody repairs the file, we recover automatically.
|
||||
} else {
|
||||
assert(file.get() == nullptr);
|
||||
*handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Iterator* TableCache::NewIterator(const ReadOptions& options,
|
||||
const EnvOptions& toptions,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
TableReader** table_reader_ptr,
|
||||
bool for_compaction) {
|
||||
if (table_reader_ptr != nullptr) {
|
||||
*table_reader_ptr = nullptr;
|
||||
}
|
||||
|
||||
Cache::Handle* handle = nullptr;
|
||||
Status s = FindTable(toptions, file_number, file_size, &handle,
|
||||
nullptr, options.read_tier == kBlockCacheTier);
|
||||
if (!s.ok()) {
|
||||
return NewErrorIterator(s);
|
||||
}
|
||||
|
||||
TableReader* table_reader =
|
||||
reinterpret_cast<TableReader*>(cache_->Value(handle));
|
||||
Iterator* result = table_reader->NewIterator(options);
|
||||
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
|
||||
if (table_reader_ptr != nullptr) {
|
||||
*table_reader_ptr = table_reader;
|
||||
}
|
||||
|
||||
if (for_compaction) {
|
||||
table_reader->SetupForCompaction();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Status TableCache::Get(const ReadOptions& options,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
const Slice& k,
|
||||
void* arg,
|
||||
bool (*saver)(void*, const Slice&, const Slice&, bool),
|
||||
bool* table_io,
|
||||
void (*mark_key_may_exist)(void*)) {
|
||||
Cache::Handle* handle = nullptr;
|
||||
Status s = FindTable(storage_options_, file_number, file_size,
|
||||
&handle, table_io,
|
||||
options.read_tier == kBlockCacheTier);
|
||||
if (s.ok()) {
|
||||
TableReader* t =
|
||||
reinterpret_cast<TableReader*>(cache_->Value(handle));
|
||||
s = t->Get(options, k, arg, saver, mark_key_may_exist);
|
||||
cache_->Release(handle);
|
||||
} else if (options.read_tier && s.IsIncomplete()) {
|
||||
// Couldnt find Table in cache but treat as kFound if no_io set
|
||||
(*mark_key_may_exist)(arg);
|
||||
return Status::OK();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
bool TableCache::PrefixMayMatch(const ReadOptions& options,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
const Slice& internal_prefix,
|
||||
bool* table_io) {
|
||||
Cache::Handle* handle = nullptr;
|
||||
Status s = FindTable(storage_options_, file_number,
|
||||
file_size, &handle, table_io);
|
||||
bool may_match = true;
|
||||
if (s.ok()) {
|
||||
TableReader* t =
|
||||
reinterpret_cast<TableReader*>(cache_->Value(handle));
|
||||
may_match = t->PrefixMayMatch(internal_prefix);
|
||||
cache_->Release(handle);
|
||||
}
|
||||
return may_match;
|
||||
}
|
||||
|
||||
void TableCache::Evict(uint64_t file_number) {
|
||||
char buf[sizeof(file_number)];
|
||||
EncodeFixed64(buf, file_number);
|
||||
cache_->Erase(Slice(buf, sizeof(buf)));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
78
db/table_cache.h
Normal file
78
db/table_cache.h
Normal file
@ -0,0 +1,78 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Thread-safe (provides internal synchronization)
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/table.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Env;
|
||||
|
||||
class TableCache {
|
||||
public:
|
||||
TableCache(const std::string& dbname, const Options* options,
|
||||
const EnvOptions& storage_options, int entries);
|
||||
~TableCache();
|
||||
|
||||
// Return an iterator for the specified file number (the corresponding
|
||||
// file length must be exactly "file_size" bytes). If "tableptr" is
|
||||
// non-nullptr, also sets "*tableptr" to point to the Table object
|
||||
// underlying the returned iterator, or nullptr if no Table object underlies
|
||||
// the returned iterator. The returned "*tableptr" object is owned by
|
||||
// the cache and should not be deleted, and is valid for as long as the
|
||||
// returned iterator is live.
|
||||
Iterator* NewIterator(const ReadOptions& options,
|
||||
const EnvOptions& toptions,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
TableReader** table_reader_ptr = nullptr,
|
||||
bool for_compaction = false);
|
||||
|
||||
// If a seek to internal key "k" in specified file finds an entry,
|
||||
// call (*handle_result)(arg, found_key, found_value) repeatedly until
|
||||
// it returns false.
|
||||
Status Get(const ReadOptions& options,
|
||||
uint64_t file_number,
|
||||
uint64_t file_size,
|
||||
const Slice& k,
|
||||
void* arg,
|
||||
bool (*handle_result)(void*, const Slice&, const Slice&, bool),
|
||||
bool* table_io,
|
||||
void (*mark_key_may_exist)(void*) = nullptr);
|
||||
|
||||
// Determine whether the table may contain the specified prefix. If
|
||||
// the table index of blooms are not in memory, this may cause an I/O
|
||||
bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
|
||||
uint64_t file_size, const Slice& internal_prefix,
|
||||
bool* table_io);
|
||||
|
||||
// Evict any entry for the specified file number
|
||||
void Evict(uint64_t file_number);
|
||||
|
||||
private:
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
const Options* options_;
|
||||
const EnvOptions& storage_options_;
|
||||
std::shared_ptr<Cache> cache_;
|
||||
|
||||
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
|
||||
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
|
||||
const bool no_io = false);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
55
db/table_stats_collector.cc
Normal file
55
db/table_stats_collector.cc
Normal file
@ -0,0 +1,55 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "db/table_stats_collector.h"
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status InternalKeyStatsCollector::Add(const Slice& key, const Slice& value) {
|
||||
ParsedInternalKey ikey;
|
||||
if (!ParseInternalKey(key, &ikey)) {
|
||||
return Status::InvalidArgument("Invalid internal key");
|
||||
}
|
||||
|
||||
if (ikey.type == ValueType::kTypeDeletion) {
|
||||
++deleted_keys_;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status InternalKeyStatsCollector::Finish(
|
||||
TableStats::UserCollectedStats* stats) {
|
||||
assert(stats);
|
||||
assert(stats->find(InternalKeyTableStatsNames::kDeletedKeys) == stats->end());
|
||||
std::string val;
|
||||
|
||||
PutVarint64(&val, deleted_keys_);
|
||||
stats->insert(std::make_pair(InternalKeyTableStatsNames::kDeletedKeys, val));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status UserKeyTableStatsCollector::Add(const Slice& key, const Slice& value) {
|
||||
ParsedInternalKey ikey;
|
||||
if (!ParseInternalKey(key, &ikey)) {
|
||||
return Status::InvalidArgument("Invalid internal key");
|
||||
}
|
||||
|
||||
return collector_->Add(ikey.user_key, value);
|
||||
}
|
||||
|
||||
Status UserKeyTableStatsCollector::Finish(
|
||||
TableStats::UserCollectedStats* stats) {
|
||||
return collector_->Finish(stats);
|
||||
}
|
||||
|
||||
const std::string InternalKeyTableStatsNames::kDeletedKeys
|
||||
= "rocksdb.deleted.keys";
|
||||
|
||||
} // namespace rocksdb
|
58
db/table_stats_collector.h
Normal file
58
db/table_stats_collector.h
Normal file
@ -0,0 +1,58 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// This file defines a collection of statistics collectors.
|
||||
#pragma once
|
||||
|
||||
#include "rocksdb/table_stats.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct InternalKeyTableStatsNames {
|
||||
static const std::string kDeletedKeys;
|
||||
};
|
||||
|
||||
// Collecting the statistics for internal keys. Visible only by internal
|
||||
// rocksdb modules.
|
||||
class InternalKeyStatsCollector : public TableStatsCollector {
|
||||
public:
|
||||
virtual Status Add(const Slice& key, const Slice& value);
|
||||
virtual Status Finish(TableStats::UserCollectedStats* stats);
|
||||
virtual const char* Name() const { return "InternalKeyStatsCollector"; }
|
||||
|
||||
private:
|
||||
uint64_t deleted_keys_ = 0;
|
||||
};
|
||||
|
||||
// When rocksdb creates a new table, it will encode all "user keys" into
|
||||
// "internal keys", which contains meta information of a given entry.
|
||||
//
|
||||
// This class extracts user key from the encoded internal key when Add() is
|
||||
// invoked.
|
||||
class UserKeyTableStatsCollector : public TableStatsCollector {
|
||||
public:
|
||||
explicit UserKeyTableStatsCollector(TableStatsCollector* collector):
|
||||
UserKeyTableStatsCollector(
|
||||
std::shared_ptr<TableStatsCollector>(collector)
|
||||
) {
|
||||
}
|
||||
|
||||
explicit UserKeyTableStatsCollector(
|
||||
std::shared_ptr<TableStatsCollector> collector) : collector_(collector) {
|
||||
}
|
||||
virtual ~UserKeyTableStatsCollector() { }
|
||||
virtual Status Add(const Slice& key, const Slice& value);
|
||||
virtual Status Finish(TableStats::UserCollectedStats* stats);
|
||||
virtual const char* Name() const { return collector_->Name(); }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<TableStatsCollector> collector_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
261
db/table_stats_collector_test.cc
Normal file
261
db/table_stats_collector_test.cc
Normal file
@ -0,0 +1,261 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/table_stats_collector.h"
|
||||
#include "rocksdb/table_stats.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class TableStatsTest {
|
||||
private:
|
||||
unique_ptr<TableReader> table_reader_;
|
||||
};
|
||||
|
||||
// TODO(kailiu) the following classes should be moved to some more general
|
||||
// places, so that other tests can also make use of them.
|
||||
// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
|
||||
// and therefore enable us to quickly setup the tests.
|
||||
class FakeWritableFile : public WritableFile {
|
||||
public:
|
||||
~FakeWritableFile() { }
|
||||
|
||||
const std::string& contents() const { return contents_; }
|
||||
|
||||
virtual Status Close() { return Status::OK(); }
|
||||
virtual Status Flush() { return Status::OK(); }
|
||||
virtual Status Sync() { return Status::OK(); }
|
||||
|
||||
virtual Status Append(const Slice& data) {
|
||||
contents_.append(data.data(), data.size());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string contents_;
|
||||
};
|
||||
|
||||
|
||||
class FakeRandomeAccessFile : public RandomAccessFile {
|
||||
public:
|
||||
explicit FakeRandomeAccessFile(const Slice& contents)
|
||||
: contents_(contents.data(), contents.size()) {
|
||||
}
|
||||
|
||||
virtual ~FakeRandomeAccessFile() { }
|
||||
|
||||
uint64_t Size() const { return contents_.size(); }
|
||||
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const {
|
||||
if (offset > contents_.size()) {
|
||||
return Status::InvalidArgument("invalid Read offset");
|
||||
}
|
||||
if (offset + n > contents_.size()) {
|
||||
n = contents_.size() - offset;
|
||||
}
|
||||
memcpy(scratch, &contents_[offset], n);
|
||||
*result = Slice(scratch, n);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string contents_;
|
||||
};
|
||||
|
||||
|
||||
class DumbLogger : public Logger {
|
||||
public:
|
||||
virtual void Logv(const char* format, va_list ap) { }
|
||||
virtual size_t GetLogFileSize() const { return 0; }
|
||||
};
|
||||
|
||||
// Utilities test functions
|
||||
void MakeBuilder(
|
||||
Options options,
|
||||
std::unique_ptr<FakeWritableFile>* writable,
|
||||
std::unique_ptr<TableBuilder>* builder) {
|
||||
writable->reset(new FakeWritableFile);
|
||||
if (!options.flush_block_policy_factory) {
|
||||
options.SetUpDefaultFlushBlockPolicyFactory();
|
||||
}
|
||||
builder->reset(
|
||||
options.table_factory->GetTableBuilder(options, writable->get(),
|
||||
options.compression));
|
||||
}
|
||||
|
||||
void OpenTable(
|
||||
const Options& options,
|
||||
const std::string& contents,
|
||||
std::unique_ptr<TableReader>* table_reader) {
|
||||
std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
|
||||
auto s = options.table_factory->GetTableReader(
|
||||
options,
|
||||
EnvOptions(),
|
||||
std::move(file),
|
||||
contents.size(),
|
||||
table_reader
|
||||
);
|
||||
ASSERT_OK(s);
|
||||
}
|
||||
|
||||
// Collects keys that starts with "A" in a table.
|
||||
class RegularKeysStartWithA: public TableStatsCollector {
|
||||
public:
|
||||
const char* Name() const { return "RegularKeysStartWithA"; }
|
||||
|
||||
Status Finish(TableStats::UserCollectedStats* stats) {
|
||||
std::string encoded;
|
||||
PutVarint32(&encoded, count_);
|
||||
*stats = TableStats::UserCollectedStats {
|
||||
{ "TableStatsTest", "Rocksdb" },
|
||||
{ "Count", encoded }
|
||||
};
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Add(const Slice& user_key, const Slice& value) {
|
||||
// simply asssume all user keys are not empty.
|
||||
if (user_key.data()[0] == 'A') {
|
||||
++count_;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t count_ = 0;
|
||||
};
|
||||
|
||||
TEST(TableStatsTest, CustomizedTableStatsCollector) {
|
||||
Options options;
|
||||
|
||||
// make sure the entries will be inserted with order.
|
||||
std::map<std::string, std::string> kvs = {
|
||||
{"About", "val5"}, // starts with 'A'
|
||||
{"Abstract", "val2"}, // starts with 'A'
|
||||
{"Around", "val7"}, // starts with 'A'
|
||||
{"Beyond", "val3"},
|
||||
{"Builder", "val1"},
|
||||
{"Cancel", "val4"},
|
||||
{"Find", "val6"},
|
||||
};
|
||||
|
||||
// Test stats collectors with internal keys or regular keys
|
||||
for (bool encode_as_internal : { true, false }) {
|
||||
// -- Step 1: build table
|
||||
auto collector = new RegularKeysStartWithA();
|
||||
if (encode_as_internal) {
|
||||
options.table_stats_collectors = {
|
||||
std::make_shared<UserKeyTableStatsCollector>(collector)
|
||||
};
|
||||
} else {
|
||||
options.table_stats_collectors.resize(1);
|
||||
options.table_stats_collectors[0].reset(collector);
|
||||
}
|
||||
std::unique_ptr<TableBuilder> builder;
|
||||
std::unique_ptr<FakeWritableFile> writable;
|
||||
MakeBuilder(options, &writable, &builder);
|
||||
|
||||
for (const auto& kv : kvs) {
|
||||
if (encode_as_internal) {
|
||||
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
|
||||
builder->Add(ikey.Encode(), kv.second);
|
||||
} else {
|
||||
builder->Add(kv.first, kv.second);
|
||||
}
|
||||
}
|
||||
ASSERT_OK(builder->Finish());
|
||||
|
||||
// -- Step 2: Open table
|
||||
std::unique_ptr<TableReader> table_reader;
|
||||
OpenTable(options, writable->contents(), &table_reader);
|
||||
const auto& stats = table_reader->GetTableStats().user_collected_stats;
|
||||
|
||||
ASSERT_EQ("Rocksdb", stats.at("TableStatsTest"));
|
||||
|
||||
uint32_t starts_with_A = 0;
|
||||
Slice key(stats.at("Count"));
|
||||
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
||||
ASSERT_EQ(3u, starts_with_A);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TableStatsTest, InternalKeyStatsCollector) {
|
||||
InternalKey keys[] = {
|
||||
InternalKey("A", 0, ValueType::kTypeValue),
|
||||
InternalKey("B", 0, ValueType::kTypeValue),
|
||||
InternalKey("C", 0, ValueType::kTypeValue),
|
||||
InternalKey("W", 0, ValueType::kTypeDeletion),
|
||||
InternalKey("X", 0, ValueType::kTypeDeletion),
|
||||
InternalKey("Y", 0, ValueType::kTypeDeletion),
|
||||
InternalKey("Z", 0, ValueType::kTypeDeletion),
|
||||
};
|
||||
|
||||
for (bool sanitized : { false, true }) {
|
||||
std::unique_ptr<TableBuilder> builder;
|
||||
std::unique_ptr<FakeWritableFile> writable;
|
||||
Options options;
|
||||
if (sanitized) {
|
||||
options.table_stats_collectors = {
|
||||
std::make_shared<RegularKeysStartWithA>()
|
||||
};
|
||||
// with sanitization, even regular stats collector will be able to
|
||||
// handle internal keys.
|
||||
auto comparator = options.comparator;
|
||||
// HACK: Set options.info_log to avoid writing log in
|
||||
// SanitizeOptions().
|
||||
options.info_log = std::make_shared<DumbLogger>();
|
||||
options = SanitizeOptions(
|
||||
"db", // just a place holder
|
||||
nullptr, // with skip internal key comparator
|
||||
nullptr, // don't care filter policy
|
||||
options
|
||||
);
|
||||
options.comparator = comparator;
|
||||
} else {
|
||||
options.table_stats_collectors = {
|
||||
std::make_shared<InternalKeyStatsCollector>()
|
||||
};
|
||||
}
|
||||
|
||||
MakeBuilder(options, &writable, &builder);
|
||||
for (const auto& k : keys) {
|
||||
builder->Add(k.Encode(), "val");
|
||||
}
|
||||
|
||||
ASSERT_OK(builder->Finish());
|
||||
|
||||
std::unique_ptr<TableReader> table_reader;
|
||||
OpenTable(options, writable->contents(), &table_reader);
|
||||
const auto& stats = table_reader->GetTableStats().user_collected_stats;
|
||||
|
||||
uint64_t deleted = 0;
|
||||
Slice key(stats.at(InternalKeyTableStatsNames::kDeletedKeys));
|
||||
ASSERT_TRUE(GetVarint64(&key, &deleted));
|
||||
ASSERT_EQ(4u, deleted);
|
||||
|
||||
if (sanitized) {
|
||||
uint32_t starts_with_A = 0;
|
||||
Slice key(stats.at("Count"));
|
||||
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
||||
ASSERT_EQ(1u, starts_with_A);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
264
db/transaction_log_impl.cc
Normal file
264
db/transaction_log_impl.cc
Normal file
@ -0,0 +1,264 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#include "db/transaction_log_impl.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
TransactionLogIteratorImpl::TransactionLogIteratorImpl(
|
||||
const std::string& dir,
|
||||
const Options* options,
|
||||
const EnvOptions& soptions,
|
||||
const SequenceNumber seq,
|
||||
std::unique_ptr<VectorLogPtr> files,
|
||||
DBImpl const * const dbimpl) :
|
||||
dir_(dir),
|
||||
options_(options),
|
||||
soptions_(soptions),
|
||||
startingSequenceNumber_(seq),
|
||||
files_(std::move(files)),
|
||||
started_(false),
|
||||
isValid_(false),
|
||||
currentFileIndex_(0),
|
||||
currentBatchSeq_(0),
|
||||
currentLastSeq_(0),
|
||||
dbimpl_(dbimpl) {
|
||||
assert(files_ != nullptr);
|
||||
assert(dbimpl_ != nullptr);
|
||||
|
||||
reporter_.env = options_->env;
|
||||
reporter_.info_log = options_->info_log.get();
|
||||
SeekToStartSequence(); // Seek till starting sequence
|
||||
}
|
||||
|
||||
Status TransactionLogIteratorImpl::OpenLogFile(
|
||||
const LogFile* logFile,
|
||||
unique_ptr<SequentialFile>* file) {
|
||||
Env* env = options_->env;
|
||||
if (logFile->Type() == kArchivedLogFile) {
|
||||
std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
|
||||
return env->NewSequentialFile(fname, file, soptions_);
|
||||
} else {
|
||||
std::string fname = LogFileName(dir_, logFile->LogNumber());
|
||||
Status status = env->NewSequentialFile(fname, file, soptions_);
|
||||
if (!status.ok()) {
|
||||
// If cannot open file in DB directory.
|
||||
// Try the archive dir, as it could have moved in the meanwhile.
|
||||
fname = ArchivedLogFileName(dir_, logFile->LogNumber());
|
||||
status = env->NewSequentialFile(fname, file, soptions_);
|
||||
if (!status.ok()) {
|
||||
return Status::IOError("Requested file not present in the dir");
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
BatchResult TransactionLogIteratorImpl::GetBatch() {
|
||||
assert(isValid_); // cannot call in a non valid state.
|
||||
BatchResult result;
|
||||
result.sequence = currentBatchSeq_;
|
||||
result.writeBatchPtr = std::move(currentBatch_);
|
||||
return result;
|
||||
}
|
||||
|
||||
Status TransactionLogIteratorImpl::status() {
|
||||
return currentStatus_;
|
||||
}
|
||||
|
||||
bool TransactionLogIteratorImpl::Valid() {
|
||||
return started_ && isValid_;
|
||||
}
|
||||
|
||||
bool TransactionLogIteratorImpl::RestrictedRead(
|
||||
Slice* record,
|
||||
std::string* scratch) {
|
||||
// Don't read if no more complete entries to read from logs
|
||||
if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
|
||||
return false;
|
||||
}
|
||||
return currentLogReader_->ReadRecord(record, scratch);
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::SeekToStartSequence(
|
||||
uint64_t startFileIndex,
|
||||
bool strict) {
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
started_ = false;
|
||||
isValid_ = false;
|
||||
if (files_->size() <= startFileIndex) {
|
||||
return;
|
||||
}
|
||||
Status s = OpenLogReader(files_->at(startFileIndex).get());
|
||||
if (!s.ok()) {
|
||||
currentStatus_ = s;
|
||||
return;
|
||||
}
|
||||
while (RestrictedRead(&record, &scratch)) {
|
||||
if (record.size() < 12) {
|
||||
reporter_.Corruption(
|
||||
record.size(), Status::Corruption("very small log record"));
|
||||
continue;
|
||||
}
|
||||
UpdateCurrentWriteBatch(record);
|
||||
if (currentLastSeq_ >= startingSequenceNumber_) {
|
||||
if (strict && currentBatchSeq_ != startingSequenceNumber_) {
|
||||
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
|
||||
"seek to required sequence number");
|
||||
reporter_.Info(currentStatus_.ToString().c_str());
|
||||
return;
|
||||
} else if (strict) {
|
||||
reporter_.Info("Could seek required sequence number. Iterator will "
|
||||
"continue.");
|
||||
}
|
||||
isValid_ = true;
|
||||
started_ = true; // set started_ as we could seek till starting sequence
|
||||
return;
|
||||
} else {
|
||||
isValid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Could not find start sequence in first file. Normally this must be the
|
||||
// only file. Otherwise log the error and let the iterator return next entry
|
||||
// If strict is set, we want to seek exactly till the start sequence and it
|
||||
// should have been present in the file we scanned above
|
||||
if (strict) {
|
||||
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
|
||||
"seek to required sequence number");
|
||||
reporter_.Info(currentStatus_.ToString().c_str());
|
||||
} else if (files_->size() != 1) {
|
||||
currentStatus_ = Status::Corruption("Start sequence was not found, "
|
||||
"skipping to the next available");
|
||||
reporter_.Info(currentStatus_.ToString().c_str());
|
||||
// Let NextImpl find the next available entry. started_ remains false
|
||||
// because we don't want to check for gaps while moving to start sequence
|
||||
NextImpl(true);
|
||||
}
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::Next() {
|
||||
return NextImpl(false);
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::NextImpl(bool internal) {
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
isValid_ = false;
|
||||
if (!internal && !started_) {
|
||||
// Runs every time until we can seek to the start sequence
|
||||
return SeekToStartSequence();
|
||||
}
|
||||
while(true) {
|
||||
assert(currentLogReader_);
|
||||
if (currentLogReader_->IsEOF()) {
|
||||
currentLogReader_->UnmarkEOF();
|
||||
}
|
||||
while (RestrictedRead(&record, &scratch)) {
|
||||
if (record.size() < 12) {
|
||||
reporter_.Corruption(
|
||||
record.size(), Status::Corruption("very small log record"));
|
||||
continue;
|
||||
} else {
|
||||
// started_ should be true if called by application
|
||||
assert(internal || started_);
|
||||
// started_ should be false if called internally
|
||||
assert(!internal || !started_);
|
||||
UpdateCurrentWriteBatch(record);
|
||||
if (internal && !started_) {
|
||||
started_ = true;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Open the next file
|
||||
if (currentFileIndex_ < files_->size() - 1) {
|
||||
++currentFileIndex_;
|
||||
Status status =OpenLogReader(files_->at(currentFileIndex_).get());
|
||||
if (!status.ok()) {
|
||||
isValid_ = false;
|
||||
currentStatus_ = status;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
isValid_ = false;
|
||||
if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
|
||||
currentStatus_ = Status::OK();
|
||||
} else {
|
||||
currentStatus_ = Status::IOError("NO MORE DATA LEFT");
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool TransactionLogIteratorImpl::IsBatchExpected(
|
||||
const WriteBatch* batch,
|
||||
const SequenceNumber expectedSeq) {
|
||||
assert(batch);
|
||||
SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
|
||||
if (batchSeq != expectedSeq) {
|
||||
char buf[200];
|
||||
snprintf(buf, sizeof(buf),
|
||||
"Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
|
||||
"Last flushed seq=%lu.Log iterator will reseek the correct "
|
||||
"batch.",
|
||||
(unsigned long)batchSeq,
|
||||
(unsigned long)expectedSeq,
|
||||
(unsigned long)dbimpl_->GetLatestSequenceNumber());
|
||||
reporter_.Info(buf);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
|
||||
std::unique_ptr<WriteBatch> batch(new WriteBatch());
|
||||
WriteBatchInternal::SetContents(batch.get(), record);
|
||||
|
||||
SequenceNumber expectedSeq = currentLastSeq_ + 1;
|
||||
// If the iterator has started, then confirm that we get continuous batches
|
||||
if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
|
||||
// Seek to the batch having expected sequence number
|
||||
if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
|
||||
// Expected batch must lie in the previous log file
|
||||
// Avoid underflow.
|
||||
if (currentFileIndex_ != 0) {
|
||||
currentFileIndex_--;
|
||||
}
|
||||
}
|
||||
startingSequenceNumber_ = expectedSeq;
|
||||
// currentStatus_ will be set to Ok if reseek succeeds
|
||||
currentStatus_ = Status::NotFound("Gap in sequence numbers");
|
||||
return SeekToStartSequence(currentFileIndex_, true);
|
||||
}
|
||||
|
||||
currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
|
||||
currentLastSeq_ = currentBatchSeq_ +
|
||||
WriteBatchInternal::Count(batch.get()) - 1;
|
||||
// currentBatchSeq_ can only change here
|
||||
assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
|
||||
|
||||
currentBatch_ = move(batch);
|
||||
isValid_ = true;
|
||||
currentStatus_ = Status::OK();
|
||||
}
|
||||
|
||||
Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
|
||||
unique_ptr<SequentialFile> file;
|
||||
Status status = OpenLogFile(logFile, &file);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
assert(file);
|
||||
currentLogReader_.reset(
|
||||
new log::Reader(std::move(file), &reporter_, true, 0)
|
||||
);
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace rocksdb
|
118
db/transaction_log_impl.h
Normal file
118
db/transaction_log_impl.h
Normal file
@ -0,0 +1,118 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#pragma once
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/log_reader.h"
|
||||
#include "db/filename.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct LogReporter : public log::Reader::Reporter {
|
||||
Env* env;
|
||||
Logger* info_log;
|
||||
virtual void Corruption(size_t bytes, const Status& s) {
|
||||
Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
|
||||
}
|
||||
virtual void Info(const char* s) {
|
||||
Log(info_log, "%s", s);
|
||||
}
|
||||
};
|
||||
|
||||
class LogFileImpl : public LogFile {
|
||||
public:
|
||||
LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
|
||||
uint64_t sizeBytes) :
|
||||
logNumber_(logNum),
|
||||
type_(logType),
|
||||
startSequence_(startSeq),
|
||||
sizeFileBytes_(sizeBytes) {
|
||||
}
|
||||
|
||||
std::string PathName() const {
|
||||
if (type_ == kArchivedLogFile) {
|
||||
return ArchivedLogFileName("", logNumber_);
|
||||
}
|
||||
return LogFileName("", logNumber_);
|
||||
}
|
||||
|
||||
uint64_t LogNumber() const { return logNumber_; }
|
||||
|
||||
WalFileType Type() const { return type_; }
|
||||
|
||||
SequenceNumber StartSequence() const { return startSequence_; }
|
||||
|
||||
uint64_t SizeFileBytes() const { return sizeFileBytes_; }
|
||||
|
||||
bool operator < (const LogFile& that) const {
|
||||
return LogNumber() < that.LogNumber();
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t logNumber_;
|
||||
WalFileType type_;
|
||||
SequenceNumber startSequence_;
|
||||
uint64_t sizeFileBytes_;
|
||||
|
||||
};
|
||||
|
||||
class TransactionLogIteratorImpl : public TransactionLogIterator {
|
||||
public:
|
||||
TransactionLogIteratorImpl(const std::string& dir,
|
||||
const Options* options,
|
||||
const EnvOptions& soptions,
|
||||
const SequenceNumber seqNum,
|
||||
std::unique_ptr<VectorLogPtr> files,
|
||||
DBImpl const * const dbimpl);
|
||||
|
||||
virtual bool Valid();
|
||||
|
||||
virtual void Next();
|
||||
|
||||
virtual Status status();
|
||||
|
||||
virtual BatchResult GetBatch();
|
||||
|
||||
private:
|
||||
const std::string& dir_;
|
||||
const Options* options_;
|
||||
const EnvOptions& soptions_;
|
||||
SequenceNumber startingSequenceNumber_;
|
||||
std::unique_ptr<VectorLogPtr> files_;
|
||||
bool started_;
|
||||
bool isValid_; // not valid when it starts of.
|
||||
Status currentStatus_;
|
||||
size_t currentFileIndex_;
|
||||
std::unique_ptr<WriteBatch> currentBatch_;
|
||||
unique_ptr<log::Reader> currentLogReader_;
|
||||
Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
|
||||
LogReporter reporter_;
|
||||
SequenceNumber currentBatchSeq_; // sequence number at start of current batch
|
||||
SequenceNumber currentLastSeq_; // last sequence in the current batch
|
||||
DBImpl const * const dbimpl_; // The db on whose log files this iterates
|
||||
|
||||
// Reads from transaction log only if the writebatch record has been written
|
||||
bool RestrictedRead(Slice* record, std::string* scratch);
|
||||
// Seeks to startingSequenceNumber reading from startFileIndex in files_.
|
||||
// If strict is set,then must get a batch starting with startingSequenceNumber
|
||||
void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
|
||||
// Implementation of Next. SeekToStartSequence calls it internally with
|
||||
// internal=true to let it find next entry even if it has to jump gaps because
|
||||
// the iterator may start off from the first available entry but promises to
|
||||
// be continuous after that
|
||||
void NextImpl(bool internal = false);
|
||||
// Check if batch is expected, else return false
|
||||
bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
|
||||
// Update current batch if a continuous batch is found, else return false
|
||||
void UpdateCurrentWriteBatch(const Slice& record);
|
||||
Status OpenLogReader(const LogFile* file);
|
||||
};
|
||||
} // namespace rocksdb
|
301
db/version_edit.cc
Normal file
301
db/version_edit.cc
Normal file
@ -0,0 +1,301 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_edit.h"
|
||||
|
||||
#include "db/version_set.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Tag numbers for serialized VersionEdit. These numbers are written to
|
||||
// disk and should not be changed.
|
||||
enum Tag {
|
||||
kComparator = 1,
|
||||
kLogNumber = 2,
|
||||
kNextFileNumber = 3,
|
||||
kLastSequence = 4,
|
||||
kCompactPointer = 5,
|
||||
kDeletedFile = 6,
|
||||
kNewFile = 7,
|
||||
// 8 was used for large value refs
|
||||
kPrevLogNumber = 9,
|
||||
|
||||
// these are new formats divergent from open source leveldb
|
||||
kNewFile2 = 100 // store smallest & largest seqno
|
||||
};
|
||||
|
||||
void VersionEdit::Clear() {
|
||||
comparator_.clear();
|
||||
log_number_ = 0;
|
||||
prev_log_number_ = 0;
|
||||
last_sequence_ = 0;
|
||||
next_file_number_ = 0;
|
||||
has_comparator_ = false;
|
||||
has_log_number_ = false;
|
||||
has_prev_log_number_ = false;
|
||||
has_next_file_number_ = false;
|
||||
has_last_sequence_ = false;
|
||||
deleted_files_.clear();
|
||||
new_files_.clear();
|
||||
}
|
||||
|
||||
void VersionEdit::EncodeTo(std::string* dst) const {
|
||||
if (has_comparator_) {
|
||||
PutVarint32(dst, kComparator);
|
||||
PutLengthPrefixedSlice(dst, comparator_);
|
||||
}
|
||||
if (has_log_number_) {
|
||||
PutVarint32(dst, kLogNumber);
|
||||
PutVarint64(dst, log_number_);
|
||||
}
|
||||
if (has_prev_log_number_) {
|
||||
PutVarint32(dst, kPrevLogNumber);
|
||||
PutVarint64(dst, prev_log_number_);
|
||||
}
|
||||
if (has_next_file_number_) {
|
||||
PutVarint32(dst, kNextFileNumber);
|
||||
PutVarint64(dst, next_file_number_);
|
||||
}
|
||||
if (has_last_sequence_) {
|
||||
PutVarint32(dst, kLastSequence);
|
||||
PutVarint64(dst, last_sequence_);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < compact_pointers_.size(); i++) {
|
||||
PutVarint32(dst, kCompactPointer);
|
||||
PutVarint32(dst, compact_pointers_[i].first); // level
|
||||
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
|
||||
}
|
||||
|
||||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
|
||||
iter != deleted_files_.end();
|
||||
++iter) {
|
||||
PutVarint32(dst, kDeletedFile);
|
||||
PutVarint32(dst, iter->first); // level
|
||||
PutVarint64(dst, iter->second); // file number
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < new_files_.size(); i++) {
|
||||
const FileMetaData& f = new_files_[i].second;
|
||||
PutVarint32(dst, kNewFile2);
|
||||
PutVarint32(dst, new_files_[i].first); // level
|
||||
PutVarint64(dst, f.number);
|
||||
PutVarint64(dst, f.file_size);
|
||||
PutLengthPrefixedSlice(dst, f.smallest.Encode());
|
||||
PutLengthPrefixedSlice(dst, f.largest.Encode());
|
||||
PutVarint64(dst, f.smallest_seqno);
|
||||
PutVarint64(dst, f.largest_seqno);
|
||||
}
|
||||
}
|
||||
|
||||
static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
||||
Slice str;
|
||||
if (GetLengthPrefixedSlice(input, &str)) {
|
||||
dst->DecodeFrom(str);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
|
||||
uint32_t v;
|
||||
if (GetVarint32(input, &v) &&
|
||||
(int)v < number_levels_) {
|
||||
*level = v;
|
||||
return true;
|
||||
} else {
|
||||
if ((int)v >= number_levels_) {
|
||||
*msg = "db already has more levels than options.num_levels";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Status VersionEdit::DecodeFrom(const Slice& src) {
|
||||
Clear();
|
||||
Slice input = src;
|
||||
const char* msg = nullptr;
|
||||
uint32_t tag;
|
||||
|
||||
// Temporary storage for parsing
|
||||
int level;
|
||||
uint64_t number;
|
||||
FileMetaData f;
|
||||
Slice str;
|
||||
InternalKey key;
|
||||
|
||||
while (msg == nullptr && GetVarint32(&input, &tag)) {
|
||||
switch (tag) {
|
||||
case kComparator:
|
||||
if (GetLengthPrefixedSlice(&input, &str)) {
|
||||
comparator_ = str.ToString();
|
||||
has_comparator_ = true;
|
||||
} else {
|
||||
msg = "comparator name";
|
||||
}
|
||||
break;
|
||||
|
||||
case kLogNumber:
|
||||
if (GetVarint64(&input, &log_number_)) {
|
||||
has_log_number_ = true;
|
||||
} else {
|
||||
msg = "log number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kPrevLogNumber:
|
||||
if (GetVarint64(&input, &prev_log_number_)) {
|
||||
has_prev_log_number_ = true;
|
||||
} else {
|
||||
msg = "previous log number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kNextFileNumber:
|
||||
if (GetVarint64(&input, &next_file_number_)) {
|
||||
has_next_file_number_ = true;
|
||||
} else {
|
||||
msg = "next file number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kLastSequence:
|
||||
if (GetVarint64(&input, &last_sequence_)) {
|
||||
has_last_sequence_ = true;
|
||||
} else {
|
||||
msg = "last sequence number";
|
||||
}
|
||||
break;
|
||||
|
||||
case kCompactPointer:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetInternalKey(&input, &key)) {
|
||||
compact_pointers_.push_back(std::make_pair(level, key));
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "compaction pointer";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kDeletedFile:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetVarint64(&input, &number)) {
|
||||
deleted_files_.insert(std::make_pair(level, number));
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "deleted file";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kNewFile:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetVarint64(&input, &f.number) &&
|
||||
GetVarint64(&input, &f.file_size) &&
|
||||
GetInternalKey(&input, &f.smallest) &&
|
||||
GetInternalKey(&input, &f.largest)) {
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "new-file entry";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kNewFile2:
|
||||
if (GetLevel(&input, &level, &msg) &&
|
||||
GetVarint64(&input, &f.number) &&
|
||||
GetVarint64(&input, &f.file_size) &&
|
||||
GetInternalKey(&input, &f.smallest) &&
|
||||
GetInternalKey(&input, &f.largest) &&
|
||||
GetVarint64(&input, &f.smallest_seqno) &&
|
||||
GetVarint64(&input, &f.largest_seqno) ) {
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
} else {
|
||||
if (!msg) {
|
||||
msg = "new-file2 entry";
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
msg = "unknown tag";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (msg == nullptr && !input.empty()) {
|
||||
msg = "invalid tag";
|
||||
}
|
||||
|
||||
Status result;
|
||||
if (msg != nullptr) {
|
||||
result = Status::Corruption("VersionEdit", msg);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string VersionEdit::DebugString(bool hex_key) const {
|
||||
std::string r;
|
||||
r.append("VersionEdit {");
|
||||
if (has_comparator_) {
|
||||
r.append("\n Comparator: ");
|
||||
r.append(comparator_);
|
||||
}
|
||||
if (has_log_number_) {
|
||||
r.append("\n LogNumber: ");
|
||||
AppendNumberTo(&r, log_number_);
|
||||
}
|
||||
if (has_prev_log_number_) {
|
||||
r.append("\n PrevLogNumber: ");
|
||||
AppendNumberTo(&r, prev_log_number_);
|
||||
}
|
||||
if (has_next_file_number_) {
|
||||
r.append("\n NextFile: ");
|
||||
AppendNumberTo(&r, next_file_number_);
|
||||
}
|
||||
if (has_last_sequence_) {
|
||||
r.append("\n LastSeq: ");
|
||||
AppendNumberTo(&r, last_sequence_);
|
||||
}
|
||||
for (size_t i = 0; i < compact_pointers_.size(); i++) {
|
||||
r.append("\n CompactPointer: ");
|
||||
AppendNumberTo(&r, compact_pointers_[i].first);
|
||||
r.append(" ");
|
||||
r.append(compact_pointers_[i].second.DebugString(hex_key));
|
||||
}
|
||||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
|
||||
iter != deleted_files_.end();
|
||||
++iter) {
|
||||
r.append("\n DeleteFile: ");
|
||||
AppendNumberTo(&r, iter->first);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, iter->second);
|
||||
}
|
||||
for (size_t i = 0; i < new_files_.size(); i++) {
|
||||
const FileMetaData& f = new_files_[i].second;
|
||||
r.append("\n AddFile: ");
|
||||
AppendNumberTo(&r, new_files_[i].first);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, f.number);
|
||||
r.append(" ");
|
||||
AppendNumberTo(&r, f.file_size);
|
||||
r.append(" ");
|
||||
r.append(f.smallest.DebugString(hex_key));
|
||||
r.append(" .. ");
|
||||
r.append(f.largest.DebugString(hex_key));
|
||||
}
|
||||
r.append("\n}\n");
|
||||
return r;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
128
db/version_edit.h
Normal file
128
db/version_edit.h
Normal file
@ -0,0 +1,128 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "db/dbformat.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class VersionSet;
|
||||
|
||||
struct FileMetaData {
|
||||
int refs;
|
||||
int allowed_seeks; // Seeks allowed until compaction
|
||||
uint64_t number;
|
||||
uint64_t file_size; // File size in bytes
|
||||
InternalKey smallest; // Smallest internal key served by table
|
||||
InternalKey largest; // Largest internal key served by table
|
||||
bool being_compacted; // Is this file undergoing compaction?
|
||||
SequenceNumber smallest_seqno;// The smallest seqno in this file
|
||||
SequenceNumber largest_seqno; // The largest seqno in this file
|
||||
|
||||
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
|
||||
being_compacted(false) { }
|
||||
};
|
||||
|
||||
class VersionEdit {
|
||||
public:
|
||||
explicit VersionEdit(int number_levels) :
|
||||
number_levels_(number_levels) {
|
||||
Clear();
|
||||
}
|
||||
~VersionEdit() { }
|
||||
|
||||
void Clear();
|
||||
|
||||
void SetComparatorName(const Slice& name) {
|
||||
has_comparator_ = true;
|
||||
comparator_ = name.ToString();
|
||||
}
|
||||
void SetLogNumber(uint64_t num) {
|
||||
has_log_number_ = true;
|
||||
log_number_ = num;
|
||||
}
|
||||
void SetPrevLogNumber(uint64_t num) {
|
||||
has_prev_log_number_ = true;
|
||||
prev_log_number_ = num;
|
||||
}
|
||||
void SetNextFile(uint64_t num) {
|
||||
has_next_file_number_ = true;
|
||||
next_file_number_ = num;
|
||||
}
|
||||
void SetLastSequence(SequenceNumber seq) {
|
||||
has_last_sequence_ = true;
|
||||
last_sequence_ = seq;
|
||||
}
|
||||
void SetCompactPointer(int level, const InternalKey& key) {
|
||||
compact_pointers_.push_back(std::make_pair(level, key));
|
||||
}
|
||||
|
||||
// Add the specified file at the specified number.
|
||||
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
||||
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
||||
void AddFile(int level, uint64_t file,
|
||||
uint64_t file_size,
|
||||
const InternalKey& smallest,
|
||||
const InternalKey& largest,
|
||||
const SequenceNumber& smallest_seqno,
|
||||
const SequenceNumber& largest_seqno) {
|
||||
FileMetaData f;
|
||||
f.number = file;
|
||||
f.file_size = file_size;
|
||||
f.smallest = smallest;
|
||||
f.largest = largest;
|
||||
f.smallest_seqno = smallest_seqno;
|
||||
f.largest_seqno = largest_seqno;
|
||||
assert(smallest_seqno <= largest_seqno);
|
||||
new_files_.push_back(std::make_pair(level, f));
|
||||
}
|
||||
|
||||
// Delete the specified "file" from the specified "level".
|
||||
void DeleteFile(int level, uint64_t file) {
|
||||
deleted_files_.insert(std::make_pair(level, file));
|
||||
}
|
||||
|
||||
// Number of edits
|
||||
int NumEntries() {
|
||||
return new_files_.size() + deleted_files_.size();
|
||||
}
|
||||
|
||||
void EncodeTo(std::string* dst) const;
|
||||
Status DecodeFrom(const Slice& src);
|
||||
|
||||
std::string DebugString(bool hex_key = false) const;
|
||||
|
||||
private:
|
||||
friend class VersionSet;
|
||||
|
||||
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
|
||||
|
||||
bool GetLevel(Slice* input, int* level, const char** msg);
|
||||
|
||||
int number_levels_;
|
||||
std::string comparator_;
|
||||
uint64_t log_number_;
|
||||
uint64_t prev_log_number_;
|
||||
uint64_t next_file_number_;
|
||||
SequenceNumber last_sequence_;
|
||||
bool has_comparator_;
|
||||
bool has_log_number_;
|
||||
bool has_prev_log_number_;
|
||||
bool has_next_file_number_;
|
||||
bool has_last_sequence_;
|
||||
|
||||
std::vector< std::pair<int, InternalKey> > compact_pointers_;
|
||||
DeletedFileSet deleted_files_;
|
||||
std::vector< std::pair<int, FileMetaData> > new_files_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
53
db/version_edit_test.cc
Normal file
53
db/version_edit_test.cc
Normal file
@ -0,0 +1,53 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_edit.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static void TestEncodeDecode(const VersionEdit& edit) {
|
||||
std::string encoded, encoded2;
|
||||
edit.EncodeTo(&encoded);
|
||||
VersionEdit parsed(7);
|
||||
Status s = parsed.DecodeFrom(encoded);
|
||||
ASSERT_TRUE(s.ok()) << s.ToString();
|
||||
parsed.EncodeTo(&encoded2);
|
||||
ASSERT_EQ(encoded, encoded2);
|
||||
}
|
||||
|
||||
class VersionEditTest { };
|
||||
|
||||
TEST(VersionEditTest, EncodeDecode) {
|
||||
static const uint64_t kBig = 1ull << 50;
|
||||
|
||||
VersionEdit edit(7);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
TestEncodeDecode(edit);
|
||||
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
|
||||
InternalKey("foo", kBig + 500 + i, kTypeValue),
|
||||
InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
|
||||
kBig + 500 + i,
|
||||
kBig + 600 + i);
|
||||
edit.DeleteFile(4, kBig + 700 + i);
|
||||
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
|
||||
}
|
||||
|
||||
edit.SetComparatorName("foo");
|
||||
edit.SetLogNumber(kBig + 100);
|
||||
edit.SetNextFile(kBig + 200);
|
||||
edit.SetLastSequence(kBig + 1000);
|
||||
TestEncodeDecode(edit);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
3152
db/version_set.cc
Normal file
3152
db/version_set.cc
Normal file
File diff suppressed because it is too large
Load Diff
654
db/version_set.h
Normal file
654
db/version_set.h
Normal file
@ -0,0 +1,654 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// The representation of a DBImpl consists of a set of Versions. The
|
||||
// newest version is called "current". Older versions may be kept
|
||||
// around to provide a consistent view to live iterators.
|
||||
//
|
||||
// Each Version keeps track of a set of Table files per level. The
|
||||
// entire set of versions is maintained in a VersionSet.
|
||||
//
|
||||
// Version,VersionSet are thread-compatible, but require external
|
||||
// synchronization on all accesses.
|
||||
|
||||
#pragma once
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
#include "db/dbformat.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "port/port.h"
|
||||
#include "db/table_cache.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace log { class Writer; }
|
||||
|
||||
class Compaction;
|
||||
class Iterator;
|
||||
class MemTable;
|
||||
class TableCache;
|
||||
class Version;
|
||||
class VersionSet;
|
||||
|
||||
// Return the smallest index i such that files[i]->largest >= key.
|
||||
// Return files.size() if there is no such file.
|
||||
// REQUIRES: "files" contains a sorted list of non-overlapping files.
|
||||
extern int FindFile(const InternalKeyComparator& icmp,
|
||||
const std::vector<FileMetaData*>& files,
|
||||
const Slice& key);
|
||||
|
||||
// Returns true iff some file in "files" overlaps the user key range
|
||||
// [*smallest,*largest].
|
||||
// smallest==nullptr represents a key smaller than all keys in the DB.
|
||||
// largest==nullptr represents a key largest than all keys in the DB.
|
||||
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
|
||||
// in sorted order.
|
||||
extern bool SomeFileOverlapsRange(
|
||||
const InternalKeyComparator& icmp,
|
||||
bool disjoint_sorted_files,
|
||||
const std::vector<FileMetaData*>& files,
|
||||
const Slice* smallest_user_key,
|
||||
const Slice* largest_user_key);
|
||||
|
||||
class Version {
|
||||
public:
|
||||
// Append to *iters a sequence of iterators that will
|
||||
// yield the contents of this Version when merged together.
|
||||
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
|
||||
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
|
||||
std::vector<Iterator*>* iters);
|
||||
|
||||
// Lookup the value for key. If found, store it in *val and
|
||||
// return OK. Else return a non-OK status. Fills *stats.
|
||||
// Uses *operands to store merge_operator operations to apply later
|
||||
// REQUIRES: lock is not held
|
||||
struct GetStats {
|
||||
FileMetaData* seek_file;
|
||||
int seek_file_level;
|
||||
};
|
||||
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
||||
Status* status, std::deque<std::string>* operands, GetStats* stats,
|
||||
const Options& db_option,
|
||||
bool* value_found = nullptr);
|
||||
|
||||
// Adds "stats" into the current state. Returns true if a new
|
||||
// compaction may need to be triggered, false otherwise.
|
||||
// REQUIRES: lock is held
|
||||
bool UpdateStats(const GetStats& stats);
|
||||
|
||||
// Reference count management (so Versions do not disappear out from
|
||||
// under live iterators)
|
||||
void Ref();
|
||||
void Unref();
|
||||
|
||||
void GetOverlappingInputs(
|
||||
int level,
|
||||
const InternalKey* begin, // nullptr means before all keys
|
||||
const InternalKey* end, // nullptr means after all keys
|
||||
std::vector<FileMetaData*>* inputs,
|
||||
int hint_index = -1, // index of overlap file
|
||||
int* file_index = nullptr); // return index of overlap file
|
||||
|
||||
void GetOverlappingInputsBinarySearch(
|
||||
int level,
|
||||
const Slice& begin, // nullptr means before all keys
|
||||
const Slice& end, // nullptr means after all keys
|
||||
std::vector<FileMetaData*>* inputs,
|
||||
int hint_index, // index of overlap file
|
||||
int* file_index); // return index of overlap file
|
||||
|
||||
void ExtendOverlappingInputs(
|
||||
int level,
|
||||
const Slice& begin, // nullptr means before all keys
|
||||
const Slice& end, // nullptr means after all keys
|
||||
std::vector<FileMetaData*>* inputs,
|
||||
unsigned int index); // start extending from this index
|
||||
|
||||
// Returns true iff some file in the specified level overlaps
|
||||
// some part of [*smallest_user_key,*largest_user_key].
|
||||
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
|
||||
// largest_user_key==NULL represents a key largest than all keys in the DB.
|
||||
bool OverlapInLevel(int level,
|
||||
const Slice* smallest_user_key,
|
||||
const Slice* largest_user_key);
|
||||
|
||||
// Returns true iff the first or last file in inputs contains
|
||||
// an overlapping user key to the file "just outside" of it (i.e.
|
||||
// just after the last file, or just before the first file)
|
||||
// REQUIRES: "*inputs" is a sorted list of non-overlapping files
|
||||
bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
|
||||
int level);
|
||||
|
||||
|
||||
// Return the level at which we should place a new memtable compaction
|
||||
// result that covers the range [smallest_user_key,largest_user_key].
|
||||
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
|
||||
const Slice& largest_user_key);
|
||||
|
||||
int NumFiles(int level) const { return files_[level].size(); }
|
||||
|
||||
// Return a human readable string that describes this version's contents.
|
||||
std::string DebugString(bool hex = false) const;
|
||||
|
||||
// Returns the version nuber of this version
|
||||
uint64_t GetVersionNumber() {
|
||||
return version_number_;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class Compaction;
|
||||
friend class VersionSet;
|
||||
friend class DBImpl;
|
||||
|
||||
class LevelFileNumIterator;
|
||||
Iterator* NewConcatenatingIterator(const ReadOptions&,
|
||||
const EnvOptions& soptions,
|
||||
int level) const;
|
||||
bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
|
||||
const Slice& internal_prefix, Iterator* level_iter) const;
|
||||
|
||||
VersionSet* vset_; // VersionSet to which this Version belongs
|
||||
Version* next_; // Next version in linked list
|
||||
Version* prev_; // Previous version in linked list
|
||||
int refs_; // Number of live refs to this version
|
||||
|
||||
// List of files per level, files in each level are arranged
|
||||
// in increasing order of keys
|
||||
std::vector<FileMetaData*>* files_;
|
||||
|
||||
// A list for the same set of files that are stored in files_,
|
||||
// but files in each level are now sorted based on file
|
||||
// size. The file with the largest size is at the front.
|
||||
// This vector stores the index of the file from files_.
|
||||
std::vector< std::vector<int> > files_by_size_;
|
||||
|
||||
// An index into files_by_size_ that specifies the first
|
||||
// file that is not yet compacted
|
||||
std::vector<int> next_file_to_compact_by_size_;
|
||||
|
||||
// Only the first few entries of files_by_size_ are sorted.
|
||||
// There is no need to sort all the files because it is likely
|
||||
// that on a running system, we need to look at only the first
|
||||
// few largest files because a new version is created every few
|
||||
// seconds/minutes (because of concurrent compactions).
|
||||
static const int number_of_files_to_sort_ = 50;
|
||||
|
||||
// Next file to compact based on seek stats.
|
||||
FileMetaData* file_to_compact_;
|
||||
int file_to_compact_level_;
|
||||
|
||||
// Level that should be compacted next and its compaction score.
|
||||
// Score < 1 means compaction is not strictly needed. These fields
|
||||
// are initialized by Finalize().
|
||||
// The most critical level to be compacted is listed first
|
||||
// These are used to pick the best compaction level
|
||||
std::vector<double> compaction_score_;
|
||||
std::vector<int> compaction_level_;
|
||||
double max_compaction_score_; // max score in l1 to ln-1
|
||||
int max_compaction_score_level_; // level on which max score occurs
|
||||
|
||||
// The offset in the manifest file where this version is stored.
|
||||
uint64_t offset_manifest_file_;
|
||||
|
||||
// A version number that uniquely represents this version. This is
|
||||
// used for debugging and logging purposes only.
|
||||
uint64_t version_number_;
|
||||
|
||||
explicit Version(VersionSet* vset, uint64_t version_number = 0);
|
||||
|
||||
~Version();
|
||||
|
||||
// re-initializes the index that is used to offset into files_by_size_
|
||||
// to find the next compaction candidate file.
|
||||
void ResetNextCompactionIndex(int level) {
|
||||
next_file_to_compact_by_size_[level] = 0;
|
||||
}
|
||||
|
||||
// No copying allowed
|
||||
Version(const Version&);
|
||||
void operator=(const Version&);
|
||||
};
|
||||
|
||||
class VersionSet {
|
||||
public:
|
||||
VersionSet(const std::string& dbname,
|
||||
const Options* options,
|
||||
const EnvOptions& storage_options,
|
||||
TableCache* table_cache,
|
||||
const InternalKeyComparator*);
|
||||
~VersionSet();
|
||||
|
||||
// Apply *edit to the current version to form a new descriptor that
|
||||
// is both saved to persistent state and installed as the new
|
||||
// current version. Will release *mu while actually writing to the file.
|
||||
// REQUIRES: *mu is held on entry.
|
||||
// REQUIRES: no other thread concurrently calls LogAndApply()
|
||||
Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
|
||||
bool new_descriptor_log = false);
|
||||
|
||||
// Recover the last saved descriptor from persistent storage.
|
||||
Status Recover();
|
||||
|
||||
// Try to reduce the number of levels. This call is valid when
|
||||
// only one level from the new max level to the old
|
||||
// max level containing files.
|
||||
// For example, a db currently has 7 levels [0-6], and a call to
|
||||
// to reduce to 5 [0-4] can only be executed when only one level
|
||||
// among [4-6] contains files.
|
||||
Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu);
|
||||
|
||||
// Return the current version.
|
||||
Version* current() const { return current_; }
|
||||
|
||||
// Return the current manifest file number
|
||||
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
|
||||
|
||||
// Allocate and return a new file number
|
||||
uint64_t NewFileNumber() { return next_file_number_++; }
|
||||
|
||||
// Arrange to reuse "file_number" unless a newer file number has
|
||||
// already been allocated.
|
||||
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
|
||||
void ReuseFileNumber(uint64_t file_number) {
|
||||
if (next_file_number_ == file_number + 1) {
|
||||
next_file_number_ = file_number;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the number of Table files at the specified level.
|
||||
int NumLevelFiles(int level) const;
|
||||
|
||||
// Return the combined file size of all files at the specified level.
|
||||
int64_t NumLevelBytes(int level) const;
|
||||
|
||||
// Return the last sequence number.
|
||||
uint64_t LastSequence() const { return last_sequence_; }
|
||||
|
||||
// Set the last sequence number to s.
|
||||
void SetLastSequence(uint64_t s) {
|
||||
assert(s >= last_sequence_);
|
||||
last_sequence_ = s;
|
||||
}
|
||||
|
||||
// Mark the specified file number as used.
|
||||
void MarkFileNumberUsed(uint64_t number);
|
||||
|
||||
// Return the current log file number.
|
||||
uint64_t LogNumber() const { return log_number_; }
|
||||
|
||||
// Return the log file number for the log file that is currently
|
||||
// being compacted, or zero if there is no such log file.
|
||||
uint64_t PrevLogNumber() const { return prev_log_number_; }
|
||||
|
||||
int NumberLevels() const { return num_levels_; }
|
||||
|
||||
// Pick level and inputs for a new compaction.
|
||||
// Returns nullptr if there is no compaction to be done.
|
||||
// Otherwise returns a pointer to a heap-allocated object that
|
||||
// describes the compaction. Caller should delete the result.
|
||||
Compaction* PickCompaction();
|
||||
|
||||
// Return a compaction object for compacting the range [begin,end] in
|
||||
// the specified level. Returns nullptr if there is nothing in that
|
||||
// level that overlaps the specified range. Caller should delete
|
||||
// the result.
|
||||
Compaction* CompactRange(
|
||||
int level,
|
||||
const InternalKey* begin,
|
||||
const InternalKey* end);
|
||||
|
||||
// Return the maximum overlapping data (in bytes) at next level for any
|
||||
// file at a level >= 1.
|
||||
int64_t MaxNextLevelOverlappingBytes();
|
||||
|
||||
// Create an iterator that reads over the compaction inputs for "*c".
|
||||
// The caller should delete the iterator when no longer needed.
|
||||
Iterator* MakeInputIterator(Compaction* c);
|
||||
|
||||
// Returns true iff some level needs a compaction because it has
|
||||
// exceeded its target size.
|
||||
bool NeedsSizeCompaction() const {
|
||||
// In universal compaction case, this check doesn't really
|
||||
// check the compaction condition, but checks num of files threshold
|
||||
// only. We are not going to miss any compaction opportunity
|
||||
// but it's likely that more compactions are scheduled but
|
||||
// ending up with nothing to do. We can improve it later.
|
||||
// TODO: improve this function to be accurate for universal
|
||||
// compactions.
|
||||
int num_levels_to_check =
|
||||
(options_->compaction_style != kCompactionStyleUniversal) ?
|
||||
NumberLevels() - 1 : 1;
|
||||
for (int i = 0; i < num_levels_to_check; i++) {
|
||||
if (current_->compaction_score_[i] >= 1) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// Returns true iff some level needs a compaction.
|
||||
bool NeedsCompaction() const {
|
||||
return ((current_->file_to_compact_ != nullptr) ||
|
||||
NeedsSizeCompaction());
|
||||
}
|
||||
|
||||
// Returns the maxmimum compaction score for levels 1 to max
|
||||
double MaxCompactionScore() const {
|
||||
return current_->max_compaction_score_;
|
||||
}
|
||||
|
||||
// See field declaration
|
||||
int MaxCompactionScoreLevel() const {
|
||||
return current_->max_compaction_score_level_;
|
||||
}
|
||||
|
||||
// Add all files listed in any live version to *live.
|
||||
void AddLiveFiles(std::vector<uint64_t>* live_list);
|
||||
|
||||
// Add all files listed in the current version to *live.
|
||||
void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
|
||||
|
||||
// Return the approximate offset in the database of the data for
|
||||
// "key" as of version "v".
|
||||
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
|
||||
|
||||
// Return a human-readable short (single-line) summary of the number
|
||||
// of files per level. Uses *scratch as backing store.
|
||||
struct LevelSummaryStorage {
|
||||
char buffer[100];
|
||||
};
|
||||
struct FileSummaryStorage {
|
||||
char buffer[1000];
|
||||
};
|
||||
const char* LevelSummary(LevelSummaryStorage* scratch) const;
|
||||
|
||||
// printf contents (for debugging)
|
||||
Status DumpManifest(Options& options, std::string& manifestFileName,
|
||||
bool verbose, bool hex = false);
|
||||
|
||||
// Return a human-readable short (single-line) summary of the data size
|
||||
// of files per level. Uses *scratch as backing store.
|
||||
const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
|
||||
|
||||
// Return a human-readable short (single-line) summary of files
|
||||
// in a specified level. Uses *scratch as backing store.
|
||||
const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
|
||||
|
||||
// Return the size of the current manifest file
|
||||
const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
|
||||
|
||||
// For the specfied level, pick a compaction.
|
||||
// Returns nullptr if there is no compaction to be done.
|
||||
// If level is 0 and there is already a compaction on that level, this
|
||||
// function will return nullptr.
|
||||
Compaction* PickCompactionBySize(int level, double score);
|
||||
|
||||
// Pick files to compact in Universal mode
|
||||
Compaction* PickCompactionUniversal(int level, double score);
|
||||
|
||||
// Pick Universal compaction to limit read amplification
|
||||
Compaction* PickCompactionUniversalReadAmp(int level, double score,
|
||||
unsigned int ratio, unsigned int num_files);
|
||||
|
||||
// Pick Universal compaction to limit space amplification.
|
||||
Compaction* PickCompactionUniversalSizeAmp(int level, double score);
|
||||
|
||||
// Free up the files that were participated in a compaction
|
||||
void ReleaseCompactionFiles(Compaction* c, Status status);
|
||||
|
||||
// verify that the files that we started with for a compaction
|
||||
// still exist in the current version and in the same original level.
|
||||
// This ensures that a concurrent compaction did not erroneously
|
||||
// pick the same files to compact.
|
||||
bool VerifyCompactionFileConsistency(Compaction* c);
|
||||
|
||||
// used to sort files by size
|
||||
typedef struct fsize {
|
||||
int index;
|
||||
FileMetaData* file;
|
||||
} Fsize;
|
||||
|
||||
// Sort all files for this version based on their file size and
|
||||
// record results in files_by_size_. The largest files are listed first.
|
||||
void UpdateFilesBySize(Version *v);
|
||||
|
||||
// Get the max file size in a given level.
|
||||
uint64_t MaxFileSizeForLevel(int level);
|
||||
|
||||
double MaxBytesForLevel(int level);
|
||||
|
||||
Status GetMetadataForFile(
|
||||
uint64_t number, int *filelevel, FileMetaData *metadata);
|
||||
|
||||
void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData> *metadata);
|
||||
|
||||
void GetObsoleteFiles(std::vector<FileMetaData*>* files);
|
||||
|
||||
private:
|
||||
class Builder;
|
||||
struct ManifestWriter;
|
||||
|
||||
friend class Compaction;
|
||||
friend class Version;
|
||||
|
||||
void Init(int num_levels);
|
||||
|
||||
void Finalize(Version* v, std::vector<uint64_t>&);
|
||||
|
||||
void GetRange(const std::vector<FileMetaData*>& inputs,
|
||||
InternalKey* smallest,
|
||||
InternalKey* largest);
|
||||
|
||||
void GetRange2(const std::vector<FileMetaData*>& inputs1,
|
||||
const std::vector<FileMetaData*>& inputs2,
|
||||
InternalKey* smallest,
|
||||
InternalKey* largest);
|
||||
|
||||
void ExpandWhileOverlapping(Compaction* c);
|
||||
|
||||
void SetupOtherInputs(Compaction* c);
|
||||
|
||||
// Save current contents to *log
|
||||
Status WriteSnapshot(log::Writer* log);
|
||||
|
||||
void AppendVersion(Version* v);
|
||||
|
||||
bool ManifestContains(const std::string& record) const;
|
||||
|
||||
uint64_t ExpandedCompactionByteSizeLimit(int level);
|
||||
|
||||
uint64_t MaxGrandParentOverlapBytes(int level);
|
||||
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
const Options* const options_;
|
||||
TableCache* const table_cache_;
|
||||
const InternalKeyComparator icmp_;
|
||||
uint64_t next_file_number_;
|
||||
uint64_t manifest_file_number_;
|
||||
uint64_t last_sequence_;
|
||||
uint64_t log_number_;
|
||||
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
|
||||
|
||||
int num_levels_;
|
||||
|
||||
// Opened lazily
|
||||
unique_ptr<log::Writer> descriptor_log_;
|
||||
Version dummy_versions_; // Head of circular doubly-linked list of versions.
|
||||
Version* current_; // == dummy_versions_.prev_
|
||||
|
||||
// Per-level key at which the next compaction at that level should start.
|
||||
// Either an empty string, or a valid InternalKey.
|
||||
std::string* compact_pointer_;
|
||||
|
||||
// Per-level target file size.
|
||||
uint64_t* max_file_size_;
|
||||
|
||||
// Per-level max bytes
|
||||
uint64_t* level_max_bytes_;
|
||||
|
||||
// record all the ongoing compactions for all levels
|
||||
std::vector<std::set<Compaction*> > compactions_in_progress_;
|
||||
|
||||
// generates a increasing version number for every new version
|
||||
uint64_t current_version_number_;
|
||||
|
||||
// Queue of writers to the manifest file
|
||||
std::deque<ManifestWriter*> manifest_writers_;
|
||||
|
||||
// Store the manifest file size when it is checked.
|
||||
// Save us the cost of checking file size twice in LogAndApply
|
||||
uint64_t last_observed_manifest_size_;
|
||||
|
||||
std::vector<FileMetaData*> obsolete_files_;
|
||||
|
||||
// storage options for all reads and writes except compactions
|
||||
const EnvOptions& storage_options_;
|
||||
|
||||
// storage options used for compactions. This is a copy of
|
||||
// storage_options_ but with readaheads set to readahead_compactions_.
|
||||
const EnvOptions storage_options_compactions_;
|
||||
|
||||
// No copying allowed
|
||||
VersionSet(const VersionSet&);
|
||||
void operator=(const VersionSet&);
|
||||
|
||||
// Return the total amount of data that is undergoing
|
||||
// compactions per level
|
||||
void SizeBeingCompacted(std::vector<uint64_t>&);
|
||||
|
||||
// Returns true if any one of the parent files are being compacted
|
||||
bool ParentRangeInCompaction(const InternalKey* smallest,
|
||||
const InternalKey* largest, int level, int* index);
|
||||
|
||||
// Returns true if any one of the specified files are being compacted
|
||||
bool FilesInCompaction(std::vector<FileMetaData*>& files);
|
||||
|
||||
void LogAndApplyHelper(Builder*b, Version* v,
|
||||
VersionEdit* edit, port::Mutex* mu);
|
||||
};
|
||||
|
||||
// A Compaction encapsulates information about a compaction.
|
||||
class Compaction {
|
||||
public:
|
||||
~Compaction();
|
||||
|
||||
// Return the level that is being compacted. Inputs from "level"
|
||||
// will be merged.
|
||||
int level() const { return level_; }
|
||||
|
||||
// Outputs will go to this level
|
||||
int output_level() const { return out_level_; }
|
||||
|
||||
// Return the object that holds the edits to the descriptor done
|
||||
// by this compaction.
|
||||
VersionEdit* edit() { return edit_; }
|
||||
|
||||
// "which" must be either 0 or 1
|
||||
int num_input_files(int which) const { return inputs_[which].size(); }
|
||||
|
||||
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
|
||||
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
|
||||
|
||||
// Maximum size of files to build during this compaction.
|
||||
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
|
||||
|
||||
// Whether compression will be enabled for compaction outputs
|
||||
bool enable_compression() const { return enable_compression_; }
|
||||
|
||||
// Is this a trivial compaction that can be implemented by just
|
||||
// moving a single input file to the next level (no merging or splitting)
|
||||
bool IsTrivialMove() const;
|
||||
|
||||
// Add all inputs to this compaction as delete operations to *edit.
|
||||
void AddInputDeletions(VersionEdit* edit);
|
||||
|
||||
// Returns true if the information we have available guarantees that
|
||||
// the compaction is producing data in "level+1" for which no data exists
|
||||
// in levels greater than "level+1".
|
||||
bool IsBaseLevelForKey(const Slice& user_key);
|
||||
|
||||
// Returns true iff we should stop building the current output
|
||||
// before processing "internal_key".
|
||||
bool ShouldStopBefore(const Slice& internal_key);
|
||||
|
||||
// Release the input version for the compaction, once the compaction
|
||||
// is successful.
|
||||
void ReleaseInputs();
|
||||
|
||||
void Summary(char* output, int len);
|
||||
|
||||
// Return the score that was used to pick this compaction run.
|
||||
double score() const { return score_; }
|
||||
|
||||
// Is this compaction creating a file in the bottom most level?
|
||||
bool BottomMostLevel() { return bottommost_level_; }
|
||||
|
||||
// Does this compaction include all sst files?
|
||||
bool IsFullCompaction() { return is_full_compaction_; }
|
||||
|
||||
private:
|
||||
friend class Version;
|
||||
friend class VersionSet;
|
||||
|
||||
explicit Compaction(int level, int out_level, uint64_t target_file_size,
|
||||
uint64_t max_grandparent_overlap_bytes, int number_levels,
|
||||
bool seek_compaction = false, bool enable_compression = true);
|
||||
|
||||
int level_;
|
||||
int out_level_; // levels to which output files are stored
|
||||
uint64_t max_output_file_size_;
|
||||
uint64_t maxGrandParentOverlapBytes_;
|
||||
Version* input_version_;
|
||||
VersionEdit* edit_;
|
||||
int number_levels_;
|
||||
|
||||
bool seek_compaction_;
|
||||
bool enable_compression_;
|
||||
|
||||
// Each compaction reads inputs from "level_" and "level_+1"
|
||||
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
||||
|
||||
// State used to check for number of of overlapping grandparent files
|
||||
// (parent == level_ + 1, grandparent == level_ + 2)
|
||||
std::vector<FileMetaData*> grandparents_;
|
||||
size_t grandparent_index_; // Index in grandparent_starts_
|
||||
bool seen_key_; // Some output key has been seen
|
||||
uint64_t overlapped_bytes_; // Bytes of overlap between current output
|
||||
// and grandparent files
|
||||
int base_index_; // index of the file in files_[level_]
|
||||
int parent_index_; // index of some file with same range in files_[level_+1]
|
||||
double score_; // score that was used to pick this compaction.
|
||||
|
||||
// Is this compaction creating a file in the bottom most level?
|
||||
bool bottommost_level_;
|
||||
// Does this compaction include all sst files?
|
||||
bool is_full_compaction_;
|
||||
|
||||
// level_ptrs_ holds indices into input_version_->levels_: our state
|
||||
// is that we are positioned at one of the file ranges for each
|
||||
// higher level than the ones involved in this compaction (i.e. for
|
||||
// all L >= level_ + 2).
|
||||
std::vector<size_t> level_ptrs_;
|
||||
|
||||
// mark (or clear) all files that are being compacted
|
||||
void MarkFilesBeingCompacted(bool);
|
||||
|
||||
// Initialize whether compaction producing files at the bottommost level
|
||||
void SetupBottomMostLevel(bool isManual);
|
||||
|
||||
// In case of compaction error, reset the nextIndex that is used
|
||||
// to pick up the next file to be compacted from files_by_size_
|
||||
void ResetNextCompactionIndex();
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
80
db/version_set_reduce_num_levels.cc
Normal file
80
db/version_set_reduce_num_levels.cc
Normal file
@ -0,0 +1,80 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2012 Facebook. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "db/version_set.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
#include "db/log_reader.h"
|
||||
#include "db/log_writer.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
|
||||
|
||||
if(new_levels <= 1) {
|
||||
return Status::InvalidArgument(
|
||||
"Number of levels needs to be bigger than 1");
|
||||
}
|
||||
|
||||
Version* current_version = current_;
|
||||
int current_levels = NumberLevels();
|
||||
|
||||
if (current_levels <= new_levels) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Make sure there are file only on one level from
|
||||
// (new_levels-1) to (current_levels-1)
|
||||
int first_nonempty_level = -1;
|
||||
int first_nonempty_level_filenum = 0;
|
||||
for (int i = new_levels - 1; i < current_levels; i++) {
|
||||
int file_num = NumLevelFiles(i);
|
||||
if (file_num != 0) {
|
||||
if (first_nonempty_level < 0) {
|
||||
first_nonempty_level = i;
|
||||
first_nonempty_level_filenum = file_num;
|
||||
} else {
|
||||
char msg[255];
|
||||
sprintf(msg, "Found at least two levels containing files: "
|
||||
"[%d:%d],[%d:%d].\n",
|
||||
first_nonempty_level, first_nonempty_level_filenum, i, file_num);
|
||||
return Status::InvalidArgument(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status st;
|
||||
std::vector<FileMetaData*>* old_files_list = current_version->files_;
|
||||
std::vector<FileMetaData*>* new_files_list =
|
||||
new std::vector<FileMetaData*>[new_levels];
|
||||
for (int i = 0; i < new_levels - 1; i++) {
|
||||
new_files_list[i] = old_files_list[i];
|
||||
}
|
||||
|
||||
if (first_nonempty_level > 0) {
|
||||
new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
|
||||
}
|
||||
|
||||
delete[] current_version->files_;
|
||||
current_version->files_ = new_files_list;
|
||||
|
||||
delete[] compact_pointer_;
|
||||
delete[] max_file_size_;
|
||||
delete[] level_max_bytes_;
|
||||
num_levels_ = new_levels;
|
||||
compact_pointer_ = new std::string[new_levels];
|
||||
Init(new_levels);
|
||||
VersionEdit ve(new_levels);
|
||||
st = LogAndApply(&ve , mu, true);
|
||||
return st;
|
||||
}
|
||||
|
||||
}
|
184
db/version_set_test.cc
Normal file
184
db/version_set_test.cc
Normal file
@ -0,0 +1,184 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/version_set.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
#include "util/testutil.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FindFileTest {
|
||||
public:
|
||||
std::vector<FileMetaData*> files_;
|
||||
bool disjoint_sorted_files_;
|
||||
|
||||
FindFileTest() : disjoint_sorted_files_(true) { }
|
||||
|
||||
~FindFileTest() {
|
||||
for (unsigned int i = 0; i < files_.size(); i++) {
|
||||
delete files_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Add(const char* smallest, const char* largest,
|
||||
SequenceNumber smallest_seq = 100,
|
||||
SequenceNumber largest_seq = 100) {
|
||||
FileMetaData* f = new FileMetaData;
|
||||
f->number = files_.size() + 1;
|
||||
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
|
||||
f->largest = InternalKey(largest, largest_seq, kTypeValue);
|
||||
files_.push_back(f);
|
||||
}
|
||||
|
||||
int Find(const char* key) {
|
||||
InternalKey target(key, 100, kTypeValue);
|
||||
InternalKeyComparator cmp(BytewiseComparator());
|
||||
return FindFile(cmp, files_, target.Encode());
|
||||
}
|
||||
|
||||
bool Overlaps(const char* smallest, const char* largest) {
|
||||
InternalKeyComparator cmp(BytewiseComparator());
|
||||
Slice s(smallest != nullptr ? smallest : "");
|
||||
Slice l(largest != nullptr ? largest : "");
|
||||
return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
|
||||
(smallest != nullptr ? &s : nullptr),
|
||||
(largest != nullptr ? &l : nullptr));
|
||||
}
|
||||
};
|
||||
|
||||
TEST(FindFileTest, Empty) {
|
||||
ASSERT_EQ(0, Find("foo"));
|
||||
ASSERT_TRUE(! Overlaps("a", "z"));
|
||||
ASSERT_TRUE(! Overlaps(nullptr, "z"));
|
||||
ASSERT_TRUE(! Overlaps("a", nullptr));
|
||||
ASSERT_TRUE(! Overlaps(nullptr, nullptr));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, Single) {
|
||||
Add("p", "q");
|
||||
ASSERT_EQ(0, Find("a"));
|
||||
ASSERT_EQ(0, Find("p"));
|
||||
ASSERT_EQ(0, Find("p1"));
|
||||
ASSERT_EQ(0, Find("q"));
|
||||
ASSERT_EQ(1, Find("q1"));
|
||||
ASSERT_EQ(1, Find("z"));
|
||||
|
||||
ASSERT_TRUE(! Overlaps("a", "b"));
|
||||
ASSERT_TRUE(! Overlaps("z1", "z2"));
|
||||
ASSERT_TRUE(Overlaps("a", "p"));
|
||||
ASSERT_TRUE(Overlaps("a", "q"));
|
||||
ASSERT_TRUE(Overlaps("a", "z"));
|
||||
ASSERT_TRUE(Overlaps("p", "p1"));
|
||||
ASSERT_TRUE(Overlaps("p", "q"));
|
||||
ASSERT_TRUE(Overlaps("p", "z"));
|
||||
ASSERT_TRUE(Overlaps("p1", "p2"));
|
||||
ASSERT_TRUE(Overlaps("p1", "z"));
|
||||
ASSERT_TRUE(Overlaps("q", "q"));
|
||||
ASSERT_TRUE(Overlaps("q", "q1"));
|
||||
|
||||
ASSERT_TRUE(! Overlaps(nullptr, "j"));
|
||||
ASSERT_TRUE(! Overlaps("r", nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "p"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "p1"));
|
||||
ASSERT_TRUE(Overlaps("q", nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, nullptr));
|
||||
}
|
||||
|
||||
|
||||
TEST(FindFileTest, Multiple) {
|
||||
Add("150", "200");
|
||||
Add("200", "250");
|
||||
Add("300", "350");
|
||||
Add("400", "450");
|
||||
ASSERT_EQ(0, Find("100"));
|
||||
ASSERT_EQ(0, Find("150"));
|
||||
ASSERT_EQ(0, Find("151"));
|
||||
ASSERT_EQ(0, Find("199"));
|
||||
ASSERT_EQ(0, Find("200"));
|
||||
ASSERT_EQ(1, Find("201"));
|
||||
ASSERT_EQ(1, Find("249"));
|
||||
ASSERT_EQ(1, Find("250"));
|
||||
ASSERT_EQ(2, Find("251"));
|
||||
ASSERT_EQ(2, Find("299"));
|
||||
ASSERT_EQ(2, Find("300"));
|
||||
ASSERT_EQ(2, Find("349"));
|
||||
ASSERT_EQ(2, Find("350"));
|
||||
ASSERT_EQ(3, Find("351"));
|
||||
ASSERT_EQ(3, Find("400"));
|
||||
ASSERT_EQ(3, Find("450"));
|
||||
ASSERT_EQ(4, Find("451"));
|
||||
|
||||
ASSERT_TRUE(! Overlaps("100", "149"));
|
||||
ASSERT_TRUE(! Overlaps("251", "299"));
|
||||
ASSERT_TRUE(! Overlaps("451", "500"));
|
||||
ASSERT_TRUE(! Overlaps("351", "399"));
|
||||
|
||||
ASSERT_TRUE(Overlaps("100", "150"));
|
||||
ASSERT_TRUE(Overlaps("100", "200"));
|
||||
ASSERT_TRUE(Overlaps("100", "300"));
|
||||
ASSERT_TRUE(Overlaps("100", "400"));
|
||||
ASSERT_TRUE(Overlaps("100", "500"));
|
||||
ASSERT_TRUE(Overlaps("375", "400"));
|
||||
ASSERT_TRUE(Overlaps("450", "450"));
|
||||
ASSERT_TRUE(Overlaps("450", "500"));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, MultipleNullBoundaries) {
|
||||
Add("150", "200");
|
||||
Add("200", "250");
|
||||
Add("300", "350");
|
||||
Add("400", "450");
|
||||
ASSERT_TRUE(! Overlaps(nullptr, "149"));
|
||||
ASSERT_TRUE(! Overlaps("451", nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, nullptr));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "150"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "199"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "200"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "201"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "400"));
|
||||
ASSERT_TRUE(Overlaps(nullptr, "800"));
|
||||
ASSERT_TRUE(Overlaps("100", nullptr));
|
||||
ASSERT_TRUE(Overlaps("200", nullptr));
|
||||
ASSERT_TRUE(Overlaps("449", nullptr));
|
||||
ASSERT_TRUE(Overlaps("450", nullptr));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, OverlapSequenceChecks) {
|
||||
Add("200", "200", 5000, 3000);
|
||||
ASSERT_TRUE(! Overlaps("199", "199"));
|
||||
ASSERT_TRUE(! Overlaps("201", "300"));
|
||||
ASSERT_TRUE(Overlaps("200", "200"));
|
||||
ASSERT_TRUE(Overlaps("190", "200"));
|
||||
ASSERT_TRUE(Overlaps("200", "210"));
|
||||
}
|
||||
|
||||
TEST(FindFileTest, OverlappingFiles) {
|
||||
Add("150", "600");
|
||||
Add("400", "500");
|
||||
disjoint_sorted_files_ = false;
|
||||
ASSERT_TRUE(! Overlaps("100", "149"));
|
||||
ASSERT_TRUE(! Overlaps("601", "700"));
|
||||
ASSERT_TRUE(Overlaps("100", "150"));
|
||||
ASSERT_TRUE(Overlaps("100", "200"));
|
||||
ASSERT_TRUE(Overlaps("100", "300"));
|
||||
ASSERT_TRUE(Overlaps("100", "400"));
|
||||
ASSERT_TRUE(Overlaps("100", "500"));
|
||||
ASSERT_TRUE(Overlaps("375", "400"));
|
||||
ASSERT_TRUE(Overlaps("450", "450"));
|
||||
ASSERT_TRUE(Overlaps("450", "500"));
|
||||
ASSERT_TRUE(Overlaps("450", "700"));
|
||||
ASSERT_TRUE(Overlaps("600", "700"));
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
247
db/write_batch.cc
Normal file
247
db/write_batch.cc
Normal file
@ -0,0 +1,247 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// WriteBatch::rep_ :=
|
||||
// sequence: fixed64
|
||||
// count: fixed32
|
||||
// data: record[count]
|
||||
// record :=
|
||||
// kTypeValue varstring varstring
|
||||
// kTypeMerge varstring varstring
|
||||
// kTypeDeletion varstring
|
||||
// varstring :=
|
||||
// len: varint32
|
||||
// data: uint8[len]
|
||||
|
||||
#include "rocksdb/write_batch.h"
|
||||
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/memtable.h"
|
||||
#include "db/snapshot.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "util/coding.h"
|
||||
#include <stdexcept>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
|
||||
static const size_t kHeader = 12;
|
||||
|
||||
WriteBatch::WriteBatch() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
WriteBatch::~WriteBatch() { }
|
||||
|
||||
WriteBatch::Handler::~Handler() { }
|
||||
|
||||
void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
|
||||
throw std::runtime_error("Handler::Merge not implemented!");
|
||||
}
|
||||
|
||||
void WriteBatch::Handler::LogData(const Slice& blob) {
|
||||
// If the user has not specified something to do with blobs, then we ignore
|
||||
// them.
|
||||
}
|
||||
|
||||
bool WriteBatch::Handler::Continue() {
|
||||
return true;
|
||||
}
|
||||
|
||||
void WriteBatch::Clear() {
|
||||
rep_.clear();
|
||||
rep_.resize(kHeader);
|
||||
}
|
||||
|
||||
int WriteBatch::Count() const {
|
||||
return WriteBatchInternal::Count(this);
|
||||
}
|
||||
|
||||
Status WriteBatch::Iterate(Handler* handler) const {
|
||||
Slice input(rep_);
|
||||
if (input.size() < kHeader) {
|
||||
return Status::Corruption("malformed WriteBatch (too small)");
|
||||
}
|
||||
|
||||
input.remove_prefix(kHeader);
|
||||
Slice key, value, blob;
|
||||
int found = 0;
|
||||
while (!input.empty() && handler->Continue()) {
|
||||
char tag = input[0];
|
||||
input.remove_prefix(1);
|
||||
switch (tag) {
|
||||
case kTypeValue:
|
||||
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||
GetLengthPrefixedSlice(&input, &value)) {
|
||||
handler->Put(key, value);
|
||||
found++;
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Put");
|
||||
}
|
||||
break;
|
||||
case kTypeDeletion:
|
||||
if (GetLengthPrefixedSlice(&input, &key)) {
|
||||
handler->Delete(key);
|
||||
found++;
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Delete");
|
||||
}
|
||||
break;
|
||||
case kTypeMerge:
|
||||
if (GetLengthPrefixedSlice(&input, &key) &&
|
||||
GetLengthPrefixedSlice(&input, &value)) {
|
||||
handler->Merge(key, value);
|
||||
found++;
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Merge");
|
||||
}
|
||||
break;
|
||||
case kTypeLogData:
|
||||
if (GetLengthPrefixedSlice(&input, &blob)) {
|
||||
handler->LogData(blob);
|
||||
} else {
|
||||
return Status::Corruption("bad WriteBatch Blob");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return Status::Corruption("unknown WriteBatch tag");
|
||||
}
|
||||
}
|
||||
if (found != WriteBatchInternal::Count(this)) {
|
||||
return Status::Corruption("WriteBatch has wrong count");
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
int WriteBatchInternal::Count(const WriteBatch* b) {
|
||||
return DecodeFixed32(b->rep_.data() + 8);
|
||||
}
|
||||
|
||||
void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
|
||||
EncodeFixed32(&b->rep_[8], n);
|
||||
}
|
||||
|
||||
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
|
||||
return SequenceNumber(DecodeFixed64(b->rep_.data()));
|
||||
}
|
||||
|
||||
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
|
||||
EncodeFixed64(&b->rep_[0], seq);
|
||||
}
|
||||
|
||||
void WriteBatch::Put(const Slice& key, const Slice& value) {
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeValue));
|
||||
PutLengthPrefixedSlice(&rep_, key);
|
||||
PutLengthPrefixedSlice(&rep_, value);
|
||||
}
|
||||
|
||||
void WriteBatch::Put(const SliceParts& key, const SliceParts& value) {
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeValue));
|
||||
PutLengthPrefixedSliceParts(&rep_, key);
|
||||
PutLengthPrefixedSliceParts(&rep_, value);
|
||||
}
|
||||
|
||||
void WriteBatch::Delete(const Slice& key) {
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeDeletion));
|
||||
PutLengthPrefixedSlice(&rep_, key);
|
||||
}
|
||||
|
||||
void WriteBatch::Merge(const Slice& key, const Slice& value) {
|
||||
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
|
||||
rep_.push_back(static_cast<char>(kTypeMerge));
|
||||
PutLengthPrefixedSlice(&rep_, key);
|
||||
PutLengthPrefixedSlice(&rep_, value);
|
||||
}
|
||||
|
||||
void WriteBatch::PutLogData(const Slice& blob) {
|
||||
rep_.push_back(static_cast<char>(kTypeLogData));
|
||||
PutLengthPrefixedSlice(&rep_, blob);
|
||||
}
|
||||
|
||||
namespace {
|
||||
class MemTableInserter : public WriteBatch::Handler {
|
||||
public:
|
||||
SequenceNumber sequence_;
|
||||
MemTable* mem_;
|
||||
const Options* options_;
|
||||
DBImpl* db_;
|
||||
const bool filter_deletes_;
|
||||
|
||||
MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts,
|
||||
DB* db, const bool filter_deletes)
|
||||
: sequence_(sequence),
|
||||
mem_(mem),
|
||||
options_(opts),
|
||||
db_(reinterpret_cast<DBImpl*>(db)),
|
||||
filter_deletes_(filter_deletes) {
|
||||
assert(mem_);
|
||||
if (filter_deletes_) {
|
||||
assert(options_);
|
||||
assert(db_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
if (options_->inplace_update_support
|
||||
&& mem_->Update(sequence_, kTypeValue, key, value)) {
|
||||
RecordTick(options_->statistics, NUMBER_KEYS_UPDATED);
|
||||
} else {
|
||||
mem_->Add(sequence_, kTypeValue, key, value);
|
||||
}
|
||||
sequence_++;
|
||||
}
|
||||
virtual void Merge(const Slice& key, const Slice& value) {
|
||||
mem_->Add(sequence_, kTypeMerge, key, value);
|
||||
sequence_++;
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
if (filter_deletes_) {
|
||||
SnapshotImpl read_from_snapshot;
|
||||
read_from_snapshot.number_ = sequence_;
|
||||
ReadOptions ropts;
|
||||
ropts.snapshot = &read_from_snapshot;
|
||||
std::string value;
|
||||
if (!db_->KeyMayExist(ropts, key, &value)) {
|
||||
RecordTick(options_->statistics, NUMBER_FILTERED_DELETES);
|
||||
return;
|
||||
}
|
||||
}
|
||||
mem_->Add(sequence_, kTypeDeletion, key, Slice());
|
||||
sequence_++;
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem,
|
||||
const Options* opts, DB* db,
|
||||
const bool filter_deletes) {
|
||||
MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db,
|
||||
filter_deletes);
|
||||
return b->Iterate(&inserter);
|
||||
}
|
||||
|
||||
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
|
||||
assert(contents.size() >= kHeader);
|
||||
b->rep_.assign(contents.data(), contents.size());
|
||||
}
|
||||
|
||||
void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
|
||||
SetCount(dst, Count(dst) + Count(src));
|
||||
assert(src->rep_.size() >= kHeader);
|
||||
dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
57
db/write_batch_internal.h
Normal file
57
db/write_batch_internal.h
Normal file
@ -0,0 +1,57 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/write_batch.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class MemTable;
|
||||
|
||||
// WriteBatchInternal provides static methods for manipulating a
|
||||
// WriteBatch that we don't want in the public WriteBatch interface.
|
||||
class WriteBatchInternal {
|
||||
public:
|
||||
// Return the number of entries in the batch.
|
||||
static int Count(const WriteBatch* batch);
|
||||
|
||||
// Set the count for the number of entries in the batch.
|
||||
static void SetCount(WriteBatch* batch, int n);
|
||||
|
||||
// Return the seqeunce number for the start of this batch.
|
||||
static SequenceNumber Sequence(const WriteBatch* batch);
|
||||
|
||||
// Store the specified number as the seqeunce number for the start of
|
||||
// this batch.
|
||||
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
|
||||
|
||||
static Slice Contents(const WriteBatch* batch) {
|
||||
return Slice(batch->rep_);
|
||||
}
|
||||
|
||||
static size_t ByteSize(const WriteBatch* batch) {
|
||||
return batch->rep_.size();
|
||||
}
|
||||
|
||||
static void SetContents(WriteBatch* batch, const Slice& contents);
|
||||
|
||||
// Inserts batch entries into memtable
|
||||
// Drops deletes in batch if filter_del is set to true and
|
||||
// db->KeyMayExist returns false
|
||||
static Status InsertInto(const WriteBatch* batch, MemTable* memtable,
|
||||
const Options* opts, DB* db = nullptr,
|
||||
const bool filter_del = false);
|
||||
|
||||
static void Append(WriteBatch* dst, const WriteBatch* src);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
262
db/write_batch_test.cc
Normal file
262
db/write_batch_test.cc
Normal file
@ -0,0 +1,262 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
#include <memory>
|
||||
#include "db/memtable.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static std::string PrintContents(WriteBatch* b) {
|
||||
InternalKeyComparator cmp(BytewiseComparator());
|
||||
auto factory = std::make_shared<SkipListFactory>();
|
||||
MemTable* mem = new MemTable(cmp, factory);
|
||||
mem->Ref();
|
||||
std::string state;
|
||||
Options options;
|
||||
Status s = WriteBatchInternal::InsertInto(b, mem, &options);
|
||||
int count = 0;
|
||||
Iterator* iter = mem->NewIterator();
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
ParsedInternalKey ikey;
|
||||
memset((void *)&ikey, 0, sizeof(ikey));
|
||||
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
|
||||
switch (ikey.type) {
|
||||
case kTypeValue:
|
||||
state.append("Put(");
|
||||
state.append(ikey.user_key.ToString());
|
||||
state.append(", ");
|
||||
state.append(iter->value().ToString());
|
||||
state.append(")");
|
||||
count++;
|
||||
break;
|
||||
case kTypeMerge:
|
||||
state.append("Merge(");
|
||||
state.append(ikey.user_key.ToString());
|
||||
state.append(", ");
|
||||
state.append(iter->value().ToString());
|
||||
state.append(")");
|
||||
count++;
|
||||
break;
|
||||
case kTypeDeletion:
|
||||
state.append("Delete(");
|
||||
state.append(ikey.user_key.ToString());
|
||||
state.append(")");
|
||||
count++;
|
||||
break;
|
||||
case kTypeLogData:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
state.append("@");
|
||||
state.append(NumberToString(ikey.sequence));
|
||||
}
|
||||
delete iter;
|
||||
if (!s.ok()) {
|
||||
state.append(s.ToString());
|
||||
} else if (count != WriteBatchInternal::Count(b)) {
|
||||
state.append("CountMismatch()");
|
||||
}
|
||||
mem->Unref();
|
||||
return state;
|
||||
}
|
||||
|
||||
class WriteBatchTest { };
|
||||
|
||||
TEST(WriteBatchTest, Empty) {
|
||||
WriteBatch batch;
|
||||
ASSERT_EQ("", PrintContents(&batch));
|
||||
ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
|
||||
ASSERT_EQ(0, batch.Count());
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, Multiple) {
|
||||
WriteBatch batch;
|
||||
batch.Put(Slice("foo"), Slice("bar"));
|
||||
batch.Delete(Slice("box"));
|
||||
batch.Put(Slice("baz"), Slice("boo"));
|
||||
WriteBatchInternal::SetSequence(&batch, 100);
|
||||
ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
|
||||
ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
|
||||
ASSERT_EQ("Put(baz, boo)@102"
|
||||
"Delete(box)@101"
|
||||
"Put(foo, bar)@100",
|
||||
PrintContents(&batch));
|
||||
ASSERT_EQ(3, batch.Count());
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, Corruption) {
|
||||
WriteBatch batch;
|
||||
batch.Put(Slice("foo"), Slice("bar"));
|
||||
batch.Delete(Slice("box"));
|
||||
WriteBatchInternal::SetSequence(&batch, 200);
|
||||
Slice contents = WriteBatchInternal::Contents(&batch);
|
||||
WriteBatchInternal::SetContents(&batch,
|
||||
Slice(contents.data(),contents.size()-1));
|
||||
ASSERT_EQ("Put(foo, bar)@200"
|
||||
"Corruption: bad WriteBatch Delete",
|
||||
PrintContents(&batch));
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, Append) {
|
||||
WriteBatch b1, b2;
|
||||
WriteBatchInternal::SetSequence(&b1, 200);
|
||||
WriteBatchInternal::SetSequence(&b2, 300);
|
||||
WriteBatchInternal::Append(&b1, &b2);
|
||||
ASSERT_EQ("",
|
||||
PrintContents(&b1));
|
||||
ASSERT_EQ(0, b1.Count());
|
||||
b2.Put("a", "va");
|
||||
WriteBatchInternal::Append(&b1, &b2);
|
||||
ASSERT_EQ("Put(a, va)@200",
|
||||
PrintContents(&b1));
|
||||
ASSERT_EQ(1, b1.Count());
|
||||
b2.Clear();
|
||||
b2.Put("b", "vb");
|
||||
WriteBatchInternal::Append(&b1, &b2);
|
||||
ASSERT_EQ("Put(a, va)@200"
|
||||
"Put(b, vb)@201",
|
||||
PrintContents(&b1));
|
||||
ASSERT_EQ(2, b1.Count());
|
||||
b2.Delete("foo");
|
||||
WriteBatchInternal::Append(&b1, &b2);
|
||||
ASSERT_EQ("Put(a, va)@200"
|
||||
"Put(b, vb)@202"
|
||||
"Put(b, vb)@201"
|
||||
"Delete(foo)@203",
|
||||
PrintContents(&b1));
|
||||
ASSERT_EQ(4, b1.Count());
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct TestHandler : public WriteBatch::Handler {
|
||||
std::string seen;
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
|
||||
}
|
||||
virtual void Merge(const Slice& key, const Slice& value) {
|
||||
seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
|
||||
}
|
||||
virtual void LogData(const Slice& blob) {
|
||||
seen += "LogData(" + blob.ToString() + ")";
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
seen += "Delete(" + key.ToString() + ")";
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, Blob) {
|
||||
WriteBatch batch;
|
||||
batch.Put(Slice("k1"), Slice("v1"));
|
||||
batch.Put(Slice("k2"), Slice("v2"));
|
||||
batch.Put(Slice("k3"), Slice("v3"));
|
||||
batch.PutLogData(Slice("blob1"));
|
||||
batch.Delete(Slice("k2"));
|
||||
batch.PutLogData(Slice("blob2"));
|
||||
batch.Merge(Slice("foo"), Slice("bar"));
|
||||
ASSERT_EQ(5, batch.Count());
|
||||
ASSERT_EQ("Merge(foo, bar)@4"
|
||||
"Put(k1, v1)@0"
|
||||
"Delete(k2)@3"
|
||||
"Put(k2, v2)@1"
|
||||
"Put(k3, v3)@2",
|
||||
PrintContents(&batch));
|
||||
|
||||
TestHandler handler;
|
||||
batch.Iterate(&handler);
|
||||
ASSERT_EQ(
|
||||
"Put(k1, v1)"
|
||||
"Put(k2, v2)"
|
||||
"Put(k3, v3)"
|
||||
"LogData(blob1)"
|
||||
"Delete(k2)"
|
||||
"LogData(blob2)"
|
||||
"Merge(foo, bar)",
|
||||
handler.seen);
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, Continue) {
|
||||
WriteBatch batch;
|
||||
|
||||
struct Handler : public TestHandler {
|
||||
int num_seen = 0;
|
||||
virtual void Put(const Slice& key, const Slice& value) {
|
||||
++num_seen;
|
||||
TestHandler::Put(key, value);
|
||||
}
|
||||
virtual void Merge(const Slice& key, const Slice& value) {
|
||||
++num_seen;
|
||||
TestHandler::Merge(key, value);
|
||||
}
|
||||
virtual void LogData(const Slice& blob) {
|
||||
++num_seen;
|
||||
TestHandler::LogData(blob);
|
||||
}
|
||||
virtual void Delete(const Slice& key) {
|
||||
++num_seen;
|
||||
TestHandler::Delete(key);
|
||||
}
|
||||
virtual bool Continue() override {
|
||||
return num_seen < 3;
|
||||
}
|
||||
} handler;
|
||||
|
||||
batch.Put(Slice("k1"), Slice("v1"));
|
||||
batch.PutLogData(Slice("blob1"));
|
||||
batch.Delete(Slice("k1"));
|
||||
batch.PutLogData(Slice("blob2"));
|
||||
batch.Merge(Slice("foo"), Slice("bar"));
|
||||
batch.Iterate(&handler);
|
||||
ASSERT_EQ(
|
||||
"Put(k1, v1)"
|
||||
"LogData(blob1)"
|
||||
"Delete(k1)",
|
||||
handler.seen);
|
||||
}
|
||||
|
||||
TEST(WriteBatchTest, PutGatherSlices) {
|
||||
WriteBatch batch;
|
||||
batch.Put(Slice("foo"), Slice("bar"));
|
||||
|
||||
{
|
||||
// Try a write where the key is one slice but the value is two
|
||||
Slice key_slice("baz");
|
||||
Slice value_slices[2] = { Slice("header"), Slice("payload") };
|
||||
batch.Put(SliceParts(&key_slice, 1),
|
||||
SliceParts(value_slices, 2));
|
||||
}
|
||||
|
||||
{
|
||||
// One where the key is composite but the value is a single slice
|
||||
Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
|
||||
Slice value_slice("value");
|
||||
batch.Put(SliceParts(key_slices, 3),
|
||||
SliceParts(&value_slice, 1));
|
||||
}
|
||||
|
||||
WriteBatchInternal::SetSequence(&batch, 100);
|
||||
ASSERT_EQ("Put(baz, headerpayload)@101"
|
||||
"Put(foo, bar)@100"
|
||||
"Put(keypart2part3, value)@102",
|
||||
PrintContents(&batch));
|
||||
ASSERT_EQ(3, batch.Count());
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
89
doc/doc.css
Normal file
89
doc/doc.css
Normal file
@ -0,0 +1,89 @@
|
||||
body {
|
||||
margin-left: 0.5in;
|
||||
margin-right: 0.5in;
|
||||
background: white;
|
||||
color: black;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-left: -0.2in;
|
||||
font-size: 14pt;
|
||||
}
|
||||
h2 {
|
||||
margin-left: -0in;
|
||||
font-size: 12pt;
|
||||
}
|
||||
h3 {
|
||||
margin-left: -0in;
|
||||
}
|
||||
h4 {
|
||||
margin-left: -0in;
|
||||
}
|
||||
hr {
|
||||
margin-left: -0in;
|
||||
}
|
||||
|
||||
/* Definition lists: definition term bold */
|
||||
dt {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
address {
|
||||
text-align: center;
|
||||
}
|
||||
code,samp,var {
|
||||
color: blue;
|
||||
}
|
||||
kbd {
|
||||
color: #600000;
|
||||
}
|
||||
div.note p {
|
||||
float: right;
|
||||
width: 3in;
|
||||
margin-right: 0%;
|
||||
padding: 1px;
|
||||
border: 2px solid #6060a0;
|
||||
background-color: #fffff0;
|
||||
}
|
||||
|
||||
ul {
|
||||
margin-top: -0em;
|
||||
margin-bottom: -0em;
|
||||
}
|
||||
|
||||
ol {
|
||||
margin-top: -0em;
|
||||
margin-bottom: -0em;
|
||||
}
|
||||
|
||||
UL.nobullets {
|
||||
list-style-type: none;
|
||||
list-style-image: none;
|
||||
margin-left: -1em;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 1em 0 1em 0;
|
||||
padding: 0 0 0 0;
|
||||
}
|
||||
|
||||
pre {
|
||||
line-height: 1.3em;
|
||||
padding: 0.4em 0 0.8em 0;
|
||||
margin: 0 0 0 0;
|
||||
border: 0 0 0 0;
|
||||
color: blue;
|
||||
}
|
||||
|
||||
.datatable {
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
margin-top: 2em;
|
||||
margin-bottom: 2em;
|
||||
border: 1px solid;
|
||||
}
|
||||
|
||||
.datatable td,th {
|
||||
padding: 0 0.5em 0 0.5em;
|
||||
text-align: right;
|
||||
}
|
831
doc/index.html
Normal file
831
doc/index.html
Normal file
@ -0,0 +1,831 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<link rel="stylesheet" type="text/css" href="doc.css" />
|
||||
<title>RocksDB</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h1>RocksDB</h1>
|
||||
<address>The Facebook Database Engineering Team</address>
|
||||
<address>Build on earlier work on leveldb by Sanjay Ghemawat
|
||||
(sanjay@google.com) and Jeff Dean (jeff@google.com)</address>
|
||||
<p>
|
||||
The <code>rocksdb</code> library provides a persistent key value store. Keys and
|
||||
values are arbitrary byte arrays. The keys are ordered within the key
|
||||
value store according to a user-specified comparator function.
|
||||
|
||||
<p>
|
||||
<h1>Opening A Database</h1>
|
||||
<p>
|
||||
A <code>rocksdb</code> database has a name which corresponds to a file system
|
||||
directory. All of the contents of database are stored in this
|
||||
directory. The following example shows how to open a database,
|
||||
creating it if necessary:
|
||||
<p>
|
||||
<pre>
|
||||
#include <assert>
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
rocksdb::DB* db;
|
||||
rocksdb::Options options;
|
||||
options.create_if_missing = true;
|
||||
rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
|
||||
assert(status.ok());
|
||||
...
|
||||
</pre>
|
||||
If you want to raise an error if the database already exists, add
|
||||
the following line before the <code>rocksdb::DB::Open</code> call:
|
||||
<pre>
|
||||
options.error_if_exists = true;
|
||||
</pre>
|
||||
<h1>Status</h1>
|
||||
<p>
|
||||
You may have noticed the <code>rocksdb::Status</code> type above. Values of this
|
||||
type are returned by most functions in <code>rocksdb</code> that may encounter an
|
||||
error. You can check if such a result is ok, and also print an
|
||||
associated error message:
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::Status s = ...;
|
||||
if (!s.ok()) cerr << s.ToString() << endl;
|
||||
</pre>
|
||||
<h1>Closing A Database</h1>
|
||||
<p>
|
||||
When you are done with a database, just delete the database object.
|
||||
Example:
|
||||
<p>
|
||||
<pre>
|
||||
... open the db as described above ...
|
||||
... do something with db ...
|
||||
delete db;
|
||||
</pre>
|
||||
<h1>Reads And Writes</h1>
|
||||
<p>
|
||||
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
|
||||
modify/query the database. For example, the following code
|
||||
moves the value stored under key1 to key2.
|
||||
<pre>
|
||||
std::string value;
|
||||
rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
|
||||
if (s.ok()) s = db->Put(rocksdb::WriteOptions(), key2, value);
|
||||
if (s.ok()) s = db->Delete(rocksdb::WriteOptions(), key1);
|
||||
</pre>
|
||||
|
||||
<h1>Atomic Updates</h1>
|
||||
<p>
|
||||
Note that if the process dies after the Put of key2 but before the
|
||||
delete of key1, the same value may be left stored under multiple keys.
|
||||
Such problems can be avoided by using the <code>WriteBatch</code> class to
|
||||
atomically apply a set of updates:
|
||||
<p>
|
||||
<pre>
|
||||
#include "leveldb/write_batch.h"
|
||||
...
|
||||
std::string value;
|
||||
rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
|
||||
if (s.ok()) {
|
||||
rocksdb::WriteBatch batch;
|
||||
batch.Delete(key1);
|
||||
batch.Put(key2, value);
|
||||
s = db->Write(rocksdb::WriteOptions(), &batch);
|
||||
}
|
||||
</pre>
|
||||
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
|
||||
and these edits within the batch are applied in order. Note that we
|
||||
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
|
||||
we do not end up erroneously dropping the value entirely.
|
||||
<p>
|
||||
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
|
||||
speed up bulk updates by placing lots of individual mutations into the
|
||||
same batch.
|
||||
|
||||
<h1>Synchronous Writes</h1>
|
||||
By default, each write to <code>leveldb</code> is asynchronous: it
|
||||
returns after pushing the write from the process into the operating
|
||||
system. The transfer from operating system memory to the underlying
|
||||
persistent storage happens asynchronously. The <code>sync</code> flag
|
||||
can be turned on for a particular write to make the write operation
|
||||
not return until the data being written has been pushed all the way to
|
||||
persistent storage. (On Posix systems, this is implemented by calling
|
||||
either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
|
||||
<code>msync(..., MS_SYNC)</code> before the write operation returns.)
|
||||
<pre>
|
||||
rocksdb::WriteOptions write_options;
|
||||
write_options.sync = true;
|
||||
db->Put(write_options, ...);
|
||||
</pre>
|
||||
Asynchronous writes are often more than a thousand times as fast as
|
||||
synchronous writes. The downside of asynchronous writes is that a
|
||||
crash of the machine may cause the last few updates to be lost. Note
|
||||
that a crash of just the writing process (i.e., not a reboot) will not
|
||||
cause any loss since even when <code>sync</code> is false, an update
|
||||
is pushed from the process memory into the operating system before it
|
||||
is considered done.
|
||||
|
||||
<p>
|
||||
Asynchronous writes can often be used safely. For example, when
|
||||
loading a large amount of data into the database you can handle lost
|
||||
updates by restarting the bulk load after a crash. A hybrid scheme is
|
||||
also possible where every Nth write is synchronous, and in the event
|
||||
of a crash, the bulk load is restarted just after the last synchronous
|
||||
write finished by the previous run. (The synchronous write can update
|
||||
a marker that describes where to restart on a crash.)
|
||||
|
||||
<p>
|
||||
<code>WriteBatch</code> provides an alternative to asynchronous writes.
|
||||
Multiple updates may be placed in the same <code>WriteBatch</code> and
|
||||
applied together using a synchronous write (i.e.,
|
||||
<code>write_options.sync</code> is set to true). The extra cost of
|
||||
the synchronous write will be amortized across all of the writes in
|
||||
the batch.
|
||||
|
||||
<p>
|
||||
We also provide a way to completely disable Write Ahead Log for a
|
||||
particular write. If you set write_option.disableWAL to true, the
|
||||
write will not go to the log at all and may be lost in an event of
|
||||
process crash.
|
||||
|
||||
<p>
|
||||
When opening a DB, you can disable syncing of data files by setting
|
||||
Options::disableDataSync to true. This can be useful when doing
|
||||
bulk-loading or big idempotent operations. Once the operation is
|
||||
finished, you can manually call sync() to flush all dirty buffers
|
||||
to stable storage.
|
||||
|
||||
<p>
|
||||
RocksDB by default uses faster fdatasync() to sync files. If you want
|
||||
to use fsync(), you can set Options::use_fsync to true. You should set
|
||||
this to true on filesystems like ext3 that can lose files after a
|
||||
reboot.
|
||||
|
||||
<p>
|
||||
<h1>Concurrency</h1>
|
||||
<p>
|
||||
A database may only be opened by one process at a time.
|
||||
The <code>rocksdb</code> implementation acquires a lock from the
|
||||
operating system to prevent misuse. Within a single process, the
|
||||
same <code>rocksdb::DB</code> object may be safely shared by multiple
|
||||
concurrent threads. I.e., different threads may write into or fetch
|
||||
iterators or call <code>Get</code> on the same database without any
|
||||
external synchronization (the leveldb implementation will
|
||||
automatically do the required synchronization). However other objects
|
||||
(like Iterator and WriteBatch) may require external synchronization.
|
||||
If two threads share such an object, they must protect access to it
|
||||
using their own locking protocol. More details are available in
|
||||
the public header files.
|
||||
|
||||
<p>
|
||||
<h1>Merge operators</h1>
|
||||
<p>
|
||||
Merge operators provide efficient support for read-modify-write operation.
|
||||
More on the interface and implementation can be found on:
|
||||
<p>
|
||||
<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator">
|
||||
Merge Operator</a>
|
||||
<p>
|
||||
<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation">
|
||||
Merge Operator Implementation</a>
|
||||
|
||||
<p>
|
||||
<h1>Iteration</h1>
|
||||
<p>
|
||||
The following example demonstrates how to print all key,value pairs
|
||||
in a database.
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions());
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
cout << it->key().ToString() << ": " << it->value().ToString() << endl;
|
||||
}
|
||||
assert(it->status().ok()); // Check for any errors found during the scan
|
||||
delete it;
|
||||
</pre>
|
||||
The following variation shows how to process just the keys in the
|
||||
range <code>[start,limit)</code>:
|
||||
<p>
|
||||
<pre>
|
||||
for (it->Seek(start);
|
||||
it->Valid() && it->key().ToString() < limit;
|
||||
it->Next()) {
|
||||
...
|
||||
}
|
||||
</pre>
|
||||
You can also process entries in reverse order. (Caveat: reverse
|
||||
iteration may be somewhat slower than forward iteration.)
|
||||
<p>
|
||||
<pre>
|
||||
for (it->SeekToLast(); it->Valid(); it->Prev()) {
|
||||
...
|
||||
}
|
||||
</pre>
|
||||
<h1>Snapshots</h1>
|
||||
<p>
|
||||
Snapshots provide consistent read-only views over the entire state of
|
||||
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
|
||||
that a read should operate on a particular version of the DB state.
|
||||
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
|
||||
implicit snapshot of the current state.
|
||||
<p>
|
||||
Snapshots are created by the DB::GetSnapshot() method:
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::ReadOptions options;
|
||||
options.snapshot = db->GetSnapshot();
|
||||
... apply some updates to db ...
|
||||
rocksdb::Iterator* iter = db->NewIterator(options);
|
||||
... read using iter to view the state when the snapshot was created ...
|
||||
delete iter;
|
||||
db->ReleaseSnapshot(options.snapshot);
|
||||
</pre>
|
||||
Note that when a snapshot is no longer needed, it should be released
|
||||
using the DB::ReleaseSnapshot interface. This allows the
|
||||
implementation to get rid of state that was being maintained just to
|
||||
support reading as of that snapshot.
|
||||
<h1>Slice</h1>
|
||||
<p>
|
||||
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
|
||||
are instances of the <code>rocksdb::Slice</code> type. <code>Slice</code> is a simple
|
||||
structure that contains a length and a pointer to an external byte
|
||||
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
|
||||
<code>std::string</code> since we do not need to copy potentially large keys and
|
||||
values. In addition, <code>rocksdb</code> methods do not return null-terminated
|
||||
C-style strings since <code>rocksdb</code> keys and values are allowed to
|
||||
contain '\0' bytes.
|
||||
<p>
|
||||
C++ strings and null-terminated C-style strings can be easily converted
|
||||
to a Slice:
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::Slice s1 = "hello";
|
||||
|
||||
std::string str("world");
|
||||
rocksdb::Slice s2 = str;
|
||||
</pre>
|
||||
A Slice can be easily converted back to a C++ string:
|
||||
<pre>
|
||||
std::string str = s1.ToString();
|
||||
assert(str == std::string("hello"));
|
||||
</pre>
|
||||
Be careful when using Slices since it is up to the caller to ensure that
|
||||
the external byte array into which the Slice points remains live while
|
||||
the Slice is in use. For example, the following is buggy:
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::Slice slice;
|
||||
if (...) {
|
||||
std::string str = ...;
|
||||
slice = str;
|
||||
}
|
||||
Use(slice);
|
||||
</pre>
|
||||
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
|
||||
backing storage for <code>slice</code> will disappear.
|
||||
<p>
|
||||
<h1>Comparators</h1>
|
||||
<p>
|
||||
The preceding examples used the default ordering function for key,
|
||||
which orders bytes lexicographically. You can however supply a custom
|
||||
comparator when opening a database. For example, suppose each
|
||||
database key consists of two numbers and we should sort by the first
|
||||
number, breaking ties by the second number. First, define a proper
|
||||
subclass of <code>rocksdb::Comparator</code> that expresses these rules:
|
||||
<p>
|
||||
<pre>
|
||||
class TwoPartComparator : public rocksdb::Comparator {
|
||||
public:
|
||||
// Three-way comparison function:
|
||||
// if a < b: negative result
|
||||
// if a > b: positive result
|
||||
// else: zero result
|
||||
int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const {
|
||||
int a1, a2, b1, b2;
|
||||
ParseKey(a, &a1, &a2);
|
||||
ParseKey(b, &b1, &b2);
|
||||
if (a1 < b1) return -1;
|
||||
if (a1 > b1) return +1;
|
||||
if (a2 < b2) return -1;
|
||||
if (a2 > b2) return +1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Ignore the following methods for now:
|
||||
const char* Name() const { return "TwoPartComparator"; }
|
||||
void FindShortestSeparator(std::string*, const rocksdb::Slice&) const { }
|
||||
void FindShortSuccessor(std::string*) const { }
|
||||
};
|
||||
</pre>
|
||||
Now create a database using this custom comparator:
|
||||
<p>
|
||||
<pre>
|
||||
TwoPartComparator cmp;
|
||||
rocksdb::DB* db;
|
||||
rocksdb::Options options;
|
||||
options.create_if_missing = true;
|
||||
options.comparator = &cmp;
|
||||
rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
|
||||
...
|
||||
</pre>
|
||||
<h2>Backwards compatibility</h2>
|
||||
<p>
|
||||
The result of the comparator's <code>Name</code> method is attached to the
|
||||
database when it is created, and is checked on every subsequent
|
||||
database open. If the name changes, the <code>rocksdb::DB::Open</code> call will
|
||||
fail. Therefore, change the name if and only if the new key format
|
||||
and comparison function are incompatible with existing databases, and
|
||||
it is ok to discard the contents of all existing databases.
|
||||
<p>
|
||||
You can however still gradually evolve your key format over time with
|
||||
a little bit of pre-planning. For example, you could store a version
|
||||
number at the end of each key (one byte should suffice for most uses).
|
||||
When you wish to switch to a new key format (e.g., adding an optional
|
||||
third part to the keys processed by <code>TwoPartComparator</code>),
|
||||
(a) keep the same comparator name (b) increment the version number
|
||||
for new keys (c) change the comparator function so it uses the
|
||||
version numbers found in the keys to decide how to interpret them.
|
||||
|
||||
|
||||
<p>
|
||||
<h1>MemTable and Table factories</h1>
|
||||
<p>
|
||||
By default, we keep the data in memory in skiplist memtable and the data
|
||||
on disk in a table format described here:
|
||||
<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
|
||||
RocksDB Table Format</a>.
|
||||
<p>
|
||||
Since one of the goals of RocksDB is to have
|
||||
different parts of the system easily pluggable, we support different
|
||||
implementations of both memtable and table format. You can supply
|
||||
your own memtable factory by setting <code>Options::memtable_factory</code>
|
||||
and your own table factory by setting <code>Options::table_factory</code>.
|
||||
For available memtable factories, please refer to
|
||||
<code>rocksdb/memtablerep.h</code> and for table factores to
|
||||
<code>rocksdb/table.h</code>. These features are both in active development
|
||||
and please be wary of any API changes that might break your application
|
||||
going forward.
|
||||
<p>
|
||||
You can also read more about memtables here:
|
||||
<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#memtables">
|
||||
Memtables wiki
|
||||
</a>
|
||||
|
||||
<p>
|
||||
<h1>Performance</h1>
|
||||
<p>
|
||||
Performance can be tuned by changing the default values of the
|
||||
types defined in <code>include/rocksdb/options.h</code>.
|
||||
|
||||
<p>
|
||||
<h2>Block size</h2>
|
||||
<p>
|
||||
<code>rocksdb</code> groups adjacent keys together into the same block and such a
|
||||
block is the unit of transfer to and from persistent storage. The
|
||||
default block size is approximately 4096 uncompressed bytes.
|
||||
Applications that mostly do bulk scans over the contents of the
|
||||
database may wish to increase this size. Applications that do a lot
|
||||
of point reads of small values may wish to switch to a smaller block
|
||||
size if performance measurements indicate an improvement. There isn't
|
||||
much benefit in using blocks smaller than one kilobyte, or larger than
|
||||
a few megabytes. Also note that compression will be more effective
|
||||
with larger block sizes. To change block size parameter, use
|
||||
<code>Options::block_size</code>.
|
||||
<p>
|
||||
<h2>Write buffer</h2>
|
||||
<p>
|
||||
<code>Options::write_buffer_size</code> specifies the amount of data
|
||||
to build up in memory before converting to a sorted on-disk file.
|
||||
Larger values increase performance, especially during bulk loads.
|
||||
Up to max_write_buffer_number write buffers may be held in memory
|
||||
at the same time,
|
||||
so you may wish to adjust this parameter to control memory usage.
|
||||
Also, a larger write buffer will result in a longer recovery time
|
||||
the next time the database is opened.
|
||||
Related option is
|
||||
<code>Options::max_write_buffer_number</code>, which is maximum number
|
||||
of write buffers that are built up in memory. The default is 2, so that
|
||||
when 1 write buffer is being flushed to storage, new writes can continue
|
||||
to the other write buffer.
|
||||
<code>Options::min_write_buffer_number_to_merge</code> is the minimum number
|
||||
of write buffers that will be merged together before writing to storage.
|
||||
If set to 1, then all write buffers are fushed to L0 as individual files and
|
||||
this increases read amplification because a get request has to check in all
|
||||
of these files. Also, an in-memory merge may result in writing lesser
|
||||
data to storage if there are duplicate records in each of these
|
||||
individual write buffers. Default: 1
|
||||
<p>
|
||||
<h2>Compression</h2>
|
||||
<p>
|
||||
Each block is individually compressed before being written to
|
||||
persistent storage. Compression is on by default since the default
|
||||
compression method is very fast, and is automatically disabled for
|
||||
uncompressible data. In rare cases, applications may want to disable
|
||||
compression entirely, but should only do so if benchmarks show a
|
||||
performance improvement:
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::Options options;
|
||||
options.compression = rocksdb::kNoCompression;
|
||||
... rocksdb::DB::Open(options, name, ...) ....
|
||||
</pre>
|
||||
<h2>Cache</h2>
|
||||
<p>
|
||||
The contents of the database are stored in a set of files in the
|
||||
filesystem and each file stores a sequence of compressed blocks. If
|
||||
<code>options.block_cache</code> is non-NULL, it is used to cache frequently
|
||||
used uncompressed block contents. If <code>options.block_cache_compressed</code>
|
||||
is non-NULL, it is used to cache frequently used compressed blocks. Compressed
|
||||
cache is an alternative to OS cache, which also caches compressed blocks. If
|
||||
compressed cache is used, the OS cache will be disabled automatically by setting
|
||||
<code>options.allow_os_buffer</code> to false.
|
||||
<p>
|
||||
<pre>
|
||||
#include "rocksdb/cache.h"
|
||||
|
||||
rocksdb::Options options;
|
||||
options.block_cache = rocksdb::NewLRUCache(100 * 1048576); // 100MB uncompressed cache
|
||||
options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576); // 100MB compressed cache
|
||||
rocksdb::DB* db;
|
||||
rocksdb::DB::Open(options, name, &db);
|
||||
... use the db ...
|
||||
delete db
|
||||
delete options.block_cache;
|
||||
delete options.block_cache_compressed;
|
||||
</pre>
|
||||
<p>
|
||||
When performing a bulk read, the application may wish to disable
|
||||
caching so that the data processed by the bulk read does not end up
|
||||
displacing most of the cached contents. A per-iterator option can be
|
||||
used to achieve this:
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::ReadOptions options;
|
||||
options.fill_cache = false;
|
||||
rocksdb::Iterator* it = db->NewIterator(options);
|
||||
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
||||
...
|
||||
}
|
||||
</pre>
|
||||
<p>
|
||||
You can also disable block cache by setting <code>options.no_block_cache</code>
|
||||
to true.
|
||||
<h2>Key Layout</h2>
|
||||
<p>
|
||||
Note that the unit of disk transfer and caching is a block. Adjacent
|
||||
keys (according to the database sort order) will usually be placed in
|
||||
the same block. Therefore the application can improve its performance
|
||||
by placing keys that are accessed together near each other and placing
|
||||
infrequently used keys in a separate region of the key space.
|
||||
<p>
|
||||
For example, suppose we are implementing a simple file system on top
|
||||
of <code>rocksdb</code>. The types of entries we might wish to store are:
|
||||
<p>
|
||||
<pre>
|
||||
filename -> permission-bits, length, list of file_block_ids
|
||||
file_block_id -> data
|
||||
</pre>
|
||||
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
|
||||
<code>file_block_id</code> keys with a different letter (say '0') so that scans
|
||||
over just the metadata do not force us to fetch and cache bulky file
|
||||
contents.
|
||||
<p>
|
||||
<h2>Filters</h2>
|
||||
<p>
|
||||
Because of the way <code>rocksdb</code> data is organized on disk,
|
||||
a single <code>Get()</code> call may involve multiple reads from disk.
|
||||
The optional <code>FilterPolicy</code> mechanism can be used to reduce
|
||||
the number of disk reads substantially.
|
||||
<pre>
|
||||
rocksdb::Options options;
|
||||
options.filter_policy = NewBloomFilter(10);
|
||||
rocksdb::DB* db;
|
||||
rocksdb::DB::Open(options, "/tmp/testdb", &db);
|
||||
... use the database ...
|
||||
delete db;
|
||||
delete options.filter_policy;
|
||||
</pre>
|
||||
The preceding code associates a
|
||||
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
|
||||
based filtering policy with the database. Bloom filter based
|
||||
filtering relies on keeping some number of bits of data in memory per
|
||||
key (in this case 10 bits per key since that is the argument we passed
|
||||
to NewBloomFilter). This filter will reduce the number of unnecessary
|
||||
disk reads needed for <code>Get()</code> calls by a factor of
|
||||
approximately a 100. Increasing the bits per key will lead to a
|
||||
larger reduction at the cost of more memory usage. We recommend that
|
||||
applications whose working set does not fit in memory and that do a
|
||||
lot of random reads set a filter policy.
|
||||
<p>
|
||||
If you are using a custom comparator, you should ensure that the filter
|
||||
policy you are using is compatible with your comparator. For example,
|
||||
consider a comparator that ignores trailing spaces when comparing keys.
|
||||
<code>NewBloomFilter</code> must not be used with such a comparator.
|
||||
Instead, the application should provide a custom filter policy that
|
||||
also ignores trailing spaces. For example:
|
||||
<pre>
|
||||
class CustomFilterPolicy : public rocksdb::FilterPolicy {
|
||||
private:
|
||||
FilterPolicy* builtin_policy_;
|
||||
public:
|
||||
CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
|
||||
~CustomFilterPolicy() { delete builtin_policy_; }
|
||||
|
||||
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
|
||||
|
||||
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
|
||||
// Use builtin bloom filter code after removing trailing spaces
|
||||
std::vector<Slice> trimmed(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
trimmed[i] = RemoveTrailingSpaces(keys[i]);
|
||||
}
|
||||
return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
|
||||
}
|
||||
|
||||
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
|
||||
// Use builtin bloom filter code after removing trailing spaces
|
||||
return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
|
||||
}
|
||||
};
|
||||
</pre>
|
||||
<p>
|
||||
Advanced applications may provide a filter policy that does not use
|
||||
a bloom filter but uses some other mechanism for summarizing a set
|
||||
of keys. See <code>rocksdb/filter_policy.h</code> for detail.
|
||||
<p>
|
||||
<h1>Checksums</h1>
|
||||
<p>
|
||||
<code>rocksdb</code> associates checksums with all data it stores in the file system.
|
||||
There are two separate controls provided over how aggressively these
|
||||
checksums are verified:
|
||||
<p>
|
||||
<ul>
|
||||
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
|
||||
checksum verification of all data that is read from the file system on
|
||||
behalf of a particular read. By default, no such verification is
|
||||
done.
|
||||
<p>
|
||||
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
|
||||
database to make the database implementation raise an error as soon as
|
||||
it detects an internal corruption. Depending on which portion of the
|
||||
database has been corrupted, the error may be raised when the database
|
||||
is opened, or later by another database operation. By default,
|
||||
paranoid checking is off so that the database can be used even if
|
||||
parts of its persistent storage have been corrupted.
|
||||
<p>
|
||||
If a database is corrupted (perhaps it cannot be opened when
|
||||
paranoid checking is turned on), the <code>rocksdb::RepairDB</code> function
|
||||
may be used to recover as much of the data as possible.
|
||||
<p>
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
<h1>Compaction</h1>
|
||||
<p>
|
||||
You can read more on Compactions here:
|
||||
<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#multi-threaded-compactions">
|
||||
Multi-threaded compactions
|
||||
</a>
|
||||
<p>
|
||||
Here we give overview of the options that impact behavior of Compactions:
|
||||
<ul>
|
||||
<p>
|
||||
<li><code>Options::compaction_style</code> - RocksDB currently supports two
|
||||
compaction algorithms - Universal style and Level style. This option switches
|
||||
between the two. Can be kCompactionStyleUniversal or kCompactionStyleLevel.
|
||||
If this is kCompactionStyleUniversal, then you can configure universal style
|
||||
parameters with <code>Options::compaction_options_universal</code>.
|
||||
<p>
|
||||
<li><code>Options::disable_auto_compactions</code> - Disable automatic compactions.
|
||||
Manual compactions can still be issued on this database.
|
||||
<p>
|
||||
<li><code>Options::compaction_filter</code> - Allows an application to modify/delete
|
||||
a key-value during background compaction. The client must provide
|
||||
compaction_filter_factory if it requires a new compaction filter to be used
|
||||
for different compaction processes. Client should specify only one of filter
|
||||
or factory.
|
||||
<p>
|
||||
<li><code>Options::compaction_filter_factory</code> - a factory that provides
|
||||
compaction filter objects which allow an application to modify/delete a
|
||||
key-value during background compaction.
|
||||
</ul>
|
||||
<p>
|
||||
Other options impacting performance of compactions and when they get triggered
|
||||
are:
|
||||
<ul>
|
||||
<p>
|
||||
<li> <code>Options::access_hint_on_compaction_start</code> - Specify the file access
|
||||
pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL
|
||||
<p>
|
||||
<li> <code>Options::level0_file_num_compaction_trigger</code> - Number of files to trigger level-0 compaction.
|
||||
A negative value means that level-0 compaction will not be triggered by number of files at all.
|
||||
<p>
|
||||
<li> <code>Options::max_mem_compaction_level</code> - Maximum level to which a new compacted memtable is pushed if it
|
||||
does not create overlap. We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some
|
||||
expensive manifest file operations. We do not push all the way to the largest level since that can generate a lot of wasted disk
|
||||
space if the same key space is being repeatedly overwritten.
|
||||
<p>
|
||||
<li> <code>Options::target_file_size_base</code> and <code>Options::target_file_size_multiplier</code> -
|
||||
Target file size for compaction. target_file_size_base is per-file size for level-1.
|
||||
Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1))
|
||||
For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will
|
||||
be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB
|
||||
and default target_file_size_multiplier is 1.
|
||||
<p>
|
||||
<li> <code>Options::expanded_compaction_factor</code> - Maximum number of bytes in all compacted files. We avoid expanding
|
||||
the lower level file set of a compaction if it would make the total compaction cover more than
|
||||
(expanded_compaction_factor * targetFileSizeLevel()) many bytes.
|
||||
<p>
|
||||
<li> <code>Options::source_compaction_factor</code> - Maximum number of bytes in all source files to be compacted in a
|
||||
single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes
|
||||
for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes.
|
||||
Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
|
||||
<p>
|
||||
<li> <code>Options::max_grandparent_overlap_factor</code> - Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
|
||||
stop building a single file in a level->level+1 compaction.
|
||||
<p>
|
||||
<li> <code>Options::disable_seek_compaction</code> - Disable compaction triggered by seek.
|
||||
With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache
|
||||
(which is true if max_open_files is large).
|
||||
<p>
|
||||
<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs, submitted to
|
||||
the default LOW priority thread pool
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
You can learn more about all of those options in <code>rocksdb/options.h</code>
|
||||
|
||||
<h2> Universal style compaction specific settings</h2>
|
||||
<p>
|
||||
If you're using Universal style compaction, there is an object <code>CompactionOptionsUniversal</code>
|
||||
that hold all the different options for that compaction. The exact definition is in
|
||||
<code>rocksdb/universal_compaction.h</code> and you can set it in <code>Options::compaction_options_universal</code>.
|
||||
Here we give short overview of options in <code>CompactionOptionsUniversal</code>:
|
||||
<ul>
|
||||
<p>
|
||||
<li> <code>CompactionOptionsUniversal::size_ratio</code> - Percentage flexibilty while comparing file size. If the candidate file(s)
|
||||
size is 1% smaller than the next file's size, then include next file into
|
||||
this candidate set. Default: 1
|
||||
<p>
|
||||
<li> <code>CompactionOptionsUniversal::min_merge_width</code> - The minimum number of files in a single compaction run. Default: 2
|
||||
<p>
|
||||
<li> <code>CompactionOptionsUniversal::max_merge_width</code> - The maximum number of files in a single compaction run. Default: UINT_MAX
|
||||
<p>
|
||||
<li> <code>CompactionOptionsUniversal::max_size_amplification_percent</code> - The size amplification is defined as the amount (in percentage) of
|
||||
additional storage needed to store a single byte of data in the database. For example, a size amplification of 2% means that a database that
|
||||
contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has
|
||||
a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding
|
||||
the earliest file contribute to the size amplification. Default: 200, which means that a 100 byte database could require upto
|
||||
300 bytes of storage.
|
||||
<p>
|
||||
<li> <code>CompactionOptionsUniversal::compression_size_percent</code> - If this option is set to be -1 (the default value), all the output files
|
||||
will follow compression type specified. If this option is not negative, we will try to make sure compressed
|
||||
size is just above this value. In normal cases, at least this percentage
|
||||
of data will be compressed.
|
||||
When we are compacting to a new file, here is the criteria whether
|
||||
it needs to be compressed: assuming here are the list of files sorted
|
||||
by generation time: [ A1...An B1...Bm C1...Ct ],
|
||||
where A1 is the newest and Ct is the oldest, and we are going to compact
|
||||
B1...Bm, we calculate the total size of all the files as total_size, as
|
||||
well as the total size of C1...Ct as total_C, the compaction output file
|
||||
will be compressed iff total_C / total_size < this percentage
|
||||
<p>
|
||||
<li> <code>CompactionOptionsUniversal::stop_style</code> - The algorithm used to stop picking files into a single compaction run.
|
||||
Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file).
|
||||
Default: kCompactionStopStyleTotalSize
|
||||
</ul>
|
||||
|
||||
<h1>Thread pools</h1>
|
||||
<p>
|
||||
A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background
|
||||
threads using method <code>Env::SetBackgroundThreads()</code> defined in <code>rocksdb/env.h</code>.
|
||||
We use the thread pool for compactions and memtable flushes.
|
||||
Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest
|
||||
having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool.
|
||||
There are two options available for configuration of background compactions and flushes:
|
||||
<ul>
|
||||
<p>
|
||||
<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs,
|
||||
submitted to the default LOW priority thread pool
|
||||
<p>
|
||||
<li> <code>Options::max_background_flushes</code> - Maximum number of concurrent background memtable flush jobs, submitted to
|
||||
the HIGH priority thread pool. By default, all background jobs (major compaction and memtable flush) go
|
||||
to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool.
|
||||
It is important when the same Env is shared by multiple db instances. Without a separate pool, long running major compaction jobs could
|
||||
potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls.
|
||||
</ul>
|
||||
<p>
|
||||
<pre>
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/db.h"
|
||||
|
||||
auto env = rocksdb::Env::Default();
|
||||
env->SetBackgroundThreads(2, rocksdb::Env::LOW);
|
||||
env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
|
||||
rocksdb::DB* db;
|
||||
rocksdb::Options options;
|
||||
options.env = env;
|
||||
options.max_background_compactions = 2;
|
||||
options.max_background_flushes = 1;
|
||||
rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);
|
||||
assert(status.ok());
|
||||
...
|
||||
</pre>
|
||||
<h1>Approximate Sizes</h1>
|
||||
<p>
|
||||
The <code>GetApproximateSizes</code> method can used to get the approximate
|
||||
number of bytes of file system space used by one or more key ranges.
|
||||
<p>
|
||||
<pre>
|
||||
rocksdb::Range ranges[2];
|
||||
ranges[0] = rocksdb::Range("a", "c");
|
||||
ranges[1] = rocksdb::Range("x", "z");
|
||||
uint64_t sizes[2];
|
||||
rocksdb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
|
||||
</pre>
|
||||
The preceding call will set <code>sizes[0]</code> to the approximate number of
|
||||
bytes of file system space used by the key range <code>[a..c)</code> and
|
||||
<code>sizes[1]</code> to the approximate number of bytes used by the key range
|
||||
<code>[x..z)</code>.
|
||||
<p>
|
||||
<h1>Environment</h1>
|
||||
<p>
|
||||
All file operations (and other operating system calls) issued by the
|
||||
<code>rocksdb</code> implementation are routed through a <code>rocksdb::Env</code> object.
|
||||
Sophisticated clients may wish to provide their own <code>Env</code>
|
||||
implementation to get better control. For example, an application may
|
||||
introduce artificial delays in the file IO paths to limit the impact
|
||||
of <code>rocksdb</code> on other activities in the system.
|
||||
<p>
|
||||
<pre>
|
||||
class SlowEnv : public rocksdb::Env {
|
||||
.. implementation of the Env interface ...
|
||||
};
|
||||
|
||||
SlowEnv env;
|
||||
rocksdb::Options options;
|
||||
options.env = &env;
|
||||
Status s = rocksdb::DB::Open(options, ...);
|
||||
</pre>
|
||||
<h1>Porting</h1>
|
||||
<p>
|
||||
<code>rocksdb</code> may be ported to a new platform by providing platform
|
||||
specific implementations of the types/methods/functions exported by
|
||||
<code>rocksdb/port/port.h</code>. See <code>rocksdb/port/port_example.h</code> for more
|
||||
details.
|
||||
<p>
|
||||
In addition, the new platform may need a new default <code>rocksdb::Env</code>
|
||||
implementation. See <code>rocksdb/util/env_posix.h</code> for an example.
|
||||
|
||||
<h1>Statistics</h1>
|
||||
<p>
|
||||
To be able to efficiently tune your application, it is always helpful if you
|
||||
have access to usage statistics. You can collect those statistics by setting
|
||||
<code>Options::table_stats_collectors</code> or
|
||||
<code>Options::statistics</code>. For more information, refer to
|
||||
<code>rocksdb/table_stats.h</code> and <code>rocksdb/statistics.h</code>.
|
||||
These should not add significant overhead to your application and we
|
||||
recommend exporting them to other monitoring tools.
|
||||
|
||||
<h1>Purging WAL files</h1>
|
||||
<p>
|
||||
By default, old write-ahead logs are deleted automatically when they fall out
|
||||
of scope and application doesn't need them anymore. There are options that
|
||||
enable the user to archive the logs and then delete them lazily, either in
|
||||
TTL fashion or based on size limit.
|
||||
|
||||
The options are <code>Options::WAL_ttl_seconds</code> and
|
||||
<code>Options::WAL_size_limit_MB</code>. Here is how they can be used:
|
||||
<ul>
|
||||
<li>
|
||||
<p>
|
||||
If both set to 0, logs will be deleted asap and will never get into the archive.
|
||||
<li>
|
||||
<p>
|
||||
If <code>WAL_ttl_seconds</code> is 0 and WAL_size_limit_MB is not 0, WAL
|
||||
files will be checked every 10 min and if total size is greater then
|
||||
<code>WAL_size_limit_MB</code>, they will be deleted starting with the
|
||||
earliest until size_limit is met. All empty files will be deleted.
|
||||
<li>
|
||||
<p>
|
||||
If <code>WAL_ttl_seconds</code> is not 0 and WAL_size_limit_MB is 0, then
|
||||
WAL files will be checked every <code>WAL_ttl_seconds / 2</code> and those
|
||||
that are older than WAL_ttl_seconds will be deleted.
|
||||
<li>
|
||||
<p>
|
||||
If both are not 0, WAL files will be checked every 10 min and both
|
||||
checks will be performed with ttl being first.
|
||||
</ul>
|
||||
|
||||
<h1>Other Information</h1>
|
||||
<p>
|
||||
Details about the <code>rocksdb</code> implementation may be found in
|
||||
the following documents:
|
||||
<ul>
|
||||
<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide">
|
||||
RocksDB Architecture Guide</a>
|
||||
<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
|
||||
Format of an immutable Table file</a>
|
||||
<li> <a href="log_format.txt">Format of a log file</a>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
75
doc/log_format.txt
Normal file
75
doc/log_format.txt
Normal file
@ -0,0 +1,75 @@
|
||||
The log file contents are a sequence of 32KB blocks. The only
|
||||
exception is that the tail of the file may contain a partial block.
|
||||
|
||||
Each block consists of a sequence of records:
|
||||
block := record* trailer?
|
||||
record :=
|
||||
checksum: uint32 // crc32c of type and data[]
|
||||
length: uint16
|
||||
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
|
||||
data: uint8[length]
|
||||
|
||||
A record never starts within the last six bytes of a block (since it
|
||||
won't fit). Any leftover bytes here form the trailer, which must
|
||||
consist entirely of zero bytes and must be skipped by readers.
|
||||
|
||||
Aside: if exactly seven bytes are left in the current block, and a new
|
||||
non-zero length record is added, the writer must emit a FIRST record
|
||||
(which contains zero bytes of user data) to fill up the trailing seven
|
||||
bytes of the block and then emit all of the user data in subsequent
|
||||
blocks.
|
||||
|
||||
More types may be added in the future. Some Readers may skip record
|
||||
types they do not understand, others may report that some data was
|
||||
skipped.
|
||||
|
||||
FULL == 1
|
||||
FIRST == 2
|
||||
MIDDLE == 3
|
||||
LAST == 4
|
||||
|
||||
The FULL record contains the contents of an entire user record.
|
||||
|
||||
FIRST, MIDDLE, LAST are types used for user records that have been
|
||||
split into multiple fragments (typically because of block boundaries).
|
||||
FIRST is the type of the first fragment of a user record, LAST is the
|
||||
type of the last fragment of a user record, and MID is the type of all
|
||||
interior fragments of a user record.
|
||||
|
||||
Example: consider a sequence of user records:
|
||||
A: length 1000
|
||||
B: length 97270
|
||||
C: length 8000
|
||||
A will be stored as a FULL record in the first block.
|
||||
|
||||
B will be split into three fragments: first fragment occupies the rest
|
||||
of the first block, second fragment occupies the entirety of the
|
||||
second block, and the third fragment occupies a prefix of the third
|
||||
block. This will leave six bytes free in the third block, which will
|
||||
be left empty as the trailer.
|
||||
|
||||
C will be stored as a FULL record in the fourth block.
|
||||
|
||||
===================
|
||||
|
||||
Some benefits over the recordio format:
|
||||
|
||||
(1) We do not need any heuristics for resyncing - just go to next
|
||||
block boundary and scan. If there is a corruption, skip to the next
|
||||
block. As a side-benefit, we do not get confused when part of the
|
||||
contents of one log file are embedded as a record inside another log
|
||||
file.
|
||||
|
||||
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
|
||||
simple: find the next block boundary and skip records until we
|
||||
hit a FULL or FIRST record.
|
||||
|
||||
(3) We do not need extra buffering for large records.
|
||||
|
||||
Some downsides compared to recordio format:
|
||||
|
||||
(1) No packing of tiny records. This could be fixed by adding a new
|
||||
record type, so it is a shortcoming of the current implementation,
|
||||
not necessarily the format.
|
||||
|
||||
(2) No compression. Again, this could be fixed by adding new record types.
|
BIN
doc/rockslogo.jpg
Normal file
BIN
doc/rockslogo.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 134 KiB |
BIN
doc/rockslogo.png
Normal file
BIN
doc/rockslogo.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 60 KiB |
26
hdfs/README
Normal file
26
hdfs/README
Normal file
@ -0,0 +1,26 @@
|
||||
This directory contains the hdfs extensions needed to make rocksdb store
|
||||
files in HDFS.
|
||||
|
||||
The hdfs.h file is copied from the Apache Hadoop 1.0 source code.
|
||||
It defines the libhdfs library
|
||||
(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access
|
||||
data in HDFS. The libhdfs.a is copied from the Apache Hadoop 1.0 build.
|
||||
It implements the API defined in hdfs.h. If your hadoop cluster is running
|
||||
a different hadoop release, then install these two files manually from your
|
||||
hadoop distribution and then recompile rocksdb.
|
||||
|
||||
The env_hdfs.h file defines the rocksdb objects that are needed to talk to an
|
||||
underlying filesystem.
|
||||
|
||||
If you want to compile rocksdb with hdfs support, please set the following
|
||||
enviroment variables appropriately:
|
||||
USE_HDFS=1
|
||||
JAVA_HOME=/usr/local/jdk-6u22-64
|
||||
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs
|
||||
make clean all db_bench
|
||||
|
||||
To run dbbench,
|
||||
set CLASSPATH to include your hadoop distribution
|
||||
db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000"
|
||||
|
||||
|
302
hdfs/env_hdfs.h
Normal file
302
hdfs/env_hdfs.h
Normal file
@ -0,0 +1,302 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
#include <iostream>
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
#ifdef USE_HDFS
|
||||
#include "hdfs/hdfs.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static const std::string kProto = "hdfs://";
|
||||
static const std::string pathsep = "/";
|
||||
|
||||
// Thrown during execution when there is an issue with the supplied
|
||||
// arguments.
|
||||
class HdfsUsageException : public std::exception { };
|
||||
|
||||
// A simple exception that indicates something went wrong that is not
|
||||
// recoverable. The intention is for the message to be printed (with
|
||||
// nothing else) and the process terminate.
|
||||
class HdfsFatalException : public std::exception {
|
||||
public:
|
||||
explicit HdfsFatalException(const std::string& s) : what_(s) { }
|
||||
virtual ~HdfsFatalException() throw() { }
|
||||
virtual const char* what() const throw() {
|
||||
return what_.c_str();
|
||||
}
|
||||
private:
|
||||
const std::string what_;
|
||||
};
|
||||
|
||||
//
|
||||
// The HDFS environment for rocksdb. This class overrides all the
|
||||
// file/dir access methods and delegates the thread-mgmt methods to the
|
||||
// default posix environment.
|
||||
//
|
||||
class HdfsEnv : public Env {
|
||||
|
||||
public:
|
||||
HdfsEnv(const std::string& fsname) : fsname_(fsname) {
|
||||
posixEnv = Env::Default();
|
||||
fileSys_ = connectToPath(fsname_);
|
||||
}
|
||||
|
||||
virtual ~HdfsEnv() {
|
||||
fprintf(stderr, "Destroying HdfsEnv::Default()\n");
|
||||
hdfsDisconnect(fileSys_);
|
||||
}
|
||||
|
||||
virtual Status NewSequentialFile(const std::string& fname,
|
||||
SequentialFile** result);
|
||||
|
||||
virtual Status NewRandomAccessFile(const std::string& fname,
|
||||
RandomAccessFile** result);
|
||||
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
WritableFile** result);
|
||||
|
||||
virtual Status NewRandomRWFile(const std::string& fname,
|
||||
unique_ptr<RandomRWFile>* result,
|
||||
const EnvOptions& options);
|
||||
|
||||
virtual bool FileExists(const std::string& fname);
|
||||
|
||||
virtual Status GetChildren(const std::string& path,
|
||||
std::vector<std::string>* result);
|
||||
|
||||
virtual Status DeleteFile(const std::string& fname);
|
||||
|
||||
virtual Status CreateDir(const std::string& name);
|
||||
|
||||
virtual Status CreateDirIfMissing(const std::string& name);
|
||||
|
||||
virtual Status DeleteDir(const std::string& name);
|
||||
|
||||
virtual Status GetFileSize(const std::string& fname, uint64_t* size);
|
||||
|
||||
virtual Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* file_mtime);
|
||||
|
||||
virtual Status RenameFile(const std::string& src, const std::string& target);
|
||||
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock);
|
||||
|
||||
virtual Status UnlockFile(FileLock* lock);
|
||||
|
||||
virtual Status NewLogger(const std::string& fname, Logger** result);
|
||||
|
||||
virtual void Schedule(void (*function)(void* arg), void* arg,
|
||||
Priority pri = LOW) {
|
||||
posixEnv->Schedule(function, arg, pri);
|
||||
}
|
||||
|
||||
virtual void StartThread(void (*function)(void* arg), void* arg) {
|
||||
posixEnv->StartThread(function, arg);
|
||||
}
|
||||
|
||||
virtual Status GetTestDirectory(std::string* path) {
|
||||
return posixEnv->GetTestDirectory(path);
|
||||
}
|
||||
|
||||
virtual uint64_t NowMicros() {
|
||||
return posixEnv->NowMicros();
|
||||
}
|
||||
|
||||
virtual void SleepForMicroseconds(int micros) {
|
||||
posixEnv->SleepForMicroseconds(micros);
|
||||
}
|
||||
|
||||
virtual Status GetHostName(char* name, uint64_t len) {
|
||||
return posixEnv->GetHostName(name, len);
|
||||
}
|
||||
|
||||
virtual Status GetCurrentTime(int64_t* unix_time) {
|
||||
return posixEnv->GetCurrentTime(unix_time);
|
||||
}
|
||||
|
||||
virtual Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* output_path) {
|
||||
return posixEnv->GetAbsolutePath(db_path, output_path);
|
||||
}
|
||||
|
||||
virtual void SetBackgroundThreads(int number, Priority pri = LOW) {
|
||||
posixEnv->SetBackgroundThreads(number, pri);
|
||||
}
|
||||
|
||||
virtual std::string TimeToString(uint64_t number) {
|
||||
return posixEnv->TimeToString(number);
|
||||
}
|
||||
|
||||
static uint64_t gettid() {
|
||||
assert(sizeof(pthread_t) <= sizeof(uint64_t));
|
||||
return (uint64_t)pthread_self();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string fsname_; // string of the form "hdfs://hostname:port/"
|
||||
hdfsFS fileSys_; // a single FileSystem object for all files
|
||||
Env* posixEnv; // This object is derived from Env, but not from
|
||||
// posixEnv. We have posixnv as an encapsulated
|
||||
// object here so that we can use posix timers,
|
||||
// posix threads, etc.
|
||||
|
||||
/**
|
||||
* If the URI is specified of the form hdfs://server:port/path,
|
||||
* then connect to the specified cluster
|
||||
* else connect to default.
|
||||
*/
|
||||
hdfsFS connectToPath(const std::string& uri) {
|
||||
if (uri.empty()) {
|
||||
return NULL;
|
||||
}
|
||||
if (uri.find(kProto) != 0) {
|
||||
// uri doesn't start with hdfs:// -> use default:0, which is special
|
||||
// to libhdfs.
|
||||
return hdfsConnectNewInstance("default", 0);
|
||||
}
|
||||
const std::string hostport = uri.substr(kProto.length());
|
||||
|
||||
std::vector <std::string> parts;
|
||||
split(hostport, ':', parts);
|
||||
if (parts.size() != 2) {
|
||||
throw HdfsFatalException("Bad uri for hdfs " + uri);
|
||||
}
|
||||
// parts[0] = hosts, parts[1] = port/xxx/yyy
|
||||
std::string host(parts[0]);
|
||||
std::string remaining(parts[1]);
|
||||
|
||||
int rem = remaining.find(pathsep);
|
||||
std::string portStr = (rem == 0 ? remaining :
|
||||
remaining.substr(0, rem));
|
||||
|
||||
tPort port;
|
||||
port = atoi(portStr.c_str());
|
||||
if (port == 0) {
|
||||
throw HdfsFatalException("Bad host-port for hdfs " + uri);
|
||||
}
|
||||
hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port);
|
||||
return fs;
|
||||
}
|
||||
|
||||
void split(const std::string &s, char delim,
|
||||
std::vector<std::string> &elems) {
|
||||
elems.clear();
|
||||
size_t prev = 0;
|
||||
size_t pos = s.find(delim);
|
||||
while (pos != std::string::npos) {
|
||||
elems.push_back(s.substr(prev, pos));
|
||||
prev = pos + 1;
|
||||
pos = s.find(delim, prev);
|
||||
}
|
||||
elems.push_back(s.substr(prev, s.size()));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#else // USE_HDFS
|
||||
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static const Status notsup;
|
||||
|
||||
class HdfsEnv : public Env {
|
||||
|
||||
public:
|
||||
HdfsEnv(const std::string& fsname) {
|
||||
fprintf(stderr, "You have not build rocksdb with HDFS support\n");
|
||||
fprintf(stderr, "Please see hdfs/README for details\n");
|
||||
throw new std::exception();
|
||||
}
|
||||
|
||||
virtual ~HdfsEnv() {
|
||||
}
|
||||
|
||||
virtual Status NewSequentialFile(const std::string& fname,
|
||||
unique_ptr<SequentialFile>* result,
|
||||
const EnvOptions& options);
|
||||
|
||||
virtual Status NewRandomAccessFile(const std::string& fname,
|
||||
unique_ptr<RandomAccessFile>* result,
|
||||
const EnvOptions& options) {
|
||||
return notsup;
|
||||
}
|
||||
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options) {
|
||||
return notsup;
|
||||
}
|
||||
|
||||
virtual Status NewRandomRWFile(const std::string& fname,
|
||||
unique_ptr<RandomRWFile>* result,
|
||||
const EnvOptions& options) {
|
||||
return notsup;
|
||||
}
|
||||
|
||||
virtual bool FileExists(const std::string& fname){return false;}
|
||||
|
||||
virtual Status GetChildren(const std::string& path,
|
||||
std::vector<std::string>* result){return notsup;}
|
||||
|
||||
virtual Status DeleteFile(const std::string& fname){return notsup;}
|
||||
|
||||
virtual Status CreateDir(const std::string& name){return notsup;}
|
||||
|
||||
virtual Status CreateDirIfMissing(const std::string& name){return notsup;}
|
||||
|
||||
virtual Status DeleteDir(const std::string& name){return notsup;}
|
||||
|
||||
virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;}
|
||||
|
||||
virtual Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* time) {
|
||||
return notsup;
|
||||
}
|
||||
|
||||
virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;}
|
||||
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;}
|
||||
|
||||
virtual Status UnlockFile(FileLock* lock){return notsup;}
|
||||
|
||||
virtual Status NewLogger(const std::string& fname,
|
||||
shared_ptr<Logger>* result){return notsup;}
|
||||
|
||||
virtual void Schedule(void (*function)(void* arg), void* arg,
|
||||
Priority pri = LOW) {}
|
||||
|
||||
virtual void StartThread(void (*function)(void* arg), void* arg) {}
|
||||
|
||||
virtual Status GetTestDirectory(std::string* path) {return notsup;}
|
||||
|
||||
virtual uint64_t NowMicros() {return 0;}
|
||||
|
||||
virtual void SleepForMicroseconds(int micros) {}
|
||||
|
||||
virtual Status GetHostName(char* name, uint64_t len) {return notsup;}
|
||||
|
||||
virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;}
|
||||
|
||||
virtual Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* outputpath) {return notsup;}
|
||||
|
||||
virtual void SetBackgroundThreads(int number, Priority pri = LOW) {}
|
||||
|
||||
virtual std::string TimeToString(uint64_t number) { return "";}
|
||||
};
|
||||
}
|
||||
|
||||
#endif // USE_HDFS
|
477
hdfs/hdfs.h
Normal file
477
hdfs/hdfs.h
Normal file
@ -0,0 +1,477 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
#ifndef LIBHDFS_HDFS_H
|
||||
#define LIBHDFS_HDFS_H
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <jni.h>
|
||||
|
||||
#ifndef O_RDONLY
|
||||
#define O_RDONLY 1
|
||||
#endif
|
||||
|
||||
#ifndef O_WRONLY
|
||||
#define O_WRONLY 2
|
||||
#endif
|
||||
|
||||
#ifndef EINTERNAL
|
||||
#define EINTERNAL 255
|
||||
#endif
|
||||
|
||||
|
||||
/** All APIs set errno to meaningful values */
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Some utility decls used in libhdfs.
|
||||
*/
|
||||
|
||||
typedef int32_t tSize; /// size of data for read/write io ops
|
||||
typedef time_t tTime; /// time type in seconds
|
||||
typedef int64_t tOffset;/// offset within the file
|
||||
typedef uint16_t tPort; /// port
|
||||
typedef enum tObjectKind {
|
||||
kObjectKindFile = 'F',
|
||||
kObjectKindDirectory = 'D',
|
||||
} tObjectKind;
|
||||
|
||||
|
||||
/**
|
||||
* The C reflection of org.apache.org.hadoop.FileSystem .
|
||||
*/
|
||||
typedef void* hdfsFS;
|
||||
|
||||
|
||||
/**
|
||||
* The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream .
|
||||
*/
|
||||
enum hdfsStreamType
|
||||
{
|
||||
UNINITIALIZED = 0,
|
||||
INPUT = 1,
|
||||
OUTPUT = 2,
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* The 'file-handle' to a file in hdfs.
|
||||
*/
|
||||
struct hdfsFile_internal {
|
||||
void* file;
|
||||
enum hdfsStreamType type;
|
||||
};
|
||||
typedef struct hdfsFile_internal* hdfsFile;
|
||||
|
||||
|
||||
/**
|
||||
* hdfsConnectAsUser - Connect to a hdfs file system as a specific user
|
||||
* Connect to the hdfs.
|
||||
* @param host A string containing either a host name, or an ip address
|
||||
* of the namenode of a hdfs cluster. 'host' should be passed as NULL if
|
||||
* you want to connect to local filesystem. 'host' should be passed as
|
||||
* 'default' (and port as 0) to used the 'configured' filesystem
|
||||
* (core-site/core-default.xml).
|
||||
* @param port The port on which the server is listening.
|
||||
* @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port)
|
||||
* @param groups the groups (these are hadoop domain groups)
|
||||
* @return Returns a handle to the filesystem or NULL on error.
|
||||
*/
|
||||
hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
|
||||
|
||||
|
||||
/**
|
||||
* hdfsConnect - Connect to a hdfs file system.
|
||||
* Connect to the hdfs.
|
||||
* @param host A string containing either a host name, or an ip address
|
||||
* of the namenode of a hdfs cluster. 'host' should be passed as NULL if
|
||||
* you want to connect to local filesystem. 'host' should be passed as
|
||||
* 'default' (and port as 0) to used the 'configured' filesystem
|
||||
* (core-site/core-default.xml).
|
||||
* @param port The port on which the server is listening.
|
||||
* @return Returns a handle to the filesystem or NULL on error.
|
||||
*/
|
||||
hdfsFS hdfsConnect(const char* host, tPort port);
|
||||
|
||||
|
||||
/**
|
||||
* This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle.
|
||||
* Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance.
|
||||
*/
|
||||
hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
|
||||
hdfsFS hdfsConnectNewInstance(const char* host, tPort port);
|
||||
hdfsFS hdfsConnectPath(const char* uri);
|
||||
|
||||
/**
|
||||
* hdfsDisconnect - Disconnect from the hdfs file system.
|
||||
* Disconnect from hdfs.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsDisconnect(hdfsFS fs);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsOpenFile - Open a hdfs file in given mode.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The full path to the file.
|
||||
* @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT),
|
||||
* O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
|
||||
* @param bufferSize Size of buffer for read/write - pass 0 if you want
|
||||
* to use the default configured values.
|
||||
* @param replication Block replication - pass 0 if you want to use
|
||||
* the default configured values.
|
||||
* @param blocksize Size of block - pass 0 if you want to use the
|
||||
* default configured values.
|
||||
* @return Returns the handle to the open file or NULL on error.
|
||||
*/
|
||||
hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
|
||||
int bufferSize, short replication, tSize blocksize);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsCloseFile - Close an open file.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsCloseFile(hdfsFS fs, hdfsFile file);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsExists - Checks if a given path exsits on the filesystem
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path to look for
|
||||
* @return Returns 0 on exists, 1 on non-exists, -1/-2 on error.
|
||||
*/
|
||||
int hdfsExists(hdfsFS fs, const char *path);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsSeek - Seek to given offset in file.
|
||||
* This works only for files opened in read-only mode.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @param desiredPos Offset into the file to seek into.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsTell - Get the current offset in the file, in bytes.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @return Current offset, -1 on error.
|
||||
*/
|
||||
tOffset hdfsTell(hdfsFS fs, hdfsFile file);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsRead - Read data from an open file.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @param buffer The buffer to copy read bytes into.
|
||||
* @param length The length of the buffer.
|
||||
* @return Returns the number of bytes actually read, possibly less
|
||||
* than than length;-1 on error.
|
||||
*/
|
||||
tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsPread - Positional read of data from an open file.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @param position Position from which to read
|
||||
* @param buffer The buffer to copy read bytes into.
|
||||
* @param length The length of the buffer.
|
||||
* @return Returns the number of bytes actually read, possibly less than
|
||||
* than length;-1 on error.
|
||||
*/
|
||||
tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
|
||||
void* buffer, tSize length);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsWrite - Write data into an open file.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @param buffer The data.
|
||||
* @param length The no. of bytes to write.
|
||||
* @return Returns the number of bytes written, -1 on error.
|
||||
*/
|
||||
tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
|
||||
tSize length);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsWrite - Flush the data.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsFlush(hdfsFS fs, hdfsFile file);
|
||||
|
||||
/**
|
||||
* hdfsSync - Sync the data to persistent store.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsSync(hdfsFS fs, hdfsFile file);
|
||||
|
||||
/**
|
||||
* hdfsGetNumReplicasInPipeline - get number of remaining replicas in
|
||||
* pipeline
|
||||
* @param fs The configured filesystem handle
|
||||
* @param file the file handle
|
||||
* @return returns the # of datanodes in the write pipeline; -1 on error
|
||||
*/
|
||||
int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file);
|
||||
|
||||
/**
|
||||
* hdfsAvailable - Number of bytes that can be read from this
|
||||
* input stream without blocking.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param file The file handle.
|
||||
* @return Returns available bytes; -1 on error.
|
||||
*/
|
||||
int hdfsAvailable(hdfsFS fs, hdfsFile file);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsCopy - Copy file from one filesystem to another.
|
||||
* @param srcFS The handle to source filesystem.
|
||||
* @param src The path of source file.
|
||||
* @param dstFS The handle to destination filesystem.
|
||||
* @param dst The path of destination file.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsMove - Move file from one filesystem to another.
|
||||
* @param srcFS The handle to source filesystem.
|
||||
* @param src The path of source file.
|
||||
* @param dstFS The handle to destination filesystem.
|
||||
* @param dst The path of destination file.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsDelete - Delete file.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path of the file.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsDelete(hdfsFS fs, const char* path);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsRename - Rename file.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param oldPath The path of the source file.
|
||||
* @param newPath The path of the destination file.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsGetWorkingDirectory - Get the current working directory for
|
||||
* the given filesystem.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param buffer The user-buffer to copy path of cwd into.
|
||||
* @param bufferSize The length of user-buffer.
|
||||
* @return Returns buffer, NULL on error.
|
||||
*/
|
||||
char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsSetWorkingDirectory - Set the working directory. All relative
|
||||
* paths will be resolved relative to it.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path of the new 'cwd'.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsCreateDirectory - Make the given file and all non-existent
|
||||
* parents into directories.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path of the directory.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsCreateDirectory(hdfsFS fs, const char* path);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsSetReplication - Set the replication of the specified
|
||||
* file to the supplied value
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path of the file.
|
||||
* @return Returns 0 on success, -1 on error.
|
||||
*/
|
||||
int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsFileInfo - Information about a file/directory.
|
||||
*/
|
||||
typedef struct {
|
||||
tObjectKind mKind; /* file or directory */
|
||||
char *mName; /* the name of the file */
|
||||
tTime mLastMod; /* the last modification time for the file in seconds */
|
||||
tOffset mSize; /* the size of the file in bytes */
|
||||
short mReplication; /* the count of replicas */
|
||||
tOffset mBlockSize; /* the block size for the file */
|
||||
char *mOwner; /* the owner of the file */
|
||||
char *mGroup; /* the group associated with the file */
|
||||
short mPermissions; /* the permissions associated with the file */
|
||||
tTime mLastAccess; /* the last access time for the file in seconds */
|
||||
} hdfsFileInfo;
|
||||
|
||||
|
||||
/**
|
||||
* hdfsListDirectory - Get list of files/directories for a given
|
||||
* directory-path. hdfsFreeFileInfo should be called to deallocate memory if
|
||||
* the function returns non-NULL value.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path of the directory.
|
||||
* @param numEntries Set to the number of files/directories in path.
|
||||
* @return Returns a dynamically-allocated array of hdfsFileInfo
|
||||
* objects; NULL if empty or on error.
|
||||
* on error, numEntries will be -1.
|
||||
*/
|
||||
hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path,
|
||||
int *numEntries);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsGetPathInfo - Get information about a path as a (dynamically
|
||||
* allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
|
||||
* called when the pointer is no longer needed.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path of the file.
|
||||
* @return Returns a dynamically-allocated hdfsFileInfo object;
|
||||
* NULL on error.
|
||||
*/
|
||||
hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields)
|
||||
* @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
|
||||
* objects.
|
||||
* @param numEntries The size of the array.
|
||||
*/
|
||||
void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsGetHosts - Get hostnames where a particular block (determined by
|
||||
* pos & blocksize) of a file is stored. The last element in the array
|
||||
* is NULL. Due to replication, a single block could be present on
|
||||
* multiple hosts.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path The path of the file.
|
||||
* @param start The start of the block.
|
||||
* @param length The length of the block.
|
||||
* @return Returns a dynamically-allocated 2-d array of blocks-hosts;
|
||||
* NULL on error.
|
||||
*/
|
||||
char*** hdfsGetHosts(hdfsFS fs, const char* path,
|
||||
tOffset start, tOffset length);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
|
||||
* @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
|
||||
* objects.
|
||||
* @param numEntries The size of the array.
|
||||
*/
|
||||
void hdfsFreeHosts(char ***blockHosts);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsGetDefaultBlockSize - Get the optimum blocksize.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @return Returns the blocksize; -1 on error.
|
||||
*/
|
||||
tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsGetCapacity - Return the raw capacity of the filesystem.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @return Returns the raw-capacity; -1 on error.
|
||||
*/
|
||||
tOffset hdfsGetCapacity(hdfsFS fs);
|
||||
|
||||
|
||||
/**
|
||||
* hdfsGetUsed - Return the total raw size of all files in the filesystem.
|
||||
* @param fs The configured filesystem handle.
|
||||
* @return Returns the total-size; -1 on error.
|
||||
*/
|
||||
tOffset hdfsGetUsed(hdfsFS fs);
|
||||
|
||||
/**
|
||||
* hdfsChown
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path the path to the file or directory
|
||||
* @param owner this is a string in Hadoop land. Set to null or "" if only setting group
|
||||
* @param group this is a string in Hadoop land. Set to null or "" if only setting user
|
||||
* @return 0 on success else -1
|
||||
*/
|
||||
int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group);
|
||||
|
||||
/**
|
||||
* hdfsChmod
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path the path to the file or directory
|
||||
* @param mode the bitmask to set it to
|
||||
* @return 0 on success else -1
|
||||
*/
|
||||
int hdfsChmod(hdfsFS fs, const char* path, short mode);
|
||||
|
||||
/**
|
||||
* hdfsUtime
|
||||
* @param fs The configured filesystem handle.
|
||||
* @param path the path to the file or directory
|
||||
* @param mtime new modification time or 0 for only set access time in seconds
|
||||
* @param atime new access time or 0 for only set modification time in seconds
|
||||
* @return 0 on success else -1
|
||||
*/
|
||||
int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /*LIBHDFS_HDFS_H*/
|
||||
|
||||
/**
|
||||
* vim: ts=4: sw=4: et
|
||||
*/
|
BIN
hdfs/libhdfs.a
Normal file
BIN
hdfs/libhdfs.a
Normal file
Binary file not shown.
386
helpers/memenv/memenv.cc
Normal file
386
helpers/memenv/memenv.cc
Normal file
@ -0,0 +1,386 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "helpers/memenv/memenv.h"
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "port/port.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include <map>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
class FileState {
|
||||
public:
|
||||
// FileStates are reference counted. The initial reference count is zero
|
||||
// and the caller must call Ref() at least once.
|
||||
FileState() : refs_(0), size_(0) {}
|
||||
|
||||
// Increase the reference count.
|
||||
void Ref() {
|
||||
MutexLock lock(&refs_mutex_);
|
||||
++refs_;
|
||||
}
|
||||
|
||||
// Decrease the reference count. Delete if this is the last reference.
|
||||
void Unref() {
|
||||
bool do_delete = false;
|
||||
|
||||
{
|
||||
MutexLock lock(&refs_mutex_);
|
||||
--refs_;
|
||||
assert(refs_ >= 0);
|
||||
if (refs_ <= 0) {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (do_delete) {
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t Size() const { return size_; }
|
||||
|
||||
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
|
||||
if (offset > size_) {
|
||||
return Status::IOError("Offset greater than file size.");
|
||||
}
|
||||
const uint64_t available = size_ - offset;
|
||||
if (n > available) {
|
||||
n = available;
|
||||
}
|
||||
if (n == 0) {
|
||||
*result = Slice();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
size_t block = offset / kBlockSize;
|
||||
size_t block_offset = offset % kBlockSize;
|
||||
|
||||
if (n <= kBlockSize - block_offset) {
|
||||
// The requested bytes are all in the first block.
|
||||
*result = Slice(blocks_[block] + block_offset, n);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
size_t bytes_to_copy = n;
|
||||
char* dst = scratch;
|
||||
|
||||
while (bytes_to_copy > 0) {
|
||||
size_t avail = kBlockSize - block_offset;
|
||||
if (avail > bytes_to_copy) {
|
||||
avail = bytes_to_copy;
|
||||
}
|
||||
memcpy(dst, blocks_[block] + block_offset, avail);
|
||||
|
||||
bytes_to_copy -= avail;
|
||||
dst += avail;
|
||||
block++;
|
||||
block_offset = 0;
|
||||
}
|
||||
|
||||
*result = Slice(scratch, n);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const Slice& data) {
|
||||
const char* src = data.data();
|
||||
size_t src_len = data.size();
|
||||
|
||||
while (src_len > 0) {
|
||||
size_t avail;
|
||||
size_t offset = size_ % kBlockSize;
|
||||
|
||||
if (offset != 0) {
|
||||
// There is some room in the last block.
|
||||
avail = kBlockSize - offset;
|
||||
} else {
|
||||
// No room in the last block; push new one.
|
||||
blocks_.push_back(new char[kBlockSize]);
|
||||
avail = kBlockSize;
|
||||
}
|
||||
|
||||
if (avail > src_len) {
|
||||
avail = src_len;
|
||||
}
|
||||
memcpy(blocks_.back() + offset, src, avail);
|
||||
src_len -= avail;
|
||||
src += avail;
|
||||
size_ += avail;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
// Private since only Unref() should be used to delete it.
|
||||
~FileState() {
|
||||
for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
|
||||
++i) {
|
||||
delete [] *i;
|
||||
}
|
||||
}
|
||||
|
||||
// No copying allowed.
|
||||
FileState(const FileState&);
|
||||
void operator=(const FileState&);
|
||||
|
||||
port::Mutex refs_mutex_;
|
||||
int refs_; // Protected by refs_mutex_;
|
||||
|
||||
// The following fields are not protected by any mutex. They are only mutable
|
||||
// while the file is being written, and concurrent access is not allowed
|
||||
// to writable files.
|
||||
std::vector<char*> blocks_;
|
||||
uint64_t size_;
|
||||
|
||||
enum { kBlockSize = 8 * 1024 };
|
||||
};
|
||||
|
||||
class SequentialFileImpl : public SequentialFile {
|
||||
public:
|
||||
explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
|
||||
file_->Ref();
|
||||
}
|
||||
|
||||
~SequentialFileImpl() {
|
||||
file_->Unref();
|
||||
}
|
||||
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) {
|
||||
Status s = file_->Read(pos_, n, result, scratch);
|
||||
if (s.ok()) {
|
||||
pos_ += result->size();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual Status Skip(uint64_t n) {
|
||||
if (pos_ > file_->Size()) {
|
||||
return Status::IOError("pos_ > file_->Size()");
|
||||
}
|
||||
const size_t available = file_->Size() - pos_;
|
||||
if (n > available) {
|
||||
n = available;
|
||||
}
|
||||
pos_ += n;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
FileState* file_;
|
||||
size_t pos_;
|
||||
};
|
||||
|
||||
class RandomAccessFileImpl : public RandomAccessFile {
|
||||
public:
|
||||
explicit RandomAccessFileImpl(FileState* file) : file_(file) {
|
||||
file_->Ref();
|
||||
}
|
||||
|
||||
~RandomAccessFileImpl() {
|
||||
file_->Unref();
|
||||
}
|
||||
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const {
|
||||
return file_->Read(offset, n, result, scratch);
|
||||
}
|
||||
|
||||
private:
|
||||
FileState* file_;
|
||||
};
|
||||
|
||||
class WritableFileImpl : public WritableFile {
|
||||
public:
|
||||
WritableFileImpl(FileState* file) : file_(file) {
|
||||
file_->Ref();
|
||||
}
|
||||
|
||||
~WritableFileImpl() {
|
||||
file_->Unref();
|
||||
}
|
||||
|
||||
virtual Status Append(const Slice& data) {
|
||||
return file_->Append(data);
|
||||
}
|
||||
|
||||
virtual Status Close() { return Status::OK(); }
|
||||
virtual Status Flush() { return Status::OK(); }
|
||||
virtual Status Sync() { return Status::OK(); }
|
||||
|
||||
private:
|
||||
FileState* file_;
|
||||
};
|
||||
|
||||
class InMemoryEnv : public EnvWrapper {
|
||||
public:
|
||||
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
|
||||
|
||||
virtual ~InMemoryEnv() {
|
||||
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
|
||||
i->second->Unref();
|
||||
}
|
||||
}
|
||||
|
||||
// Partial implementation of the Env interface.
|
||||
virtual Status NewSequentialFile(const std::string& fname,
|
||||
unique_ptr<SequentialFile>* result,
|
||||
const EnvOptions& soptions) {
|
||||
MutexLock lock(&mutex_);
|
||||
if (file_map_.find(fname) == file_map_.end()) {
|
||||
*result = NULL;
|
||||
return Status::IOError(fname, "File not found");
|
||||
}
|
||||
|
||||
result->reset(new SequentialFileImpl(file_map_[fname]));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status NewRandomAccessFile(const std::string& fname,
|
||||
unique_ptr<RandomAccessFile>* result,
|
||||
const EnvOptions& soptions) {
|
||||
MutexLock lock(&mutex_);
|
||||
if (file_map_.find(fname) == file_map_.end()) {
|
||||
*result = NULL;
|
||||
return Status::IOError(fname, "File not found");
|
||||
}
|
||||
|
||||
result->reset(new RandomAccessFileImpl(file_map_[fname]));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& soptions) {
|
||||
MutexLock lock(&mutex_);
|
||||
if (file_map_.find(fname) != file_map_.end()) {
|
||||
DeleteFileInternal(fname);
|
||||
}
|
||||
|
||||
FileState* file = new FileState();
|
||||
file->Ref();
|
||||
file_map_[fname] = file;
|
||||
|
||||
result->reset(new WritableFileImpl(file));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual bool FileExists(const std::string& fname) {
|
||||
MutexLock lock(&mutex_);
|
||||
return file_map_.find(fname) != file_map_.end();
|
||||
}
|
||||
|
||||
virtual Status GetChildren(const std::string& dir,
|
||||
std::vector<std::string>* result) {
|
||||
MutexLock lock(&mutex_);
|
||||
result->clear();
|
||||
|
||||
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
|
||||
const std::string& filename = i->first;
|
||||
|
||||
if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
|
||||
Slice(filename).starts_with(Slice(dir))) {
|
||||
result->push_back(filename.substr(dir.size() + 1));
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void DeleteFileInternal(const std::string& fname) {
|
||||
if (file_map_.find(fname) == file_map_.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
file_map_[fname]->Unref();
|
||||
file_map_.erase(fname);
|
||||
}
|
||||
|
||||
virtual Status DeleteFile(const std::string& fname) {
|
||||
MutexLock lock(&mutex_);
|
||||
if (file_map_.find(fname) == file_map_.end()) {
|
||||
return Status::IOError(fname, "File not found");
|
||||
}
|
||||
|
||||
DeleteFileInternal(fname);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status CreateDir(const std::string& dirname) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status CreateDirIfMissing(const std::string& dirname) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status DeleteDir(const std::string& dirname) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
|
||||
MutexLock lock(&mutex_);
|
||||
if (file_map_.find(fname) == file_map_.end()) {
|
||||
return Status::IOError(fname, "File not found");
|
||||
}
|
||||
|
||||
*file_size = file_map_[fname]->Size();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* time) {
|
||||
return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
|
||||
}
|
||||
|
||||
virtual Status RenameFile(const std::string& src,
|
||||
const std::string& target) {
|
||||
MutexLock lock(&mutex_);
|
||||
if (file_map_.find(src) == file_map_.end()) {
|
||||
return Status::IOError(src, "File not found");
|
||||
}
|
||||
|
||||
DeleteFileInternal(target);
|
||||
file_map_[target] = file_map_[src];
|
||||
file_map_.erase(src);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock) {
|
||||
*lock = new FileLock;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status UnlockFile(FileLock* lock) {
|
||||
delete lock;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status GetTestDirectory(std::string* path) {
|
||||
*path = "/test";
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
// Map from filenames to FileState objects, representing a simple file system.
|
||||
typedef std::map<std::string, FileState*> FileSystem;
|
||||
port::Mutex mutex_;
|
||||
FileSystem file_map_; // Protected by mutex_.
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
Env* NewMemEnv(Env* base_env) {
|
||||
return new InMemoryEnv(base_env);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
19
helpers/memenv/memenv.h
Normal file
19
helpers/memenv/memenv.h
Normal file
@ -0,0 +1,19 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
|
||||
#define STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
|
||||
namespace rocksdb {
|
||||
|
||||
class Env;
|
||||
|
||||
// Returns a new environment that stores its data in memory and delegates
|
||||
// all non-file-storage tasks to base_env. The caller must delete the result
|
||||
// when it is no longer needed.
|
||||
// *base_env must remain live while the result is in use.
|
||||
Env* NewMemEnv(Env* base_env);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
|
233
helpers/memenv/memenv_test.cc
Normal file
233
helpers/memenv/memenv_test.cc
Normal file
@ -0,0 +1,233 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "helpers/memenv/memenv.h"
|
||||
|
||||
#include "db/db_impl.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/testharness.h"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class MemEnvTest {
|
||||
public:
|
||||
Env* env_;
|
||||
const EnvOptions soptions_;
|
||||
|
||||
MemEnvTest()
|
||||
: env_(NewMemEnv(Env::Default())) {
|
||||
}
|
||||
~MemEnvTest() {
|
||||
delete env_;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(MemEnvTest, Basics) {
|
||||
uint64_t file_size;
|
||||
unique_ptr<WritableFile> writable_file;
|
||||
std::vector<std::string> children;
|
||||
|
||||
ASSERT_OK(env_->CreateDir("/dir"));
|
||||
|
||||
// Check that the directory is empty.
|
||||
ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
|
||||
ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
|
||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
||||
ASSERT_EQ(0U, children.size());
|
||||
|
||||
// Create a file.
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
|
||||
writable_file.reset();
|
||||
|
||||
// Check that the file exists.
|
||||
ASSERT_TRUE(env_->FileExists("/dir/f"));
|
||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
||||
ASSERT_EQ(0U, file_size);
|
||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
||||
ASSERT_EQ(1U, children.size());
|
||||
ASSERT_EQ("f", children[0]);
|
||||
|
||||
// Write to the file.
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
|
||||
ASSERT_OK(writable_file->Append("abc"));
|
||||
writable_file.reset();
|
||||
|
||||
// Check for expected size.
|
||||
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
|
||||
ASSERT_EQ(3U, file_size);
|
||||
|
||||
// Check that renaming works.
|
||||
ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
|
||||
ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
|
||||
ASSERT_TRUE(!env_->FileExists("/dir/f"));
|
||||
ASSERT_TRUE(env_->FileExists("/dir/g"));
|
||||
ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
|
||||
ASSERT_EQ(3U, file_size);
|
||||
|
||||
// Check that opening non-existent file fails.
|
||||
unique_ptr<SequentialFile> seq_file;
|
||||
unique_ptr<RandomAccessFile> rand_file;
|
||||
ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
|
||||
soptions_).ok());
|
||||
ASSERT_TRUE(!seq_file);
|
||||
ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
|
||||
soptions_).ok());
|
||||
ASSERT_TRUE(!rand_file);
|
||||
|
||||
// Check that deleting works.
|
||||
ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
|
||||
ASSERT_OK(env_->DeleteFile("/dir/g"));
|
||||
ASSERT_TRUE(!env_->FileExists("/dir/g"));
|
||||
ASSERT_OK(env_->GetChildren("/dir", &children));
|
||||
ASSERT_EQ(0U, children.size());
|
||||
ASSERT_OK(env_->DeleteDir("/dir"));
|
||||
}
|
||||
|
||||
TEST(MemEnvTest, ReadWrite) {
|
||||
unique_ptr<WritableFile> writable_file;
|
||||
unique_ptr<SequentialFile> seq_file;
|
||||
unique_ptr<RandomAccessFile> rand_file;
|
||||
Slice result;
|
||||
char scratch[100];
|
||||
|
||||
ASSERT_OK(env_->CreateDir("/dir"));
|
||||
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
|
||||
ASSERT_OK(writable_file->Append("hello "));
|
||||
ASSERT_OK(writable_file->Append("world"));
|
||||
writable_file.reset();
|
||||
|
||||
// Read sequentially.
|
||||
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
|
||||
ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
|
||||
ASSERT_EQ(0, result.compare("hello"));
|
||||
ASSERT_OK(seq_file->Skip(1));
|
||||
ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
|
||||
ASSERT_EQ(0, result.compare("world"));
|
||||
ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
|
||||
ASSERT_EQ(0U, result.size());
|
||||
ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
|
||||
ASSERT_OK(seq_file->Read(1000, &result, scratch));
|
||||
ASSERT_EQ(0U, result.size());
|
||||
|
||||
// Random reads.
|
||||
ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
|
||||
ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
|
||||
ASSERT_EQ(0, result.compare("world"));
|
||||
ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
|
||||
ASSERT_EQ(0, result.compare("hello"));
|
||||
ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
|
||||
ASSERT_EQ(0, result.compare("d"));
|
||||
|
||||
// Too high offset.
|
||||
ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
|
||||
}
|
||||
|
||||
TEST(MemEnvTest, Locks) {
|
||||
FileLock* lock;
|
||||
|
||||
// These are no-ops, but we test they return success.
|
||||
ASSERT_OK(env_->LockFile("some file", &lock));
|
||||
ASSERT_OK(env_->UnlockFile(lock));
|
||||
}
|
||||
|
||||
TEST(MemEnvTest, Misc) {
|
||||
std::string test_dir;
|
||||
ASSERT_OK(env_->GetTestDirectory(&test_dir));
|
||||
ASSERT_TRUE(!test_dir.empty());
|
||||
|
||||
unique_ptr<WritableFile> writable_file;
|
||||
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
|
||||
|
||||
// These are no-ops, but we test they return success.
|
||||
ASSERT_OK(writable_file->Sync());
|
||||
ASSERT_OK(writable_file->Flush());
|
||||
ASSERT_OK(writable_file->Close());
|
||||
writable_file.reset();
|
||||
}
|
||||
|
||||
TEST(MemEnvTest, LargeWrite) {
|
||||
const size_t kWriteSize = 300 * 1024;
|
||||
char* scratch = new char[kWriteSize * 2];
|
||||
|
||||
std::string write_data;
|
||||
for (size_t i = 0; i < kWriteSize; ++i) {
|
||||
write_data.append(1, static_cast<char>(i));
|
||||
}
|
||||
|
||||
unique_ptr<WritableFile> writable_file;
|
||||
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
|
||||
ASSERT_OK(writable_file->Append("foo"));
|
||||
ASSERT_OK(writable_file->Append(write_data));
|
||||
writable_file.reset();
|
||||
|
||||
unique_ptr<SequentialFile> seq_file;
|
||||
Slice result;
|
||||
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
|
||||
ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
|
||||
ASSERT_EQ(0, result.compare("foo"));
|
||||
|
||||
size_t read = 0;
|
||||
std::string read_data;
|
||||
while (read < kWriteSize) {
|
||||
ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
|
||||
read_data.append(result.data(), result.size());
|
||||
read += result.size();
|
||||
}
|
||||
ASSERT_TRUE(write_data == read_data);
|
||||
delete [] scratch;
|
||||
}
|
||||
|
||||
TEST(MemEnvTest, DBTest) {
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.env = env_;
|
||||
DB* db;
|
||||
|
||||
const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
|
||||
const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
|
||||
|
||||
ASSERT_OK(DB::Open(options, "/dir/db", &db));
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
std::string res;
|
||||
ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
|
||||
ASSERT_TRUE(res == vals[i]);
|
||||
}
|
||||
|
||||
Iterator* iterator = db->NewIterator(ReadOptions());
|
||||
iterator->SeekToFirst();
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
ASSERT_TRUE(iterator->Valid());
|
||||
ASSERT_TRUE(keys[i] == iterator->key());
|
||||
ASSERT_TRUE(vals[i] == iterator->value());
|
||||
iterator->Next();
|
||||
}
|
||||
ASSERT_TRUE(!iterator->Valid());
|
||||
delete iterator;
|
||||
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
|
||||
ASSERT_OK(dbi->TEST_FlushMemTable());
|
||||
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
std::string res;
|
||||
ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
|
||||
ASSERT_TRUE(res == vals[i]);
|
||||
}
|
||||
|
||||
delete db;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
return rocksdb::test::RunAllTests();
|
||||
}
|
41
include/rocksdb/arena.h
Normal file
41
include/rocksdb/arena.h
Normal file
@ -0,0 +1,41 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// Arena class defines memory allocation methods. It's used by memtable and
|
||||
// skiplist.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Arena {
|
||||
public:
|
||||
Arena() {};
|
||||
virtual ~Arena() {};
|
||||
|
||||
// Return a pointer to a newly allocated memory block of "bytes" bytes.
|
||||
virtual char* Allocate(size_t bytes) = 0;
|
||||
|
||||
// Allocate memory with the normal alignment guarantees provided by malloc.
|
||||
virtual char* AllocateAligned(size_t bytes) = 0;
|
||||
|
||||
// Returns an estimate of the total memory used by arena.
|
||||
virtual const size_t ApproximateMemoryUsage() = 0;
|
||||
|
||||
// Returns the total number of bytes in all blocks allocated so far.
|
||||
virtual const size_t MemoryAllocatedBytes() = 0;
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Arena(const Arena&);
|
||||
void operator=(const Arena&);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
281
include/rocksdb/c.h
Normal file
281
include/rocksdb/c.h
Normal file
@ -0,0 +1,281 @@
|
||||
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
Use of this source code is governed by a BSD-style license that can be
|
||||
found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
C bindings for leveldb. May be useful as a stable ABI that can be
|
||||
used by programs that keep leveldb in a shared library, or for
|
||||
a JNI api.
|
||||
|
||||
Does not support:
|
||||
. getters for the option types
|
||||
. custom comparators that implement key shortening
|
||||
. capturing post-write-snapshot
|
||||
. custom iter, db, env, cache implementations using just the C bindings
|
||||
|
||||
Some conventions:
|
||||
|
||||
(1) We expose just opaque struct pointers and functions to clients.
|
||||
This allows us to change internal representations without having to
|
||||
recompile clients.
|
||||
|
||||
(2) For simplicity, there is no equivalent to the Slice type. Instead,
|
||||
the caller has to pass the pointer and length as separate
|
||||
arguments.
|
||||
|
||||
(3) Errors are represented by a null-terminated c string. NULL
|
||||
means no error. All operations that can raise an error are passed
|
||||
a "char** errptr" as the last argument. One of the following must
|
||||
be true on entry:
|
||||
*errptr == NULL
|
||||
*errptr points to a malloc()ed null-terminated error message
|
||||
On success, a leveldb routine leaves *errptr unchanged.
|
||||
On failure, leveldb frees the old value of *errptr and
|
||||
set *errptr to a malloc()ed error message.
|
||||
|
||||
(4) Bools have the type unsigned char (0 == false; rest == true)
|
||||
|
||||
(5) All of the pointer arguments must be non-NULL.
|
||||
*/
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_C_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* Exported types */
|
||||
|
||||
typedef struct leveldb_t leveldb_t;
|
||||
typedef struct leveldb_cache_t leveldb_cache_t;
|
||||
typedef struct leveldb_comparator_t leveldb_comparator_t;
|
||||
typedef struct leveldb_env_t leveldb_env_t;
|
||||
typedef struct leveldb_filelock_t leveldb_filelock_t;
|
||||
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
|
||||
typedef struct leveldb_iterator_t leveldb_iterator_t;
|
||||
typedef struct leveldb_logger_t leveldb_logger_t;
|
||||
typedef struct leveldb_options_t leveldb_options_t;
|
||||
typedef struct leveldb_randomfile_t leveldb_randomfile_t;
|
||||
typedef struct leveldb_readoptions_t leveldb_readoptions_t;
|
||||
typedef struct leveldb_seqfile_t leveldb_seqfile_t;
|
||||
typedef struct leveldb_snapshot_t leveldb_snapshot_t;
|
||||
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
|
||||
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
|
||||
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
|
||||
|
||||
/* DB operations */
|
||||
|
||||
extern leveldb_t* leveldb_open(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_close(leveldb_t* db);
|
||||
|
||||
extern void leveldb_put(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
const char* val, size_t vallen,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_delete(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_write(
|
||||
leveldb_t* db,
|
||||
const leveldb_writeoptions_t* options,
|
||||
leveldb_writebatch_t* batch,
|
||||
char** errptr);
|
||||
|
||||
/* Returns NULL if not found. A malloc()ed array otherwise.
|
||||
Stores the length of the array in *vallen. */
|
||||
extern char* leveldb_get(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options,
|
||||
const char* key, size_t keylen,
|
||||
size_t* vallen,
|
||||
char** errptr);
|
||||
|
||||
extern leveldb_iterator_t* leveldb_create_iterator(
|
||||
leveldb_t* db,
|
||||
const leveldb_readoptions_t* options);
|
||||
|
||||
extern const leveldb_snapshot_t* leveldb_create_snapshot(
|
||||
leveldb_t* db);
|
||||
|
||||
extern void leveldb_release_snapshot(
|
||||
leveldb_t* db,
|
||||
const leveldb_snapshot_t* snapshot);
|
||||
|
||||
/* Returns NULL if property name is unknown.
|
||||
Else returns a pointer to a malloc()-ed null-terminated value. */
|
||||
extern char* leveldb_property_value(
|
||||
leveldb_t* db,
|
||||
const char* propname);
|
||||
|
||||
extern void leveldb_approximate_sizes(
|
||||
leveldb_t* db,
|
||||
int num_ranges,
|
||||
const char* const* range_start_key, const size_t* range_start_key_len,
|
||||
const char* const* range_limit_key, const size_t* range_limit_key_len,
|
||||
uint64_t* sizes);
|
||||
|
||||
extern void leveldb_compact_range(
|
||||
leveldb_t* db,
|
||||
const char* start_key, size_t start_key_len,
|
||||
const char* limit_key, size_t limit_key_len);
|
||||
|
||||
/* Management operations */
|
||||
|
||||
extern void leveldb_destroy_db(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
extern void leveldb_repair_db(
|
||||
const leveldb_options_t* options,
|
||||
const char* name,
|
||||
char** errptr);
|
||||
|
||||
/* Iterator */
|
||||
|
||||
extern void leveldb_iter_destroy(leveldb_iterator_t*);
|
||||
extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
|
||||
extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
|
||||
extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
|
||||
extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
|
||||
extern void leveldb_iter_next(leveldb_iterator_t*);
|
||||
extern void leveldb_iter_prev(leveldb_iterator_t*);
|
||||
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
|
||||
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
|
||||
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
|
||||
|
||||
/* Write batch */
|
||||
|
||||
extern leveldb_writebatch_t* leveldb_writebatch_create();
|
||||
extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
|
||||
extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
|
||||
extern void leveldb_writebatch_put(
|
||||
leveldb_writebatch_t*,
|
||||
const char* key, size_t klen,
|
||||
const char* val, size_t vlen);
|
||||
extern void leveldb_writebatch_delete(
|
||||
leveldb_writebatch_t*,
|
||||
const char* key, size_t klen);
|
||||
extern void leveldb_writebatch_iterate(
|
||||
leveldb_writebatch_t*,
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*deleted)(void*, const char* k, size_t klen));
|
||||
|
||||
/* Options */
|
||||
|
||||
extern leveldb_options_t* leveldb_options_create();
|
||||
extern void leveldb_options_destroy(leveldb_options_t*);
|
||||
extern void leveldb_options_set_comparator(
|
||||
leveldb_options_t*,
|
||||
leveldb_comparator_t*);
|
||||
extern void leveldb_options_set_compression_per_level(
|
||||
leveldb_options_t* opt,
|
||||
int* level_values,
|
||||
size_t num_levels);
|
||||
extern void leveldb_options_set_filter_policy(
|
||||
leveldb_options_t*,
|
||||
leveldb_filterpolicy_t*);
|
||||
extern void leveldb_options_set_create_if_missing(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_error_if_exists(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_paranoid_checks(
|
||||
leveldb_options_t*, unsigned char);
|
||||
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
|
||||
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
|
||||
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
|
||||
extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
|
||||
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
|
||||
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
|
||||
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
|
||||
extern void leveldb_options_set_compression_options(
|
||||
leveldb_options_t* opt, int w_bits, int level, int strategy);
|
||||
|
||||
enum {
|
||||
leveldb_no_compression = 0,
|
||||
leveldb_snappy_compression = 1
|
||||
};
|
||||
extern void leveldb_options_set_compression(leveldb_options_t*, int);
|
||||
|
||||
/* Comparator */
|
||||
|
||||
extern leveldb_comparator_t* leveldb_comparator_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
int (*compare)(
|
||||
void*,
|
||||
const char* a, size_t alen,
|
||||
const char* b, size_t blen),
|
||||
const char* (*name)(void*));
|
||||
extern void leveldb_comparator_destroy(leveldb_comparator_t*);
|
||||
|
||||
/* Filter policy */
|
||||
|
||||
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
|
||||
void* state,
|
||||
void (*destructor)(void*),
|
||||
char* (*create_filter)(
|
||||
void*,
|
||||
const char* const* key_array, const size_t* key_length_array,
|
||||
int num_keys,
|
||||
size_t* filter_length),
|
||||
unsigned char (*key_may_match)(
|
||||
void*,
|
||||
const char* key, size_t length,
|
||||
const char* filter, size_t filter_length),
|
||||
const char* (*name)(void*));
|
||||
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
|
||||
|
||||
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
|
||||
int bits_per_key);
|
||||
|
||||
/* Read options */
|
||||
|
||||
extern leveldb_readoptions_t* leveldb_readoptions_create();
|
||||
extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
|
||||
extern void leveldb_readoptions_set_verify_checksums(
|
||||
leveldb_readoptions_t*,
|
||||
unsigned char);
|
||||
extern void leveldb_readoptions_set_fill_cache(
|
||||
leveldb_readoptions_t*, unsigned char);
|
||||
extern void leveldb_readoptions_set_snapshot(
|
||||
leveldb_readoptions_t*,
|
||||
const leveldb_snapshot_t*);
|
||||
|
||||
/* Write options */
|
||||
|
||||
extern leveldb_writeoptions_t* leveldb_writeoptions_create();
|
||||
extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
|
||||
extern void leveldb_writeoptions_set_sync(
|
||||
leveldb_writeoptions_t*, unsigned char);
|
||||
|
||||
/* Cache */
|
||||
|
||||
extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
|
||||
extern void leveldb_cache_destroy(leveldb_cache_t* cache);
|
||||
|
||||
/* Env */
|
||||
|
||||
extern leveldb_env_t* leveldb_create_default_env();
|
||||
extern void leveldb_env_destroy(leveldb_env_t*);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* end extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
|
118
include/rocksdb/cache.h
Normal file
118
include/rocksdb/cache.h
Normal file
@ -0,0 +1,118 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A Cache is an interface that maps keys to values. It has internal
|
||||
// synchronization and may be safely accessed concurrently from
|
||||
// multiple threads. It may automatically evict entries to make room
|
||||
// for new entries. Values have a specified charge against the cache
|
||||
// capacity. For example, a cache where the values are variable
|
||||
// length strings, may use the length of the string as the charge for
|
||||
// the string.
|
||||
//
|
||||
// A builtin cache implementation with a least-recently-used eviction
|
||||
// policy is provided. Clients may use their own implementations if
|
||||
// they want something more sophisticated (like scan-resistance, a
|
||||
// custom eviction policy, variable cache sizing, etc.)
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
using std::shared_ptr;
|
||||
|
||||
class Cache;
|
||||
|
||||
// Create a new cache with a fixed size capacity. The cache is sharded
|
||||
// to 2^numShardBits shards, by hash of the key. The total capacity
|
||||
// is divided and evenly assigned to each shard. Inside each shard,
|
||||
// the eviction is done in two passes: first try to free spaces by
|
||||
// evicting entries that are among the most least used removeScanCountLimit
|
||||
// entries and do not have reference other than by the cache itself, in
|
||||
// the least-used order. If not enough space is freed, further free the
|
||||
// entries in least used order.
|
||||
//
|
||||
// The functions without parameter numShardBits and/or removeScanCountLimit
|
||||
// use default values. removeScanCountLimit's default value is 0, which
|
||||
// means a strict LRU order inside each shard.
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity);
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
|
||||
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
|
||||
int removeScanCountLimit);
|
||||
|
||||
class Cache {
|
||||
public:
|
||||
Cache() { }
|
||||
|
||||
// Destroys all existing entries by calling the "deleter"
|
||||
// function that was passed to the constructor.
|
||||
virtual ~Cache();
|
||||
|
||||
// Opaque handle to an entry stored in the cache.
|
||||
struct Handle { };
|
||||
|
||||
// Insert a mapping from key->value into the cache and assign it
|
||||
// the specified charge against the total cache capacity.
|
||||
//
|
||||
// Returns a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
//
|
||||
// When the inserted entry is no longer needed, the key and
|
||||
// value will be passed to "deleter".
|
||||
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
|
||||
void (*deleter)(const Slice& key, void* value)) = 0;
|
||||
|
||||
// If the cache has no mapping for "key", returns nullptr.
|
||||
//
|
||||
// Else return a handle that corresponds to the mapping. The caller
|
||||
// must call this->Release(handle) when the returned mapping is no
|
||||
// longer needed.
|
||||
virtual Handle* Lookup(const Slice& key) = 0;
|
||||
|
||||
// Release a mapping returned by a previous Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void Release(Handle* handle) = 0;
|
||||
|
||||
// Return the value encapsulated in a handle returned by a
|
||||
// successful Lookup().
|
||||
// REQUIRES: handle must not have been released yet.
|
||||
// REQUIRES: handle must have been returned by a method on *this.
|
||||
virtual void* Value(Handle* handle) = 0;
|
||||
|
||||
// If the cache contains entry for key, erase it. Note that the
|
||||
// underlying entry will be kept around until all existing handles
|
||||
// to it have been released.
|
||||
virtual void Erase(const Slice& key) = 0;
|
||||
|
||||
// Return a new numeric id. May be used by multiple clients who are
|
||||
// sharing the same cache to partition the key space. Typically the
|
||||
// client will allocate a new id at startup and prepend the id to
|
||||
// its cache keys.
|
||||
virtual uint64_t NewId() = 0;
|
||||
|
||||
// returns the maximum configured capacity of the cache
|
||||
virtual size_t GetCapacity() = 0;
|
||||
|
||||
private:
|
||||
void LRU_Remove(Handle* e);
|
||||
void LRU_Append(Handle* e);
|
||||
void Unref(Handle* e);
|
||||
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
|
||||
// No copying allowed
|
||||
Cache(const Cache&);
|
||||
void operator=(const Cache&);
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_
|
79
include/rocksdb/compaction_filter.h
Normal file
79
include/rocksdb/compaction_filter.h
Normal file
@ -0,0 +1,79 @@
|
||||
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
// CompactionFilter allows an application to modify/delete a key-value at
|
||||
// the time of compaction.
|
||||
|
||||
class CompactionFilter {
|
||||
public:
|
||||
|
||||
// Context information of a compaction run
|
||||
struct Context {
|
||||
// Does this compaction run include all data files
|
||||
bool is_full_compaction;
|
||||
};
|
||||
|
||||
virtual ~CompactionFilter() {}
|
||||
|
||||
// The compaction process invokes this
|
||||
// method for kv that is being compacted. A return value
|
||||
// of false indicates that the kv should be preserved in the
|
||||
// output of this compaction run and a return value of true
|
||||
// indicates that this key-value should be removed from the
|
||||
// output of the compaction. The application can inspect
|
||||
// the existing value of the key and make decision based on it.
|
||||
//
|
||||
// When the value is to be preserved, the application has the option
|
||||
// to modify the existing_value and pass it back through new_value.
|
||||
// value_changed needs to be set to true in this case.
|
||||
virtual bool Filter(int level,
|
||||
const Slice& key,
|
||||
const Slice& existing_value,
|
||||
std::string* new_value,
|
||||
bool* value_changed) const = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter.
|
||||
// The name will be printed to LOG file on start up for diagnosis.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Each compaction will create a new CompactionFilter allowing the
|
||||
// application to know about different campactions
|
||||
class CompactionFilterFactory {
|
||||
public:
|
||||
virtual ~CompactionFilterFactory() { };
|
||||
|
||||
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||
const CompactionFilter::Context& context) = 0;
|
||||
|
||||
// Returns a name that identifies this compaction filter factory.
|
||||
virtual const char* Name() const = 0;
|
||||
};
|
||||
|
||||
// Default implementaion of CompactionFilterFactory which does not
|
||||
// return any filter
|
||||
class DefaultCompactionFilterFactory : public CompactionFilterFactory {
|
||||
public:
|
||||
virtual std::unique_ptr<CompactionFilter>
|
||||
CreateCompactionFilter(const CompactionFilter::Context& context) override {
|
||||
return std::unique_ptr<CompactionFilter>(nullptr);
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "DefaultCompactionFilterFactory";
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
|
63
include/rocksdb/comparator.h
Normal file
63
include/rocksdb/comparator.h
Normal file
@ -0,0 +1,63 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
// A Comparator object provides a total order across slices that are
|
||||
// used as keys in an sstable or a database. A Comparator implementation
|
||||
// must be thread-safe since rocksdb may invoke its methods concurrently
|
||||
// from multiple threads.
|
||||
class Comparator {
|
||||
public:
|
||||
virtual ~Comparator();
|
||||
|
||||
// Three-way comparison. Returns value:
|
||||
// < 0 iff "a" < "b",
|
||||
// == 0 iff "a" == "b",
|
||||
// > 0 iff "a" > "b"
|
||||
virtual int Compare(const Slice& a, const Slice& b) const = 0;
|
||||
|
||||
// The name of the comparator. Used to check for comparator
|
||||
// mismatches (i.e., a DB created with one comparator is
|
||||
// accessed using a different comparator.
|
||||
//
|
||||
// The client of this package should switch to a new name whenever
|
||||
// the comparator implementation changes in a way that will cause
|
||||
// the relative ordering of any two keys to change.
|
||||
//
|
||||
// Names starting with "rocksdb." are reserved and should not be used
|
||||
// by any clients of this package.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Advanced functions: these are used to reduce the space requirements
|
||||
// for internal data structures like index blocks.
|
||||
|
||||
// If *start < limit, changes *start to a short string in [start,limit).
|
||||
// Simple comparator implementations may return with *start unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortestSeparator(
|
||||
std::string* start,
|
||||
const Slice& limit) const = 0;
|
||||
|
||||
// Changes *key to a short string >= *key.
|
||||
// Simple comparator implementations may return with *key unchanged,
|
||||
// i.e., an implementation of this method that does nothing is correct.
|
||||
virtual void FindShortSuccessor(std::string* key) const = 0;
|
||||
};
|
||||
|
||||
// Return a builtin comparator that uses lexicographic byte-wise
|
||||
// ordering. The result remains the property of this module and
|
||||
// must not be deleted.
|
||||
extern const Comparator* BytewiseComparator();
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
|
303
include/rocksdb/db.h
Normal file
303
include/rocksdb/db.h
Normal file
@ -0,0 +1,303 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_DB_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/transaction_log.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
using std::unique_ptr;
|
||||
|
||||
// Update Makefile if you change these
|
||||
static const int kMajorVersion = 2;
|
||||
static const int kMinorVersion = 0;
|
||||
|
||||
struct Options;
|
||||
struct ReadOptions;
|
||||
struct WriteOptions;
|
||||
struct FlushOptions;
|
||||
class WriteBatch;
|
||||
|
||||
// Metadata associated with each SST file.
|
||||
struct LiveFileMetaData {
|
||||
std::string name; // Name of the file
|
||||
int level; // Level at which this file resides.
|
||||
size_t size; // File size in bytes.
|
||||
std::string smallestkey; // Smallest user defined key in the file.
|
||||
std::string largestkey; // Largest user defined key in the file.
|
||||
SequenceNumber smallest_seqno; // smallest seqno in file
|
||||
SequenceNumber largest_seqno; // largest seqno in file
|
||||
};
|
||||
|
||||
// Abstract handle to particular state of a DB.
|
||||
// A Snapshot is an immutable object and can therefore be safely
|
||||
// accessed from multiple threads without any external synchronization.
|
||||
class Snapshot {
|
||||
protected:
|
||||
virtual ~Snapshot();
|
||||
};
|
||||
|
||||
// A range of keys
|
||||
struct Range {
|
||||
Slice start; // Included in the range
|
||||
Slice limit; // Not included in the range
|
||||
|
||||
Range() { }
|
||||
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
|
||||
};
|
||||
|
||||
// A DB is a persistent ordered map from keys to values.
|
||||
// A DB is safe for concurrent access from multiple threads without
|
||||
// any external synchronization.
|
||||
class DB {
|
||||
public:
|
||||
// Open the database with the specified "name".
|
||||
// Stores a pointer to a heap-allocated database in *dbptr and returns
|
||||
// OK on success.
|
||||
// Stores nullptr in *dbptr and returns a non-OK status on error.
|
||||
// Caller should delete *dbptr when it is no longer needed.
|
||||
static Status Open(const Options& options,
|
||||
const std::string& name,
|
||||
DB** dbptr);
|
||||
|
||||
// Open the database for read only. All DB interfaces
|
||||
// that modify data, like put/delete, will return error.
|
||||
// If the db is opened in read only mode, then no compactions
|
||||
// will happen.
|
||||
static Status OpenForReadOnly(const Options& options,
|
||||
const std::string& name, DB** dbptr,
|
||||
bool error_if_log_file_exist = false);
|
||||
|
||||
DB() { }
|
||||
virtual ~DB();
|
||||
|
||||
// Set the database entry for "key" to "value".
|
||||
// Returns OK on success, and a non-OK status on error.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
|
||||
// Remove the database entry (if any) for "key". Returns OK on
|
||||
// success, and a non-OK status on error. It is not an error if "key"
|
||||
// did not exist in the database.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
|
||||
|
||||
// Merge the database entry for "key" with "value". Returns OK on success,
|
||||
// and a non-OK status on error. The semantics of this operation is
|
||||
// determined by the user provided merge_operator when opening DB.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
|
||||
// Apply the specified updates to the database.
|
||||
// Returns OK on success, non-OK on failure.
|
||||
// Note: consider setting options.sync = true.
|
||||
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
|
||||
|
||||
// If the database contains an entry for "key" store the
|
||||
// corresponding value in *value and return OK.
|
||||
//
|
||||
// If there is no entry for "key" leave *value unchanged and return
|
||||
// a status for which Status::IsNotFound() returns true.
|
||||
//
|
||||
// May return some other Status on an error.
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value) = 0;
|
||||
|
||||
// If keys[i] does not exist in the database, then the i'th returned
|
||||
// status will be one for which Status::IsNotFound() is true, and
|
||||
// (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
|
||||
// the i'th returned status will have Status::ok() true, and (*values)[i]
|
||||
// will store the value associated with keys[i].
|
||||
//
|
||||
// (*values) will always be resized to be the same size as (keys).
|
||||
// Similarly, the number of returned statuses will be the number of keys.
|
||||
// Note: keys will not be "de-duplicated". Duplicate keys will return
|
||||
// duplicate values in order.
|
||||
virtual std::vector<Status> MultiGet(const ReadOptions& options,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) = 0;
|
||||
|
||||
// If the key definitely does not exist in the database, then this method
|
||||
// returns false, else true. If the caller wants to obtain value when the key
|
||||
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
|
||||
// will be true on return if value has been set properly.
|
||||
// This check is potentially lighter-weight than invoking DB::Get(). One way
|
||||
// to make this lighter weight is to avoid doing any IOs.
|
||||
// Default implementation here returns true and sets 'value_found' to false
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) {
|
||||
if (value_found != nullptr) {
|
||||
*value_found = false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Return a heap-allocated iterator over the contents of the database.
|
||||
// The result of NewIterator() is initially invalid (caller must
|
||||
// call one of the Seek methods on the iterator before using it).
|
||||
//
|
||||
// Caller should delete the iterator when it is no longer needed.
|
||||
// The returned iterator should be deleted before this db is deleted.
|
||||
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
|
||||
|
||||
// Return a handle to the current DB state. Iterators created with
|
||||
// this handle will all observe a stable snapshot of the current DB
|
||||
// state. The caller must call ReleaseSnapshot(result) when the
|
||||
// snapshot is no longer needed.
|
||||
virtual const Snapshot* GetSnapshot() = 0;
|
||||
|
||||
// Release a previously acquired snapshot. The caller must not
|
||||
// use "snapshot" after this call.
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
|
||||
|
||||
// DB implementations can export properties about their state
|
||||
// via this method. If "property" is a valid property understood by this
|
||||
// DB implementation, fills "*value" with its current value and returns
|
||||
// true. Otherwise returns false.
|
||||
//
|
||||
//
|
||||
// Valid property names include:
|
||||
//
|
||||
// "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
|
||||
// where <N> is an ASCII representation of a level number (e.g. "0").
|
||||
// "rocksdb.stats" - returns a multi-line string that describes statistics
|
||||
// about the internal operation of the DB.
|
||||
// "rocksdb.sstables" - returns a multi-line string that describes all
|
||||
// of the sstables that make up the db contents.
|
||||
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
|
||||
|
||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||
// file system space used by keys in "[range[i].start .. range[i].limit)".
|
||||
//
|
||||
// Note that the returned sizes measure file system space usage, so
|
||||
// if the user data compresses by a factor of ten, the returned
|
||||
// sizes will be one-tenth the size of the corresponding user data size.
|
||||
//
|
||||
// The results may not include the sizes of recently written data.
|
||||
virtual void GetApproximateSizes(const Range* range, int n,
|
||||
uint64_t* sizes) = 0;
|
||||
|
||||
// Compact the underlying storage for the key range [*begin,*end].
|
||||
// In particular, deleted and overwritten versions are discarded,
|
||||
// and the data is rearranged to reduce the cost of operations
|
||||
// needed to access the data. This operation should typically only
|
||||
// be invoked by users who understand the underlying implementation.
|
||||
//
|
||||
// begin==nullptr is treated as a key before all keys in the database.
|
||||
// end==nullptr is treated as a key after all keys in the database.
|
||||
// Therefore the following call will compact the entire database:
|
||||
// db->CompactRange(nullptr, nullptr);
|
||||
// Note that after the entire database is compacted, all data are pushed
|
||||
// down to the last level containing any data. If the total data size
|
||||
// after compaction is reduced, that level might not be appropriate for
|
||||
// hosting all the files. In this case, client could set reduce_level
|
||||
// to true, to move the files back to the minimum level capable of holding
|
||||
// the data set or a given level (specified by non-negative target_level).
|
||||
virtual void CompactRange(const Slice* begin, const Slice* end,
|
||||
bool reduce_level = false,
|
||||
int target_level = -1) = 0;
|
||||
|
||||
// Number of levels used for this DB.
|
||||
virtual int NumberLevels() = 0;
|
||||
|
||||
// Maximum level to which a new compacted memtable is pushed if it
|
||||
// does not create overlap.
|
||||
virtual int MaxMemCompactionLevel() = 0;
|
||||
|
||||
// Number of files in level-0 that would stop writes.
|
||||
virtual int Level0StopWriteTrigger() = 0;
|
||||
|
||||
// Flush all mem-table data.
|
||||
virtual Status Flush(const FlushOptions& options) = 0;
|
||||
|
||||
// Prevent file deletions. Compactions will continue to occur,
|
||||
// but no obsolete files will be deleted. Calling this multiple
|
||||
// times have the same effect as calling it once.
|
||||
virtual Status DisableFileDeletions() = 0;
|
||||
|
||||
// Allow compactions to delete obselete files.
|
||||
virtual Status EnableFileDeletions() = 0;
|
||||
|
||||
// GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
|
||||
|
||||
// THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more
|
||||
// detailed information on the live files.
|
||||
// Retrieve the list of all files in the database. The files are
|
||||
// relative to the dbname and are not absolute paths. The valid size of the
|
||||
// manifest file is returned in manifest_file_size. The manifest file is an
|
||||
// ever growing file, but only the portion specified by manifest_file_size is
|
||||
// valid for this snapshot.
|
||||
// Setting flush_memtable to true does Flush before recording the live files.
|
||||
// Setting flush_memtable to false is useful when we don't want to wait for
|
||||
// flush which may have to wait for compaction to complete taking an
|
||||
// indeterminate time. But this will have to use GetSortedWalFiles after
|
||||
// GetLiveFiles to compensate for memtables missed in this snapshot due to the
|
||||
// absence of Flush, by WAL files to recover the database consistently later
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) = 0;
|
||||
|
||||
// Retrieve the sorted list of all wal files with earliest file first
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
|
||||
|
||||
// The sequence number of the most recent transaction.
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const = 0;
|
||||
|
||||
// Sets iter to an iterator that is positioned at a write-batch containing
|
||||
// seq_number. If the sequence number is non existent, it returns an iterator
|
||||
// at the first available seq_no after the requested seq_no
|
||||
// Returns Status::Ok if iterator is valid
|
||||
// Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
|
||||
// use this api, else the WAL files will get
|
||||
// cleared aggressively and the iterator might keep getting invalid before
|
||||
// an update is read.
|
||||
virtual Status GetUpdatesSince(SequenceNumber seq_number,
|
||||
unique_ptr<TransactionLogIterator>* iter) = 0;
|
||||
|
||||
// Delete the file name from the db directory and update the internal state to
|
||||
// reflect that. Supports deletion of sst and log files only. 'name' must be
|
||||
// path relative to the db directory. eg. 000001.sst, /archive/000003.log
|
||||
virtual Status DeleteFile(std::string name) = 0;
|
||||
|
||||
// Returns a list of all table files with their level, start key
|
||||
// and end key
|
||||
virtual void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData> *metadata) {
|
||||
}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
DB(const DB&);
|
||||
void operator=(const DB&);
|
||||
};
|
||||
|
||||
// Destroy the contents of the specified database.
|
||||
// Be very careful using this method.
|
||||
Status DestroyDB(const std::string& name, const Options& options);
|
||||
|
||||
// If a DB cannot be opened, you may attempt to call this method to
|
||||
// resurrect as much of the contents of the database as possible.
|
||||
// Some data may be lost, so be careful when calling this function
|
||||
// on a database that contains important information.
|
||||
Status RepairDB(const std::string& dbname, const Options& options);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_
|
646
include/rocksdb/env.h
Normal file
646
include/rocksdb/env.h
Normal file
@ -0,0 +1,646 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An Env is an interface used by the rocksdb implementation to access
|
||||
// operating system functionality like the filesystem etc. Callers
|
||||
// may wish to provide a custom Env object when opening a database to
|
||||
// get fine gain control; e.g., to rate limit file system operations.
|
||||
//
|
||||
// All Env implementations are safe for concurrent access from
|
||||
// multiple threads without any external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
||||
|
||||
#include <cstdarg>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class FileLock;
|
||||
class Logger;
|
||||
class RandomAccessFile;
|
||||
class SequentialFile;
|
||||
class Slice;
|
||||
class WritableFile;
|
||||
class RandomRWFile;
|
||||
struct Options;
|
||||
|
||||
using std::unique_ptr;
|
||||
using std::shared_ptr;
|
||||
|
||||
|
||||
// Options while opening a file to read/write
|
||||
struct EnvOptions {
|
||||
|
||||
// construct with default Options
|
||||
EnvOptions();
|
||||
|
||||
// construct from Options
|
||||
explicit EnvOptions(const Options& options);
|
||||
|
||||
// If true, then allow caching of data in environment buffers
|
||||
bool use_os_buffer;
|
||||
|
||||
// If true, then use mmap to read data
|
||||
bool use_mmap_reads;
|
||||
|
||||
// If true, then use mmap to write data
|
||||
bool use_mmap_writes;
|
||||
|
||||
// If true, set the FD_CLOEXEC on open fd.
|
||||
bool set_fd_cloexec;
|
||||
|
||||
// Allows OS to incrementally sync files to disk while they are being
|
||||
// written, in the background. Issue one request for every bytes_per_sync
|
||||
// written. 0 turns it off.
|
||||
// Default: 0
|
||||
uint64_t bytes_per_sync;
|
||||
|
||||
};
|
||||
|
||||
class Env {
|
||||
public:
|
||||
Env() { }
|
||||
virtual ~Env();
|
||||
|
||||
// Return a default environment suitable for the current operating
|
||||
// system. Sophisticated users may wish to provide their own Env
|
||||
// implementation instead of relying on this default environment.
|
||||
//
|
||||
// The result of Default() belongs to rocksdb and must never be deleted.
|
||||
static Env* Default();
|
||||
|
||||
// Create a brand new sequentially-readable file with the specified name.
|
||||
// On success, stores a pointer to the new file in *result and returns OK.
|
||||
// On failure stores nullptr in *result and returns non-OK. If the file does
|
||||
// not exist, returns a non-OK status.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewSequentialFile(const std::string& fname,
|
||||
unique_ptr<SequentialFile>* result,
|
||||
const EnvOptions& options)
|
||||
= 0;
|
||||
|
||||
// Create a brand new random access read-only file with the
|
||||
// specified name. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores nullptr in *result and
|
||||
// returns non-OK. If the file does not exist, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// The returned file may be concurrently accessed by multiple threads.
|
||||
virtual Status NewRandomAccessFile(const std::string& fname,
|
||||
unique_ptr<RandomAccessFile>* result,
|
||||
const EnvOptions& options)
|
||||
= 0;
|
||||
|
||||
// Create an object that writes to a new file with the specified
|
||||
// name. Deletes any existing file with the same name and creates a
|
||||
// new file. On success, stores a pointer to the new file in
|
||||
// *result and returns OK. On failure stores nullptr in *result and
|
||||
// returns non-OK.
|
||||
//
|
||||
// The returned file will only be accessed by one thread at a time.
|
||||
virtual Status NewWritableFile(const std::string& fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options) = 0;
|
||||
|
||||
// Create an object that both reads and writes to a file on
|
||||
// specified offsets (random access). If file already exists,
|
||||
// does not overwrite it. On success, stores a pointer to the
|
||||
// new file in *result and returns OK. On failure stores nullptr
|
||||
// in *result and returns non-OK.
|
||||
virtual Status NewRandomRWFile(const std::string& fname,
|
||||
unique_ptr<RandomRWFile>* result,
|
||||
const EnvOptions& options) = 0;
|
||||
|
||||
// Returns true iff the named file exists.
|
||||
virtual bool FileExists(const std::string& fname) = 0;
|
||||
|
||||
// Store in *result the names of the children of the specified directory.
|
||||
// The names are relative to "dir".
|
||||
// Original contents of *results are dropped.
|
||||
virtual Status GetChildren(const std::string& dir,
|
||||
std::vector<std::string>* result) = 0;
|
||||
|
||||
// Delete the named file.
|
||||
virtual Status DeleteFile(const std::string& fname) = 0;
|
||||
|
||||
// Create the specified directory. Returns error if directory exists.
|
||||
virtual Status CreateDir(const std::string& dirname) = 0;
|
||||
|
||||
// Creates directory if missing. Return Ok if it exists, or successful in
|
||||
// Creating.
|
||||
virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
|
||||
|
||||
// Delete the specified directory.
|
||||
virtual Status DeleteDir(const std::string& dirname) = 0;
|
||||
|
||||
// Store the size of fname in *file_size.
|
||||
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
|
||||
|
||||
// Store the last modification time of fname in *file_mtime.
|
||||
virtual Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* file_mtime) = 0;
|
||||
// Rename file src to target.
|
||||
virtual Status RenameFile(const std::string& src,
|
||||
const std::string& target) = 0;
|
||||
|
||||
// Lock the specified file. Used to prevent concurrent access to
|
||||
// the same db by multiple processes. On failure, stores nullptr in
|
||||
// *lock and returns non-OK.
|
||||
//
|
||||
// On success, stores a pointer to the object that represents the
|
||||
// acquired lock in *lock and returns OK. The caller should call
|
||||
// UnlockFile(*lock) to release the lock. If the process exits,
|
||||
// the lock will be automatically released.
|
||||
//
|
||||
// If somebody else already holds the lock, finishes immediately
|
||||
// with a failure. I.e., this call does not wait for existing locks
|
||||
// to go away.
|
||||
//
|
||||
// May create the named file if it does not already exist.
|
||||
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
|
||||
|
||||
// Release the lock acquired by a previous successful call to LockFile.
|
||||
// REQUIRES: lock was returned by a successful LockFile() call
|
||||
// REQUIRES: lock has not already been unlocked.
|
||||
virtual Status UnlockFile(FileLock* lock) = 0;
|
||||
|
||||
enum Priority { LOW, HIGH, TOTAL };
|
||||
|
||||
// Arrange to run "(*function)(arg)" once in a background thread, in
|
||||
// the thread pool specified by pri. By default, jobs go to the 'LOW'
|
||||
// priority thread pool.
|
||||
|
||||
// "function" may run in an unspecified thread. Multiple functions
|
||||
// added to the same Env may run concurrently in different threads.
|
||||
// I.e., the caller may not assume that background work items are
|
||||
// serialized.
|
||||
virtual void Schedule(
|
||||
void (*function)(void* arg),
|
||||
void* arg,
|
||||
Priority pri = LOW) = 0;
|
||||
|
||||
// Start a new thread, invoking "function(arg)" within the new thread.
|
||||
// When "function(arg)" returns, the thread will be destroyed.
|
||||
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
|
||||
|
||||
// *path is set to a temporary directory that can be used for testing. It may
|
||||
// or many not have just been created. The directory may or may not differ
|
||||
// between runs of the same process, but subsequent calls will return the
|
||||
// same directory.
|
||||
virtual Status GetTestDirectory(std::string* path) = 0;
|
||||
|
||||
// Create and return a log file for storing informational messages.
|
||||
virtual Status NewLogger(const std::string& fname,
|
||||
shared_ptr<Logger>* result) = 0;
|
||||
|
||||
// Returns the number of micro-seconds since some fixed point in time. Only
|
||||
// useful for computing deltas of time.
|
||||
virtual uint64_t NowMicros() = 0;
|
||||
|
||||
// Returns the number of nano-seconds since some fixed point in time. Only
|
||||
// useful for computing deltas of time in one run.
|
||||
// Default implementation simply relies on NowMicros
|
||||
virtual uint64_t NowNanos() {
|
||||
return NowMicros() * 1000;
|
||||
}
|
||||
|
||||
// Sleep/delay the thread for the perscribed number of micro-seconds.
|
||||
virtual void SleepForMicroseconds(int micros) = 0;
|
||||
|
||||
// Get the current host name.
|
||||
virtual Status GetHostName(char* name, uint64_t len) = 0;
|
||||
|
||||
// Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
|
||||
virtual Status GetCurrentTime(int64_t* unix_time) = 0;
|
||||
|
||||
// Get full directory name for this db.
|
||||
virtual Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* output_path) = 0;
|
||||
|
||||
// The number of background worker threads of a specific thread pool
|
||||
// for this environment. 'LOW' is the default pool.
|
||||
// default number: 1
|
||||
virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
|
||||
|
||||
// Converts seconds-since-Jan-01-1970 to a printable string
|
||||
virtual std::string TimeToString(uint64_t time) = 0;
|
||||
|
||||
// Generates a unique id that can be used to identify a db
|
||||
virtual std::string GenerateUniqueId();
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Env(const Env&);
|
||||
void operator=(const Env&);
|
||||
};
|
||||
|
||||
// A file abstraction for reading sequentially through a file
|
||||
class SequentialFile {
|
||||
public:
|
||||
SequentialFile() { }
|
||||
virtual ~SequentialFile();
|
||||
|
||||
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
|
||||
// written by this routine. Sets "*result" to the data that was
|
||||
// read (including if fewer than "n" bytes were successfully read).
|
||||
// May set "*result" to point at data in "scratch[0..n-1]", so
|
||||
// "scratch[0..n-1]" must be live when "*result" is used.
|
||||
// If an error was encountered, returns a non-OK status.
|
||||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
|
||||
|
||||
// Skip "n" bytes from the file. This is guaranteed to be no
|
||||
// slower that reading the same data, but may be faster.
|
||||
//
|
||||
// If end of file is reached, skipping will stop at the end of the
|
||||
// file, and Skip will return OK.
|
||||
//
|
||||
// REQUIRES: External synchronization
|
||||
virtual Status Skip(uint64_t n) = 0;
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
};
|
||||
|
||||
// A file abstraction for randomly reading the contents of a file.
|
||||
class RandomAccessFile {
|
||||
public:
|
||||
RandomAccessFile() { }
|
||||
virtual ~RandomAccessFile();
|
||||
|
||||
// Read up to "n" bytes from the file starting at "offset".
|
||||
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
|
||||
// to the data that was read (including if fewer than "n" bytes were
|
||||
// successfully read). May set "*result" to point at data in
|
||||
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
|
||||
// "*result" is used. If an error was encountered, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// Safe for concurrent use by multiple threads.
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const = 0;
|
||||
|
||||
// Tries to get an unique ID for this file that will be the same each time
|
||||
// the file is opened (and will stay the same while the file is open).
|
||||
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
|
||||
// ID can be created this function returns the length of the ID and places it
|
||||
// in "id"; otherwise, this function returns 0, in which case "id"
|
||||
// may not have been modified.
|
||||
//
|
||||
// This function guarantees, for IDs from a given environment, two unique ids
|
||||
// cannot be made equal to eachother by adding arbitrary bytes to one of
|
||||
// them. That is, no unique ID is the prefix of another.
|
||||
//
|
||||
// This function guarantees that the returned ID will not be interpretable as
|
||||
// a single varint.
|
||||
//
|
||||
// Note: these IDs are only valid for the duration of the process.
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const {
|
||||
return 0; // Default implementation to prevent issues with backwards
|
||||
// compatibility.
|
||||
};
|
||||
|
||||
|
||||
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
|
||||
|
||||
virtual void Hint(AccessPattern pattern) {}
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
};
|
||||
|
||||
// A file abstraction for sequential writing. The implementation
|
||||
// must provide buffering since callers may append small fragments
|
||||
// at a time to the file.
|
||||
class WritableFile {
|
||||
public:
|
||||
WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
|
||||
}
|
||||
virtual ~WritableFile();
|
||||
|
||||
virtual Status Append(const Slice& data) = 0;
|
||||
virtual Status Close() = 0;
|
||||
virtual Status Flush() = 0;
|
||||
virtual Status Sync() = 0; // sync data
|
||||
|
||||
/*
|
||||
* Sync data and/or metadata as well.
|
||||
* By default, sync only data.
|
||||
* Override this method for environments where we need to sync
|
||||
* metadata as well.
|
||||
*/
|
||||
virtual Status Fsync() {
|
||||
return Sync();
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the size of valid data in the file.
|
||||
*/
|
||||
virtual uint64_t GetFileSize() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get and set the default pre-allocation block size for writes to
|
||||
* this file. If non-zero, then Allocate will be used to extend the
|
||||
* underlying storage of a file (generally via fallocate) if the Env
|
||||
* instance supports it.
|
||||
*/
|
||||
void SetPreallocationBlockSize(size_t size) {
|
||||
preallocation_block_size_ = size;
|
||||
}
|
||||
|
||||
virtual void GetPreallocationStatus(size_t* block_size,
|
||||
size_t* last_allocated_block) {
|
||||
*last_allocated_block = last_preallocated_block_;
|
||||
*block_size = preallocation_block_size_;
|
||||
}
|
||||
|
||||
// For documentation, refer to RandomAccessFile::GetUniqueId()
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const {
|
||||
return 0; // Default implementation to prevent issues with backwards
|
||||
}
|
||||
|
||||
// Remove any kind of caching of data from the offset to offset+length
|
||||
// of this file. If the length is 0, then it refers to the end of file.
|
||||
// If the system is not caching the file contents, then this is a noop.
|
||||
// This call has no effect on dirty pages in the cache.
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) {
|
||||
return Status::NotSupported("InvalidateCache not supported.");
|
||||
}
|
||||
|
||||
protected:
|
||||
// PrepareWrite performs any necessary preparation for a write
|
||||
// before the write actually occurs. This allows for pre-allocation
|
||||
// of space on devices where it can result in less file
|
||||
// fragmentation and/or less waste from over-zealous filesystem
|
||||
// pre-allocation.
|
||||
void PrepareWrite(size_t offset, size_t len) {
|
||||
if (preallocation_block_size_ == 0) {
|
||||
return;
|
||||
}
|
||||
// If this write would cross one or more preallocation blocks,
|
||||
// determine what the last preallocation block necesessary to
|
||||
// cover this write would be and Allocate to that point.
|
||||
const auto block_size = preallocation_block_size_;
|
||||
size_t new_last_preallocated_block =
|
||||
(offset + len + block_size - 1) / block_size;
|
||||
if (new_last_preallocated_block > last_preallocated_block_) {
|
||||
size_t num_spanned_blocks =
|
||||
new_last_preallocated_block - last_preallocated_block_;
|
||||
Allocate(block_size * last_preallocated_block_,
|
||||
block_size * num_spanned_blocks);
|
||||
last_preallocated_block_ = new_last_preallocated_block;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-allocate space for a file.
|
||||
*/
|
||||
virtual Status Allocate(off_t offset, off_t len) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Sync a file range with disk.
|
||||
// offset is the starting byte of the file range to be synchronized.
|
||||
// nbytes specifies the length of the range to be synchronized.
|
||||
// This asks the OS to initiate flushing the cached data to disk,
|
||||
// without waiting for completion.
|
||||
// Default implementation does nothing.
|
||||
virtual Status RangeSync(off_t offset, off_t nbytes) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
size_t last_preallocated_block_;
|
||||
size_t preallocation_block_size_;
|
||||
// No copying allowed
|
||||
WritableFile(const WritableFile&);
|
||||
void operator=(const WritableFile&);
|
||||
};
|
||||
|
||||
// A file abstraction for random reading and writing.
|
||||
class RandomRWFile {
|
||||
public:
|
||||
RandomRWFile() {}
|
||||
virtual ~RandomRWFile() {}
|
||||
|
||||
// Write data from Slice data to file starting from offset
|
||||
// Returns IOError on failure, but does not guarantee
|
||||
// atomicity of a write. Returns OK status on success.
|
||||
//
|
||||
// Safe for concurrent use.
|
||||
virtual Status Write(uint64_t offset, const Slice& data) = 0;
|
||||
// Read up to "n" bytes from the file starting at "offset".
|
||||
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
|
||||
// to the data that was read (including if fewer than "n" bytes were
|
||||
// successfully read). May set "*result" to point at data in
|
||||
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
|
||||
// "*result" is used. If an error was encountered, returns a non-OK
|
||||
// status.
|
||||
//
|
||||
// Safe for concurrent use by multiple threads.
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const = 0;
|
||||
virtual Status Close() = 0; // closes the file
|
||||
virtual Status Sync() = 0; // sync data
|
||||
|
||||
/*
|
||||
* Sync data and/or metadata as well.
|
||||
* By default, sync only data.
|
||||
* Override this method for environments where we need to sync
|
||||
* metadata as well.
|
||||
*/
|
||||
virtual Status Fsync() {
|
||||
return Sync();
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-allocate space for a file.
|
||||
*/
|
||||
virtual Status Allocate(off_t offset, off_t len) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
RandomRWFile(const RandomRWFile&);
|
||||
void operator=(const RandomRWFile&);
|
||||
};
|
||||
|
||||
// An interface for writing log messages.
|
||||
class Logger {
|
||||
public:
|
||||
enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
|
||||
Logger() { }
|
||||
virtual ~Logger();
|
||||
|
||||
// Write an entry to the log file with the specified format.
|
||||
virtual void Logv(const char* format, va_list ap) = 0;
|
||||
virtual size_t GetLogFileSize() const {
|
||||
return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
|
||||
}
|
||||
// Flush to the OS buffers
|
||||
virtual void Flush() {}
|
||||
|
||||
private:
|
||||
// No copying allowed
|
||||
Logger(const Logger&);
|
||||
void operator=(const Logger&);
|
||||
};
|
||||
|
||||
|
||||
// Identifies a locked file.
|
||||
class FileLock {
|
||||
public:
|
||||
FileLock() { }
|
||||
virtual ~FileLock();
|
||||
private:
|
||||
// No copying allowed
|
||||
FileLock(const FileLock&);
|
||||
void operator=(const FileLock&);
|
||||
};
|
||||
|
||||
|
||||
extern void LogFlush(const shared_ptr<Logger>& info_log);
|
||||
|
||||
// Log the specified data to *info_log if info_log is non-nullptr.
|
||||
extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
|
||||
# if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__format__ (__printf__, 2, 3)))
|
||||
# endif
|
||||
;
|
||||
|
||||
extern void LogFlush(Logger *info_log);
|
||||
|
||||
extern void Log(Logger* info_log, const char* format, ...)
|
||||
# if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__format__ (__printf__, 2, 3)))
|
||||
# endif
|
||||
;
|
||||
|
||||
// A utility routine: write "data" to the named file.
|
||||
extern Status WriteStringToFile(Env* env, const Slice& data,
|
||||
const std::string& fname);
|
||||
|
||||
// A utility routine: read contents of named file into *data
|
||||
extern Status ReadFileToString(Env* env, const std::string& fname,
|
||||
std::string* data);
|
||||
|
||||
// An implementation of Env that forwards all calls to another Env.
|
||||
// May be useful to clients who wish to override just part of the
|
||||
// functionality of another Env.
|
||||
class EnvWrapper : public Env {
|
||||
public:
|
||||
// Initialize an EnvWrapper that delegates all calls to *t
|
||||
explicit EnvWrapper(Env* t) : target_(t) { }
|
||||
virtual ~EnvWrapper();
|
||||
|
||||
// Return the target to which this Env forwards all calls
|
||||
Env* target() const { return target_; }
|
||||
|
||||
// The following text is boilerplate that forwards all methods to target()
|
||||
Status NewSequentialFile(const std::string& f,
|
||||
unique_ptr<SequentialFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewSequentialFile(f, r, options);
|
||||
}
|
||||
Status NewRandomAccessFile(const std::string& f,
|
||||
unique_ptr<RandomAccessFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewRandomAccessFile(f, r, options);
|
||||
}
|
||||
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewWritableFile(f, r, options);
|
||||
}
|
||||
Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
|
||||
const EnvOptions& options) {
|
||||
return target_->NewRandomRWFile(f, r, options);
|
||||
}
|
||||
bool FileExists(const std::string& f) { return target_->FileExists(f); }
|
||||
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
|
||||
return target_->GetChildren(dir, r);
|
||||
}
|
||||
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
|
||||
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
|
||||
Status CreateDirIfMissing(const std::string& d) {
|
||||
return target_->CreateDirIfMissing(d);
|
||||
}
|
||||
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
|
||||
Status GetFileSize(const std::string& f, uint64_t* s) {
|
||||
return target_->GetFileSize(f, s);
|
||||
}
|
||||
|
||||
Status GetFileModificationTime(const std::string& fname,
|
||||
uint64_t* file_mtime) {
|
||||
return target_->GetFileModificationTime(fname, file_mtime);
|
||||
}
|
||||
|
||||
Status RenameFile(const std::string& s, const std::string& t) {
|
||||
return target_->RenameFile(s, t);
|
||||
}
|
||||
Status LockFile(const std::string& f, FileLock** l) {
|
||||
return target_->LockFile(f, l);
|
||||
}
|
||||
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
|
||||
void Schedule(void (*f)(void*), void* a, Priority pri) {
|
||||
return target_->Schedule(f, a, pri);
|
||||
}
|
||||
void StartThread(void (*f)(void*), void* a) {
|
||||
return target_->StartThread(f, a);
|
||||
}
|
||||
virtual Status GetTestDirectory(std::string* path) {
|
||||
return target_->GetTestDirectory(path);
|
||||
}
|
||||
virtual Status NewLogger(const std::string& fname,
|
||||
shared_ptr<Logger>* result) {
|
||||
return target_->NewLogger(fname, result);
|
||||
}
|
||||
uint64_t NowMicros() {
|
||||
return target_->NowMicros();
|
||||
}
|
||||
void SleepForMicroseconds(int micros) {
|
||||
target_->SleepForMicroseconds(micros);
|
||||
}
|
||||
Status GetHostName(char* name, uint64_t len) {
|
||||
return target_->GetHostName(name, len);
|
||||
}
|
||||
Status GetCurrentTime(int64_t* unix_time) {
|
||||
return target_->GetCurrentTime(unix_time);
|
||||
}
|
||||
Status GetAbsolutePath(const std::string& db_path,
|
||||
std::string* output_path) {
|
||||
return target_->GetAbsolutePath(db_path, output_path);
|
||||
}
|
||||
void SetBackgroundThreads(int num, Priority pri) {
|
||||
return target_->SetBackgroundThreads(num, pri);
|
||||
}
|
||||
std::string TimeToString(uint64_t time) {
|
||||
return target_->TimeToString(time);
|
||||
}
|
||||
|
||||
private:
|
||||
Env* target_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_
|
70
include/rocksdb/filter_policy.h
Normal file
70
include/rocksdb/filter_policy.h
Normal file
@ -0,0 +1,70 @@
|
||||
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// A database can be configured with a custom FilterPolicy object.
|
||||
// This object is responsible for creating a small filter from a set
|
||||
// of keys. These filters are stored in rocksdb and are consulted
|
||||
// automatically by rocksdb to decide whether or not to read some
|
||||
// information from disk. In many cases, a filter can cut down the
|
||||
// number of disk seeks form a handful to a single disk seek per
|
||||
// DB::Get() call.
|
||||
//
|
||||
// Most people will want to use the builtin bloom filter support (see
|
||||
// NewBloomFilterPolicy() below).
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
|
||||
class FilterPolicy {
|
||||
public:
|
||||
virtual ~FilterPolicy();
|
||||
|
||||
// Return the name of this policy. Note that if the filter encoding
|
||||
// changes in an incompatible way, the name returned by this method
|
||||
// must be changed. Otherwise, old incompatible filters may be
|
||||
// passed to methods of this type.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// keys[0,n-1] contains a list of keys (potentially with duplicates)
|
||||
// that are ordered according to the user supplied comparator.
|
||||
// Append a filter that summarizes keys[0,n-1] to *dst.
|
||||
//
|
||||
// Warning: do not change the initial contents of *dst. Instead,
|
||||
// append the newly constructed filter to *dst.
|
||||
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
|
||||
const = 0;
|
||||
|
||||
// "filter" contains the data appended by a preceding call to
|
||||
// CreateFilter() on this class. This method must return true if
|
||||
// the key was in the list of keys passed to CreateFilter().
|
||||
// This method may return true or false if the key was not on the
|
||||
// list, but it should aim to return false with a high probability.
|
||||
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
|
||||
};
|
||||
|
||||
// Return a new filter policy that uses a bloom filter with approximately
|
||||
// the specified number of bits per key. A good value for bits_per_key
|
||||
// is 10, which yields a filter with ~ 1% false positive rate.
|
||||
//
|
||||
// Callers must delete the result after any database that is using the
|
||||
// result has been closed.
|
||||
//
|
||||
// Note: if you are using a custom comparator that ignores some parts
|
||||
// of the keys being compared, you must not use NewBloomFilterPolicy()
|
||||
// and must provide your own FilterPolicy that also ignores the
|
||||
// corresponding parts of the keys. For example, if the comparator
|
||||
// ignores trailing spaces, it would be incorrect to use a
|
||||
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
|
||||
// trailing spaces in keys.
|
||||
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
|
||||
|
||||
}
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
|
64
include/rocksdb/flush_block_policy.h
Normal file
64
include/rocksdb/flush_block_policy.h
Normal file
@ -0,0 +1,64 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Slice;
|
||||
class BlockBuilder;
|
||||
|
||||
// FlushBlockPolicy provides a configurable way to determine when to flush a
|
||||
// block in the block based tables,
|
||||
class FlushBlockPolicy {
|
||||
public:
|
||||
// Keep track of the key/value sequences and return the boolean value to
|
||||
// determine if table builder should flush current data block.
|
||||
virtual bool Update(const Slice& key,
|
||||
const Slice& value) = 0;
|
||||
|
||||
virtual ~FlushBlockPolicy() { }
|
||||
};
|
||||
|
||||
class FlushBlockPolicyFactory {
|
||||
public:
|
||||
// Return the name of the flush block policy.
|
||||
virtual const char* Name() const = 0;
|
||||
|
||||
// Return a new block flush policy that flushes data blocks by data size.
|
||||
// FlushBlockPolicy may need to access the metadata of the data block
|
||||
// builder to determine when to flush the blocks.
|
||||
//
|
||||
// Callers must delete the result after any database that is using the
|
||||
// result has been closed.
|
||||
virtual FlushBlockPolicy* NewFlushBlockPolicy(
|
||||
const BlockBuilder& data_block_builder) const = 0;
|
||||
|
||||
virtual ~FlushBlockPolicyFactory() { }
|
||||
};
|
||||
|
||||
class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
|
||||
public:
|
||||
FlushBlockBySizePolicyFactory(const uint64_t block_size,
|
||||
const uint64_t block_size_deviation) :
|
||||
block_size_(block_size),
|
||||
block_size_deviation_(block_size_deviation) {
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "FlushBlockBySizePolicyFactory";
|
||||
}
|
||||
|
||||
virtual FlushBlockPolicy* NewFlushBlockPolicy(
|
||||
const BlockBuilder& data_block_builder) const override;
|
||||
|
||||
private:
|
||||
const uint64_t block_size_;
|
||||
const uint64_t block_size_deviation_;
|
||||
};
|
||||
|
||||
} // rocksdb
|
102
include/rocksdb/iterator.h
Normal file
102
include/rocksdb/iterator.h
Normal file
@ -0,0 +1,102 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
//
|
||||
// An iterator yields a sequence of key/value pairs from a source.
|
||||
// The following class defines the interface. Multiple implementations
|
||||
// are provided by this library. In particular, iterators are provided
|
||||
// to access the contents of a Table or a DB.
|
||||
//
|
||||
// Multiple threads can invoke const methods on an Iterator without
|
||||
// external synchronization, but if any of the threads may call a
|
||||
// non-const method, all threads accessing the same Iterator must use
|
||||
// external synchronization.
|
||||
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
||||
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Iterator {
|
||||
public:
|
||||
Iterator();
|
||||
virtual ~Iterator();
|
||||
|
||||
// An iterator is either positioned at a key/value pair, or
|
||||
// not valid. This method returns true iff the iterator is valid.
|
||||
virtual bool Valid() const = 0;
|
||||
|
||||
// Position at the first key in the source. The iterator is Valid()
|
||||
// after this call iff the source is not empty.
|
||||
virtual void SeekToFirst() = 0;
|
||||
|
||||
// Position at the last key in the source. The iterator is
|
||||
// Valid() after this call iff the source is not empty.
|
||||
virtual void SeekToLast() = 0;
|
||||
|
||||
// Position at the first key in the source that at or past target
|
||||
// The iterator is Valid() after this call iff the source contains
|
||||
// an entry that comes at or past target.
|
||||
virtual void Seek(const Slice& target) = 0;
|
||||
|
||||
// Moves to the next entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the last entry in the source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Moves to the previous entry in the source. After this call, Valid() is
|
||||
// true iff the iterator was not positioned at the first entry in source.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() = 0;
|
||||
|
||||
// Return the key for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: Valid()
|
||||
virtual Slice key() const = 0;
|
||||
|
||||
// Return the value for the current entry. The underlying storage for
|
||||
// the returned slice is valid only until the next modification of
|
||||
// the iterator.
|
||||
// REQUIRES: !AtEnd() && !AtStart()
|
||||
virtual Slice value() const = 0;
|
||||
|
||||
// If an error has occurred, return it. Else return an ok status.
|
||||
// If non-blocking IO is requested and this operation cannot be
|
||||
// satisfied without doing some IO, then this returns Status::Incomplete().
|
||||
virtual Status status() const = 0;
|
||||
|
||||
// Clients are allowed to register function/arg1/arg2 triples that
|
||||
// will be invoked when this iterator is destroyed.
|
||||
//
|
||||
// Note that unlike all of the preceding methods, this method is
|
||||
// not abstract and therefore clients should not override it.
|
||||
typedef void (*CleanupFunction)(void* arg1, void* arg2);
|
||||
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
|
||||
|
||||
private:
|
||||
struct Cleanup {
|
||||
CleanupFunction function;
|
||||
void* arg1;
|
||||
void* arg2;
|
||||
Cleanup* next;
|
||||
};
|
||||
Cleanup cleanup_;
|
||||
|
||||
// No copying allowed
|
||||
Iterator(const Iterator&);
|
||||
void operator=(const Iterator&);
|
||||
};
|
||||
|
||||
// Return an empty iterator (yields nothing).
|
||||
extern Iterator* NewEmptyIterator();
|
||||
|
||||
// Return an empty iterator with the specified status.
|
||||
extern Iterator* NewErrorIterator(const Status& status);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
|
15
include/rocksdb/ldb_tool.h
Normal file
15
include/rocksdb/ldb_tool.h
Normal file
@ -0,0 +1,15 @@
|
||||
// Copyright 2008-present Facebook. All Rights Reserved.
|
||||
#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
|
||||
#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class LDBTool {
|
||||
public:
|
||||
void Run(int argc, char** argv, Options = Options());
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user