Merge branch 'master' of github.com:facebook/rocksdb into gh-pages

This commit is contained in:
James Pearce 2013-11-15 11:34:59 -08:00
commit 3a0a8a02be
353 changed files with 157724 additions and 0 deletions

10
.arcconfig Normal file
View File

@ -0,0 +1,10 @@
{
"project_id" : "leveldb",
"conduit_uri" : "https://reviews.facebook.net/",
"copyright_holder" : "",
"load" : [
"linters/src/"
],
"lint.engine" : "FacebookFbcodeLintEngine",
"lint.engine.single.linter" : "FbcodeCppLinter"
}

23
.gitignore vendored Normal file
View File

@ -0,0 +1,23 @@
build_config.mk
*.a
*.arc
*.d
*.dylib*
*.gcda
*.gcno
*.o
*.so
*.so.*
*_test
*_bench
*_stress
ldb
manifest_dump
sst_dump
util/build_version.cc
build_tools/VALGRIND_LOGS/
coverage/COVERAGE_REPORT
util/build_version.cc.tmp
.gdbhistory

35
LICENSE Normal file
View File

@ -0,0 +1,35 @@
BSD License
For rocksdb software
Copyright (c) 2013, Facebook, Inc.
All rights reserved.
---------------------------------------------------------------------
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

380
Makefile Normal file
View File

@ -0,0 +1,380 @@
# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. See the AUTHORS file for names of contributors.
# Inherit some settings from environment variables, if available
INSTALL_PATH ?= $(CURDIR)
#-----------------------------------------------
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
# to switch between compilation modes.
# OPT ?= -DNDEBUG # (A) Production use (optimized mode)
OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
#-----------------------------------------------
# detect what platform we're building on
$(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platform $(CURDIR)/build_config.mk))
# this file is generated by the previous line to set build flags and sources
include build_config.mk
WARNING_FLAGS = -Wall -Werror
CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -std=gnu++0x -Woverloaded-virtual
LDFLAGS += $(PLATFORM_LDFLAGS)
LIBOBJECTS = $(SOURCES:.cc=.o)
LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
TESTUTIL = ./util/testutil.o
TESTHARNESS = ./util/testharness.o $(TESTUTIL)
VALGRIND_ERROR = 2
VALGRIND_DIR = build_tools/VALGRIND_LOGS
VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
TESTS = \
table_stats_collector_test \
arena_test \
auto_roll_logger_test \
block_test \
bloom_test \
c_test \
cache_test \
coding_test \
corruption_test \
crc32c_test \
db_test \
dbformat_test \
env_test \
blob_store_test \
filelock_test \
filename_test \
filter_block_test \
histogram_test \
log_test \
manual_compaction_test \
memenv_test \
merge_test \
redis_test \
reduce_levels_test \
simple_table_db_test \
skiplist_test \
stringappend_test \
table_test \
ttl_test \
version_edit_test \
version_set_test \
write_batch_test\
deletefile_test
TOOLS = \
sst_dump \
db_stress \
ldb \
db_repl_stress \
blob_store_bench
PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
# The library name is configurable since we are maintaining libraries of both
# debug/release mode.
LIBNAME = librocksdb
LIBRARY = ${LIBNAME}.a
MEMENVLIBRARY = libmemenv.a
default: all
#-----------------------------------------------
# Create platform independent shared libraries.
#-----------------------------------------------
ifneq ($(PLATFORM_SHARED_EXT),)
ifneq ($(PLATFORM_SHARED_VERSIONED),true)
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1)
SHARED3 = $(SHARED1)
SHARED = $(SHARED1)
else
# Update db.h if you change these.
SHARED_MAJOR = 2
SHARED_MINOR = 0
SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
SHARED2 = $(SHARED1).$(SHARED_MAJOR)
SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
$(SHARED1): $(SHARED3)
ln -fs $(SHARED3) $(SHARED1)
$(SHARED2): $(SHARED3)
ln -fs $(SHARED3) $(SHARED2)
endif
$(SHARED3):
$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3)
endif # PLATFORM_SHARED_EXT
all: $(LIBRARY) $(PROGRAMS)
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
release tags valgrind_check whitebox_crash_test
release:
$(MAKE) clean
OPT=-DNDEBUG $(MAKE) -j32
coverage:
$(MAKE) clean
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
(cd coverage; ./coverage_test.sh)
# Delete intermediate files
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
check: all $(PROGRAMS) $(TESTS) $(TOOLS) ldb_tests
for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
ldb_tests: all $(PROGRAMS) $(TOOLS)
python tools/ldb_test.py
crash_test: blackbox_crash_test whitebox_crash_test
blackbox_crash_test: db_stress
python -u tools/db_crashtest.py
whitebox_crash_test: db_stress
python -u tools/db_crashtest2.py
valgrind_check: all $(PROGRAMS) $(TESTS)
mkdir -p $(VALGRIND_DIR)
echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
for t in $(filter-out skiplist_test,$(TESTS)); do \
stime=`date '+%s'`; \
$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
fi; \
etime=`date '+%s'`; \
echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
done
clean:
-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
-rm -rf ios-x86/* ios-arm/*
-find . -name "*.[od]" -exec rm {} \;
-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
tags:
ctags * -R
cscope -b `find . -name '*.cc'` `find . -name '*.h'`
# ---------------------------------------------------------------------------
# Unit tests and tools
# ---------------------------------------------------------------------------
$(LIBRARY): $(LIBOBJECTS)
rm -f $@
$(AR) -rs $@ $(LIBOBJECTS)
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lsqlite3 $(COVERAGEFLAGS)
db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lkyotocabinet $(COVERAGEFLAGS)
signal_test: util/signal_test.o $(LIBOBJECTS)
$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_stats_collector_test: db/table_stats_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/table_stats_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
rm -f $@
$(AR) -rs $@ $(MEMENVOBJECTS)
memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
$(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
$(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
sst_dump: tools/sst_dump.o $(LIBOBJECTS)
$(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
ldb: tools/ldb.o $(LIBOBJECTS)
$(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
# ---------------------------------------------------------------------------
# Platform-specific compilation
# ---------------------------------------------------------------------------
ifeq ($(PLATFORM), IOS)
# For iOS, create universal object files to be used on both the simulator and
# a device.
PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString)
.cc.o:
mkdir -p ios-x86/$(dir $@)
$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS)
mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS)
lipo ios-x86/$@ ios-arm/$@ -create -output $@
.c.o:
mkdir -p ios-x86/$(dir $@)
$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
mkdir -p ios-arm/$(dir $@)
$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
lipo ios-x86/$@ ios-arm/$@ -create -output $@
else
.cc.o:
$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
.c.o:
$(CC) $(CFLAGS) -c $< -o $@
endif
# ---------------------------------------------------------------------------
# Source files dependencies detection
# ---------------------------------------------------------------------------
# Add proper dependency support so changing a .h file forces a .cc file to
# rebuild.
# The .d file indicates .cc file's dependencies on .h files. We generate such
# dependency by g++'s -MM option, whose output is a make dependency rule.
# The sed command makes sure the "target" file in the generated .d file has
# the correct path prefix.
%.d: %.cc
$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
@sed -i -e 's|.*:|$*.o:|' $@
DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
depend: $(DEPFILES)
ifneq ($(MAKECMDGOALS),clean)
-include $(DEPFILES)
endif

23
PATENTS Normal file
View File

@ -0,0 +1,23 @@
Additional Grant of Patent Rights
“Software” means the rocksdb software distributed by Facebook, Inc.
Facebook hereby grants you a perpetual, worldwide, royalty-free,
non-exclusive, irrevocable (subject to the termination provision below)
license under any rights in any patent claims owned by Facebook, to make,
have made, use, sell, offer to sell, import, and otherwise transfer the
Software. For avoidance of doubt, no license is granted under Facebooks
rights in any patent claims that are infringed by (i) modifications to the
Software made by you or a third party, or (ii) the Software in combination
with any software or other technology provided by you or a third party.
The license granted hereunder will terminate, automatically and without
notice, for anyone that makes any claim (including by filing any lawsuit,
assertion or other action) alleging (a) direct, indirect, or contributory
infringement or inducement to infringe any patent: (i) by Facebook or any
of its subsidiaries or affiliates, whether or not such claim is related
to the Software, (ii) by any party if such claim arises in whole or in
part from any software, product or service of Facebook or any of its
subsidiaries or affiliates, whether or not such claim is related to the
Software, or (iii) by any party relating to the Software; or (b) that
any right in any patent claim of Facebook is invalid or unenforceable.

82
README Normal file
View File

@ -0,0 +1,82 @@
rocksdb: A persistent key-value store for flash storage
Authors: * The Facebook Database Engineering Team
* Build on earlier work on leveldb by Sanjay Ghemawat
(sanjay@google.com) and Jeff Dean (jeff@google.com)
This code is a library that forms the core building block for a fast
key value server, especially suited for storing data on flash drives.
It has an Log-Stuctured-Merge-Database (LSM) design with flexible tradeoffs
between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF)
and Space-Amplification-Factor(SAF). It has multi-threaded compactions,
making it specially suitable for storing multiple terabytes of data in a
single database.
The core of this code has been derived from open-source leveldb.
The code under this directory implements a system for maintaining a
persistent key/value store.
See doc/index.html for more explanation.
See doc/impl.html for a brief overview of the implementation.
The public interface is in include/*. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Guide to header files:
include/rocksdb/db.h
Main interface to the DB: Start here
include/rocksdb/options.h
Control over the behavior of an entire database, and also
control over the behavior of individual reads and writes.
include/rocksdb/comparator.h
Abstraction for user-specified comparison function. If you want
just bytewise comparison of keys, you can use the default comparator,
but clients can write their own comparator implementations if they
want custom ordering (e.g. to handle different character
encodings, etc.)
include/rocksdb/iterator.h
Interface for iterating over data. You can get an iterator
from a DB object.
include/rocksdb/write_batch.h
Interface for atomically applying multiple updates to a database.
include/rocksdb/slice.h
A simple module for maintaining a pointer and a length into some
other byte array.
include/rocksdb/status.h
Status is returned from many of the public interfaces and is used
to report success and various kinds of errors.
include/rocksdb/env.h
Abstraction of the OS environment. A posix implementation of
this interface is in util/env_posix.cc
include/rocksdb/table_builder.h
Lower-level modules that most clients probably won't use directly
include/rocksdb/cache.h
An API for the block cache.
include/rocksdb/compaction_filter.h
An API for a application filter invoked on every compaction.
include/rocksdb/filter_policy.h
An API for configuring a bloom filter.
include/rocksdb/memtablerep.h
An API for implementing a memtable.
include/rocksdb/statistics.h
An API to retrieve various database statistics.
include/rocksdb/transaction_log.h
An API to retrieve transaction logs from a database.

3
README.fb Normal file
View File

@ -0,0 +1,3 @@
* Detailed instructions on how to compile using fbcode and jemalloc
* Latest release is 2.5.fb

252
build_tools/build_detect_platform Executable file
View File

@ -0,0 +1,252 @@
#!/bin/sh
#
# Detects OS we're compiling on and outputs a file specified by the first
# argument, which in turn gets read while processing Makefile.
#
# The output will set the following variables:
# CC C Compiler path
# CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags
# PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
# PLATFORM_CCFLAGS C compiler flags
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned
# shared libraries, empty otherwise.
#
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
#
# -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
# -DLEVELDB_PLATFORM_NOATOMIC if it is not
# -DSNAPPY if the Snappy library is present
#
OUTPUT=$1
if test -z "$OUTPUT"; then
echo "usage: $0 <output-filename>" >&2
exit 1
fi
# Default to fbcode gcc on internal fb machines
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
if [ -z "$USE_CLANG" ]; then
source $PWD/build_tools/fbcode.gcc471.sh
else
source $PWD/build_tools/fbcode.clang31.sh
fi
fi
# Delete existing output, if it exists
rm -f $OUTPUT
touch $OUTPUT
if test -z "$CC"; then
CC=cc
fi
if test -z "$CXX"; then
CXX=g++
fi
# Detect OS
if test -z "$TARGET_OS"; then
TARGET_OS=`uname -s`
fi
COMMON_FLAGS="${CFLAGS}"
CROSS_COMPILE=
PLATFORM_CCFLAGS=
PLATFORM_CXXFLAGS="${CXXFLAGS}"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=true
# generic port files (working on all platform by #ifdef) go directly in /port
GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
case "$TARGET_OS" in
Darwin)
PLATFORM=OS_MACOSX
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
# PORT_FILES=port/darwin/darwin_specific.cc
;;
Linux)
PLATFORM=OS_LINUX
COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
if [ -z "$USE_CLANG" ]; then
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
fi
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
# PORT_FILES=port/linux/linux_specific.cc
;;
SunOS)
PLATFORM=OS_SOLARIS
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
# PORT_FILES=port/sunos/sunos_specific.cc
;;
FreeBSD)
PLATFORM=OS_FREEBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
# PORT_FILES=port/freebsd/freebsd_specific.cc
;;
NetBSD)
PLATFORM=OS_NETBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
# PORT_FILES=port/netbsd/netbsd_specific.cc
;;
OpenBSD)
PLATFORM=OS_OPENBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
# PORT_FILES=port/openbsd/openbsd_specific.cc
;;
DragonFly)
PLATFORM=OS_DRAGONFLYBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
# PORT_FILES=port/dragonfly/dragonfly_specific.cc
;;
OS_ANDROID_CROSSCOMPILE)
PLATFORM=OS_ANDROID
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library
# PORT_FILES=port/android/android.cc
CROSS_COMPILE=true
;;
*)
echo "Unknown platform!" >&2
exit 1
esac
$PWD/build_tools/build_detect_version
# We want to make a list of all cc files within util, db, table, and helpers
# except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the
# prune take effect.
DIRS="util db table utilities"
set -f # temporarily disable globbing so that our patterns arent expanded
PRUNE_TEST="-name *test*.cc -prune"
PRUNE_BENCH="-name *_bench.cc -prune"
PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
set +f # re-enable globbing
# The sources consist of the portable files, plus the platform-specific port
# file.
echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT
echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
if [ "$CROSS_COMPILE" = "true" ]; then
# Cross-compiling; do not try any compilation tests.
true
else
# If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
$CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <cstdatomic>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
PLATFORM_CXXFLAGS="-std=c++0x"
else
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
fi
# Test whether Snappy library is installed
# http://code.google.com/p/snappy/
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <snappy.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS ${SNAPPY_LDFLAGS:-./snappy/libs/libsnappy.a}"
fi
# Test whether gflags library is installed
# http://code.google.com/p/gflags/
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <gflags/gflags.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS"
fi
# Test whether zlib library is installed
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <zlib.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
fi
# Test whether bzip library is installed
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <bzlib.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
fi
# Test whether tcmalloc is available
$CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
fi
fi
# shall we use HDFS?
if test "$USE_HDFS"; then
if test -z "$JAVA_HOME"; then
echo "JAVA_HOME has to be set for HDFS usage."
exit 1
fi
HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64"
HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
fi
# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
VALGRIND_VER="$VALGRIND_VER"
echo "CC=$CC" >> $OUTPUT
echo "CXX=$CXX" >> $OUTPUT
echo "PLATFORM=$PLATFORM" >> $OUTPUT
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT

View File

@ -0,0 +1,42 @@
#!/bin/sh
#
# Record the version of the source that we are compiling.
# We keep a record of the git revision in util/version.cc. This source file
# is then built as a regular source file as part of the compilation process.
# One can run "strings executable_filename | grep _build_" to find the version of
# the source that we used to build the executable file.
#
# create git version file
VFILE=$ROCKSDB_ROOT/util/build_version.cc.tmp
trap "rm $VFILE" EXIT
# check to see if git is in the path
which git > /dev/null
if [ "$?" = 0 ]; then
env -i git rev-parse HEAD |
awk '
BEGIN {
print "#include \"build_version.h\"\n"
}
{ print "const char* rocksdb_build_git_sha = \"rocksdb_build_git_sha:" $0"\";" }
' > ${VFILE}
else
echo "git not found" |
awk '
BEGIN {
print "#include \"build_version.h\""
}
{ print "const char* rocksdb_build_git_sha = \"rocksdb_build_git_sha:git not found\";" }
' > ${VFILE}
fi
echo "const char* rocksdb_build_git_datetime = \"rocksdb_build_git_datetime:$(date)\";" >> ${VFILE}
echo "const char* rocksdb_build_compile_date = __DATE__;" >> ${VFILE}
echo "const char* rocksdb_build_compile_time = __TIME__;" >> ${VFILE}
OUTFILE=$ROCKSDB_ROOT/util/build_version.cc
if [ ! -e $OUTFILE ] || ! cmp -s $VFILE $OUTFILE; then
cp $VFILE $OUTFILE
fi

View File

@ -0,0 +1,49 @@
#!/bin/sh
#
# Set environment variables so that we can compile leveldb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
# location of libevent
LIBEVENT_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/include"
LIBEVENT_LIBS=" -L $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/lib"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.1/6ca8a59/bin/clang $CLANG_INCLUDES"
CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.1/6ca8a59/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $LIBEVENT_INCLUDE $GFLAGS_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.1/c8f7279/lib/clang/3.1/include -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
CXXFLAGS="$CFLAGS -nostdinc++ -std=gnu++0x"
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $LIBEVENT_LIBS $GFLAGS_LIBS"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
PLATFORM_LDFLAGS="-L$TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/lib -L$TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
SNAPPY_LDFLAGS="$SNAPPY_LIBS"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED SNAPPY_LDFLAGS

View File

@ -0,0 +1,60 @@
#!/bin/sh
#
# Set environment variables so that we can compile leveldb using
# fbcode settings. It uses the latest g++ compiler and also
# uses jemalloc
TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
# location of libhdfs libraries
if test "$USE_HDFS"; then
JAVA_HOME="/usr/local/jdk-6u22-64"
JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
HDFSLIB+=" -ldl -lverify -ljava -ljvm "
fi
# location of snappy headers and libraries
SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
# location of libevent
LIBEVENT_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/include"
LIBEVENT_LIBS=" -L $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/lib"
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
# use Intel SSE support for checksum calculations
export USE_SSE=" -msse -msse4.2 "
CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $LIBEVENT_INCLUDE $GFLAGS_INCLUDE"
AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/bin/gold -m64 -mtune=generic -fPIC"
CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $LIBEVENT_LIBS $GFLAGS_LIBS"
PLATFORM_LDFLAGS="-L$TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/lib -L$TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
SNAPPY_LDFLAGS="$SNAPPY_LIBS $ZLIB_LIBS"
VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED SNAPPY_LDFLAGS VALGRIND_VER

View File

@ -0,0 +1,100 @@
#!/bin/bash
set -e
NUM=10000000
if [ $# -eq 1 ];then
DATA_DIR=$1
elif [ $# -eq 2 ];then
DATA_DIR=$1
STAT_FILE=$2
fi
# On the production build servers, set data and stat
# files/directories not in /tmp or else the tempdir cleaning
# scripts will make you very unhappy.
DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
function cleanup {
rm -rf $DATA_DIR
rm -f $STAT_FILE.fillseq
rm -f $STAT_FILE.readrandom
rm -f $STAT_FILE.overwrite
}
trap cleanup EXIT
function send_to_ods {
key="$1"
value="$2"
if [ -z "$value" ];then
echo >&2 "ERROR: Key $key doesn't have a value."
return
fi
curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
--connect-timeout 60
}
make clean
make db_bench -j$(nproc)
./db_bench \
--benchmarks=fillseq \
--db=$DATA_DIR \
--use_existing_db=0 \
--bloom_bits=10 \
--num=$NUM \
--writes=$NUM \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 > ${STAT_FILE}.fillseq
./db_bench \
--benchmarks=overwrite \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--writes=$((NUM / 2)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=8 > ${STAT_FILE}.overwrite
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 100)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=128 > ${STAT_FILE}.readrandom
OVERWRITE_OPS=$(awk '/overwrite/ {print $5}' $STAT_FILE.overwrite)
FILLSEQ_OPS=$(awk '/fillseq/ {print $5}' $STAT_FILE.fillseq)
READRANDOM_OPS=$(awk '/readrandom/ {print $5}' $STAT_FILE.readrandom)
send_to_ods rocksdb.build.overwrite.qps $OVERWRITE_OPS
send_to_ods rocksdb.build.fillseq.qps $FILLSEQ_OPS
send_to_ods rocksdb.build.readrandom.qps $READRANDOM_OPS

15
build_tools/valgrind_test.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/bash
#A shell script for Jenknis to run valgrind on rocksdb tests
#Returns 0 on success when there are no failed tests
VALGRIND_DIR=build_tools/VALGRIND_LOGS
make clean
make -j$(nproc) valgrind_check
NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
if [ $NUM_FAILED_TESTS -lt 1 ]; then
echo No tests have valgrind errors
exit 0
else
cat $VALGRIND_DIR/valgrind_failed_tests
exit 1
fi

73
coverage/coverage_test.sh Executable file
View File

@ -0,0 +1,73 @@
#!/bin/bash
# Exit on error.
set -e
if [ -n "$USE_CLANG" ]; then
echo "Error: Coverage test is supported only for gcc."
exit 1
fi
ROOT=".."
# Fetch right version of gcov
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
source $ROOT/build_tools/fbcode.gcc471.sh
GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
else
GCOV=$(which gcov)
fi
COVERAGE_DIR="$PWD/COVERAGE_REPORT"
mkdir -p $COVERAGE_DIR
# Find all gcno files to generate the coverage report
GCNO_FILES=`find $ROOT -name "*.gcno"`
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
# Parse the raw gcov report to more human readable form.
python $ROOT/coverage/parse_gcov_output.py |
# Write the output to both stdout and report file.
tee $COVERAGE_DIR/coverage_report_all.txt &&
echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
# TODO: we also need to get the files of the latest commits.
# Get the most recently committed files.
LATEST_FILES=`
git show --pretty="format:" --name-only HEAD |
grep -v "^$" |
paste -s -d,`
RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
tee -a $RECENT_REPORT &&
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
# Generate the html report. If we cannot find lcov in this machine, we'll simply
# skip this step.
echo "Generating the html coverage report..."
LCOV=$(which lcov || true 2>/dev/null)
if [ -z $LCOV ]
then
echo "Skip: Cannot find lcov to generate the html report."
exit 0
fi
LCOV_VERSION=$(lcov -v | grep 1.1 || true)
if [ $LCOV_VERSION ]
then
echo "Not supported lcov version. Expect lcov 1.1."
exit 0
fi
(cd $ROOT; lcov --no-external \
--capture \
--directory $PWD \
--gcov-tool $GCOV \
--output-file $COVERAGE_DIR/coverage.info)
genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
echo "HTML Coverage report is generated in $COVERAGE_DIR"

View File

@ -0,0 +1,118 @@
import optparse
import re
import sys
from optparse import OptionParser
# the gcov report follows certain pattern. Each file will have two lines
# of report, from which we can extract the file name, total lines and coverage
# percentage.
def parse_gcov_report(gcov_input):
per_file_coverage = {}
total_coverage = None
for line in sys.stdin:
line = line.strip()
# --First line of the coverage report (with file name in it)?
match_obj = re.match("^File '(.*)'$", line)
if match_obj:
# fetch the file name from the first line of the report.
current_file = match_obj.group(1)
continue
# -- Second line of the file report (with coverage percentage)
match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
if match_obj:
coverage = float(match_obj.group(1))
lines = int(match_obj.group(2))
if current_file is not None:
per_file_coverage[current_file] = (coverage, lines)
current_file = None
else:
# If current_file is not set, we reach the last line of report,
# which contains the summarized coverage percentage.
total_coverage = (coverage, lines)
continue
# If the line's pattern doesn't fall into the above categories. We
# can simply ignore them since they're either empty line or doesn't
# find executable lines of the given file.
current_file = None
return per_file_coverage, total_coverage
def get_option_parser():
usage = "Parse the gcov output and generate more human-readable code " +\
"coverage report."
parser = OptionParser(usage)
parser.add_option(
"--interested-files", "-i",
dest="filenames",
help="Comma separated files names. if specified, we will display " +
"the coverage report only for interested source files. " +
"Otherwise we will display the coverage report for all " +
"source files."
)
return parser
def display_file_coverage(per_file_coverage, total_coverage):
# To print out auto-adjustable column, we need to know the longest
# length of file names.
max_file_name_length = max(
len(fname) for fname in per_file_coverage.keys()
)
# -- Print header
# size of separator is determined by 3 column sizes:
# file name, coverage percentage and lines.
header_template = \
"%" + str(max_file_name_length) + "s\t%s\t%s"
separator = "-" * (max_file_name_length + 10 + 20)
print header_template % ("Filename", "Coverage", "Lines")
print separator
# -- Print body
# template for printing coverage report for each file.
record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
for fname, coverage_info in per_file_coverage.items():
coverage, lines = coverage_info
print record_template % (fname, coverage, lines)
# -- Print footer
if total_coverage:
print separator
print record_template % ("Total", total_coverage[0], total_coverage[1])
def report_coverage():
parser = get_option_parser()
(options, args) = parser.parse_args()
interested_files = set()
if options.filenames is not None:
interested_files = set(f.strip() for f in options.filenames.split(','))
# To make things simple, right now we only read gcov report from the input
per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
# Check if we need to display coverage info for interested files.
if len(interested_files):
per_file_coverage = dict(
(fname, per_file_coverage[fname]) for fname in interested_files
if fname in per_file_coverage
)
# If we only interested in several files, it makes no sense to report
# the total_coverage
total_coverage = None
if not len(per_file_coverage):
print >> sys.stderr, "Cannot find coverage info for the given files."
return
display_file_coverage(per_file_coverage, total_coverage)
if __name__ == "__main__":
report_coverage()

226
db/builder.cc Normal file
View File

@ -0,0 +1,226 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/builder.h"
#include "db/filename.h"
#include "db/dbformat.h"
#include "db/merge_helper.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "rocksdb/db.h"
#include "rocksdb/table.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "table/block_based_table_builder.h"
#include "util/stop_watch.h"
namespace rocksdb {
class TableFactory;
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
CompressionType compression_type) {
return options.table_factory->GetTableBuilder(options, file,
compression_type);
}
Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
const EnvOptions& soptions,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta,
const Comparator* user_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const bool enable_compression) {
Status s;
meta->file_size = 0;
meta->smallest_seqno = meta->largest_seqno = 0;
iter->SeekToFirst();
// If the sequence number of the smallest entry in the memtable is
// smaller than the most recent snapshot, then we do not trigger
// removal of duplicate/deleted keys as part of this builder.
bool purge = options.purge_redundant_kvs_while_flush;
if (earliest_seqno_in_memtable <= newest_snapshot) {
purge = false;
}
std::string fname = TableFileName(dbname, meta->number);
if (iter->Valid()) {
unique_ptr<WritableFile> file;
s = env->NewWritableFile(fname, &file, soptions);
if (!s.ok()) {
return s;
}
TableBuilder* builder = GetTableBuilder(options, file.get(),
options.compression);
// the first key is the smallest key
Slice key = iter->key();
meta->smallest.DecodeFrom(key);
meta->smallest_seqno = GetInternalKeySeqno(key);
meta->largest_seqno = meta->smallest_seqno;
MergeHelper merge(user_comparator, options.merge_operator.get(),
options.info_log.get(),
true /* internal key corruption is not ok */);
if (purge) {
// Ugly walkaround to avoid compiler error for release build
bool ok __attribute__((unused)) = true;
// Will write to builder if current key != prev key
ParsedInternalKey prev_ikey;
std::string prev_key;
bool is_first_key = true; // Also write if this is the very first key
while (iter->Valid()) {
bool iterator_at_next = false;
// Get current key
ParsedInternalKey this_ikey;
Slice key = iter->key();
Slice value = iter->value();
// In-memory key corruption is not ok;
// TODO: find a clean way to treat in memory key corruption
ok = ParseInternalKey(key, &this_ikey);
assert(ok);
assert(this_ikey.sequence >= earliest_seqno_in_memtable);
// If the key is the same as the previous key (and it is not the
// first key), then we skip it, since it is an older version.
// Otherwise we output the key and mark it as the "new" previous key.
if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
this_ikey.user_key)) {
// seqno within the same key are in decreasing order
assert(this_ikey.sequence < prev_ikey.sequence);
} else {
is_first_key = false;
if (this_ikey.type == kTypeMerge) {
// Handle merge-type keys using the MergeHelper
merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
iterator_at_next = true;
if (merge.IsSuccess()) {
// Merge completed correctly.
// Add the resulting merge key/value and continue to next
builder->Add(merge.key(), merge.value());
prev_key.assign(merge.key().data(), merge.key().size());
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
assert(ok);
} else {
// Merge did not find a Put/Delete.
// Can not compact these merges into a kValueType.
// Write them out one-by-one. (Proceed back() to front())
const std::deque<std::string>& keys = merge.keys();
const std::deque<std::string>& values = merge.values();
assert(keys.size() == values.size() && keys.size() >= 1);
std::deque<std::string>::const_reverse_iterator key_iter;
std::deque<std::string>::const_reverse_iterator value_iter;
for (key_iter=keys.rbegin(), value_iter = values.rbegin();
key_iter != keys.rend() && value_iter != values.rend();
++key_iter, ++value_iter) {
builder->Add(Slice(*key_iter), Slice(*value_iter));
}
// Sanity check. Both iterators should end at the same time
assert(key_iter == keys.rend() && value_iter == values.rend());
prev_key.assign(keys.front());
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
assert(ok);
}
} else {
// Handle Put/Delete-type keys by simply writing them
builder->Add(key, value);
prev_key.assign(key.data(), key.size());
ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
assert(ok);
}
}
if (!iterator_at_next) iter->Next();
}
// The last key is the largest key
meta->largest.DecodeFrom(Slice(prev_key));
SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
} else {
for (; iter->Valid(); iter->Next()) {
Slice key = iter->key();
meta->largest.DecodeFrom(key);
builder->Add(key, iter->value());
SequenceNumber seqno = GetInternalKeySeqno(key);
meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
meta->largest_seqno = std::max(meta->largest_seqno, seqno);
}
}
// Finish and check for builder errors
if (s.ok()) {
s = builder->Finish();
if (s.ok()) {
meta->file_size = builder->FileSize();
assert(meta->file_size > 0);
}
} else {
builder->Abandon();
}
delete builder;
// Finish and check for file errors
if (s.ok() && !options.disableDataSync) {
if (options.use_fsync) {
StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS);
s = file->Fsync();
} else {
StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS);
s = file->Sync();
}
}
if (s.ok()) {
s = file->Close();
}
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(),
soptions,
meta->number,
meta->file_size);
s = it->status();
delete it;
}
}
// Check for input iterator errors
if (!iter->status().ok()) {
s = iter->status();
}
if (s.ok() && meta->file_size > 0) {
// Keep it
} else {
env->DeleteFile(fname);
}
return s;
}
} // namespace rocksdb

48
db/builder.h Normal file
View File

@ -0,0 +1,48 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/comparator.h"
#include "rocksdb/status.h"
#include "rocksdb/types.h"
#include "rocksdb/options.h"
namespace rocksdb {
struct Options;
struct FileMetaData;
class Env;
struct EnvOptions;
class Iterator;
class TableCache;
class VersionEdit;
class TableBuilder;
class WritableFile;
extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
CompressionType compression_type);
// Build a Table file from the contents of *iter. The generated file
// will be named according to meta->number. On success, the rest of
// *meta will be filled with metadata about the generated table.
// If no data is present in *iter, meta->file_size will be set to
// zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
const EnvOptions& soptions,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta,
const Comparator* user_comparator,
const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable,
const bool enable_compression);
} // namespace rocksdb

691
db/c.cc Normal file
View File

@ -0,0 +1,691 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/c.h"
#include <stdlib.h>
#include <unistd.h>
#include "rocksdb/cache.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/status.h"
#include "rocksdb/write_batch.h"
using rocksdb::Cache;
using rocksdb::Comparator;
using rocksdb::CompressionType;
using rocksdb::DB;
using rocksdb::Env;
using rocksdb::FileLock;
using rocksdb::FilterPolicy;
using rocksdb::Iterator;
using rocksdb::Logger;
using rocksdb::NewBloomFilterPolicy;
using rocksdb::NewLRUCache;
using rocksdb::Options;
using rocksdb::RandomAccessFile;
using rocksdb::Range;
using rocksdb::ReadOptions;
using rocksdb::SequentialFile;
using rocksdb::Slice;
using rocksdb::Snapshot;
using rocksdb::Status;
using rocksdb::WritableFile;
using rocksdb::WriteBatch;
using rocksdb::WriteOptions;
using std::shared_ptr;
extern "C" {
struct leveldb_t { DB* rep; };
struct leveldb_iterator_t { Iterator* rep; };
struct leveldb_writebatch_t { WriteBatch rep; };
struct leveldb_snapshot_t { const Snapshot* rep; };
struct leveldb_readoptions_t { ReadOptions rep; };
struct leveldb_writeoptions_t { WriteOptions rep; };
struct leveldb_options_t { Options rep; };
struct leveldb_seqfile_t { SequentialFile* rep; };
struct leveldb_randomfile_t { RandomAccessFile* rep; };
struct leveldb_writablefile_t { WritableFile* rep; };
struct leveldb_filelock_t { FileLock* rep; };
struct leveldb_logger_t { shared_ptr<Logger> rep; };
struct leveldb_cache_t { shared_ptr<Cache> rep; };
struct leveldb_comparator_t : public Comparator {
void* state_;
void (*destructor_)(void*);
int (*compare_)(
void*,
const char* a, size_t alen,
const char* b, size_t blen);
const char* (*name_)(void*);
virtual ~leveldb_comparator_t() {
(*destructor_)(state_);
}
virtual int Compare(const Slice& a, const Slice& b) const {
return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
}
virtual const char* Name() const {
return (*name_)(state_);
}
// No-ops since the C binding does not support key shortening methods.
virtual void FindShortestSeparator(std::string*, const Slice&) const { }
virtual void FindShortSuccessor(std::string* key) const { }
};
struct leveldb_filterpolicy_t : public FilterPolicy {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
char* (*create_)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length);
unsigned char (*key_match_)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length);
virtual ~leveldb_filterpolicy_t() {
(*destructor_)(state_);
}
virtual const char* Name() const {
return (*name_)(state_);
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
std::vector<const char*> key_pointers(n);
std::vector<size_t> key_sizes(n);
for (int i = 0; i < n; i++) {
key_pointers[i] = keys[i].data();
key_sizes[i] = keys[i].size();
}
size_t len;
char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
dst->append(filter, len);
free(filter);
}
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return (*key_match_)(state_, key.data(), key.size(),
filter.data(), filter.size());
}
};
struct leveldb_env_t {
Env* rep;
bool is_default;
};
static bool SaveError(char** errptr, const Status& s) {
assert(errptr != NULL);
if (s.ok()) {
return false;
} else if (*errptr == NULL) {
*errptr = strdup(s.ToString().c_str());
} else {
// TODO(sanjay): Merge with existing error?
free(*errptr);
*errptr = strdup(s.ToString().c_str());
}
return true;
}
static char* CopyString(const std::string& str) {
char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
memcpy(result, str.data(), sizeof(char) * str.size());
return result;
}
leveldb_t* leveldb_open(
const leveldb_options_t* options,
const char* name,
char** errptr) {
DB* db;
if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
return NULL;
}
leveldb_t* result = new leveldb_t;
result->rep = db;
return result;
}
void leveldb_close(leveldb_t* db) {
delete db->rep;
delete db;
}
void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr) {
SaveError(errptr,
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
}
void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
}
void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
char** errptr) {
SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
}
char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = NULL;
std::string tmp;
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options) {
leveldb_iterator_t* result = new leveldb_iterator_t;
result->rep = db->rep->NewIterator(options->rep);
return result;
}
const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db) {
leveldb_snapshot_t* result = new leveldb_snapshot_t;
result->rep = db->rep->GetSnapshot();
return result;
}
void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot) {
db->rep->ReleaseSnapshot(snapshot->rep);
delete snapshot;
}
char* leveldb_property_value(
leveldb_t* db,
const char* propname) {
std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) {
// We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else {
return NULL;
}
}
void leveldb_approximate_sizes(
leveldb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes) {
Range* ranges = new Range[num_ranges];
for (int i = 0; i < num_ranges; i++) {
ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
}
db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
delete[] ranges;
}
void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
// Pass NULL Slice if corresponding "const char*" is NULL
(start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
}
void leveldb_destroy_db(
const leveldb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, DestroyDB(name, options->rep));
}
void leveldb_repair_db(
const leveldb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, RepairDB(name, options->rep));
}
void leveldb_iter_destroy(leveldb_iterator_t* iter) {
delete iter->rep;
delete iter;
}
unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
return iter->rep->Valid();
}
void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
iter->rep->SeekToFirst();
}
void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
iter->rep->SeekToLast();
}
void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
iter->rep->Seek(Slice(k, klen));
}
void leveldb_iter_next(leveldb_iterator_t* iter) {
iter->rep->Next();
}
void leveldb_iter_prev(leveldb_iterator_t* iter) {
iter->rep->Prev();
}
const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
Slice s = iter->rep->key();
*klen = s.size();
return s.data();
}
const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
Slice s = iter->rep->value();
*vlen = s.size();
return s.data();
}
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
SaveError(errptr, iter->rep->status());
}
leveldb_writebatch_t* leveldb_writebatch_create() {
return new leveldb_writebatch_t;
}
void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
delete b;
}
void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
b->rep.Clear();
}
void leveldb_writebatch_put(
leveldb_writebatch_t* b,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep.Put(Slice(key, klen), Slice(val, vlen));
}
void leveldb_writebatch_delete(
leveldb_writebatch_t* b,
const char* key, size_t klen) {
b->rep.Delete(Slice(key, klen));
}
void leveldb_writebatch_iterate(
leveldb_writebatch_t* b,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen)) {
class H : public WriteBatch::Handler {
public:
void* state_;
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
void (*deleted_)(void*, const char* k, size_t klen);
virtual void Put(const Slice& key, const Slice& value) {
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
}
virtual void Delete(const Slice& key) {
(*deleted_)(state_, key.data(), key.size());
}
};
H handler;
handler.state_ = state;
handler.put_ = put;
handler.deleted_ = deleted;
b->rep.Iterate(&handler);
}
leveldb_options_t* leveldb_options_create() {
return new leveldb_options_t;
}
void leveldb_options_destroy(leveldb_options_t* options) {
delete options;
}
void leveldb_options_set_comparator(
leveldb_options_t* opt,
leveldb_comparator_t* cmp) {
opt->rep.comparator = cmp;
}
void leveldb_options_set_filter_policy(
leveldb_options_t* opt,
leveldb_filterpolicy_t* policy) {
opt->rep.filter_policy = policy;
}
void leveldb_options_set_create_if_missing(
leveldb_options_t* opt, unsigned char v) {
opt->rep.create_if_missing = v;
}
void leveldb_options_set_error_if_exists(
leveldb_options_t* opt, unsigned char v) {
opt->rep.error_if_exists = v;
}
void leveldb_options_set_paranoid_checks(
leveldb_options_t* opt, unsigned char v) {
opt->rep.paranoid_checks = v;
}
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
opt->rep.env = (env ? env->rep : NULL);
}
void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
if (l) {
opt->rep.info_log = l->rep;
}
}
void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
opt->rep.write_buffer_size = s;
}
void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
opt->rep.max_open_files = n;
}
void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
if (c) {
opt->rep.block_cache = c->rep;
}
}
void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
opt->rep.block_size = s;
}
void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
opt->rep.block_restart_interval = n;
}
void leveldb_options_set_target_file_size_base(
leveldb_options_t* opt, uint64_t n) {
opt->rep.target_file_size_base = n;
}
void leveldb_options_set_target_file_size_multiplier(
leveldb_options_t* opt, int n) {
opt->rep.target_file_size_multiplier = n;
}
void leveldb_options_set_max_bytes_for_level_base(
leveldb_options_t* opt, uint64_t n) {
opt->rep.max_bytes_for_level_base = n;
}
void leveldb_options_set_max_bytes_for_level_multiplier(
leveldb_options_t* opt, int n) {
opt->rep.max_bytes_for_level_multiplier = n;
}
void leveldb_options_set_expanded_compaction_factor(
leveldb_options_t* opt, int n) {
opt->rep.expanded_compaction_factor = n;
}
void leveldb_options_set_max_grandparent_overlap_factor(
leveldb_options_t* opt, int n) {
opt->rep.max_grandparent_overlap_factor = n;
}
void leveldb_options_set_num_levels(leveldb_options_t* opt, int n) {
opt->rep.num_levels = n;
}
void leveldb_options_set_level0_file_num_compaction_trigger(
leveldb_options_t* opt, int n) {
opt->rep.level0_file_num_compaction_trigger = n;
}
void leveldb_options_set_level0_slowdown_writes_trigger(
leveldb_options_t* opt, int n) {
opt->rep.level0_slowdown_writes_trigger = n;
}
void leveldb_options_set_level0_stop_writes_trigger(
leveldb_options_t* opt, int n) {
opt->rep.level0_stop_writes_trigger = n;
}
void leveldb_options_set_max_mem_compaction_level(
leveldb_options_t* opt, int n) {
opt->rep.max_mem_compaction_level = n;
}
void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
opt->rep.compression = static_cast<CompressionType>(t);
}
void leveldb_options_set_compression_per_level(leveldb_options_t* opt,
int* level_values,
size_t num_levels) {
opt->rep.compression_per_level.resize(num_levels);
for (size_t i = 0; i < num_levels; ++i) {
opt->rep.compression_per_level[i] =
static_cast<CompressionType>(level_values[i]);
}
}
void leveldb_options_set_compression_options(
leveldb_options_t* opt, int w_bits, int level, int strategy) {
opt->rep.compression_opts.window_bits = w_bits;
opt->rep.compression_opts.level = level;
opt->rep.compression_opts.strategy = strategy;
}
void leveldb_options_set_disable_data_sync(
leveldb_options_t* opt, bool disable_data_sync) {
opt->rep.disableDataSync = disable_data_sync;
}
void leveldb_options_set_use_fsync(
leveldb_options_t* opt, bool use_fsync) {
opt->rep.use_fsync = use_fsync;
}
void leveldb_options_set_db_stats_log_interval(
leveldb_options_t* opt, int db_stats_log_interval) {
opt->rep.db_stats_log_interval = db_stats_log_interval;
}
void leveldb_options_set_db_log_dir(
leveldb_options_t* opt, const char* db_log_dir) {
opt->rep.db_log_dir = db_log_dir;
}
void leveldb_options_set_WAL_ttl_seconds(leveldb_options_t* opt, uint64_t ttl) {
opt->rep.WAL_ttl_seconds = ttl;
}
void leveldb_options_set_WAL_size_limit_MB(
leveldb_options_t* opt, uint64_t limit) {
opt->rep.WAL_size_limit_MB = limit;
}
leveldb_comparator_t* leveldb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*)) {
leveldb_comparator_t* result = new leveldb_comparator_t;
result->state_ = state;
result->destructor_ = destructor;
result->compare_ = compare;
result->name_ = name;
return result;
}
void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
delete cmp;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*)) {
leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
result->state_ = state;
result->destructor_ = destructor;
result->create_ = create_filter;
result->key_match_ = key_may_match;
result->name_ = name;
return result;
}
void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
delete filter;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
// Make a leveldb_filterpolicy_t, but override all of its methods so
// they delegate to a NewBloomFilterPolicy() instead of user
// supplied C functions.
struct Wrapper : public leveldb_filterpolicy_t {
const FilterPolicy* rep_;
~Wrapper() { delete rep_; }
const char* Name() const { return rep_->Name(); }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
return rep_->CreateFilter(keys, n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return rep_->KeyMayMatch(key, filter);
}
static void DoNothing(void*) { }
};
Wrapper* wrapper = new Wrapper;
wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
wrapper->state_ = NULL;
wrapper->destructor_ = &Wrapper::DoNothing;
return wrapper;
}
leveldb_readoptions_t* leveldb_readoptions_create() {
return new leveldb_readoptions_t;
}
void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
delete opt;
}
void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t* opt,
unsigned char v) {
opt->rep.verify_checksums = v;
}
void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t* opt, unsigned char v) {
opt->rep.fill_cache = v;
}
void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t* opt,
const leveldb_snapshot_t* snap) {
opt->rep.snapshot = (snap ? snap->rep : NULL);
}
leveldb_writeoptions_t* leveldb_writeoptions_create() {
return new leveldb_writeoptions_t;
}
void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
delete opt;
}
void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t* opt, unsigned char v) {
opt->rep.sync = v;
}
leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
leveldb_cache_t* c = new leveldb_cache_t;
c->rep = NewLRUCache(capacity);
return c;
}
void leveldb_cache_destroy(leveldb_cache_t* cache) {
delete cache;
}
leveldb_env_t* leveldb_create_default_env() {
leveldb_env_t* result = new leveldb_env_t;
result->rep = Env::Default();
result->is_default = true;
return result;
}
void leveldb_env_destroy(leveldb_env_t* env) {
if (!env->is_default) delete env->rep;
delete env;
}
} // end extern "C"

390
db/c_test.c Normal file
View File

@ -0,0 +1,390 @@
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors. */
#include "rocksdb/c.h"
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
const char* phase = "";
static char dbname[200];
static void StartPhase(const char* name) {
fprintf(stderr, "=== Test %s\n", name);
phase = name;
}
static const char* GetTempDir(void) {
const char* ret = getenv("TEST_TMPDIR");
if (ret == NULL || ret[0] == '\0')
ret = "/tmp";
return ret;
}
#define CheckNoError(err) \
if ((err) != NULL) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
abort(); \
}
#define CheckCondition(cond) \
if (!(cond)) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
abort(); \
}
static void CheckEqual(const char* expected, const char* v, size_t n) {
if (expected == NULL && v == NULL) {
// ok
} else if (expected != NULL && v != NULL && n == strlen(expected) &&
memcmp(expected, v, n) == 0) {
// ok
return;
} else {
fprintf(stderr, "%s: expected '%s', got '%s'\n",
phase,
(expected ? expected : "(null)"),
(v ? v : "(null"));
abort();
}
}
static void Free(char** ptr) {
if (*ptr) {
free(*ptr);
*ptr = NULL;
}
}
static void CheckGet(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key,
const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckIter(leveldb_iterator_t* iter,
const char* key, const char* val) {
size_t len;
const char* str;
str = leveldb_iter_key(iter, &len);
CheckEqual(key, str, len);
str = leveldb_iter_value(iter, &len);
CheckEqual(val, str, len);
}
// Callback from leveldb_writebatch_iterate()
static void CheckPut(void* ptr,
const char* k, size_t klen,
const char* v, size_t vlen) {
int* state = (int*) ptr;
CheckCondition(*state < 2);
switch (*state) {
case 0:
CheckEqual("bar", k, klen);
CheckEqual("b", v, vlen);
break;
case 1:
CheckEqual("box", k, klen);
CheckEqual("c", v, vlen);
break;
}
(*state)++;
}
// Callback from leveldb_writebatch_iterate()
static void CheckDel(void* ptr, const char* k, size_t klen) {
int* state = (int*) ptr;
CheckCondition(*state == 2);
CheckEqual("bar", k, klen);
(*state)++;
}
static void CmpDestroy(void* arg) { }
static int CmpCompare(void* arg, const char* a, size_t alen,
const char* b, size_t blen) {
int n = (alen < blen) ? alen : blen;
int r = memcmp(a, b, n);
if (r == 0) {
if (alen < blen) r = -1;
else if (alen > blen) r = +1;
}
return r;
}
static const char* CmpName(void* arg) {
return "foo";
}
// Custom filter policy
static unsigned char fake_filter_result = 1;
static void FilterDestroy(void* arg) { }
static const char* FilterName(void* arg) {
return "TestFilter";
}
static char* FilterCreate(
void* arg,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length) {
*filter_length = 4;
char* result = malloc(4);
memcpy(result, "fake", 4);
return result;
}
unsigned char FilterKeyMatch(
void* arg,
const char* key, size_t length,
const char* filter, size_t filter_length) {
CheckCondition(filter_length == 4);
CheckCondition(memcmp(filter, "fake", 4) == 0);
return fake_filter_result;
}
int main(int argc, char** argv) {
leveldb_t* db;
leveldb_comparator_t* cmp;
leveldb_cache_t* cache;
leveldb_env_t* env;
leveldb_options_t* options;
leveldb_readoptions_t* roptions;
leveldb_writeoptions_t* woptions;
char* err = NULL;
int run = -1;
snprintf(dbname, sizeof(dbname),
"%s/leveldb_c_test-%d",
GetTempDir(),
((int) geteuid()));
StartPhase("create_objects");
cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
env = leveldb_create_default_env();
cache = leveldb_cache_create_lru(100000);
options = leveldb_options_create();
leveldb_options_set_comparator(options, cmp);
leveldb_options_set_error_if_exists(options, 1);
leveldb_options_set_cache(options, cache);
leveldb_options_set_env(options, env);
leveldb_options_set_info_log(options, NULL);
leveldb_options_set_write_buffer_size(options, 100000);
leveldb_options_set_paranoid_checks(options, 1);
leveldb_options_set_max_open_files(options, 10);
leveldb_options_set_block_size(options, 1024);
leveldb_options_set_block_restart_interval(options, 8);
leveldb_options_set_compression(options, leveldb_no_compression);
leveldb_options_set_compression_options(options, -14, -1, 0);
int compression_levels[] = {leveldb_no_compression, leveldb_no_compression,
leveldb_no_compression, leveldb_no_compression};
leveldb_options_set_compression_per_level(options, compression_levels, 4);
roptions = leveldb_readoptions_create();
leveldb_readoptions_set_verify_checksums(roptions, 1);
leveldb_readoptions_set_fill_cache(roptions, 0);
woptions = leveldb_writeoptions_create();
leveldb_writeoptions_set_sync(woptions, 1);
StartPhase("destroy");
leveldb_destroy_db(options, dbname, &err);
Free(&err);
StartPhase("open_error");
db = leveldb_open(options, dbname, &err);
CheckCondition(err != NULL);
Free(&err);
StartPhase("open");
leveldb_options_set_create_if_missing(options, 1);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
StartPhase("put");
leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactall");
leveldb_compact_range(db, NULL, 0, NULL, 0);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactrange");
leveldb_compact_range(db, "a", 1, "z", 1);
CheckGet(db, roptions, "foo", "hello");
StartPhase("writebatch");
{
leveldb_writebatch_t* wb = leveldb_writebatch_create();
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
leveldb_writebatch_clear(wb);
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
leveldb_writebatch_put(wb, "box", 3, "c", 1);
leveldb_writebatch_delete(wb, "bar", 3);
leveldb_write(db, woptions, wb, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
int pos = 0;
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
CheckCondition(pos == 3);
leveldb_writebatch_destroy(wb);
}
StartPhase("iter");
{
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_first(iter);
CheckCondition(leveldb_iter_valid(iter));
CheckIter(iter, "box", "c");
leveldb_iter_next(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_prev(iter);
CheckIter(iter, "box", "c");
leveldb_iter_prev(iter);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_last(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_seek(iter, "b", 1);
CheckIter(iter, "box", "c");
leveldb_iter_get_error(iter, &err);
CheckNoError(err);
leveldb_iter_destroy(iter);
}
StartPhase("approximate_sizes");
{
int i;
int n = 20000;
char keybuf[100];
char valbuf[100];
uint64_t sizes[2];
const char* start[2] = { "a", "k00000000000000010000" };
size_t start_len[2] = { 1, 21 };
const char* limit[2] = { "k00000000000000010000", "z" };
size_t limit_len[2] = { 21, 1 };
leveldb_writeoptions_set_sync(woptions, 0);
for (i = 0; i < n; i++) {
snprintf(keybuf, sizeof(keybuf), "k%020d", i);
snprintf(valbuf, sizeof(valbuf), "v%020d", i);
leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
&err);
CheckNoError(err);
}
leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
CheckCondition(sizes[0] > 0);
CheckCondition(sizes[1] > 0);
}
StartPhase("property");
{
char* prop = leveldb_property_value(db, "nosuchprop");
CheckCondition(prop == NULL);
prop = leveldb_property_value(db, "rocksdb.stats");
CheckCondition(prop != NULL);
Free(&prop);
}
StartPhase("snapshot");
{
const leveldb_snapshot_t* snap;
snap = leveldb_create_snapshot(db);
leveldb_delete(db, woptions, "foo", 3, &err);
CheckNoError(err);
leveldb_readoptions_set_snapshot(roptions, snap);
CheckGet(db, roptions, "foo", "hello");
leveldb_readoptions_set_snapshot(roptions, NULL);
CheckGet(db, roptions, "foo", NULL);
leveldb_release_snapshot(db, snap);
}
StartPhase("repair");
{
// If we do not compact here, then the lazy deletion of
// files (https://reviews.facebook.net/D6123) would leave
// around deleted files and the repair process will find
// those files and put them back into the database.
leveldb_compact_range(db, NULL, 0, NULL, 0);
leveldb_close(db);
leveldb_options_set_create_if_missing(options, 0);
leveldb_options_set_error_if_exists(options, 0);
leveldb_repair_db(options, dbname, &err);
CheckNoError(err);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
leveldb_options_set_create_if_missing(options, 1);
leveldb_options_set_error_if_exists(options, 1);
}
StartPhase("filter");
for (run = 0; run < 2; run++) {
// First run uses custom filter, second run uses bloom filter
CheckNoError(err);
leveldb_filterpolicy_t* policy;
if (run == 0) {
policy = leveldb_filterpolicy_create(
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
} else {
policy = leveldb_filterpolicy_create_bloom(10);
}
// Create new database
leveldb_close(db);
leveldb_destroy_db(options, dbname, &err);
leveldb_options_set_filter_policy(options, policy);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
CheckNoError(err);
leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
CheckNoError(err);
leveldb_compact_range(db, NULL, 0, NULL, 0);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
if (phase == 0) {
// Must not find value when custom filter returns false
fake_filter_result = 0;
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
}
leveldb_options_set_filter_policy(options, NULL);
leveldb_filterpolicy_destroy(policy);
}
StartPhase("cleanup");
leveldb_close(db);
leveldb_options_destroy(options);
leveldb_readoptions_destroy(roptions);
leveldb_writeoptions_destroy(woptions);
leveldb_cache_destroy(cache);
leveldb_comparator_destroy(cmp);
leveldb_env_destroy(env);
fprintf(stderr, "PASS\n");
return 0;
}

378
db/corruption_test.cc Normal file
View File

@ -0,0 +1,378 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/db.h"
#include <errno.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "rocksdb/cache.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "rocksdb/write_batch.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/log_format.h"
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
static const int kValueSize = 1000;
class CorruptionTest {
public:
test::ErrorEnv env_;
std::string dbname_;
shared_ptr<Cache> tiny_cache_;
Options options_;
DB* db_;
CorruptionTest() {
tiny_cache_ = NewLRUCache(100);
options_.env = &env_;
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, options_);
db_ = nullptr;
options_.create_if_missing = true;
options_.block_size_deviation = 0; // make unit test pass for now
Reopen();
options_.create_if_missing = false;
}
~CorruptionTest() {
delete db_;
DestroyDB(dbname_, Options());
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
Options opt = (options ? *options : options_);
opt.env = &env_;
opt.block_cache = tiny_cache_;
opt.block_size_deviation = 0;
opt.arena_block_size = 4096;
return DB::Open(opt, dbname_, &db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void RepairDB() {
delete db_;
db_ = nullptr;
ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
}
void Build(int n) {
std::string key_space, value_space;
WriteBatch batch;
for (int i = 0; i < n; i++) {
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
}
}
void Check(int min_expected, int max_expected) {
unsigned int next_expected = 0;
int missed = 0;
int bad_keys = 0;
int bad_values = 0;
int correct = 0;
std::string value_space;
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());
if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() ||
key < next_expected) {
bad_keys++;
continue;
}
missed += (key - next_expected);
next_expected = key + 1;
if (iter->value() != Value(key, &value_space)) {
bad_values++;
} else {
correct++;
}
}
delete iter;
fprintf(stderr,
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
min_expected, max_expected, correct, bad_keys, bad_values, missed);
ASSERT_LE(min_expected, correct);
ASSERT_GE(max_expected, correct);
}
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
// Pick file to corrupt
std::vector<std::string> filenames;
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
uint64_t number;
FileType type;
std::string fname;
int picked_number = -1;
for (unsigned int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) &&
type == filetype &&
int(number) > picked_number) { // Pick latest file
fname = dbname_ + "/" + filenames[i];
picked_number = number;
}
}
ASSERT_TRUE(!fname.empty()) << filetype;
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
const char* msg = strerror(errno);
ASSERT_TRUE(false) << fname << ": " << msg;
}
if (offset < 0) {
// Relative to end of file; make it absolute
if (-offset > sbuf.st_size) {
offset = 0;
} else {
offset = sbuf.st_size + offset;
}
}
if (offset > sbuf.st_size) {
offset = sbuf.st_size;
}
if (offset + bytes_to_corrupt > sbuf.st_size) {
bytes_to_corrupt = sbuf.st_size - offset;
}
// Do it
std::string contents;
Status s = ReadFileToString(Env::Default(), fname, &contents);
ASSERT_TRUE(s.ok()) << s.ToString();
for (int i = 0; i < bytes_to_corrupt; i++) {
contents[i + offset] ^= 0x80;
}
s = WriteStringToFile(Env::Default(), contents, fname);
ASSERT_TRUE(s.ok()) << s.ToString();
}
int Property(const std::string& name) {
std::string property;
int result;
if (db_->GetProperty(name, &property) &&
sscanf(property.c_str(), "%d", &result) == 1) {
return result;
} else {
return -1;
}
}
// Return the ith key
Slice Key(int i, std::string* storage) {
char buf[100];
snprintf(buf, sizeof(buf), "%016d", i);
storage->assign(buf, strlen(buf));
return Slice(*storage);
}
// Return the value to associate with the specified key
Slice Value(int k, std::string* storage) {
Random r(k);
return test::RandomString(&r, kValueSize, storage);
}
};
TEST(CorruptionTest, Recovery) {
Build(100);
Check(100, 100);
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
Reopen();
// The 64 records in the first two log blocks are completely lost.
Check(36, 36);
}
TEST(CorruptionTest, RecoverWriteError) {
env_.writable_file_error_ = true;
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
}
TEST(CorruptionTest, NewFileErrorDuringWrite) {
// Do enough writing to force minor compaction
env_.writable_file_error_ = true;
const int num = 3 + (Options().write_buffer_size / kValueSize);
std::string value_storage;
Status s;
bool failed = false;
for (int i = 0; i < num; i++) {
WriteBatch batch;
batch.Put("a", Value(100, &value_storage));
s = db_->Write(WriteOptions(), &batch);
if (!s.ok()) {
failed = true;
}
ASSERT_TRUE(!failed || !s.ok());
}
ASSERT_TRUE(!s.ok());
ASSERT_GE(env_.num_writable_file_errors_, 1);
env_.writable_file_error_ = false;
Reopen();
}
TEST(CorruptionTest, TableFile) {
Build(100);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
dbi->TEST_CompactRange(0, nullptr, nullptr);
dbi->TEST_CompactRange(1, nullptr, nullptr);
Corrupt(kTableFile, 100, 1);
Check(99, 99);
}
TEST(CorruptionTest, TableFileIndexData) {
Build(10000); // Enough to build multiple Tables
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
Corrupt(kTableFile, -2000, 500);
Reopen();
Check(5000, 9999);
}
TEST(CorruptionTest, MissingDescriptor) {
Build(1000);
RepairDB();
Reopen();
Check(1000, 1000);
}
TEST(CorruptionTest, SequenceNumberRecovery) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v5", v);
// Write something. If sequence number was not recovered properly,
// it will be hidden by an earlier write.
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
}
TEST(CorruptionTest, CorruptedDescriptor) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
dbi->TEST_CompactRange(0, nullptr, nullptr);
Corrupt(kDescriptorFile, 0, 1000);
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("hello", v);
}
TEST(CorruptionTest, CompactionInputError) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
const int last = dbi->MaxMemCompactionLevel();
ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Force compactions by writing lots of values
Build(10000);
Check(10000, 10000);
}
TEST(CorruptionTest, CompactionInputErrorParanoid) {
Options options;
options.paranoid_checks = true;
options.write_buffer_size = 1048576;
Reopen(&options);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Fill levels >= 1 so memtable compaction outputs to level 1
for (int level = 1; level < dbi->NumberLevels(); level++) {
dbi->Put(WriteOptions(), "", "begin");
dbi->Put(WriteOptions(), "~", "end");
dbi->TEST_FlushMemTable();
}
Build(10);
dbi->TEST_FlushMemTable();
dbi->TEST_WaitForCompact();
ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
Corrupt(kTableFile, 100, 1);
Check(9, 9);
// Write must eventually fail because of corrupted table
Status s;
std::string tmp1, tmp2;
bool failed = false;
for (int i = 0; i < 10000 && s.ok(); i++) {
s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
if (!s.ok()) {
failed = true;
}
// if one write failed, every subsequent write must fail, too
ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
}
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
}
TEST(CorruptionTest, UnrelatedKeys) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
Corrupt(kTableFile, 100, 1);
std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
dbi->TEST_FlushMemTable();
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

2461
db/db_bench.cc Normal file

File diff suppressed because it is too large Load Diff

100
db/db_filesnapshot.cc Normal file
View File

@ -0,0 +1,100 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <algorithm>
#include <string>
#include <stdint.h>
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
#include "util/mutexlock.h"
namespace rocksdb {
Status DBImpl::DisableFileDeletions() {
MutexLock l(&mutex_);
disable_delete_obsolete_files_ = true;
Log(options_.info_log, "File Deletions Disabled");
return Status::OK();
}
Status DBImpl::EnableFileDeletions() {
DeletionState deletion_state;
{
MutexLock l(&mutex_);
disable_delete_obsolete_files_ = false;
Log(options_.info_log, "File Deletions Enabled");
FindObsoleteFiles(deletion_state, true);
}
PurgeObsoleteFiles(deletion_state);
LogFlush(options_.info_log);
return Status::OK();
}
Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
uint64_t* manifest_file_size,
bool flush_memtable) {
*manifest_file_size = 0;
if (flush_memtable) {
// flush all dirty data to disk.
Status status = Flush(FlushOptions());
if (!status.ok()) {
Log(options_.info_log, "Cannot Flush data %s\n",
status.ToString().c_str());
return status;
}
}
MutexLock l(&mutex_);
// Make a set of all of the live *.sst files
std::set<uint64_t> live;
versions_->AddLiveFilesCurrentVersion(&live);
ret.clear();
ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
// create names of the live files. The names are not absolute
// paths, instead they are relative to dbname_;
for (auto live_file : live) {
ret.push_back(TableFileName("", live_file));
}
ret.push_back(CurrentFileName(""));
ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
// find length of manifest file while holding the mutex lock
*manifest_file_size = versions_->ManifestFileSize();
return Status::OK();
}
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
// First get sorted files in archive dir, then append sorted files from main
// dir to maintain sorted order
// list wal files in archive dir.
Status s;
std::string archivedir = ArchivalDirectory(options_.wal_dir);
if (env_->FileExists(archivedir)) {
s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile);
if (!s.ok()) {
return s;
}
}
// list wal files in main db dir.
return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile);
}
}

3595
db/db_impl.cc Normal file

File diff suppressed because it is too large Load Diff

497
db/db_impl.h Normal file
View File

@ -0,0 +1,497 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <atomic>
#include <deque>
#include <set>
#include <vector>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "db/version_edit.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/transaction_log.h"
#include "port/port.h"
#include "util/stats_logger.h"
#include "memtablelist.h"
namespace rocksdb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class DBImpl : public DB {
public:
DBImpl(const Options& options, const std::string& dbname);
virtual ~DBImpl();
// Implementations of the DB interface
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
virtual Status Merge(const WriteOptions&, const Slice& key,
const Slice& value);
virtual Status Delete(const WriteOptions&, const Slice& key);
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value);
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values);
// Returns false if key doesn't exist in the database and true if it may.
// If value_found is not passed in as null, then return the value if found in
// memory. On return, if value was found, then value_found will be set to true
// , otherwise false.
virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key,
std::string* value,
bool* value_found = nullptr);
virtual Iterator* NewIterator(const ReadOptions&);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, std::string* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
virtual void CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1);
virtual int NumberLevels();
virtual int MaxMemCompactionLevel();
virtual int Level0StopWriteTrigger();
virtual Status Flush(const FlushOptions& options);
virtual Status DisableFileDeletions();
virtual Status EnableFileDeletions();
// All the returned filenames start with "/"
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true);
virtual Status GetSortedWalFiles(VectorLogPtr& files);
virtual SequenceNumber GetLatestSequenceNumber() const;
virtual Status GetUpdatesSince(SequenceNumber seq_number,
unique_ptr<TransactionLogIterator>* iter);
virtual Status DeleteFile(std::string name);
virtual void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata);
// Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [*begin, *end]
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
// Force current memtable contents to be flushed.
Status TEST_FlushMemTable();
// Wait for memtable compaction
Status TEST_WaitForFlushMemTable();
// Wait for any compaction
Status TEST_WaitForCompact();
// Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator();
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes();
// Simulate a db crash, no elegant closing of database.
void TEST_Destroy_DBImpl();
// Return the current manifest file no.
uint64_t TEST_Current_Manifest_FileNo();
// Trigger's a background call for testing.
void TEST_PurgeObsoleteteWAL();
// get total level0 file size. Only for testing.
uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
{
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
}
// needed for CleanupIteratorState
struct DeletionState {
inline bool HaveSomethingToDelete() const {
return all_files.size() ||
sst_delete_files.size() ||
log_delete_files.size();
}
// a list of all files that we'll consider deleting
// (every once in a while this is filled up with all files
// in the DB directory)
std::vector<std::string> all_files;
// the list of all live sst files that cannot be deleted
std::vector<uint64_t> sst_live;
// a list of sst files that we need to delete
std::vector<FileMetaData*> sst_delete_files;
// a list of log files that we need to delete
std::vector<uint64_t> log_delete_files;
// the current manifest_file_number, log_number and prev_log_number
// that corresponds to the set of files in 'live'.
uint64_t manifest_file_number, log_number, prev_log_number;
DeletionState() {
manifest_file_number = 0;
log_number = 0;
prev_log_number = 0;
}
};
// Returns the list of live files in 'live' and the list
// of all files in the filesystem in 'all_files'.
// If force == false and the last call was less than
// options_.delete_obsolete_files_period_micros microseconds ago,
// it will not fill up the deletion_state
void FindObsoleteFiles(DeletionState& deletion_state,
bool force,
bool no_full_scan = false);
// Diffs the files listed in filenames and those that do not
// belong to live files are posibly removed. Also, removes all the
// files in sst_delete_files and log_delete_files.
// It is not necessary to hold the mutex when invoking this method.
void PurgeObsoleteFiles(DeletionState& deletion_state);
protected:
Env* const env_;
const std::string dbname_;
unique_ptr<VersionSet> versions_;
const InternalKeyComparator internal_comparator_;
const Options options_; // options_.comparator == &internal_comparator_
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
MemTable* GetMemTable() {
return mem_;
}
Iterator* NewInternalIterator(const ReadOptions&,
SequenceNumber* latest_snapshot);
private:
friend class DB;
struct CompactionState;
struct Writer;
Status NewDB();
// Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit.
Status Recover(VersionEdit* edit, MemTable* external_table = nullptr,
bool error_if_log_file_exist = false);
void MaybeIgnoreError(Status* s) const;
const Status CreateArchivalDirectory();
// Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles();
// Flush the in-memory write buffer to storage. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
Status FlushMemTableToOutputFile(bool* madeProgress,
DeletionState& deletion_state);
Status RecoverLogFile(uint64_t log_number,
VersionEdit* edit,
SequenceNumber* max_sequence,
MemTable* external_table);
// The following two methods are used to flush a memtable to
// storage. The first one is used atdatabase RecoveryTime (when the
// database is opened) and is heavyweight because it holds the mutex
// for the entire period. The second method WriteLevel0Table supports
// concurrent flush memtables to storage.
Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
uint64_t* filenumber);
uint64_t SlowdownAmount(int n, int top, int bottom);
Status MakeRoomForWrite(bool force /* compact even if there is room? */);
WriteBatch* BuildBatchGroup(Writer** last_writer);
// Force current memtable contents to be flushed.
Status FlushMemTable(const FlushOptions& options);
// Wait for memtable flushed
Status WaitForFlushMemTable();
void MaybeScheduleLogDBDeployStats();
static void BGLogDBDeployStats(void* db);
void LogDBDeployStats();
void MaybeScheduleFlushOrCompaction();
static void BGWorkCompaction(void* db);
static void BGWorkFlush(void* db);
void BackgroundCallCompaction();
void BackgroundCallFlush();
Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state);
Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state);
void CleanupCompaction(CompactionState* compact, Status status);
Status DoCompactionWork(CompactionState* compact,
DeletionState& deletion_state);
Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact);
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
void PurgeObsoleteWALFiles();
Status AppendSortedWalsOfType(const std::string& path,
VectorLogPtr& log_files,
WalFileType type);
// Requires: all_logs should be sorted with earliest log file first
// Retains all log files in all_logs which contain updates with seq no.
// Greater Than or Equal to the requested SequenceNumber.
Status RetainProbableWalFiles(VectorLogPtr& all_logs,
const SequenceNumber target);
// return true if
bool CheckWalFileExistsAndEmpty(const WalFileType type,
const uint64_t number);
Status ReadFirstRecord(const WalFileType type, const uint64_t number,
WriteBatch* const result);
Status ReadFirstLine(const std::string& fname, WriteBatch* const batch);
void PrintStatistics();
// dump rocksdb.stats to LOG
void MaybeDumpStats();
// Return the minimum empty level that could hold the total data in the
// input level. Return the input level, if such level could not be found.
int FindMinimumEmptyLevelFitting(int level);
// Move the files in the input level to the target level.
// If target_level < 0, automatically calculate the minimum level that could
// hold the data set.
void ReFitLevel(int level, int target_level = -1);
// Constant after construction
const InternalFilterPolicy internal_filter_policy_;
bool owns_info_log_;
// table_cache_ provides its own synchronization
unique_ptr<TableCache> table_cache_;
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
FileLock* db_lock_;
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
port::CondVar bg_cv_; // Signalled when background work finishes
std::shared_ptr<MemTableRepFactory> mem_rep_factory_;
MemTable* mem_;
MemTableList imm_; // Memtable that are not changing
uint64_t logfile_number_;
unique_ptr<log::Writer> log_;
std::string host_name_;
// Queue of writers.
std::deque<Writer*> writers_;
WriteBatch tmp_batch_;
SnapshotList snapshots_;
// Set of table files to protect from deletion because they are
// part of ongoing compactions.
std::set<uint64_t> pending_outputs_;
// count how many background compaction been scheduled or is running?
int bg_compaction_scheduled_;
// number of background memtable flush jobs, submitted to the HIGH pool
int bg_flush_scheduled_;
// Has a background stats log thread scheduled?
bool bg_logstats_scheduled_;
// Information for a manual compaction
struct ManualCompaction {
int level;
bool done;
bool in_progress; // compaction request being processed?
const InternalKey* begin; // nullptr means beginning of key range
const InternalKey* end; // nullptr means end of key range
InternalKey tmp_storage; // Used to keep track of compaction progress
};
ManualCompaction* manual_compaction_;
// Have we encountered a background error in paranoid mode?
Status bg_error_;
std::unique_ptr<StatsLogger> logger_;
int64_t volatile last_log_ts;
// shall we disable deletion of obsolete files
bool disable_delete_obsolete_files_;
// last time when DeleteObsoleteFiles was invoked
uint64_t delete_obsolete_files_last_run_;
// last time when PurgeObsoleteWALFiles ran.
uint64_t purge_wal_files_last_run_;
// last time stats were dumped to LOG
std::atomic<uint64_t> last_stats_dump_time_microsec_;
// obsolete files will be deleted every this seconds if ttl deletion is
// enabled and archive size_limit is disabled.
uint64_t default_interval_to_delete_obsolete_WAL_;
// These count the number of microseconds for which MakeRoomForWrite stalls.
uint64_t stall_level0_slowdown_;
uint64_t stall_memtable_compaction_;
uint64_t stall_level0_num_files_;
std::vector<uint64_t> stall_leveln_slowdown_;
uint64_t stall_level0_slowdown_count_;
uint64_t stall_memtable_compaction_count_;
uint64_t stall_level0_num_files_count_;
std::vector<uint64_t> stall_leveln_slowdown_count_;
// Time at which this instance was started.
const uint64_t started_at_;
bool flush_on_destroy_; // Used when disableWAL is true.
// Per level compaction stats. stats_[level] stores the stats for
// compactions that produced data for the specified "level".
struct CompactionStats {
uint64_t micros;
// Bytes read from level N during compaction between levels N and N+1
int64_t bytes_readn;
// Bytes read from level N+1 during compaction between levels N and N+1
int64_t bytes_readnp1;
// Total bytes written during compaction between levels N and N+1
int64_t bytes_written;
// Files read from level N during compaction between levels N and N+1
int files_in_leveln;
// Files read from level N+1 during compaction between levels N and N+1
int files_in_levelnp1;
// Files written during compaction between levels N and N+1
int files_out_levelnp1;
// Number of compactions done
int count;
CompactionStats() : micros(0), bytes_readn(0), bytes_readnp1(0),
bytes_written(0), files_in_leveln(0),
files_in_levelnp1(0), files_out_levelnp1(0),
count(0) { }
void Add(const CompactionStats& c) {
this->micros += c.micros;
this->bytes_readn += c.bytes_readn;
this->bytes_readnp1 += c.bytes_readnp1;
this->bytes_written += c.bytes_written;
this->files_in_leveln += c.files_in_leveln;
this->files_in_levelnp1 += c.files_in_levelnp1;
this->files_out_levelnp1 += c.files_out_levelnp1;
this->count += 1;
}
};
std::vector<CompactionStats> stats_;
// Used to compute per-interval statistics
struct StatsSnapshot {
uint64_t bytes_read_;
uint64_t bytes_written_;
uint64_t bytes_new_;
double seconds_up_;
StatsSnapshot() : bytes_read_(0), bytes_written_(0),
bytes_new_(0), seconds_up_(0) {}
};
StatsSnapshot last_stats_;
static const int KEEP_LOG_FILE_NUM = 1000;
std::string db_absolute_path_;
// count of the number of contiguous delaying writes
int delayed_writes_;
// The options to access storage files
const EnvOptions storage_options_;
// A value of true temporarily disables scheduling of background work
bool bg_work_gate_closed_;
// Guard against multiple concurrent refitting
bool refitting_level_;
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
// dump the delayed_writes_ to the log file and reset counter.
void DelayLoggingAndReset();
// Return the earliest snapshot where seqno is visible.
// Store the snapshot right before that, if any, in prev_snapshot
inline SequenceNumber findEarliestVisibleSnapshot(
SequenceNumber in,
std::vector<SequenceNumber>& snapshots,
SequenceNumber* prev_snapshot);
// Function that Get and KeyMayExist call with no_io true or false
// Note: 'value_found' from KeyMayExist propagates here
Status GetImpl(const ReadOptions& options,
const Slice& key,
std::string* value,
bool* value_found = nullptr);
};
// Sanitize db options. The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src);
// Determine compression type, based on user options, level of the output
// file and whether compression is disabled.
// If enable_compression is false, then compression is always disabled no
// matter what the values of the other two parameters are.
// Otherwise, the compression type is determined based on options and level.
CompressionType GetCompressionType(const Options& options, int level,
const bool enable_compression);
} // namespace rocksdb

99
db/db_impl_readonly.cc Normal file
View File

@ -0,0 +1,99 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "db/db_impl_readonly.h"
#include "db/db_impl.h"
#include <algorithm>
#include <set>
#include <string>
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include <algorithm>
#include "db/db_iter.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "port/port.h"
#include "table/block.h"
#include "table/merger.h"
#include "table/two_level_iterator.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/build_version.h"
namespace rocksdb {
DBImplReadOnly::DBImplReadOnly(const Options& options,
const std::string& dbname)
: DBImpl(options, dbname) {
Log(options_.info_log, "Opening the db in read only mode");
}
DBImplReadOnly::~DBImplReadOnly() {
}
// Implementations of the DB interface
Status DBImplReadOnly::Get(const ReadOptions& options,
const Slice& key,
std::string* value) {
Status s;
MemTable* mem = GetMemTable();
Version* current = versions_->current();
SequenceNumber snapshot = versions_->LastSequence();
std::deque<std::string> merge_operands;
LookupKey lkey(key, snapshot);
if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
} else {
Version::GetStats stats;
current->Get(options, lkey, value, &s, &merge_operands, &stats, options_);
}
return s;
}
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) {
SequenceNumber latest_snapshot;
Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
return NewDBIterator(
&dbname_, env_, options_, user_comparator(),internal_iter,
(options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
: latest_snapshot));
}
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
DB** dbptr, bool error_if_log_file_exist) {
*dbptr = nullptr;
DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
impl->mutex_.Lock();
VersionEdit edit(impl->NumberLevels());
Status s = impl->Recover(&edit, impl->GetMemTable(),
error_if_log_file_exist);
impl->mutex_.Unlock();
if (s.ok()) {
*dbptr = impl;
} else {
delete impl;
}
return s;
}
}

78
db/db_impl_readonly.h Normal file
View File

@ -0,0 +1,78 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#pragma once
#include "db/db_impl.h"
#include <deque>
#include <set>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
#include "util/stats_logger.h"
namespace rocksdb {
class DBImplReadOnly : public DBImpl {
public:
DBImplReadOnly(const Options& options, const std::string& dbname);
virtual ~DBImplReadOnly();
// Implementations of the DB interface
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value);
// TODO: Implement ReadOnly MultiGet?
virtual Iterator* NewIterator(const ReadOptions&);
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Merge(const WriteOptions&, const Slice& key,
const Slice& value) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Delete(const WriteOptions&, const Slice& key) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual void CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false, int target_level = -1) {
}
virtual Status DisableFileDeletions() {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status EnableFileDeletions() {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true) {
return Status::NotSupported("Not supported operation in read only mode.");
}
virtual Status Flush(const FlushOptions& options) {
return Status::NotSupported("Not supported operation in read only mode.");
}
private:
friend class DB;
// No copying allowed
DBImplReadOnly(const DBImplReadOnly&);
void operator=(const DBImplReadOnly&);
};
}

481
db/db_iter.cc Normal file
View File

@ -0,0 +1,481 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_iter.h"
#include <stdexcept>
#include <deque>
#include "db/filename.h"
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/perf_context_imp.h"
namespace rocksdb {
#if 0
static void DumpInternalIter(Iterator* iter) {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey k;
if (!ParseInternalKey(iter->key(), &k)) {
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
} else {
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
}
}
}
#endif
namespace {
// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries. DBIter
// combines multiple entries for the same userkey found in the DB
// representation into a single entry while accounting for sequence
// numbers, deletion markers, overwrites, etc.
class DBIter: public Iterator {
public:
// The following is grossly complicated. TODO: clean it up
// Which direction is the iterator currently moving?
// (1) When moving forward, the internal iterator is positioned at
// the exact entry that yields this->key(), this->value()
// (2) When moving backwards, the internal iterator is positioned
// just before all entries whose user key == this->key().
enum Direction {
kForward,
kReverse
};
DBIter(const std::string* dbname, Env* env, const Options& options,
const Comparator* cmp, Iterator* iter, SequenceNumber s)
: dbname_(dbname),
env_(env),
logger_(options.info_log),
user_comparator_(cmp),
user_merge_operator_(options.merge_operator.get()),
iter_(iter),
sequence_(s),
direction_(kForward),
valid_(false),
current_entry_is_merged_(false),
statistics_(options.statistics) {
RecordTick(statistics_, NO_ITERATORS, 1);
max_skip_ = options.max_sequential_skip_in_iterations;
}
virtual ~DBIter() {
RecordTick(statistics_, NO_ITERATORS, -1);
delete iter_;
}
virtual bool Valid() const { return valid_; }
virtual Slice key() const {
assert(valid_);
return saved_key_;
}
virtual Slice value() const {
assert(valid_);
return (direction_ == kForward && !current_entry_is_merged_) ?
iter_->value() : saved_value_;
}
virtual Status status() const {
if (status_.ok()) {
return iter_->status();
} else {
return status_;
}
}
virtual void Next();
virtual void Prev();
virtual void Seek(const Slice& target);
virtual void SeekToFirst();
virtual void SeekToLast();
private:
void FindNextUserEntry(bool skipping);
void FindPrevUserEntry();
bool ParseKey(ParsedInternalKey* key);
void MergeValuesNewToOld();
inline void SaveKey(const Slice& k, std::string* dst) {
dst->assign(k.data(), k.size());
}
inline void ClearSavedValue() {
if (saved_value_.capacity() > 1048576) {
std::string empty;
swap(empty, saved_value_);
} else {
saved_value_.clear();
}
}
const std::string* const dbname_;
Env* const env_;
shared_ptr<Logger> logger_;
const Comparator* const user_comparator_;
const MergeOperator* const user_merge_operator_;
Iterator* const iter_;
SequenceNumber const sequence_;
Status status_;
std::string saved_key_; // == current key when direction_==kReverse
std::string saved_value_; // == current raw value when direction_==kReverse
std::string skip_key_;
Direction direction_;
bool valid_;
bool current_entry_is_merged_;
std::shared_ptr<Statistics> statistics_;
uint64_t max_skip_;
// No copying allowed
DBIter(const DBIter&);
void operator=(const DBIter&);
};
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
if (!ParseInternalKey(iter_->key(), ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
Log(logger_, "corrupted internal key in DBIter: %s",
iter_->key().ToString(true).c_str());
return false;
} else {
return true;
}
}
void DBIter::Next() {
assert(valid_);
if (direction_ == kReverse) { // Switch directions?
direction_ = kForward;
// iter_ is pointing just before the entries for this->key(),
// so advance into the range of entries for this->key() and then
// use the normal skipping code below.
if (!iter_->Valid()) {
iter_->SeekToFirst();
} else {
iter_->Next();
}
if (!iter_->Valid()) {
valid_ = false;
saved_key_.clear();
return;
}
}
// If the current value is merged, we might already hit end of iter_
if (!iter_->Valid()) {
valid_ = false;
return;
}
FindNextUserEntry(true /* skipping the current user key */);
}
// PRE: saved_key_ has the current user key if skipping
// POST: saved_key_ should have the next user key if valid_,
// if the current entry is a result of merge
// current_entry_is_merged_ => true
// saved_value_ => the merged value
//
// NOTE: In between, saved_key_ can point to a user key that has
// a delete marker
void DBIter::FindNextUserEntry(bool skipping) {
// Loop until we hit an acceptable entry to yield
assert(iter_->Valid());
assert(direction_ == kForward);
current_entry_is_merged_ = false;
uint64_t num_skipped = 0;
do {
ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
num_skipped++; // skip this entry
BumpPerfCount(&perf_context.internal_key_skipped_count);
} else {
skipping = false;
switch (ikey.type) {
case kTypeDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
SaveKey(ikey.user_key, &saved_key_);
skipping = true;
num_skipped = 0;
BumpPerfCount(&perf_context.internal_delete_skipped_count);
break;
case kTypeValue:
valid_ = true;
SaveKey(ikey.user_key, &saved_key_);
return;
case kTypeMerge:
// By now, we are sure the current ikey is going to yield a value
SaveKey(ikey.user_key, &saved_key_);
current_entry_is_merged_ = true;
valid_ = true;
MergeValuesNewToOld(); // Go to a different state machine
return;
case kTypeLogData:
assert(false);
break;
}
}
}
// If we have sequentially iterated via numerous keys and still not
// found the next user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the last occurence of
// our current key by looking for sequence number 0.
if (skipping && num_skipped > max_skip_) {
num_skipped = 0;
std::string last_key;
AppendInternalKey(&last_key,
ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else {
iter_->Next();
}
} while (iter_->Valid());
valid_ = false;
}
// Merge values of the same user key starting from the current iter_ position
// Scan from the newer entries to older entries.
// PRE: iter_->key() points to the first merge type entry
// saved_key_ stores the user key
// POST: saved_value_ has the merged value for the user key
// iter_ points to the next entry (or invalid)
void DBIter::MergeValuesNewToOld() {
if (!user_merge_operator_) {
Log(logger_, "Options::merge_operator is null.");
throw std::logic_error("DBIter::MergeValuesNewToOld() with"
" Options::merge_operator null");
}
// Start the merge process by pushing the first operand
std::deque<std::string> operands;
operands.push_front(iter_->value().ToString());
std::string merge_result; // Temporary string to hold merge result later
ParsedInternalKey ikey;
for (iter_->Next(); iter_->Valid(); iter_->Next()) {
if (!ParseKey(&ikey)) {
// skip corrupted key
continue;
}
if (user_comparator_->Compare(ikey.user_key, saved_key_) != 0) {
// hit the next user key, stop right here
break;
}
if (kTypeDeletion == ikey.type) {
// hit a delete with the same user key, stop right here
// iter_ is positioned after delete
iter_->Next();
break;
}
if (kTypeValue == ikey.type) {
// hit a put, merge the put value with operands and store the
// final result in saved_value_. We are done!
// ignore corruption if there is any.
const Slice value = iter_->value();
user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
&saved_value_, logger_.get());
// iter_ is positioned after put
iter_->Next();
return;
}
if (kTypeMerge == ikey.type) {
// hit a merge, add the value as an operand and run associative merge.
// when complete, add result to operands and continue.
const Slice& value = iter_->value();
operands.push_front(value.ToString());
while(operands.size() >= 2) {
// Call user associative-merge until it returns false
if (user_merge_operator_->PartialMerge(ikey.user_key,
Slice(operands[0]),
Slice(operands[1]),
&merge_result,
logger_.get())) {
operands.pop_front();
swap(operands.front(), merge_result);
} else {
// Associative merge returns false ==> stack the operands
break;
}
}
}
}
// we either exhausted all internal keys under this user key, or hit
// a deletion marker.
// feed null as the existing value to the merge operator, such that
// client can differentiate this scenario and do things accordingly.
user_merge_operator_->FullMerge(saved_key_, nullptr, operands,
&saved_value_, logger_.get());
}
void DBIter::Prev() {
assert(valid_);
// Throw an exception now if merge_operator is provided
// TODO: support backward iteration
if (user_merge_operator_) {
Log(logger_, "Prev not supported yet if merge_operator is provided");
throw std::logic_error("DBIter::Prev backward iteration not supported"
" if merge_operator is provided");
}
if (direction_ == kForward) { // Switch directions?
// iter_ is pointing at the current entry. Scan backwards until
// the key changes so we can use the normal reverse scanning code.
assert(iter_->Valid()); // Otherwise valid_ would have been false
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
while (true) {
iter_->Prev();
if (!iter_->Valid()) {
valid_ = false;
saved_key_.clear();
ClearSavedValue();
return;
}
if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
saved_key_) < 0) {
break;
}
}
direction_ = kReverse;
}
FindPrevUserEntry();
}
void DBIter::FindPrevUserEntry() {
assert(direction_ == kReverse);
uint64_t num_skipped = 0;
ValueType value_type = kTypeDeletion;
if (iter_->Valid()) {
do {
ParsedInternalKey ikey;
bool saved_key_cleared = false;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if ((value_type != kTypeDeletion) &&
user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
// We encountered a non-deleted value in entries for previous keys,
break;
}
value_type = ikey.type;
if (value_type == kTypeDeletion) {
saved_key_.clear();
ClearSavedValue();
saved_key_cleared = true;
} else {
Slice raw_value = iter_->value();
if (saved_value_.capacity() > raw_value.size() + 1048576) {
std::string empty;
swap(empty, saved_value_);
}
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
saved_value_.assign(raw_value.data(), raw_value.size());
}
}
num_skipped++;
// If we have sequentially iterated via numerous keys and still not
// found the prev user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the first occurence of
// our current key by looking for max sequence number.
if (!saved_key_cleared && num_skipped > max_skip_) {
num_skipped = 0;
std::string last_key;
AppendInternalKey(&last_key,
ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
kValueTypeForSeek));
iter_->Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
} else {
iter_->Prev();
}
} while (iter_->Valid());
}
if (value_type == kTypeDeletion) {
// End
valid_ = false;
saved_key_.clear();
ClearSavedValue();
direction_ = kForward;
} else {
valid_ = true;
}
}
void DBIter::Seek(const Slice& target) {
direction_ = kForward;
ClearSavedValue();
saved_key_.clear();
AppendInternalKey(
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
iter_->Seek(saved_key_);
if (iter_->Valid()) {
FindNextUserEntry(false /*not skipping */);
} else {
valid_ = false;
}
}
void DBIter::SeekToFirst() {
direction_ = kForward;
ClearSavedValue();
iter_->SeekToFirst();
if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */);
} else {
valid_ = false;
}
}
void DBIter::SeekToLast() {
// Throw an exception for now if merge_operator is provided
// TODO: support backward iteration
if (user_merge_operator_) {
Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
throw std::logic_error("DBIter::SeekToLast: backward iteration not"
" supported if merge_operator is provided");
}
direction_ = kReverse;
ClearSavedValue();
iter_->SeekToLast();
FindPrevUserEntry();
}
} // anonymous namespace
Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
const Options& options,
const Comparator *user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence) {
return new DBIter(dbname, env, options, user_key_comparator,
internal_iter, sequence);
}
} // namespace rocksdb

28
db/db_iter.h Normal file
View File

@ -0,0 +1,28 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <stdint.h>
#include "rocksdb/db.h"
#include "db/dbformat.h"
namespace rocksdb {
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
extern Iterator* NewDBIterator(
const std::string* dbname,
Env* env,
const Options& options,
const Comparator *user_key_comparator,
Iterator* internal_iter,
const SequenceNumber& sequence);
} // namespace rocksdb

65
db/db_statistics.h Normal file
View File

@ -0,0 +1,65 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <cassert>
#include <stdlib.h>
#include <vector>
#include <memory>
#include "rocksdb/statistics.h"
#include "util/histogram.h"
#include "port/port.h"
#include "util/mutexlock.h"
namespace rocksdb {
class DBStatistics: public Statistics {
public:
DBStatistics() : allTickers_(TICKER_ENUM_MAX),
allHistograms_(HISTOGRAM_ENUM_MAX) { }
virtual ~DBStatistics() {}
virtual long getTickerCount(Tickers tickerType) {
assert(tickerType < TICKER_ENUM_MAX);
return allTickers_[tickerType].getCount();
}
virtual void setTickerCount(Tickers tickerType, uint64_t count) {
assert(tickerType < TICKER_ENUM_MAX);
allTickers_[tickerType].setTickerCount(count);
}
virtual void recordTick(Tickers tickerType, uint64_t count) {
assert(tickerType < TICKER_ENUM_MAX);
allTickers_[tickerType].recordTick(count);
}
virtual void measureTime(Histograms histogramType, uint64_t value) {
assert(histogramType < HISTOGRAM_ENUM_MAX);
allHistograms_[histogramType].Add(value);
}
virtual void histogramData(Histograms histogramType,
HistogramData * const data) {
assert(histogramType < HISTOGRAM_ENUM_MAX);
allHistograms_[histogramType].Data(data);
}
std::vector<Ticker> allTickers_;
std::vector<HistogramImpl> allHistograms_;
};
std::shared_ptr<Statistics> CreateDBStatistics() {
return std::make_shared<DBStatistics>();
}
} // namespace rocksdb

93
db/db_stats_logger.cc Normal file
View File

@ -0,0 +1,93 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_impl.h"
#include <string>
#include <stdint.h>
#include <stdio.h>
#include "db/version_set.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "port/port.h"
#include "util/mutexlock.h"
namespace rocksdb {
void DBImpl::MaybeScheduleLogDBDeployStats() {
// There is a lock in the actual logger.
if (!logger_ || options_.db_stats_log_interval < 0
|| host_name_.empty()) {
return;
}
if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
// Already scheduled
} else {
int64_t current_ts = 0;
Status st = env_->GetCurrentTime(&current_ts);
if (!st.ok()) {
return;
}
if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
return;
}
last_log_ts = current_ts;
bg_logstats_scheduled_ = true;
env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
}
}
void DBImpl::BGLogDBDeployStats(void* db) {
DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
db_inst->LogDBDeployStats();
}
void DBImpl::LogDBDeployStats() {
mutex_.Lock();
if (shutting_down_.Acquire_Load()) {
bg_logstats_scheduled_ = false;
bg_cv_.SignalAll();
mutex_.Unlock();
return;
}
char tmp_ver[100];
sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
std::string version_info(tmp_ver);
uint64_t file_total_size = 0;
uint32_t file_total_num = 0;
for (int i = 0; i < versions_->NumberLevels(); i++) {
file_total_num += versions_->NumLevelFiles(i);
file_total_size += versions_->NumLevelBytes(i);
}
VersionSet::LevelSummaryStorage scratch;
const char* file_num_summary = versions_->LevelSummary(&scratch);
std::string file_num_per_level(file_num_summary);
std::string data_size_per_level(file_num_summary);
mutex_.Unlock();
int64_t unix_ts;
env_->GetCurrentTime(&unix_ts);
logger_->Log_Deploy_Stats(version_info, host_name_,
db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
data_size_per_level, unix_ts);
mutex_.Lock();
bg_logstats_scheduled_ = false;
bg_cv_.SignalAll();
mutex_.Unlock();
}
}

4914
db/db_test.cc Normal file

File diff suppressed because it is too large Load Diff

147
db/dbformat.cc Normal file
View File

@ -0,0 +1,147 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include "db/dbformat.h"
#include "port/port.h"
#include "util/coding.h"
#include "util/perf_context_imp.h"
namespace rocksdb {
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
assert(seq <= kMaxSequenceNumber);
assert(t <= kValueTypeForSeek);
return (seq << 8) | t;
}
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
std::string ParsedInternalKey::DebugString(bool hex) const {
char buf[50];
snprintf(buf, sizeof(buf), "' @ %llu : %d",
(unsigned long long) sequence,
int(type));
std::string result = "'";
result += user_key.ToString(hex);
result += buf;
return result;
}
std::string InternalKey::DebugString(bool hex) const {
std::string result;
ParsedInternalKey parsed;
if (ParseInternalKey(rep_, &parsed)) {
result = parsed.DebugString(hex);
} else {
result = "(bad)";
result.append(EscapeString(rep_));
}
return result;
}
const char* InternalKeyComparator::Name() const {
return name_.c_str();
}
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
BumpPerfCount(&perf_context.user_key_comparison_count);
if (r == 0) {
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
void InternalKeyComparator::FindShortestSeparator(
std::string* start,
const Slice& limit) const {
// Attempt to shorten the user portion of the key
Slice user_start = ExtractUserKey(*start);
Slice user_limit = ExtractUserKey(limit);
std::string tmp(user_start.data(), user_start.size());
user_comparator_->FindShortestSeparator(&tmp, user_limit);
if (tmp.size() < user_start.size() &&
user_comparator_->Compare(user_start, tmp) < 0) {
// User key has become shorter physically, but larger logically.
// Tack on the earliest possible number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*start, tmp) < 0);
assert(this->Compare(tmp, limit) < 0);
start->swap(tmp);
}
}
void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
Slice user_key = ExtractUserKey(*key);
std::string tmp(user_key.data(), user_key.size());
user_comparator_->FindShortSuccessor(&tmp);
if (tmp.size() < user_key.size() &&
user_comparator_->Compare(user_key, tmp) < 0) {
// User key has become shorter physically, but larger logically.
// Tack on the earliest possible number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*key, tmp) < 0);
key->swap(tmp);
}
}
const char* InternalFilterPolicy::Name() const {
return user_policy_->Name();
}
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
std::string* dst) const {
// We rely on the fact that the code in table.cc does not mind us
// adjusting keys[].
Slice* mkey = const_cast<Slice*>(keys);
for (int i = 0; i < n; i++) {
mkey[i] = ExtractUserKey(keys[i]);
// TODO(sanjay): Suppress dups?
}
user_policy_->CreateFilter(keys, n, dst);
}
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
}
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size();
size_t needed = usize + 13; // A conservative estimate
char* dst;
if (needed <= sizeof(space_)) {
dst = space_;
} else {
dst = new char[needed];
}
start_ = dst;
dst = EncodeVarint32(dst, usize + 8);
kstart_ = dst;
memcpy(dst, user_key.data(), usize);
dst += usize;
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
dst += 8;
end_ = dst;
}
} // namespace rocksdb

229
db/dbformat.h Normal file
View File

@ -0,0 +1,229 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <stdio.h>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/slice.h"
#include "rocksdb/table.h"
#include "rocksdb/types.h"
#include "util/coding.h"
#include "util/logging.h"
namespace rocksdb {
class InternalKey;
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
enum ValueType {
kTypeDeletion = 0x0,
kTypeValue = 0x1,
kTypeMerge = 0x2,
kTypeLogData = 0x3
};
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest).
static const ValueType kValueTypeForSeek = kTypeMerge;
// We leave eight bits empty at the bottom so a type and sequence#
// can be packed together into 64-bits.
static const SequenceNumber kMaxSequenceNumber =
((0x1ull << 56) - 1);
struct ParsedInternalKey {
Slice user_key;
SequenceNumber sequence;
ValueType type;
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
: user_key(u), sequence(seq), type(t) { }
std::string DebugString(bool hex = false) const;
};
// Return the length of the encoding of "key".
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
return key.user_key.size() + 8;
}
// Append the serialization of "key" to *result.
extern void AppendInternalKey(std::string* result,
const ParsedInternalKey& key);
// Attempt to parse an internal key from "internal_key". On success,
// stores the parsed data in "*result", and returns true.
//
// On error, returns false, leaves "*result" in an undefined state.
extern bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result);
// Returns the user key portion of an internal key.
inline Slice ExtractUserKey(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return Slice(internal_key.data(), internal_key.size() - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) {
assert(internal_key.size() >= 8);
const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
return static_cast<ValueType>(c);
}
// A comparator for internal keys that uses a specified comparator for
// the user key portion and breaks ties by decreasing sequence number.
class InternalKeyComparator : public Comparator {
private:
const Comparator* user_comparator_;
std::string name_;
public:
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
name_("rocksdb.InternalKeyComparator:" +
std::string(user_comparator_->Name())) {
}
virtual const char* Name() const;
virtual int Compare(const Slice& a, const Slice& b) const;
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const;
virtual void FindShortSuccessor(std::string* key) const;
const Comparator* user_comparator() const { return user_comparator_; }
int Compare(const InternalKey& a, const InternalKey& b) const;
};
// Filter policy wrapper that converts from internal keys to user keys
class InternalFilterPolicy : public FilterPolicy {
private:
const FilterPolicy* const user_policy_;
public:
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
virtual const char* Name() const;
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
};
// Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator.
class InternalKey {
private:
std::string rep_;
public:
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
}
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
Slice Encode() const {
assert(!rep_.empty());
return rep_;
}
Slice user_key() const { return ExtractUserKey(rep_); }
void SetFrom(const ParsedInternalKey& p) {
rep_.clear();
AppendInternalKey(&rep_, p);
}
void Clear() { rep_.clear(); }
std::string DebugString(bool hex = false) const;
};
inline int InternalKeyComparator::Compare(
const InternalKey& a, const InternalKey& b) const {
return Compare(a.Encode(), b.Encode());
}
inline bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result) {
const size_t n = internal_key.size();
if (n < 8) return false;
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
result->sequence = num >> 8;
result->type = static_cast<ValueType>(c);
result->user_key = Slice(internal_key.data(), n - 8);
return (c <= static_cast<unsigned char>(kValueTypeForSeek));
}
// Update the sequence number in the internal key
inline void UpdateInternalKey(char* internal_key,
const size_t internal_key_size,
uint64_t seq, ValueType t) {
assert(internal_key_size >= 8);
char* seqtype = internal_key + internal_key_size - 8;
uint64_t newval = (seq << 8) | t;
EncodeFixed64(seqtype, newval);
}
// Get the sequence number from the internal key
inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
const size_t n = internal_key.size();
assert(n >= 8);
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
return num >> 8;
}
// A helper class useful for DBImpl::Get()
class LookupKey {
public:
// Initialize *this for looking up user_key at a snapshot with
// the specified sequence number.
LookupKey(const Slice& user_key, SequenceNumber sequence);
~LookupKey();
// Return a key suitable for lookup in a MemTable.
Slice memtable_key() const { return Slice(start_, end_ - start_); }
// Return an internal key (suitable for passing to an internal iterator)
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
// Return the user key
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
private:
// We construct a char array of the form:
// klength varint32 <-- start_
// userkey char[klength] <-- kstart_
// tag uint64
// <-- end_
// The array is a suitable MemTable key.
// The suffix starting with "userkey" can be used as an InternalKey.
const char* start_;
const char* kstart_;
const char* end_;
char space_[200]; // Avoid allocation for short keys
// No copying allowed
LookupKey(const LookupKey&);
void operator=(const LookupKey&);
};
inline LookupKey::~LookupKey() {
if (start_ != space_) delete[] start_;
}
} // namespace rocksdb

117
db/dbformat_test.cc Normal file
View File

@ -0,0 +1,117 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace rocksdb {
static std::string IKey(const std::string& user_key,
uint64_t seq,
ValueType vt) {
std::string encoded;
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
return encoded;
}
static std::string Shorten(const std::string& s, const std::string& l) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
return result;
}
static std::string ShortSuccessor(const std::string& s) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
return result;
}
static void TestKey(const std::string& key,
uint64_t seq,
ValueType vt) {
std::string encoded = IKey(key, seq, vt);
Slice in(encoded);
ParsedInternalKey decoded("", 0, kTypeValue);
ASSERT_TRUE(ParseInternalKey(in, &decoded));
ASSERT_EQ(key, decoded.user_key.ToString());
ASSERT_EQ(seq, decoded.sequence);
ASSERT_EQ(vt, decoded.type);
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
}
class FormatTest { };
TEST(FormatTest, InternalKey_EncodeDecode) {
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
const uint64_t seq[] = {
1, 2, 3,
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
};
for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
TestKey(keys[k], seq[s], kTypeValue);
TestKey("hello", 1, kTypeDeletion);
}
}
}
TEST(FormatTest, InternalKeyShortSeparator) {
// When user keys are same
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 99, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 101, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeDeletion)));
// When user keys are misordered
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("bar", 99, kTypeValue)));
// When user keys are different, but correctly ordered
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
Shorten(IKey("foo", 100, kTypeValue),
IKey("hello", 200, kTypeValue)));
// When start user key is prefix of limit user key
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foobar", 200, kTypeValue)));
// When limit user key is prefix of start user key
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
Shorten(IKey("foobar", 100, kTypeValue),
IKey("foo", 200, kTypeValue)));
}
TEST(FormatTest, InternalKeyShortestSuccessor) {
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
ShortSuccessor(IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

283
db/deletefile_test.cc Normal file
View File

@ -0,0 +1,283 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/db.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "rocksdb/env.h"
#include "rocksdb/transaction_log.h"
#include <vector>
#include <stdlib.h>
#include <map>
#include <string>
namespace rocksdb {
class DeleteFileTest {
public:
std::string dbname_;
Options options_;
DB* db_;
Env* env_;
int numlevels_;
DeleteFileTest() {
db_ = nullptr;
env_ = Env::Default();
options_.write_buffer_size = 1024*1024*1000;
options_.target_file_size_base = 1024*1024*1000;
options_.max_bytes_for_level_base = 1024*1024*1000;
options_.WAL_ttl_seconds = 300; // Used to test log files
options_.WAL_size_limit_MB = 1024; // Used to test log files
dbname_ = test::TmpDir() + "/deletefile_test";
options_.wal_dir = dbname_ + "/wal_files";
DestroyDB(dbname_, options_);
numlevels_ = 7;
ASSERT_OK(ReopenDB(true));
}
Status ReopenDB(bool create) {
delete db_;
if (create) {
DestroyDB(dbname_, options_);
}
db_ = nullptr;
options_.create_if_missing = create;
return DB::Open(options_, dbname_, &db_);
}
void CloseDB() {
delete db_;
}
void AddKeys(int numkeys, int startkey = 0) {
WriteOptions options;
options.sync = false;
ReadOptions roptions;
for (int i = startkey; i < (numkeys + startkey) ; i++) {
std::string temp = std::to_string(i);
Slice key(temp);
Slice value(temp);
ASSERT_OK(db_->Put(options, key, value));
}
}
int numKeysInLevels(
std::vector<LiveFileMetaData> &metadata,
std::vector<int> *keysperlevel = nullptr) {
if (keysperlevel != nullptr) {
keysperlevel->resize(numlevels_);
}
int numKeys = 0;
for (size_t i = 0; i < metadata.size(); i++) {
int startkey = atoi(metadata[i].smallestkey.c_str());
int endkey = atoi(metadata[i].largestkey.c_str());
int numkeysinfile = (endkey - startkey + 1);
numKeys += numkeysinfile;
if (keysperlevel != nullptr) {
(*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
}
fprintf(stderr, "level %d name %s smallest %s largest %s\n",
metadata[i].level, metadata[i].name.c_str(),
metadata[i].smallestkey.c_str(),
metadata[i].largestkey.c_str());
}
return numKeys;
}
void CreateTwoLevels() {
AddKeys(50000, 10000);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
ASSERT_OK(dbi->TEST_FlushMemTable());
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
AddKeys(50000, 10000);
ASSERT_OK(dbi->TEST_FlushMemTable());
ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
}
void CheckFileTypeCounts(std::string& dir,
int required_log,
int required_sst,
int required_manifest) {
std::vector<std::string> filenames;
env_->GetChildren(dir, &filenames);
int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
for (auto file : filenames) {
uint64_t number;
FileType type;
if (ParseFileName(file, &number, &type)) {
log_cnt += (type == kLogFile);
sst_cnt += (type == kTableFile);
manifest_cnt += (type == kDescriptorFile);
}
}
ASSERT_EQ(required_log, log_cnt);
ASSERT_EQ(required_sst, sst_cnt);
ASSERT_EQ(required_manifest, manifest_cnt);
}
};
TEST(DeleteFileTest, AddKeysAndQueryLevels) {
CreateTwoLevels();
std::vector<LiveFileMetaData> metadata;
std::vector<int> keysinlevel;
db_->GetLiveFilesMetaData(&metadata);
std::string level1file = "";
int level1keycount = 0;
std::string level2file = "";
int level2keycount = 0;
int level1index = 0;
int level2index = 1;
ASSERT_EQ((int)metadata.size(), 2);
if (metadata[0].level == 2) {
level1index = 1;
level2index = 0;
}
level1file = metadata[level1index].name;
int startkey = atoi(metadata[level1index].smallestkey.c_str());
int endkey = atoi(metadata[level1index].largestkey.c_str());
level1keycount = (endkey - startkey + 1);
level2file = metadata[level2index].name;
startkey = atoi(metadata[level2index].smallestkey.c_str());
endkey = atoi(metadata[level2index].largestkey.c_str());
level2keycount = (endkey - startkey + 1);
// COntrolled setup. Levels 1 and 2 should both have 50K files.
// This is a little fragile as it depends on the current
// compaction heuristics.
ASSERT_EQ(level1keycount, 50000);
ASSERT_EQ(level2keycount, 50000);
Status status = db_->DeleteFile("0.sst");
ASSERT_TRUE(status.IsInvalidArgument());
// intermediate level files cannot be deleted.
status = db_->DeleteFile(level1file);
ASSERT_TRUE(status.IsInvalidArgument());
// Lowest level file deletion should succeed.
ASSERT_OK(db_->DeleteFile(level2file));
CloseDB();
}
TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
CreateTwoLevels();
// there should be only one (empty) log file because CreateTwoLevels()
// flushes the memtables to disk
CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
// 2 ssts, 1 manifest
CheckFileTypeCounts(dbname_, 0, 2, 1);
std::string first("0"), last("999999");
Slice first_slice(first), last_slice(last);
db_->CompactRange(&first_slice, &last_slice, true, 2);
// 1 sst after compaction
CheckFileTypeCounts(dbname_, 0, 1, 1);
// this time, we keep an iterator alive
ReopenDB(true);
Iterator *itr = 0;
CreateTwoLevels();
itr = db_->NewIterator(ReadOptions());
db_->CompactRange(&first_slice, &last_slice, true, 2);
// 3 sst after compaction with live iterator
CheckFileTypeCounts(dbname_, 0, 3, 1);
delete itr;
// 1 sst after iterator deletion
CheckFileTypeCounts(dbname_, 0, 1, 1);
CloseDB();
}
TEST(DeleteFileTest, DeleteFileWithIterator) {
CreateTwoLevels();
ReadOptions options;
Iterator* it = db_->NewIterator(options);
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
std::string level2file = "";
ASSERT_EQ((int)metadata.size(), 2);
if (metadata[0].level == 1) {
level2file = metadata[1].name;
} else {
level2file = metadata[0].name;
}
Status status = db_->DeleteFile(level2file);
fprintf(stdout, "Deletion status %s: %s\n",
level2file.c_str(), status.ToString().c_str());
ASSERT_TRUE(status.ok());
it->SeekToFirst();
int numKeysIterated = 0;
while(it->Valid()) {
numKeysIterated++;
it->Next();
}
ASSERT_EQ(numKeysIterated, 50000);
delete it;
CloseDB();
}
TEST(DeleteFileTest, DeleteLogFiles) {
AddKeys(10, 0);
VectorLogPtr logfiles;
db_->GetSortedWalFiles(logfiles);
ASSERT_GT(logfiles.size(), 0UL);
// Take the last log file which is expected to be alive and try to delete it
// Should not succeed because live logs are not allowed to be deleted
std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
ASSERT_EQ(alive_log->Type(), kAliveLogFile);
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
fprintf(stdout, "Deleting alive log file %s\n",
alive_log->PathName().c_str());
ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
logfiles.clear();
// Call Flush to bring about a new working log file and add more keys
// Call Flush again to flush out memtable and move alive log to archived log
// and try to delete the archived log file
FlushOptions fopts;
db_->Flush(fopts);
AddKeys(10, 0);
db_->Flush(fopts);
db_->GetSortedWalFiles(logfiles);
ASSERT_GT(logfiles.size(), 0UL);
std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
archived_log->PathName()));
fprintf(stdout, "Deleting archived log file %s\n",
archived_log->PathName().c_str());
ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
archived_log->PathName()));
CloseDB();
}
} //namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

266
db/filename.cc Normal file
View File

@ -0,0 +1,266 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include <ctype.h>
#include <stdio.h>
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "util/logging.h"
namespace rocksdb {
// Given a path, flatten the path name by replacing all chars not in
// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
// Return the number of chars stored in dest not including the trailing '\0'.
static int FlattenPath(const std::string& path, char* dest, int len) {
int write_idx = 0;
int i = 0;
int src_len = path.size();
while (i < src_len && write_idx < len - 1) {
if ((path[i] >= 'a' && path[i] <= 'z') ||
(path[i] >= '0' && path[i] <= '9') ||
(path[i] >= 'A' && path[i] <= 'Z') ||
path[i] == '-' ||
path[i] == '.' ||
path[i] == '_'){
dest[write_idx++] = path[i];
} else {
if (i > 0)
dest[write_idx++] = '_';
}
i++;
}
dest[write_idx] = '\0';
return write_idx;
}
// A utility routine: write "data" to the named file and Sync() it.
extern Status WriteStringToFileSync(Env* env, const Slice& data,
const std::string& fname);
static std::string MakeFileName(const std::string& name, uint64_t number,
const char* suffix) {
char buf[100];
snprintf(buf, sizeof(buf), "/%06llu.%s",
static_cast<unsigned long long>(number),
suffix);
return name + buf;
}
std::string LogFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "log");
}
std::string ArchivalDirectory(const std::string& dir) {
return dir + "/" + ARCHIVAL_DIR;
}
std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
}
std::string TableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
}
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
char buf[100];
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
static_cast<unsigned long long>(number));
return dbname + buf;
}
std::string CurrentFileName(const std::string& dbname) {
return dbname + "/CURRENT";
}
std::string LockFileName(const std::string& dbname) {
return dbname + "/LOCK";
}
std::string TempFileName(const std::string& dbname, uint64_t number) {
assert(number >= 0);
return MakeFileName(dbname, number, "dbtmp");
}
std::string InfoLogFileName(const std::string& dbname,
const std::string& db_path, const std::string& log_dir) {
if (log_dir.empty())
return dbname + "/LOG";
char flatten_db_path[256];
FlattenPath(db_path, flatten_db_path, 256);
return log_dir + "/" + flatten_db_path + "_LOG";
}
// Return the name of the old info log file for "dbname".
std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
const std::string& db_path, const std::string& log_dir) {
char buf[50];
snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
if (log_dir.empty())
return dbname + "/LOG.old." + buf;
char flatten_db_path[256];
FlattenPath(db_path, flatten_db_path, 256);
return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
}
std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
char buf[100];
snprintf(buf, sizeof(buf), "/METADB-%llu",
static_cast<unsigned long long>(number));
return dbname + buf;
}
std::string IdentityFileName(const std::string& dbname) {
return dbname + "/IDENTITY";
}
// Owned filenames have the form:
// dbname/IDENTITY
// dbname/CURRENT
// dbname/LOCK
// dbname/LOG
// dbname/LOG.old.[0-9]+
// dbname/MANIFEST-[0-9]+
// dbname/[0-9]+.(log|sst)
// dbname/METADB-[0-9]+
// Disregards / at the beginning
bool ParseFileName(const std::string& fname,
uint64_t* number,
FileType* type,
WalFileType* log_type) {
Slice rest(fname);
if (fname.length() > 1 && fname[0] == '/') {
rest.remove_prefix(1);
}
if (rest == "IDENTITY") {
*number = 0;
*type = kIdentityFile;
} else if (rest == "CURRENT") {
*number = 0;
*type = kCurrentFile;
} else if (rest == "LOCK") {
*number = 0;
*type = kDBLockFile;
} else if (rest == "LOG" || rest == "LOG.old") {
*number = 0;
*type = kInfoLogFile;
} else if (rest.starts_with("LOG.old.")) {
uint64_t ts_suffix;
// sizeof also counts the trailing '\0'.
rest.remove_prefix(sizeof("LOG.old.") - 1);
if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
return false;
}
*number = ts_suffix;
*type = kInfoLogFile;
} else if (rest.starts_with("MANIFEST-")) {
rest.remove_prefix(strlen("MANIFEST-"));
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (!rest.empty()) {
return false;
}
*type = kDescriptorFile;
*number = num;
} else if (rest.starts_with("METADB-")) {
rest.remove_prefix(strlen("METADB-"));
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (!rest.empty()) {
return false;
}
*type = kMetaDatabase;
*number = num;
} else {
// Avoid strtoull() to keep filename format independent of the
// current locale
bool archive_dir_found = false;
if (rest.starts_with(ARCHIVAL_DIR)) {
if (rest.size() <= ARCHIVAL_DIR.size()) {
return false;
}
rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
if (log_type) {
*log_type = kArchivedLogFile;
}
archive_dir_found = true;
}
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
Slice suffix = rest;
if (suffix == Slice(".log")) {
*type = kLogFile;
if (log_type && !archive_dir_found) {
*log_type = kAliveLogFile;
}
} else if (archive_dir_found) {
return false; // Archive dir can contain only log files
} else if (suffix == Slice(".sst")) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
*type = kTempFile;
} else {
return false;
}
*number = num;
}
return true;
}
Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number) {
// Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number);
Slice contents = manifest;
assert(contents.starts_with(dbname + "/"));
contents.remove_prefix(dbname.size() + 1);
std::string tmp = TempFileName(dbname, descriptor_number);
Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
if (s.ok()) {
s = env->RenameFile(tmp, CurrentFileName(dbname));
}
if (!s.ok()) {
env->DeleteFile(tmp);
}
return s;
}
Status SetIdentityFile(Env* env, const std::string& dbname) {
std::string id = env->GenerateUniqueId();
assert(!id.empty());
// Reserve the filename dbname/000000.dbtmp for the temporary identity file
std::string tmp = TempFileName(dbname, 0);
Status s = WriteStringToFileSync(env, id, tmp);
if (s.ok()) {
s = env->RenameFile(tmp, IdentityFileName(dbname));
}
if (!s.ok()) {
env->DeleteFile(tmp);
}
return s;
}
} // namespace rocksdb

108
db/filename.h Normal file
View File

@ -0,0 +1,108 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// File names used by DB code
#pragma once
#include <stdint.h>
#include <string>
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/transaction_log.h"
#include "port/port.h"
namespace rocksdb {
class Env;
enum FileType {
kLogFile,
kDBLockFile,
kTableFile,
kDescriptorFile,
kCurrentFile,
kTempFile,
kInfoLogFile, // Either the current one, or an old one
kMetaDatabase,
kIdentityFile
};
// Return the name of the log file with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string LogFileName(const std::string& dbname, uint64_t number);
static const std::string ARCHIVAL_DIR = "archive";
extern std::string ArchivalDirectory(const std::string& dbname);
// Return the name of the archived log file with the specified number
// in the db named by "dbname". The result will be prefixed with "dbname".
extern std::string ArchivedLogFileName(const std::string& dbname,
uint64_t num);
// Return the name of the sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number);
// Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname".
extern std::string DescriptorFileName(const std::string& dbname,
uint64_t number);
// Return the name of the current file. This file contains the name
// of the current manifest file. The result will be prefixed with
// "dbname".
extern std::string CurrentFileName(const std::string& dbname);
// Return the name of the lock file for the db named by
// "dbname". The result will be prefixed with "dbname".
extern std::string LockFileName(const std::string& dbname);
// Return the name of a temporary file owned by the db named "dbname".
// The result will be prefixed with "dbname".
extern std::string TempFileName(const std::string& dbname, uint64_t number);
// Return the name of the info log file for "dbname".
extern std::string InfoLogFileName(const std::string& dbname,
const std::string& db_path="", const std::string& log_dir="");
// Return the name of the old info log file for "dbname".
extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
const std::string& db_path="", const std::string& log_dir="");
// Return the name to use for a metadatabase. The result will be prefixed with
// "dbname".
extern std::string MetaDatabaseName(const std::string& dbname,
uint64_t number);
// Return the name of the Identity file which stores a unique number for the db
// that will get regenerated if the db loses all its data and is recreated fresh
// either from a backup-image or empty
extern std::string IdentityFileName(const std::string& dbname);
// If filename is a rocksdb file, store the type of the file in *type.
// The number encoded in the filename is stored in *number. If the
// filename was successfully parsed, returns true. Else return false.
extern bool ParseFileName(const std::string& filename,
uint64_t* number,
FileType* type,
WalFileType* log_type = nullptr);
// Make the CURRENT file point to the descriptor file with the
// specified number.
extern Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number);
// Make the IDENTITY file for the db
extern Status SetIdentityFile(Env* env, const std::string& dbname);
} // namespace rocksdb

140
db/filename_test.cc Normal file
View File

@ -0,0 +1,140 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include "db/dbformat.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace rocksdb {
class FileNameTest { };
TEST(FileNameTest, Parse) {
Slice db;
FileType type;
uint64_t number;
// Successful parses
static struct {
const char* fname;
uint64_t number;
FileType type;
} cases[] = {
{ "100.log", 100, kLogFile },
{ "0.log", 0, kLogFile },
{ "0.sst", 0, kTableFile },
{ "CURRENT", 0, kCurrentFile },
{ "LOCK", 0, kDBLockFile },
{ "MANIFEST-2", 2, kDescriptorFile },
{ "MANIFEST-7", 7, kDescriptorFile },
{ "METADB-2", 2, kMetaDatabase },
{ "METADB-7", 7, kMetaDatabase },
{ "LOG", 0, kInfoLogFile },
{ "LOG.old", 0, kInfoLogFile },
{ "18446744073709551615.log", 18446744073709551615ull, kLogFile },
};
for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
std::string f = cases[i].fname;
ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
ASSERT_EQ(cases[i].type, type) << f;
ASSERT_EQ(cases[i].number, number) << f;
}
// Errors
static const char* errors[] = {
"",
"foo",
"foo-dx-100.log",
".log",
"",
"manifest",
"CURREN",
"CURRENTX",
"MANIFES",
"MANIFEST",
"MANIFEST-",
"XMANIFEST-3",
"MANIFEST-3x",
"META",
"METADB",
"METADB-",
"XMETADB-3",
"METADB-3x",
"LOC",
"LOCKx",
"LO",
"LOGx",
"18446744073709551616.log",
"184467440737095516150.log",
"100",
"100.",
"100.lop"
};
for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
};
}
TEST(FileNameTest, Construction) {
uint64_t number;
FileType type;
std::string fname;
fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0U, number);
ASSERT_EQ(kCurrentFile, type);
fname = LockFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0U, number);
ASSERT_EQ(kDBLockFile, type);
fname = LogFileName("foo", 192);
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(192U, number);
ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(200U, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(100U, number);
ASSERT_EQ(kDescriptorFile, type);
fname = TempFileName("tmp", 999);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(999U, number);
ASSERT_EQ(kTempFile, type);
fname = MetaDatabaseName("met", 100);
ASSERT_EQ("met/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(100U, number);
ASSERT_EQ(kMetaDatabase, type);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

36
db/log_format.h Normal file
View File

@ -0,0 +1,36 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Log format information shared by reader and writer.
// See ../doc/log_format.txt for more detail.
#pragma once
namespace rocksdb {
namespace log {
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
};
static const int kMaxRecordType = kLastType;
static const unsigned int kBlockSize = 32768;
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
static const int kHeaderSize = 4 + 1 + 2;
} // namespace log
} // namespace rocksdb

264
db/log_reader.cc Normal file
View File

@ -0,0 +1,264 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include <stdio.h>
#include "rocksdb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace rocksdb {
namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
bool checksum, uint64_t initial_offset)
: file_(std::move(file)),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset) {
}
Reader::~Reader() {
delete[] backing_store_;
}
bool Reader::SkipToInitialBlock() {
size_t offset_in_block = initial_offset_ % kBlockSize;
uint64_t block_start_location = initial_offset_ - offset_in_block;
// Don't search a block if we'd be in the trailer
if (offset_in_block > kBlockSize - 6) {
offset_in_block = 0;
block_start_location += kBlockSize;
}
end_of_buffer_offset_ = block_start_location;
// Skip to start of first block that can contain the initial record
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
scratch->clear();
record->clear();
bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true) {
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
const unsigned int record_type = ReadPhysicalRecord(&fragment);
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset;
return true;
case kFirstType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;
return true;
}
break;
case kEof:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "partial record without end(3)");
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default: {
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}
}
}
return false;
}
uint64_t Reader::LastRecordOffset() {
return last_record_offset_;
}
void Reader::ReportCorruption(size_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason));
}
void Reader::ReportDrop(size_t bytes, const Status& reason) {
if (reporter_ != nullptr &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(bytes, reason);
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
if (buffer_.size() < (size_t)kHeaderSize) {
if (!eof_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true;
return kEof;
} else if (buffer_.size() < (size_t)kBlockSize) {
eof_ = true;
}
continue;
} else if (buffer_.size() == 0) {
// End of file
return kEof;
} else {
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "truncated record at end of file");
return kEof;
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
*result = Slice(header + kHeaderSize, length);
return type;
}
}
} // namespace log
} // namespace rocksdb

124
db/log_reader.h Normal file
View File

@ -0,0 +1,124 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <stdint.h>
#include "db/log_format.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
namespace rocksdb {
class SequentialFile;
using std::unique_ptr;
namespace log {
class Reader {
public:
// Interface for reporting errors.
class Reporter {
public:
virtual ~Reporter();
// Some corruption was detected. "size" is the approximate number
// of bytes dropped due to the corruption.
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-nullptr, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
bool checksum, uint64_t initial_offset);
~Reader();
// Read the next record into *record. Returns true if read
// successfully, false if we hit end of the input. May use
// "*scratch" as temporary storage. The contents filled in *record
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch);
// Returns the physical offset of the last record returned by ReadRecord.
//
// Undefined before the first call to ReadRecord.
uint64_t LastRecordOffset();
// returns true if the reader has encountered an eof condition.
bool IsEOF() {
return eof_;
}
// when we know more data has been written to the file. we can use this
// function to force the reader to look again in the file.
void UnmarkEOF() {
eof_ = false;
}
SequentialFile* file() { return file_.get(); }
private:
const unique_ptr<SequentialFile> file_;
Reporter* const reporter_;
bool const checksum_;
char* const backing_store_;
Slice buffer_;
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
// Offset of the last record returned by ReadRecord.
uint64_t last_record_offset_;
// Offset of the first location past the end of buffer_.
uint64_t end_of_buffer_offset_;
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
// Extend record types with the following special values
enum {
kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
};
// Skips all blocks that are completely before "initial_offset_".
//
// Returns true on success. Handles reporting.
bool SkipToInitialBlock();
// Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result);
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.
void ReportCorruption(size_t bytes, const char* reason);
void ReportDrop(size_t bytes, const Status& reason);
// No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
} // namespace log
} // namespace rocksdb

528
db/log_test.cc Normal file
View File

@ -0,0 +1,528 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "rocksdb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/random.h"
#include "util/testharness.h"
namespace rocksdb {
namespace log {
// Construct a string of the specified length made out of the supplied
// partial string.
static std::string BigString(const std::string& partial_string, size_t n) {
std::string result;
while (result.size() < n) {
result.append(partial_string);
}
result.resize(n);
return result;
}
// Construct a string from a number
static std::string NumberString(int n) {
char buf[50];
snprintf(buf, sizeof(buf), "%d.", n);
return std::string(buf);
}
// Return a skewed potentially long string
static std::string RandomSkewedString(int i, Random* rnd) {
return BigString(NumberString(i), rnd->Skewed(17));
}
class LogTest {
private:
class StringDest : public WritableFile {
public:
std::string contents_;
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& slice) {
contents_.append(slice.data(), slice.size());
return Status::OK();
}
};
class StringSource : public SequentialFile {
public:
Slice contents_;
bool force_error_;
bool returned_partial_;
StringSource() : force_error_(false), returned_partial_(false) { }
virtual Status Read(size_t n, Slice* result, char* scratch) {
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
if (force_error_) {
force_error_ = false;
returned_partial_ = true;
return Status::Corruption("read error");
}
if (contents_.size() < n) {
n = contents_.size();
returned_partial_ = true;
}
*result = Slice(contents_.data(), n);
contents_.remove_prefix(n);
return Status::OK();
}
virtual Status Skip(uint64_t n) {
if (n > contents_.size()) {
contents_.clear();
return Status::NotFound("in-memory file skipepd past end");
}
contents_.remove_prefix(n);
return Status::OK();
}
};
class ReportCollector : public Reader::Reporter {
public:
size_t dropped_bytes_;
std::string message_;
ReportCollector() : dropped_bytes_(0) { }
virtual void Corruption(size_t bytes, const Status& status) {
dropped_bytes_ += bytes;
message_.append(status.ToString());
}
};
std::string& dest_contents() {
auto dest = dynamic_cast<StringDest*>(writer_.file());
assert(dest);
return dest->contents_;
}
const std::string& dest_contents() const {
auto dest = dynamic_cast<const StringDest*>(writer_.file());
assert(dest);
return dest->contents_;
}
void reset_source_contents() {
auto src = dynamic_cast<StringSource*>(reader_.file());
assert(src);
src->contents_ = dest_contents();
}
unique_ptr<StringDest> dest_holder_;
unique_ptr<StringSource> source_holder_;
ReportCollector report_;
bool reading_;
Writer writer_;
Reader reader_;
// Record metadata for testing initial offset functionality
static size_t initial_offset_record_sizes_[];
static uint64_t initial_offset_last_record_offsets_[];
public:
LogTest() : dest_holder_(new StringDest),
source_holder_(new StringSource),
reading_(false),
writer_(std::move(dest_holder_)),
reader_(std::move(source_holder_), &report_, true/*checksum*/,
0/*initial_offset*/) {
}
void Write(const std::string& msg) {
ASSERT_TRUE(!reading_) << "Write() after starting to read";
writer_.AddRecord(Slice(msg));
}
size_t WrittenBytes() const {
return dest_contents().size();
}
std::string Read() {
if (!reading_) {
reading_ = true;
reset_source_contents();
}
std::string scratch;
Slice record;
if (reader_.ReadRecord(&record, &scratch)) {
return record.ToString();
} else {
return "EOF";
}
}
void IncrementByte(int offset, int delta) {
dest_contents()[offset] += delta;
}
void SetByte(int offset, char new_byte) {
dest_contents()[offset] = new_byte;
}
void ShrinkSize(int bytes) {
dest_contents().resize(dest_contents().size() - bytes);
}
void FixChecksum(int header_offset, int len) {
// Compute crc of type/len/data
uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
crc = crc32c::Mask(crc);
EncodeFixed32(&dest_contents()[header_offset], crc);
}
void ForceError() {
auto src = dynamic_cast<StringSource*>(reader_.file());
src->force_error_ = true;
}
size_t DroppedBytes() const {
return report_.dropped_bytes_;
}
std::string ReportMessage() const {
return report_.message_;
}
// Returns OK iff recorded error message contains "msg"
std::string MatchError(const std::string& msg) const {
if (report_.message_.find(msg) == std::string::npos) {
return report_.message_;
} else {
return "OK";
}
}
void WriteInitialOffsetLog() {
for (int i = 0; i < 4; i++) {
std::string record(initial_offset_record_sizes_[i],
static_cast<char>('a' + i));
Write(record);
}
}
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
WriteInitialOffsetLog();
reading_ = true;
unique_ptr<StringSource> source(new StringSource);
source->contents_ = dest_contents();
unique_ptr<Reader> offset_reader(
new Reader(std::move(source), &report_, true/*checksum*/,
WrittenBytes() + offset_past_end));
Slice record;
std::string scratch;
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
}
void CheckInitialOffsetRecord(uint64_t initial_offset,
int expected_record_offset) {
WriteInitialOffsetLog();
reading_ = true;
unique_ptr<StringSource> source(new StringSource);
source->contents_ = dest_contents();
unique_ptr<Reader> offset_reader(
new Reader(std::move(source), &report_, true/*checksum*/,
initial_offset));
Slice record;
std::string scratch;
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
record.size());
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
offset_reader->LastRecordOffset());
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
}
};
size_t LogTest::initial_offset_record_sizes_[] =
{10000, // Two sizable records in first block
10000,
2 * log::kBlockSize - 1000, // Span three blocks
1};
uint64_t LogTest::initial_offset_last_record_offsets_[] =
{0,
kHeaderSize + 10000,
2 * (kHeaderSize + 10000),
2 * (kHeaderSize + 10000) +
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ReadWrite) {
Write("foo");
Write("bar");
Write("");
Write("xxxx");
ASSERT_EQ("foo", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("", Read());
ASSERT_EQ("xxxx", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
}
TEST(LogTest, ManyBlocks) {
for (int i = 0; i < 100000; i++) {
Write(NumberString(i));
}
for (int i = 0; i < 100000; i++) {
ASSERT_EQ(NumberString(i), Read());
}
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, Fragmentation) {
Write("small");
Write(BigString("medium", 50000));
Write(BigString("large", 100000));
ASSERT_EQ("small", Read());
ASSERT_EQ(BigString("medium", 50000), Read());
ASSERT_EQ(BigString("large", 100000), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer2) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(0U, DroppedBytes());
ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, ShortTrailer) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, AlignedEof) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, RandomRead) {
const int N = 500;
Random write_rnd(301);
for (int i = 0; i < N; i++) {
Write(RandomSkewedString(i, &write_rnd));
}
Random read_rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
}
ASSERT_EQ("EOF", Read());
}
// Tests of all the error paths in log_reader.cc follow:
TEST(LogTest, ReadError) {
Write("foo");
ForceError();
ASSERT_EQ("EOF", Read());
ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
ASSERT_EQ("OK", MatchError("read error"));
}
TEST(LogTest, BadRecordType) {
Write("foo");
// Type is stored in header[6]
IncrementByte(6, 100);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("unknown record type"));
}
TEST(LogTest, TruncatedTrailingRecord) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read());
ASSERT_EQ((unsigned int)(kHeaderSize - 1), DroppedBytes());
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
}
TEST(LogTest, BadLength) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
ASSERT_EQ((unsigned int)(kHeaderSize + 2), DroppedBytes());
ASSERT_EQ("OK", MatchError("bad record length"));
}
TEST(LogTest, ChecksumMismatch) {
Write("foo");
IncrementByte(0, 10);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(10U, DroppedBytes());
ASSERT_EQ("OK", MatchError("checksum mismatch"));
}
TEST(LogTest, UnexpectedMiddleType) {
Write("foo");
SetByte(6, kMiddleType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedLastType) {
Write("foo");
SetByte(6, kLastType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedFullType) {
Write("foo");
Write("bar");
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, UnexpectedFirstType) {
Write("foo");
Write(BigString("bar", 100000));
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ(BigString("bar", 100000), Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
// where the middle two fragments disappear. We do not want
// first(R1),last(R2) to get joined and returned as a valid record.
// Write records that span two blocks
Write(BigString("foo", kBlockSize));
Write(BigString("bar", kBlockSize));
Write("correct");
// Wipe the middle block
for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
SetByte(offset, 'x');
}
ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read());
const unsigned int dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize);
}
TEST(LogTest, ReadStart) {
CheckInitialOffsetRecord(0, 0);
}
TEST(LogTest, ReadSecondOneOff) {
CheckInitialOffsetRecord(1, 1);
}
TEST(LogTest, ReadSecondTenThousand) {
CheckInitialOffsetRecord(10000, 1);
}
TEST(LogTest, ReadSecondStart) {
CheckInitialOffsetRecord(10007, 1);
}
TEST(LogTest, ReadThirdOneOff) {
CheckInitialOffsetRecord(10008, 2);
}
TEST(LogTest, ReadThirdStart) {
CheckInitialOffsetRecord(20014, 2);
}
TEST(LogTest, ReadFourthOneOff) {
CheckInitialOffsetRecord(20015, 3);
}
TEST(LogTest, ReadFourthFirstBlockTrailer) {
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
}
TEST(LogTest, ReadFourthMiddleBlock) {
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthLastBlock) {
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthStart) {
CheckInitialOffsetRecord(
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
3);
}
TEST(LogTest, ReadEnd) {
CheckOffsetPastEndReturnsNoRecords(0);
}
TEST(LogTest, ReadPastEnd) {
CheckOffsetPastEndReturnsNoRecords(5);
}
} // namespace log
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

108
db/log_writer.cc Normal file
View File

@ -0,0 +1,108 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_writer.h"
#include <stdint.h>
#include "rocksdb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace rocksdb {
namespace log {
Writer::Writer(unique_ptr<WritableFile>&& dest)
: dest_(std::move(dest)),
block_offset_(0) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc_[i] = crc32c::Value(&t, 1);
}
}
Writer::~Writer() {
}
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
Status s;
bool begin = true;
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover < kHeaderSize) {
// Switch to a new block
if (leftover > 0) {
// Fill the trailer (literal below relies on kHeaderSize being 7)
assert(kHeaderSize == 7);
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
// Invariant: we never leave < kHeaderSize bytes in a block.
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
} else if (begin) {
type = kFirstType;
} else if (end) {
type = kLastType;
} else {
type = kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (s.ok() && left > 0);
return s;
}
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
// Format the header
char buf[kHeaderSize];
buf[4] = static_cast<char>(n & 0xff);
buf[5] = static_cast<char>(n >> 8);
buf[6] = static_cast<char>(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
s = dest_->Append(Slice(ptr, n));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + n;
return s;
}
} // namespace log
} // namespace rocksdb

55
db/log_writer.h Normal file
View File

@ -0,0 +1,55 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <stdint.h>
#include "db/log_format.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
namespace rocksdb {
class WritableFile;
using std::unique_ptr;
namespace log {
class Writer {
public:
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(unique_ptr<WritableFile>&& dest);
~Writer();
Status AddRecord(const Slice& slice);
WritableFile* file() { return dest_.get(); }
const WritableFile* file() const { return dest_.get(); }
private:
unique_ptr<WritableFile> dest_;
int block_offset_; // Current offset in block
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the
// record type stored in the header.
uint32_t type_crc_[kMaxRecordType + 1];
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
// No copying allowed
Writer(const Writer&);
void operator=(const Writer&);
};
} // namespace log
} // namespace rocksdb

330
db/memtable.cc Normal file
View File

@ -0,0 +1,330 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/memtable.h"
#include <memory>
#include "db/dbformat.h"
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "util/coding.h"
#include "util/mutexlock.h"
#include "util/murmurhash.h"
namespace std {
template <>
struct hash<rocksdb::Slice> {
size_t operator()(const rocksdb::Slice& slice) const {
return MurmurHash(slice.data(), slice.size(), 0);
}
};
}
namespace rocksdb {
MemTable::MemTable(const InternalKeyComparator& cmp,
std::shared_ptr<MemTableRepFactory> table_factory,
int numlevel,
const Options& options)
: comparator_(cmp),
refs_(0),
arena_impl_(options.arena_block_size),
table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)),
flush_in_progress_(false),
flush_completed_(false),
file_number_(0),
edit_(numlevel),
first_seqno_(0),
mem_next_logfile_number_(0),
mem_logfile_number_(0),
locks_(options.inplace_update_support
? options.inplace_update_num_locks
: 0) { }
MemTable::~MemTable() {
assert(refs_ == 0);
}
size_t MemTable::ApproximateMemoryUsage() {
return arena_impl_.ApproximateMemoryUsage() +
table_->ApproximateMemoryUsage();
}
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
const {
// Internal keys are encoded as length-prefixed strings.
Slice a = GetLengthPrefixedSlice(aptr);
Slice b = GetLengthPrefixedSlice(bptr);
return comparator.Compare(a, b);
}
Slice MemTableRep::UserKey(const char* key) const {
Slice slice = GetLengthPrefixedSlice(key);
return Slice(slice.data(), slice.size() - 8);
}
// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
static const char* EncodeKey(std::string* scratch, const Slice& target) {
scratch->clear();
PutVarint32(scratch, target.size());
scratch->append(target.data(), target.size());
return scratch->data();
}
class MemTableIterator: public Iterator {
public:
MemTableIterator(MemTableRep* table, const ReadOptions& options)
: iter_() {
if (options.prefix) {
iter_ = table->GetPrefixIterator(*options.prefix);
} else if (options.prefix_seek) {
iter_ = table->GetDynamicPrefixIterator();
} else {
iter_ = table->GetIterator();
}
}
virtual bool Valid() const { return iter_->Valid(); }
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
virtual void SeekToFirst() { iter_->SeekToFirst(); }
virtual void SeekToLast() { iter_->SeekToLast(); }
virtual void Next() { iter_->Next(); }
virtual void Prev() { iter_->Prev(); }
virtual Slice key() const {
return GetLengthPrefixedSlice(iter_->key());
}
virtual Slice value() const {
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
}
virtual Status status() const { return Status::OK(); }
private:
std::shared_ptr<MemTableRep::Iterator> iter_;
std::string tmp_; // For passing to EncodeKey
// No copying allowed
MemTableIterator(const MemTableIterator&);
void operator=(const MemTableIterator&);
};
Iterator* MemTable::NewIterator(const ReadOptions& options) {
return new MemTableIterator(table_.get(), options);
}
port::RWMutex* MemTable::GetLock(const Slice& key) {
return &locks_[std::hash<Slice>()(key) % locks_.size()];
}
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key,
const Slice& value) {
// Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size()
// key bytes : char[internal_key.size()]
// value_size : varint32 of value.size()
// value bytes : char[value.size()]
size_t key_size = key.size();
size_t val_size = value.size();
size_t internal_key_size = key_size + 8;
const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size;
char* buf = arena_impl_.Allocate(encoded_len);
char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size);
p += key_size;
EncodeFixed64(p, (s << 8) | type);
p += 8;
p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size);
assert((p + val_size) - buf == (unsigned)encoded_len);
table_->Insert(buf);
// The first sequence number inserted into the memtable
assert(first_seqno_ == 0 || s > first_seqno_);
if (first_seqno_ == 0) {
first_seqno_ = s;
}
}
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
std::deque<std::string>* operands, const Options& options) {
Slice memkey = key.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key()));
iter->Seek(memkey.data());
// It is the caller's responsibility to allocate/delete operands list
assert(operands != nullptr);
bool merge_in_progress = s->IsMergeInProgress();
auto merge_operator = options.merge_operator.get();
auto logger = options.info_log;
std::string merge_result;
for (; iter->Valid(); iter->Next()) {
// entry format is:
// klength varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
if (options.inplace_update_support) {
GetLock(key.user_key())->ReadLock();
}
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
*s = Status::OK();
if (merge_in_progress) {
assert(merge_operator);
if (!merge_operator->FullMerge(key.user_key(), &v, *operands,
value, logger.get())) {
RecordTick(options.statistics, NUMBER_MERGE_FAILURES);
*s = Status::Corruption("Error: Could not perform merge.");
}
} else {
value->assign(v.data(), v.size());
}
if (options.inplace_update_support) {
GetLock(key.user_key())->Unlock();
}
return true;
}
case kTypeDeletion: {
if (merge_in_progress) {
assert(merge_operator);
*s = Status::OK();
if (!merge_operator->FullMerge(key.user_key(), nullptr, *operands,
value, logger.get())) {
RecordTick(options.statistics, NUMBER_MERGE_FAILURES);
*s = Status::Corruption("Error: Could not perform merge.");
}
} else {
*s = Status::NotFound(Slice());
}
return true;
}
case kTypeMerge: {
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
merge_in_progress = true;
operands->push_front(v.ToString());
while(operands->size() >= 2) {
// Attempt to associative merge. (Returns true if successful)
if (merge_operator->PartialMerge(key.user_key(),
Slice((*operands)[0]),
Slice((*operands)[1]),
&merge_result,
logger.get())) {
operands->pop_front();
swap(operands->front(), merge_result);
} else {
// Stack them because user can't associative merge
break;
}
}
break;
}
case kTypeLogData:
assert(false);
break;
}
} else {
// exit loop if user key does not match
break;
}
}
// No change to value, since we have not yet found a Put/Delete
if (merge_in_progress) {
*s = Status::MergeInProgress("");
}
return false;
}
bool MemTable::Update(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value) {
LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter(
table_.get()->GetIterator(lkey.user_key()));
iter->Seek(memkey.data());
if (iter->Valid()) {
// entry format is:
// klength varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter->key();
uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
uint32_t vlength;
GetVarint32Ptr(key_ptr + key_length,
key_ptr + key_length+5, &vlength);
// Update value, if newValue size <= curValue size
if (value.size() <= vlength) {
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
value.size());
WriteLock wl(GetLock(lkey.user_key()));
memcpy(p, value.data(), value.size());
assert(
(p + value.size()) - entry ==
(unsigned) (VarintLength(key_length) +
key_length +
VarintLength(value.size()) +
value.size())
);
return true;
}
}
default:
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
// then we probably don't have enough space to update in-place
// Maybe do something later
// Return false, and do normal Add()
return false;
}
}
}
// Key doesn't exist
return false;
}
} // namespace rocksdb

172
db/memtable.h Normal file
View File

@ -0,0 +1,172 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <string>
#include <memory>
#include <deque>
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "db/version_set.h"
#include "rocksdb/db.h"
#include "rocksdb/memtablerep.h"
#include "util/arena_impl.h"
namespace rocksdb {
class Mutex;
class MemTableIterator;
class MemTable {
public:
struct KeyComparator : public MemTableRep::KeyComparator {
const InternalKeyComparator comparator;
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
virtual int operator()(const char* a, const char* b) const;
};
// MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once.
explicit MemTable(
const InternalKeyComparator& comparator,
std::shared_ptr<MemTableRepFactory> table_factory,
int numlevel = 7,
const Options& options = Options());
// Increase reference count.
void Ref() { ++refs_; }
// Drop reference count. Delete if no more references exist.
void Unref() {
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
delete this;
}
}
// Returns an estimate of the number of bytes of data in use by this
// data structure.
//
// REQUIRES: external synchronization to prevent simultaneous
// operations on the same MemTable.
size_t ApproximateMemoryUsage();
// Return an iterator that yields the contents of the memtable.
//
// The caller must ensure that the underlying MemTable remains live
// while the returned iterator is live. The keys returned by this
// iterator are internal keys encoded by AppendInternalKey in the
// db/dbformat.{h,cc} module.
//
// If options.prefix is supplied, it is passed to the underlying MemTableRep
// as a hint that the iterator only need to support access to keys with that
// specific prefix.
// If options.prefix is not supplied and options.prefix_seek is set, the
// iterator is not bound to a specific prefix. However, the semantics of
// Seek is changed - the result might only include keys with the same prefix
// as the seek-key.
Iterator* NewIterator(const ReadOptions& options = ReadOptions());
// Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type.
// Typically value will be empty if type==kTypeDeletion.
void Add(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value);
// If memtable contains a value for key, store it in *value and return true.
// If memtable contains a deletion for key, store a NotFound() error
// in *status and return true.
// If memtable contains Merge operation as the most recent entry for a key,
// and the merge process does not stop (not reaching a value or delete),
// prepend the current merge operand to *operands.
// store MergeInProgress in s, and return false.
// Else, return false.
bool Get(const LookupKey& key, std::string* value, Status* s,
std::deque<std::string>* operands, const Options& options);
// Update the value and return status ok,
// if key exists in current memtable
// if new sizeof(new_value) <= sizeof(old_value) &&
// old_value for that key is a put i.e. kTypeValue
// else return false, and status - NotUpdatable()
// else return false, and status - NotFound()
bool Update(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value);
// Returns the edits area that is needed for flushing the memtable
VersionEdit* GetEdits() { return &edit_; }
// Returns the sequence number of the first element that was inserted
// into the memtable
SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
// Returns the next active logfile number when this memtable is about to
// be flushed to storage
uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
// Sets the next active logfile number when this memtable is about to
// be flushed to storage
void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
// Returns the logfile number that can be safely deleted when this
// memstore is flushed to storage
uint64_t GetLogNumber() { return mem_logfile_number_; }
// Sets the logfile number that can be safely deleted when this
// memstore is flushed to storage
void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; }
// Notify the underlying storage that no more items will be added
void MarkImmutable() { table_->MarkReadOnly(); }
private:
~MemTable(); // Private since only Unref() should be used to delete it
friend class MemTableIterator;
friend class MemTableBackwardIterator;
friend class MemTableList;
KeyComparator comparator_;
int refs_;
ArenaImpl arena_impl_;
shared_ptr<MemTableRep> table_;
// These are used to manage memtable flushes to storage
bool flush_in_progress_; // started the flush
bool flush_completed_; // finished the flush
uint64_t file_number_; // filled up after flush is complete
// The udpates to be applied to the transaction log when this
// memtable is flushed to storage.
VersionEdit edit_;
// The sequence number of the kv that was inserted first
SequenceNumber first_seqno_;
// The log files earlier than this number can be deleted.
uint64_t mem_next_logfile_number_;
// The log file that backs this memtable (to be deleted when
// memtable flush is done)
uint64_t mem_logfile_number_;
// rw locks for inplace updates
std::vector<port::RWMutex> locks_;
// No copying allowed
MemTable(const MemTable&);
void operator=(const MemTable&);
// Get the lock associated for the key
port::RWMutex* GetLock(const Slice& key);
};
} // namespace rocksdb

215
db/memtablelist.cc Normal file
View File

@ -0,0 +1,215 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "db/memtablelist.h"
#include <string>
#include "rocksdb/db.h"
#include "db/memtable.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "util/coding.h"
namespace rocksdb {
class InternalKeyComparator;
class Mutex;
class MemTableListIterator;
class VersionSet;
using std::list;
// Increase reference count on all underling memtables
void MemTableList::RefAll() {
for (auto &memtable : memlist_) {
memtable->Ref();
}
}
// Drop reference count on all underling memtables
void MemTableList::UnrefAll() {
for (auto &memtable : memlist_) {
memtable->Unref();
}
}
// Returns the total number of memtables in the list
int MemTableList::size() {
assert(num_flush_not_started_ <= size_);
return size_;
}
// Returns true if there is at least one memtable on which flush has
// not yet started.
bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) {
if ((flush_requested_ && num_flush_not_started_ >= 1) ||
(num_flush_not_started_ >= min_write_buffer_number_to_merge)) {
assert(imm_flush_needed.NoBarrier_Load() != nullptr);
return true;
}
return false;
}
// Returns the memtables that need to be flushed.
void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
for (auto it = memlist_.rbegin(); it != memlist_.rend(); it++) {
MemTable* m = *it;
if (!m->flush_in_progress_) {
assert(!m->flush_completed_);
num_flush_not_started_--;
if (num_flush_not_started_ == 0) {
imm_flush_needed.Release_Store(nullptr);
}
m->flush_in_progress_ = true; // flushing will start very soon
ret->push_back(m);
}
}
flush_requested_ = false; // start-flush request is complete
}
// Record a successful flush in the manifest file
Status MemTableList::InstallMemtableFlushResults(
const std::vector<MemTable*> &mems,
VersionSet* vset, Status flushStatus,
port::Mutex* mu, Logger* info_log,
uint64_t file_number,
std::set<uint64_t>& pending_outputs) {
mu->AssertHeld();
// If the flush was not successful, then just reset state.
// Maybe a suceeding attempt to flush will be successful.
if (!flushStatus.ok()) {
for (MemTable* m : mems) {
assert(m->flush_in_progress_);
assert(m->file_number_ == 0);
m->flush_in_progress_ = false;
m->flush_completed_ = false;
m->edit_.Clear();
num_flush_not_started_++;
imm_flush_needed.Release_Store((void *)1);
pending_outputs.erase(file_number);
}
return flushStatus;
}
// flush was sucessful
for (size_t i = 0; i < mems.size(); ++i) {
// All the edits are associated with the first memtable of this batch.
assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
mems[i]->flush_completed_ = true;
mems[i]->file_number_ = file_number;
}
// if some other thread is already commiting, then return
Status s;
if (commit_in_progress_) {
return s;
}
// Only a single thread can be executing this piece of code
commit_in_progress_ = true;
// scan all memtables from the earliest, and commit those
// (in that order) that have finished flushing. Memetables
// are always committed in the order that they were created.
while (!memlist_.empty() && s.ok()) {
MemTable* m = memlist_.back(); // get the last element
if (!m->flush_completed_) {
break;
}
Log(info_log,
"Level-0 commit table #%lu started",
(unsigned long)m->file_number_);
// this can release and reacquire the mutex.
s = vset->LogAndApply(&m->edit_, mu);
// All the later memtables that have the same filenum
// are part of the same batch. They can be committed now.
uint64_t mem_id = 1; // how many memtables has been flushed.
do {
if (s.ok()) { // commit new state
Log(info_log,
"Level-0 commit table #%lu: memtable #%lu done",
(unsigned long)m->file_number_,
(unsigned long)mem_id);
memlist_.remove(m);
assert(m->file_number_ > 0);
// pending_outputs can be cleared only after the newly created file
// has been written to a committed version so that other concurrently
// executing compaction threads do not mistakenly assume that this
// file is not live.
pending_outputs.erase(m->file_number_);
m->Unref();
size_--;
} else {
//commit failed. setup state so that we can flush again.
Log(info_log,
"Level-0 commit table #%lu: memtable #%lu failed",
(unsigned long)m->file_number_,
(unsigned long)mem_id);
m->flush_completed_ = false;
m->flush_in_progress_ = false;
m->edit_.Clear();
num_flush_not_started_++;
pending_outputs.erase(m->file_number_);
m->file_number_ = 0;
imm_flush_needed.Release_Store((void *)1);
s = Status::IOError("Unable to commit flushed memtable");
}
++mem_id;
} while (!memlist_.empty() && (m = memlist_.back()) &&
m->file_number_ == file_number);
}
commit_in_progress_ = false;
return s;
}
// New memtables are inserted at the front of the list.
void MemTableList::Add(MemTable* m) {
assert(size_ >= num_flush_not_started_);
size_++;
memlist_.push_front(m);
m->MarkImmutable();
num_flush_not_started_++;
if (num_flush_not_started_ == 1) {
imm_flush_needed.Release_Store((void *)1);
}
}
// Returns an estimate of the number of bytes of data in use.
size_t MemTableList::ApproximateMemoryUsage() {
size_t size = 0;
for (auto &memtable : memlist_) {
size += memtable->ApproximateMemoryUsage();
}
return size;
}
// Search all the memtables starting from the most recent one.
// Return the most recent value found, if any.
// Operands stores the list of merge operations to apply, so far.
bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s,
std::deque<std::string>* operands,
const Options& options) {
for (auto &memtable : memlist_) {
if (memtable->Get(key, value, s, operands, options)) {
return true;
}
}
return false;
}
void MemTableList::GetMemTables(std::vector<MemTable*>* output) {
for (auto &memtable : memlist_) {
output->push_back(memtable);
}
}
} // namespace rocksdb

105
db/memtablelist.h Normal file
View File

@ -0,0 +1,105 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include <string>
#include <list>
#include <deque>
#include "rocksdb/db.h"
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "memtable.h"
namespace rocksdb {
class InternalKeyComparator;
class Mutex;
class MemTableListIterator;
//
// This class stores references to all the immutable memtables.
// The memtables are flushed to L0 as soon as possible and in
// any order. If there are more than one immutable memtable, their
// flushes can occur concurrently. However, they are 'committed'
// to the manifest in FIFO order to maintain correctness and
// recoverability from a crash.
//
class MemTableList {
public:
// A list of memtables.
MemTableList() : size_(0), num_flush_not_started_(0),
commit_in_progress_(false),
flush_requested_(false) {
imm_flush_needed.Release_Store(nullptr);
}
~MemTableList() {};
// so that backgrund threads can detect non-nullptr pointer to
// determine whether this is anything more to start flushing.
port::AtomicPointer imm_flush_needed;
// Increase reference count on all underling memtables
void RefAll();
// Drop reference count on all underling memtables
void UnrefAll();
// Returns the total number of memtables in the list
int size();
// Returns true if there is at least one memtable on which flush has
// not yet started.
bool IsFlushPending(int min_write_buffer_number_to_merge);
// Returns the earliest memtables that needs to be flushed. The returned
// memtables are guaranteed to be in the ascending order of created time.
void PickMemtablesToFlush(std::vector<MemTable*>* mems);
// Commit a successful flush in the manifest file
Status InstallMemtableFlushResults(const std::vector<MemTable*> &m,
VersionSet* vset, Status flushStatus,
port::Mutex* mu, Logger* info_log,
uint64_t file_number,
std::set<uint64_t>& pending_outputs);
// New memtables are inserted at the front of the list.
// Takes ownership of the referenced held on *m by the caller of Add().
void Add(MemTable* m);
// Returns an estimate of the number of bytes of data in use.
size_t ApproximateMemoryUsage();
// Search all the memtables starting from the most recent one.
// Return the most recent value found, if any.
bool Get(const LookupKey& key, std::string* value, Status* s,
std::deque<std::string>* operands, const Options& options);
// Returns the list of underlying memtables.
void GetMemTables(std::vector<MemTable*>* list);
// Request a flush of all existing memtables to storage
void FlushRequested() { flush_requested_ = true; }
// Copying allowed
// MemTableList(const MemTableList&);
// void operator=(const MemTableList&);
private:
std::list<MemTable*> memlist_;
int size_;
// the number of elements that still need flushing
int num_flush_not_started_;
// committing in progress
bool commit_in_progress_;
// Requested a flush of all memtables to storage
bool flush_requested_;
};
} // namespace rocksdb

197
db/merge_helper.cc Normal file
View File

@ -0,0 +1,197 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "merge_helper.h"
#include "db/dbformat.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/merge_operator.h"
#include <string>
#include <stdio.h>
namespace rocksdb {
// PRE: iter points to the first merge type entry
// POST: iter points to the first entry beyond the merge process (or the end)
// keys_, operands_ are updated to reflect the merge result.
// keys_ stores the list of keys encountered while merging.
// operands_ stores the list of merge operands encountered while merging.
// keys_[i] corresponds to operands_[i] for each i.
void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
bool at_bottom, shared_ptr<Statistics> stats) {
// Get a copy of the internal key, before it's invalidated by iter->Next()
// Also maintain the list of merge operands seen.
keys_.clear();
operands_.clear();
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString());
success_ = false; // Will become true if we hit Put/Delete or bottom
// We need to parse the internal key again as the parsed key is
// backed by the internal key!
// Assume no internal key corruption as it has been successfully parsed
// by the caller.
// Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
ParsedInternalKey orig_ikey;
ParseInternalKey(keys_.back(), &orig_ikey);
bool hit_the_next_user_key = false;
ParsedInternalKey ikey;
std::string merge_result; // Temporary value for merge results
for (iter->Next(); iter->Valid(); iter->Next()) {
assert(operands_.size() >= 1); // Should be invariants!
assert(keys_.size() == operands_.size());
if (!ParseInternalKey(iter->key(), &ikey)) {
// stop at corrupted key
if (assert_valid_internal_key_) {
assert(!"corrupted internal key is not expected");
}
break;
}
if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
// hit a different user key, stop right here
hit_the_next_user_key = true;
break;
}
if (stop_before && ikey.sequence <= stop_before) {
// hit an entry that's visible by the previous snapshot, can't touch that
break;
}
// At this point we are guaranteed that we need to process this key.
if (kTypeDeletion == ikey.type) {
// hit a delete
// => merge nullptr with operands_
// => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back()
// We are done! Return a success if the merge passes.
success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
operands_, &merge_result,
logger_);
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (success_) {
std::string& key = keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
orig_ikey.sequence, orig_ikey.type);
swap(operands_.back(), merge_result);
} else {
RecordTick(stats, NUMBER_MERGE_FAILURES);
}
// move iter to the next entry (before doing anything else)
iter->Next();
return;
}
if (kTypeValue == ikey.type) {
// hit a put
// => merge the put value with operands_
// => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back()
// We are done! Success!
const Slice value = iter->value();
success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
operands_, &merge_result,
logger_);
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (success_) {
std::string& key = keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
orig_ikey.sequence, orig_ikey.type);
swap(operands_.back(), merge_result);
} else {
RecordTick(stats, NUMBER_MERGE_FAILURES);
}
// move iter to the next entry
iter->Next();
return;
}
if (kTypeMerge == ikey.type) {
// hit a merge
// => merge the operand into the front of the operands_ list
// => use the user's associative merge function to determine how.
// => then continue because we haven't yet seen a Put/Delete.
assert(!operands_.empty()); // Should have at least one element in it
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString());
while (operands_.size() >= 2) {
// Returns false when the merge_operator can no longer process it
if (user_merge_operator_->PartialMerge(ikey.user_key,
Slice(operands_[0]),
Slice(operands_[1]),
&merge_result,
logger_)) {
// Merging of operands (associative merge) was successful.
// Replace these frontmost two operands with the merge result
keys_.pop_front();
operands_.pop_front();
swap(operands_.front(), merge_result);
} else {
// Merging of operands (associative merge) returned false.
// The user merge_operator does not know how to merge these operands.
// So we just stack them up until we find a Put/Delete or end of key.
break;
}
}
continue;
}
}
// We are sure we have seen this key's entire history if we are at the
// last level and exhausted all internal keys of this user key.
// NOTE: !iter->Valid() does not necessarily mean we hit the
// beginning of a user key, as versions of a user key might be
// split into multiple files (even files on the same level)
// and some files might not be included in the compaction/merge.
//
// There are also cases where we have seen the root of history of this
// key without being sure of it. Then, we simply miss the opportunity
// to combine the keys. Since VersionSet::SetupOtherInputs() always makes
// sure that all merge-operands on the same level get compacted together,
// this will simply lead to these merge operands moving to the next level.
//
// So, we only perform the following logic (to merge all operands together
// without a Put/Delete) if we are certain that we have seen the end of key.
bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
if (surely_seen_the_beginning) {
// do a final merge with nullptr as the existing value and say
// bye to the merge type (it's now converted to a Put)
assert(kTypeMerge == orig_ikey.type);
assert(operands_.size() >= 1);
assert(operands_.size() == keys_.size());
success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
operands_, &merge_result,
logger_);
if (success_) {
std::string& key = keys_.back(); // The original key encountered
orig_ikey.type = kTypeValue;
UpdateInternalKey(&key[0], key.size(),
orig_ikey.sequence, orig_ikey.type);
// The final value() is always stored in operands_.back()
swap(operands_.back(),merge_result);
} else {
RecordTick(stats, NUMBER_MERGE_FAILURES);
// Do nothing if not success_. Leave keys() and operands() as they are.
}
}
}
} // namespace rocksdb

102
db/merge_helper.h Normal file
View File

@ -0,0 +1,102 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#ifndef MERGE_HELPER_H
#define MERGE_HELPER_H
#include "db/dbformat.h"
#include "rocksdb/slice.h"
#include "rocksdb/statistics.h"
#include <string>
#include <deque>
namespace rocksdb {
class Comparator;
class Iterator;
class Logger;
class MergeOperator;
class MergeHelper {
public:
MergeHelper(const Comparator* user_comparator,
const MergeOperator* user_merge_operator,
Logger* logger,
bool assert_valid_internal_key)
: user_comparator_(user_comparator),
user_merge_operator_(user_merge_operator),
logger_(logger),
assert_valid_internal_key_(assert_valid_internal_key),
keys_(),
operands_(),
success_(false) {}
// Merge entries until we hit
// - a corrupted key
// - a Put/Delete,
// - a different user key,
// - a specific sequence number (snapshot boundary),
// or - the end of iteration
// iter: (IN) points to the first merge type entry
// (OUT) points to the first entry not included in the merge process
// stop_before: (IN) a sequence number that merge should not cross.
// 0 means no restriction
// at_bottom: (IN) true if the iterator covers the bottem level, which means
// we could reach the start of the history of this user key.
void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
bool at_bottom = false, shared_ptr<Statistics> stats=nullptr);
// Query the merge result
// These are valid until the next MergeUntil call
// If the merging was successful:
// - IsSuccess() will be true
// - key() will have the latest sequence number of the merges.
// The type will be Put or Merge. See IMPORTANT 1 note, below.
// - value() will be the result of merging all the operands together
// - The user should ignore keys() and values().
//
// IMPORTANT 1: the key type could change after the MergeUntil call.
// Put/Delete + Merge + ... + Merge => Put
// Merge + ... + Merge => Merge
//
// If the merge operator is not associative, and if a Put/Delete is not found
// then the merging will be unsuccessful. In this case:
// - IsSuccess() will be false
// - keys() contains the list of internal keys seen in order of iteration.
// - values() contains the list of values (merges) seen in the same order.
// values() is parallel to keys() so that the first entry in
// keys() is the key associated with the first entry in values()
// and so on. These lists will be the same length.
// All of these pairs will be merges over the same user key.
// See IMPORTANT 2 note below.
// - The user should ignore key() and value().
//
// IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
// So keys().back() was the first key seen by iterator.
// TODO: Re-style this comment to be like the first one
bool IsSuccess() { return success_; }
Slice key() { assert(success_); return Slice(keys_.back()); }
Slice value() { assert(success_); return Slice(operands_.back()); }
const std::deque<std::string>& keys() { assert(!success_); return keys_; }
const std::deque<std::string>& values() {
assert(!success_); return operands_;
}
private:
const Comparator* user_comparator_;
const MergeOperator* user_merge_operator_;
Logger* logger_;
bool assert_valid_internal_key_; // enforce no internal key corruption?
// the scratch area that holds the result of MergeUntil
// valid up to the next MergeUntil call
std::deque<std::string> keys_; // Keeps track of the sequence of keys seen
std::deque<std::string> operands_; // Parallel with keys_; stores the values
bool success_;
};
} // namespace rocksdb
#endif

53
db/merge_operator.cc Normal file
View File

@ -0,0 +1,53 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
/**
* Back-end implementation details specific to the Merge Operator.
*/
#include "rocksdb/merge_operator.h"
namespace rocksdb {
// Given a "real" merge from the library, call the user's
// associative merge function one-by-one on each of the operands.
// NOTE: It is assumed that the client's merge-operator will handle any errors.
bool AssociativeMergeOperator::FullMerge(
const Slice& key,
const Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
Logger* logger) const {
// Simply loop through the operands
Slice temp_existing;
std::string temp_value;
for (const auto& operand : operand_list) {
Slice value(operand);
if (!Merge(key, existing_value, value, &temp_value, logger)) {
return false;
}
swap(temp_value, *new_value);
temp_existing = Slice(*new_value);
existing_value = &temp_existing;
}
// The result will be in *new_value. All merges succeeded.
return true;
}
// Call the user defined simple merge on the operands;
// NOTE: It is assumed that the client's merge-operator will handle any errors.
bool AssociativeMergeOperator::PartialMerge(
const Slice& key,
const Slice& left_operand,
const Slice& right_operand,
std::string* new_value,
Logger* logger) const {
return Merge(key, &left_operand, right_operand, new_value, logger);
}
} // namespace rocksdb

275
db/merge_test.cc Normal file
View File

@ -0,0 +1,275 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include <assert.h>
#include <memory>
#include <iostream>
#include "rocksdb/cache.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/merge_operator.h"
#include "db/dbformat.h"
#include "db/db_impl.h"
#include "utilities/merge_operators.h"
#include "util/testharness.h"
#include "utilities/utility_db.h"
using namespace std;
using namespace rocksdb;
std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false) {
DB* db;
StackableDB* sdb;
Options options;
options.create_if_missing = true;
options.merge_operator = MergeOperators::CreateUInt64AddOperator();
Status s;
DestroyDB(dbname, Options());
if (ttl) {
cout << "Opening database with TTL\n";
s = UtilityDB::OpenTtlDB(options, dbname, &sdb);
db = sdb;
} else {
s = DB::Open(options, dbname, &db);
}
if (!s.ok()) {
cerr << s.ToString() << endl;
assert(false);
}
return std::shared_ptr<DB>(db);
}
// Imagine we are maintaining a set of uint64 counters.
// Each counter has a distinct name. And we would like
// to support four high level operations:
// set, add, get and remove
// This is a quick implementation without a Merge operation.
class Counters {
protected:
std::shared_ptr<DB> db_;
WriteOptions put_option_;
ReadOptions get_option_;
WriteOptions delete_option_;
uint64_t default_;
public:
explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
: db_(db),
put_option_(),
get_option_(),
delete_option_(),
default_(defaultCount) {
assert(db_);
}
virtual ~Counters() {}
// public interface of Counters.
// All four functions return false
// if the underlying level db operation failed.
// mapped to a levedb Put
bool set(const string& key, uint64_t value) {
// just treat the internal rep of int64 as the string
Slice slice((char *)&value, sizeof(value));
auto s = db_->Put(put_option_, key, slice);
if (s.ok()) {
return true;
} else {
cerr << s.ToString() << endl;
return false;
}
}
// mapped to a rocksdb Delete
bool remove(const string& key) {
auto s = db_->Delete(delete_option_, key);
if (s.ok()) {
return true;
} else {
cerr << s.ToString() << std::endl;
return false;
}
}
// mapped to a rocksdb Get
bool get(const string& key, uint64_t *value) {
string str;
auto s = db_->Get(get_option_, key, &str);
if (s.IsNotFound()) {
// return default value if not found;
*value = default_;
return true;
} else if (s.ok()) {
// deserialization
if (str.size() != sizeof(uint64_t)) {
cerr << "value corruption\n";
return false;
}
*value = DecodeFixed64(&str[0]);
return true;
} else {
cerr << s.ToString() << std::endl;
return false;
}
}
// 'add' is implemented as get -> modify -> set
// An alternative is a single merge operation, see MergeBasedCounters
virtual bool add(const string& key, uint64_t value) {
uint64_t base = default_;
return get(key, &base) && set(key, base + value);
}
// convenience functions for testing
void assert_set(const string& key, uint64_t value) {
assert(set(key, value));
}
void assert_remove(const string& key) {
assert(remove(key));
}
uint64_t assert_get(const string& key) {
uint64_t value = default_;
assert(get(key, &value));
return value;
}
void assert_add(const string& key, uint64_t value) {
assert(add(key, value));
}
};
// Implement 'add' directly with the new Merge operation
class MergeBasedCounters : public Counters {
private:
WriteOptions merge_option_; // for merge
public:
explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
: Counters(db, defaultCount),
merge_option_() {
}
// mapped to a rocksdb Merge operation
virtual bool add(const string& key, uint64_t value) override {
char encoded[sizeof(uint64_t)];
EncodeFixed64(encoded, value);
Slice slice(encoded, sizeof(uint64_t));
auto s = db_->Merge(merge_option_, key, slice);
if (s.ok()) {
return true;
} else {
cerr << s.ToString() << endl;
return false;
}
}
};
void dumpDb(DB* db) {
auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
for (it->SeekToFirst(); it->Valid(); it->Next()) {
uint64_t value = DecodeFixed64(it->value().data());
cout << it->key().ToString() << ": " << value << endl;
}
assert(it->status().ok()); // Check for any errors found during the scan
}
void testCounters(Counters& counters, DB* db, bool test_compaction) {
FlushOptions o;
o.wait = true;
counters.assert_set("a", 1);
if (test_compaction) db->Flush(o);
assert(counters.assert_get("a") == 1);
counters.assert_remove("b");
// defaut value is 0 if non-existent
assert(counters.assert_get("b") == 0);
counters.assert_add("a", 2);
if (test_compaction) db->Flush(o);
// 1+2 = 3
assert(counters.assert_get("a")== 3);
dumpDb(db);
std::cout << "1\n";
// 1+...+49 = ?
uint64_t sum = 0;
for (int i = 1; i < 50; i++) {
counters.assert_add("b", i);
sum += i;
}
assert(counters.assert_get("b") == sum);
std::cout << "2\n";
dumpDb(db);
std::cout << "3\n";
if (test_compaction) {
db->Flush(o);
cout << "Compaction started ...\n";
db->CompactRange(nullptr, nullptr);
cout << "Compaction ended\n";
dumpDb(db);
assert(counters.assert_get("a")== 3);
assert(counters.assert_get("b") == sum);
}
}
void runTest(int argc, const string& dbname, const bool use_ttl = false) {
auto db = OpenDb(dbname, use_ttl);
{
cout << "Test read-modify-write counters... \n";
Counters counters(db, 0);
testCounters(counters, db.get(), true);
}
bool compact = false;
if (argc > 1) {
compact = true;
cout << "Turn on Compaction\n";
}
{
cout << "Test merge-based counters... \n";
MergeBasedCounters counters(db, 0);
testCounters(counters, db.get(), compact);
}
DestroyDB(dbname, Options());
}
int main(int argc, char *argv[]) {
//TODO: Make this test like a general rocksdb unit-test
runTest(argc, test::TmpDir() + "/merge_testdb");
runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
return 0;
}

328
db/perf_context_test.cc Normal file
View File

@ -0,0 +1,328 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include <algorithm>
#include <iostream>
#include <vector>
#include "/usr/include/valgrind/callgrind.h"
#include "rocksdb/db.h"
#include "rocksdb/perf_context.h"
#include "util/histogram.h"
#include "util/stop_watch.h"
#include "util/testharness.h"
bool FLAGS_random_key = false;
bool FLAGS_use_set_based_memetable = false;
int FLAGS_total_keys = 100;
int FLAGS_write_buffer_size = 1000000000;
int FLAGS_max_write_buffer_number = 8;
int FLAGS_min_write_buffer_number_to_merge = 7;
// Path to the database on file system
const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
namespace rocksdb {
std::shared_ptr<DB> OpenDb() {
DB* db;
Options options;
options.create_if_missing = true;
options.write_buffer_size = FLAGS_write_buffer_size;
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
options.min_write_buffer_number_to_merge =
FLAGS_min_write_buffer_number_to_merge;
if (FLAGS_use_set_based_memetable) {
auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
options.memtable_factory =
std::make_shared<rocksdb::PrefixHashRepFactory>(prefix_extractor);
}
Status s = DB::Open(options, kDbName, &db);
ASSERT_OK(s);
return std::shared_ptr<DB>(db);
}
class PerfContextTest { };
TEST(PerfContextTest, SeekIntoDeletion) {
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
db->Put(write_options, key, value);
}
for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
std::string key = "k" + std::to_string(i);
db->Delete(write_options, key);
}
HistogramImpl hist_get;
HistogramImpl hist_get_time;
for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
std::string key = "k" + std::to_string(i);
std::string value;
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
auto status = db->Get(read_options, key, &value);
auto elapsed_nanos = timer.ElapsedNanos();
ASSERT_TRUE(status.IsNotFound());
hist_get.Add(perf_context.user_key_comparison_count);
hist_get_time.Add(elapsed_nanos);
}
std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
<< "Get time: \n" << hist_get_time.ToString();
HistogramImpl hist_seek_to_first;
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->SeekToFirst();
hist_seek_to_first.Add(perf_context.user_key_comparison_count);
auto elapsed_nanos = timer.ElapsedNanos();
std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
<< "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
<< "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
<< "elapsed: " << elapsed_nanos << "\n";
HistogramImpl hist_seek;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
std::string key = "k" + std::to_string(i);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
auto elapsed_nanos = timer.ElapsedNanos();
hist_seek.Add(perf_context.user_key_comparison_count);
std::cout << "seek cmp: " << perf_context.user_key_comparison_count
<< " ikey skipped " << perf_context.internal_key_skipped_count
<< " idelete skipped " << perf_context.internal_delete_skipped_count
<< " elapsed: " << elapsed_nanos << "ns\n";
perf_context.Reset();
ASSERT_TRUE(iter->Valid());
StopWatchNano timer2(Env::Default(), true);
iter->Next();
auto elapsed_nanos2 = timer2.ElapsedNanos();
std::cout << "next cmp: " << perf_context.user_key_comparison_count
<< "elapsed: " << elapsed_nanos2 << "ns\n";
}
std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
}
TEST(PerfContextTest, StopWatchNanoOverhead) {
// profile the timer cost by itself!
const int kTotalIterations = 1000000;
std::vector<uint64_t> timings(kTotalIterations);
StopWatchNano timer(Env::Default(), true);
for (auto& timing : timings) {
timing = timer.ElapsedNanos(true /* reset */);
}
HistogramImpl histogram;
for (const auto timing : timings) {
histogram.Add(timing);
}
std::cout << histogram.ToString();
}
TEST(PerfContextTest, StopWatchOverhead) {
// profile the timer cost by itself!
const int kTotalIterations = 1000000;
std::vector<uint64_t> timings(kTotalIterations);
StopWatch timer(Env::Default());
for (auto& timing : timings) {
timing = timer.ElapsedMicros();
}
HistogramImpl histogram;
uint64_t prev_timing = 0;
for (const auto timing : timings) {
histogram.Add(timing - prev_timing);
prev_timing = timing;
}
std::cout << histogram.ToString();
}
void ProfileKeyComparison() {
DestroyDB(kDbName, Options()); // Start this test with a fresh DB
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
HistogramImpl hist_put;
HistogramImpl hist_get;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
std::vector<int> keys;
for (int i = 0; i < FLAGS_total_keys; ++i) {
keys.push_back(i);
}
if (FLAGS_random_key) {
std::random_shuffle(keys.begin(), keys.end());
}
for (const int i : keys) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
perf_context.Reset();
db->Put(write_options, key, value);
hist_put.Add(perf_context.user_key_comparison_count);
perf_context.Reset();
db->Get(read_options, key, &value);
hist_get.Add(perf_context.user_key_comparison_count);
}
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
<< "Get uesr key comparison: \n" << hist_get.ToString();
}
TEST(PerfContextTest, KeyComparisonCount) {
SetPerfLevel(kEnableCount);
ProfileKeyComparison();
SetPerfLevel(kDisable);
ProfileKeyComparison();
SetPerfLevel(kEnableTime);
ProfileKeyComparison();
}
// make perf_context_test
// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
// For one memtable:
// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
// For two memtables:
// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
// Specify --random_key=1 to shuffle the key before insertion
// Results show that, for sequential insertion, worst-case Seek Key comparison
// is close to the total number of keys (linear), when there is only one
// memtable. When there are two memtables, even the avg Seek Key comparison
// starts to become linear to the input size.
TEST(PerfContextTest, SeekKeyComparison) {
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
std::vector<int> keys;
for (int i = 0; i < FLAGS_total_keys; ++i) {
keys.push_back(i);
}
if (FLAGS_random_key) {
std::random_shuffle(keys.begin(), keys.end());
}
HistogramImpl hist_put_time;
HistogramImpl hist_wal_time;
HistogramImpl hist_time_diff;
SetPerfLevel(kEnableTime);
StopWatchNano timer(Env::Default());
for (const int i : keys) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
perf_context.Reset();
timer.Start();
db->Put(write_options, key, value);
auto put_time = timer.ElapsedNanos();
hist_put_time.Add(put_time);
hist_wal_time.Add(perf_context.wal_write_time);
hist_time_diff.Add(put_time - perf_context.wal_write_time);
}
std::cout << "Put time:\n" << hist_put_time.ToString()
<< "WAL time:\n" << hist_wal_time.ToString()
<< "time diff:\n" << hist_time_diff.ToString();
HistogramImpl hist_seek;
HistogramImpl hist_next;
for (int i = 0; i < FLAGS_total_keys; ++i) {
std::string key = "k" + std::to_string(i);
std::string value = "v" + std::to_string(i);
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
iter->Seek(key);
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(iter->value().ToString(), value);
hist_seek.Add(perf_context.user_key_comparison_count);
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
for (iter->SeekToFirst(); iter->Valid();) {
perf_context.Reset();
iter->Next();
hist_next.Add(perf_context.user_key_comparison_count);
}
std::cout << "Seek:\n" << hist_seek.ToString()
<< "Next:\n" << hist_next.ToString();
}
}
int main(int argc, char** argv) {
for (int i = 1; i < argc; i++) {
int n;
char junk;
if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n;
}
if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
FLAGS_total_keys = n;
}
if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_random_key = n;
}
if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_set_based_memetable = n;
}
}
std::cout << kDbName << "\n";
rocksdb::test::RunAllTests();
return 0;
}

View File

@ -0,0 +1,73 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Wrap an underlying iterator, but exclude any results not starting
// with a given prefix. Seeking to keys not beginning with the prefix
// is invalid, and SeekToLast is not implemented (that would be
// non-trivial), but otherwise this iterator will behave just like the
// underlying iterator would if there happened to be no non-matching
// keys in the dataset.
#pragma once
#include "rocksdb/iterator.h"
namespace rocksdb {
class PrefixFilterIterator : public Iterator {
private:
Iterator* iter_;
const Slice &prefix_;
const SliceTransform *prefix_extractor_;
Status status_;
public:
PrefixFilterIterator(Iterator* iter,
const Slice &prefix,
const SliceTransform* prefix_extractor)
: iter_(iter), prefix_(prefix),
prefix_extractor_(prefix_extractor),
status_(Status::OK()) {
if (prefix_extractor == nullptr) {
status_ = Status::InvalidArgument("A prefix filter may not be used "
"unless a function is also defined "
"for extracting prefixes");
} else if (!prefix_extractor_->InRange(prefix)) {
status_ = Status::InvalidArgument("Must provide a slice for prefix which"
"is a prefix for some key");
}
}
~PrefixFilterIterator() {
delete iter_;
}
Slice key() const { return iter_->key(); }
Slice value() const { return iter_->value(); }
Status status() const {
if (!status_.ok()) {
return status_;
}
return iter_->status();
}
void Next() { iter_->Next(); }
void Prev() { iter_->Prev(); }
void Seek(const Slice& k) {
if (prefix_extractor_->Transform(k) == prefix_) {
iter_->Seek(k);
} else {
status_ = Status::InvalidArgument("Seek must begin with target prefix");
}
}
void SeekToFirst() {
Seek(prefix_);
}
void SeekToLast() {
status_ = Status::NotSupported("SeekToLast is incompatible with prefixes");
}
bool Valid() const {
return (status_.ok() && iter_->Valid() &&
prefix_extractor_->Transform(iter_->key()) == prefix_);
}
};
} // namespace rocksdb

319
db/prefix_test.cc Normal file
View File

@ -0,0 +1,319 @@
#include <algorithm>
#include <iostream>
#include <vector>
#include <gflags/gflags.h>
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/perf_context.h"
#include "util/histogram.h"
#include "util/stop_watch.h"
#include "util/testharness.h"
DEFINE_bool(use_prefix_hash_memtable, true, "");
DEFINE_bool(use_nolock_version, true, "");
DEFINE_bool(trigger_deadlock, false,
"issue delete in range scan to trigger PrefixHashMap deadlock");
DEFINE_uint64(bucket_count, 100000, "number of buckets");
DEFINE_uint64(num_locks, 10001, "number of locks");
DEFINE_bool(random_prefix, false, "randomize prefix");
DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
DEFINE_int64(write_buffer_size, 1000000000, "");
DEFINE_int64(max_write_buffer_number, 8, "");
DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
// Path to the database on file system
const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
namespace rocksdb {
struct TestKey {
uint64_t prefix;
uint64_t sorted;
TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
};
// return a slice backed by test_key
inline Slice TestKeyToSlice(const TestKey& test_key) {
return Slice((const char*)&test_key, sizeof(test_key));
}
inline const TestKey* SliceToTestKey(const Slice& slice) {
assert(slice.size() == sizeof(TestKey));
return (const TestKey*)slice.data();
}
class TestKeyComparator : public Comparator {
public:
virtual int Compare(const Slice& a, const Slice& b) const {
const TestKey* key_a = SliceToTestKey(a);
const TestKey* key_b = SliceToTestKey(b);
if (key_a->prefix != key_b->prefix) {
if (key_a->prefix < key_b->prefix) return -1;
if (key_a->prefix > key_b->prefix) return 1;
assert(false);
} else {
if (key_a->sorted < key_b->sorted) return -1;
if (key_a->sorted > key_b->sorted) return 1;
if (key_a->sorted == key_b->sorted) return 0;
assert(false);
}
assert(false);
return 0;
}
virtual const char* Name() const override {
return "TestKeyComparator";
}
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const {
}
virtual void FindShortSuccessor(std::string* key) const {}
};
class PrefixTest {
public:
std::shared_ptr<DB> OpenDb() {
DB* db;
options.create_if_missing = true;
options.write_buffer_size = FLAGS_write_buffer_size;
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
options.min_write_buffer_number_to_merge =
FLAGS_min_write_buffer_number_to_merge;
options.comparator = new TestKeyComparator();
if (FLAGS_use_prefix_hash_memtable) {
auto prefix_extractor = NewFixedPrefixTransform(8);
options.prefix_extractor = prefix_extractor;
if (FLAGS_use_nolock_version) {
options.memtable_factory.reset(NewHashSkipListRepFactory(
prefix_extractor, FLAGS_bucket_count));
} else {
options.memtable_factory =
std::make_shared<rocksdb::PrefixHashRepFactory>(
prefix_extractor, FLAGS_bucket_count, FLAGS_num_locks);
}
}
Status s = DB::Open(options, kDbName, &db);
ASSERT_OK(s);
return std::shared_ptr<DB>(db);
}
~PrefixTest() {
delete options.comparator;
}
protected:
Options options;
};
TEST(PrefixTest, DynamicPrefixIterator) {
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
}
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
// insert x random prefix, each with y continuous element.
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(sorted);
ASSERT_OK(db->Put(write_options, key, value));
}
}
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
if (FLAGS_use_prefix_hash_memtable) {
read_options.prefix_seek = true;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
for (auto prefix : prefixes) {
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
uint64_t total_keys = 0;
for (iter->Seek(key); iter->Valid(); iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
}
auto test_key = SliceToTestKey(iter->key());
if (test_key->prefix != prefix) break;
total_keys++;
}
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
}
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 100;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
}
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
TEST(PrefixTest, PrefixHash) {
DestroyDB(kDbName, Options());
auto db = OpenDb();
WriteOptions write_options;
ReadOptions read_options;
std::vector<uint64_t> prefixes;
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
prefixes.push_back(i);
}
if (FLAGS_random_prefix) {
std::random_shuffle(prefixes.begin(), prefixes.end());
}
// insert x random prefix, each with y continuous element.
HistogramImpl hist_put_time;
HistogramImpl hist_put_comparison;
for (auto prefix : prefixes) {
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
TestKey test_key(prefix, sorted);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(sorted);
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
ASSERT_OK(db->Put(write_options, key, value));
hist_put_time.Add(timer.ElapsedNanos());
hist_put_comparison.Add(perf_context.user_key_comparison_count);
}
}
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
<< "Put time: \n" << hist_put_time.ToString();
// test seek existing keys
HistogramImpl hist_seek_time;
HistogramImpl hist_seek_comparison;
for (auto prefix : prefixes) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
std::string value = "v" + std::to_string(0);
Slice key_prefix;
if (FLAGS_use_prefix_hash_memtable) {
key_prefix = options.prefix_extractor->Transform(key);
read_options.prefix = &key_prefix;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
uint64_t total_keys = 0;
for (iter->Seek(key); iter->Valid(); iter->Next()) {
if (FLAGS_trigger_deadlock) {
std::cout << "Behold the deadlock!\n";
db->Delete(write_options, iter->key());
}
auto test_key = SliceToTestKey(iter->key());
if (test_key->prefix != prefix) break;
total_keys++;
}
hist_seek_time.Add(timer.ElapsedNanos());
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
}
std::cout << "Seek key comparison: \n"
<< hist_seek_comparison.ToString()
<< "Seek time: \n"
<< hist_seek_time.ToString();
// test non-existing keys
HistogramImpl hist_no_seek_time;
HistogramImpl hist_no_seek_comparison;
for (auto prefix = FLAGS_total_prefixes;
prefix < FLAGS_total_prefixes + 100;
prefix++) {
TestKey test_key(prefix, 0);
Slice key = TestKeyToSlice(test_key);
if (FLAGS_use_prefix_hash_memtable) {
Slice key_prefix = options.prefix_extractor->Transform(key);
read_options.prefix = &key_prefix;
}
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
perf_context.Reset();
StopWatchNano timer(Env::Default(), true);
iter->Seek(key);
hist_no_seek_time.Add(timer.ElapsedNanos());
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
ASSERT_TRUE(!iter->Valid());
}
std::cout << "non-existing Seek key comparison: \n"
<< hist_no_seek_comparison.ToString()
<< "non-existing Seek time: \n"
<< hist_no_seek_time.ToString();
}
}
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
std::cout << kDbName << "\n";
rocksdb::test::RunAllTests();
return 0;
}

390
db/repair.cc Normal file
View File

@ -0,0 +1,390 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// We recover the contents of the descriptor from the other files we find.
// (1) Any log files are first converted to tables
// (2) We scan every table to compute
// (a) smallest/largest for the table
// (b) largest sequence number in the table
// (3) We generate descriptor contents:
// - log number is set to zero
// - next-file-number is set to 1 + largest file number we found
// - last-sequence-number is set to largest sequence# found across
// all tables (see 2c)
// - compaction pointers are cleared
// - every table file is added at level 0
//
// Possible optimization 1:
// (a) Compute total size and use to pick appropriate max-level M
// (b) Sort tables by largest sequence# in the table
// (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M.
// Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#, ...)
// in the table's meta section to speed up ScanTable.
#include "db/builder.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
namespace rocksdb {
namespace {
class Repairer {
public:
Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname),
env_(options.env),
icmp_(options.comparator),
ipolicy_(options.filter_policy),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
next_file_number_(1) {
// TableCache can be small since we expect each table to be opened once.
table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
edit_ = new VersionEdit(options.num_levels);
}
~Repairer() {
delete table_cache_;
delete edit_;
}
Status Run() {
Status status = FindFiles();
if (status.ok()) {
ConvertLogFilesToTables();
ExtractMetaData();
status = WriteDescriptor();
}
if (status.ok()) {
unsigned long long bytes = 0;
for (size_t i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.file_size;
}
Log(options_.info_log,
"**** Repaired rocksdb %s; "
"recovered %d files; %llu bytes. "
"Some data may have been lost. "
"****",
dbname_.c_str(),
static_cast<int>(tables_.size()),
bytes);
}
return status;
}
private:
struct TableInfo {
FileMetaData meta;
SequenceNumber min_sequence;
SequenceNumber max_sequence;
};
std::string const dbname_;
Env* const env_;
InternalKeyComparator const icmp_;
InternalFilterPolicy const ipolicy_;
Options const options_;
TableCache* table_cache_;
VersionEdit* edit_;
std::vector<std::string> manifests_;
std::vector<uint64_t> table_numbers_;
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
const EnvOptions storage_options_;
Status FindFiles() {
std::vector<std::string> filenames;
Status status = env_->GetChildren(dbname_, &filenames);
if (!status.ok()) {
return status;
}
if (filenames.empty()) {
return Status::IOError(dbname_, "repair found no files");
}
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
if (type == kDescriptorFile) {
manifests_.push_back(filenames[i]);
} else {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
}
if (type == kLogFile) {
logs_.push_back(number);
} else if (type == kTableFile) {
table_numbers_.push_back(number);
} else {
// Ignore other files
}
}
}
}
return status;
}
void ConvertLogFilesToTables() {
for (size_t i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
(unsigned long long) logs_[i],
status.ToString().c_str());
}
ArchiveFile(logname);
}
}
Status ConvertLogToTable(uint64_t log) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
std::shared_ptr<Logger> info_log;
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) {
// We print error messages for corruption, but continue repairing.
Log(info_log, "Log #%llu: dropping %d bytes; %s",
(unsigned long long) lognum,
static_cast<int>(bytes),
s.ToString().c_str());
}
};
// Open the log file
std::string logname = LogFileName(dbname_, log);
unique_ptr<SequentialFile> lfile;
Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
if (!status.ok()) {
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
// We intentially make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
0/*initial_offset*/);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
MemTable* mem = new MemTable(icmp_, options_.memtable_factory,
options_.num_levels);
mem->Ref();
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) {
reporter.Corruption(
record.size(), Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(options_.info_log, "Log #%llu: ignoring %s",
(unsigned long long) log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
}
// Do not record a version edit for this conversion to a Table
// since ExtractMetaData() will also generate edits.
FileMetaData meta;
meta.number = next_file_number_++;
Iterator* iter = mem->NewIterator();
status = BuildTable(dbname_, env_, options_, storage_options_,
table_cache_, iter, &meta,
icmp_.user_comparator(), 0, 0, true);
delete iter;
mem->Unref();
mem = nullptr;
if (status.ok()) {
if (meta.file_size > 0) {
table_numbers_.push_back(meta.number);
}
}
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
(unsigned long long) log,
counter,
(unsigned long long) meta.number,
status.ToString().c_str());
return status;
}
void ExtractMetaData() {
std::vector<TableInfo> kept;
for (size_t i = 0; i < table_numbers_.size(); i++) {
TableInfo t;
t.meta.number = table_numbers_[i];
Status status = ScanTable(&t);
if (!status.ok()) {
std::string fname = TableFileName(dbname_, table_numbers_[i]);
Log(options_.info_log, "Table #%llu: ignoring %s",
(unsigned long long) table_numbers_[i],
status.ToString().c_str());
ArchiveFile(fname);
} else {
tables_.push_back(t);
}
}
}
Status ScanTable(TableInfo* t) {
std::string fname = TableFileName(dbname_, t->meta.number);
int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) {
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
bool empty = true;
ParsedInternalKey parsed;
t->min_sequence = 0;
t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(options_.info_log, "Table #%llu: unparsable key %s",
(unsigned long long) t->meta.number,
EscapeString(key).c_str());
continue;
}
counter++;
if (empty) {
empty = false;
t->meta.smallest.DecodeFrom(key);
}
t->meta.largest.DecodeFrom(key);
if (parsed.sequence < t->min_sequence) {
t->min_sequence = parsed.sequence;
}
if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence;
}
}
if (!iter->status().ok()) {
status = iter->status();
}
delete iter;
}
Log(options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t->meta.number,
counter,
status.ToString().c_str());
return status;
}
Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1);
unique_ptr<WritableFile> file;
Status status = env_->NewWritableFile(tmp, &file, storage_options_);
if (!status.ok()) {
return status;
}
SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
}
}
edit_->SetComparatorName(icmp_.user_comparator()->Name());
edit_->SetLogNumber(0);
edit_->SetNextFile(next_file_number_);
edit_->SetLastSequence(max_sequence);
for (size_t i = 0; i < tables_.size(); i++) {
// TODO(opt): separate out into multiple levels
const TableInfo& t = tables_[i];
edit_->AddFile(0, t.meta.number, t.meta.file_size,
t.meta.smallest, t.meta.largest,
t.min_sequence, t.max_sequence);
}
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
{
log::Writer log(std::move(file));
std::string record;
edit_->EncodeTo(&record);
status = log.AddRecord(record);
}
if (!status.ok()) {
env_->DeleteFile(tmp);
} else {
// Discard older manifests
for (size_t i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]);
}
// Install new manifest
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
if (status.ok()) {
status = SetCurrentFile(env_, dbname_, 1);
} else {
env_->DeleteFile(tmp);
}
}
return status;
}
void ArchiveFile(const std::string& fname) {
// Move into another directory. E.g., for
// dir/foo
// rename to
// dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/');
std::string new_dir;
if (slash != nullptr) {
new_dir.assign(fname.data(), slash - fname.data());
}
new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir;
new_file.append("/");
new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};
} // namespace
Status RepairDB(const std::string& dbname, const Options& options) {
Repairer repairer(dbname, options);
return repairer.Run();
}
} // namespace rocksdb

793
db/simple_table_db_test.cc Normal file
View File

@ -0,0 +1,793 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <set>
#include "rocksdb/db.h"
#include "rocksdb/filter_policy.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
#include "db/db_statistics.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "rocksdb/table.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/testharness.h"
#include "util/testutil.h"
#include "utilities/merge_operators.h"
using std::unique_ptr;
namespace rocksdb {
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
// as production quality.
// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
// be longer than 150000 bytes and stored data on disk in this format:
// +--------------------------------------------+ <= key1 offset
// | key1 | value_size (4 bytes) | |
// +----------------------------------------+ |
// | value1 |
// | |
// +----------------------------------------+---+ <= key2 offset
// | key2 | value_size (4 bytes) | |
// +----------------------------------------+ |
// | value2 |
// | |
// | ...... |
// +-----------------+--------------------------+ <= index_block_offset
// | key1 | key1 offset (8 bytes) |
// +-----------------+--------------------------+
// | key2 | key2 offset (8 bytes) |
// +-----------------+--------------------------+
// | key3 | key3 offset (8 bytes) |
// +-----------------+--------------------------+
// | ...... |
// +-----------------+------------+-------------+
// | index_block_offset (8 bytes) |
// +------------------------------+
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
// as production quality.
class SimpleTableReader: public TableReader {
public:
// Attempt to open the table that is stored in bytes [0..file_size)
// of "file", and read the metadata entries necessary to allow
// retrieving data from the table.
//
// If successful, returns ok and sets "*table" to the newly opened
// table. The client should delete "*table" when no longer needed.
// If there was an error while initializing the table, sets "*table"
// to nullptr and returns a non-ok status. Does not take ownership of
// "*source", but the client must ensure that "source" remains live
// for the duration of the returned table's lifetime.
//
// *file must remain live while this Table is in use.
static Status Open(const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
unique_ptr<TableReader>* table_reader);
bool PrefixMayMatch(const Slice& internal_prefix) override;
Iterator* NewIterator(const ReadOptions&) override;
Status Get(
const ReadOptions&, const Slice& key, void* arg,
bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
void (*mark_key_may_exist)(void*) = nullptr) override;
uint64_t ApproximateOffsetOf(const Slice& key) override;
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
void SetupForCompaction() override;
TableStats& GetTableStats() override;
~SimpleTableReader();
private:
struct Rep;
Rep* rep_;
explicit SimpleTableReader(Rep* rep) {
rep_ = rep;
}
friend class TableCache;
friend class SimpleTableIterator;
Status GetOffset(const Slice& target, uint64_t* offset);
// No copying allowed
explicit SimpleTableReader(const TableReader&) = delete;
void operator=(const TableReader&) = delete;
};
// Iterator to iterate SimpleTable
class SimpleTableIterator: public Iterator {
public:
explicit SimpleTableIterator(SimpleTableReader* table);
~SimpleTableIterator();
bool Valid() const;
void SeekToFirst();
void SeekToLast();
void Seek(const Slice& target);
void Next();
void Prev();
Slice key() const;
Slice value() const;
Status status() const;
private:
SimpleTableReader* table_;
uint64_t offset_;
uint64_t next_offset_;
Slice key_;
Slice value_;
char tmp_str_[4];
char* key_str_;
char* value_str_;
int value_str_len_;
Status status_;
// No copying allowed
SimpleTableIterator(const SimpleTableIterator&) = delete;
void operator=(const Iterator&) = delete;
};
struct SimpleTableReader::Rep {
~Rep() {
}
Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
int num_entries) :
soptions(storage_options), index_start_offset(index_start_offset),
num_entries(num_entries) {
}
Options options;
const EnvOptions& soptions;
Status status;
unique_ptr<RandomAccessFile> file;
uint64_t index_start_offset;
int num_entries;
TableStats table_stats;
const static int user_key_size = 16;
const static int offset_length = 8;
const static int key_footer_len = 8;
static int GetInternalKeyLength() {
return user_key_size + key_footer_len;
}
};
SimpleTableReader::~SimpleTableReader() {
delete rep_;
}
Status SimpleTableReader::Open(const Options& options,
const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file,
uint64_t size,
unique_ptr<TableReader>* table_reader) {
char footer_space[Rep::offset_length];
Slice footer_input;
Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
&footer_input, footer_space);
if (s.ok()) {
uint64_t index_start_offset = DecodeFixed64(footer_space);
int num_entries = (size - Rep::offset_length - index_start_offset)
/ (Rep::GetInternalKeyLength() + Rep::offset_length);
SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
index_start_offset,
num_entries);
rep->file = std::move(file);
rep->options = options;
table_reader->reset(new SimpleTableReader(rep));
}
return s;
}
void SimpleTableReader::SetupForCompaction() {
}
TableStats& SimpleTableReader::GetTableStats() {
return rep_->table_stats;
}
bool SimpleTableReader::PrefixMayMatch(const Slice& internal_prefix) {
return true;
}
Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) {
return new SimpleTableIterator(this);
}
Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
uint32_t left = 0;
uint32_t right = rep_->num_entries - 1;
char key_chars[Rep::GetInternalKeyLength()];
Slice tmp_slice;
uint32_t target_offset = 0;
while (left <= right) {
uint32_t mid = (left + right + 1) / 2;
uint64_t offset_to_read = rep_->index_start_offset
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
&tmp_slice, key_chars);
if (!s.ok()) {
return s;
}
int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
if (compare_result < 0) {
if (left == right) {
target_offset = right + 1;
break;
}
left = mid;
} else {
if (left == right) {
target_offset = left;
break;
}
right = mid - 1;
}
}
if (target_offset >= (uint32_t) rep_->num_entries) {
*offset = rep_->index_start_offset;
return Status::OK();
}
char value_offset_chars[Rep::offset_length];
int64_t offset_for_value_offset = rep_->index_start_offset
+ (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
+ Rep::GetInternalKeyLength();
Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
&tmp_slice, value_offset_chars);
if (s.ok()) {
*offset = DecodeFixed64(value_offset_chars);
}
return s;
}
Status SimpleTableReader::Get(
const ReadOptions& options, const Slice& k, void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool),
void (*mark_key_may_exist)(void*)) {
Status s;
SimpleTableIterator* iter = new SimpleTableIterator(this);
for (iter->Seek(k); iter->Valid(); iter->Next()) {
if (!(*saver)(arg, iter->key(), iter->value(), true)) {
break;
}
}
s = iter->status();
delete iter;
return s;
}
bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
const Slice& key) {
return false;
}
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
return 0;
}
SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
table_(table) {
key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
value_str_len_ = -1;
SeekToFirst();
}
SimpleTableIterator::~SimpleTableIterator() {
delete[] key_str_;
if (value_str_len_ >= 0) {
delete[] value_str_;
}
}
bool SimpleTableIterator::Valid() const {
return offset_ < table_->rep_->index_start_offset;
}
void SimpleTableIterator::SeekToFirst() {
next_offset_ = 0;
Next();
}
void SimpleTableIterator::SeekToLast() {
assert(false);
}
void SimpleTableIterator::Seek(const Slice& target) {
Status s = table_->GetOffset(target, &next_offset_);
if (!s.ok()) {
status_ = s;
}
Next();
}
void SimpleTableIterator::Next() {
offset_ = next_offset_;
if (offset_ >= table_->rep_->index_start_offset) {
return;
}
Slice result;
int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
key_str_);
next_offset_ += internal_key_size;
key_ = result;
Slice value_size_slice;
s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
next_offset_ += 4;
uint32_t value_size = DecodeFixed32(tmp_str_);
Slice value_slice;
if ((int) value_size > value_str_len_) {
if (value_str_len_ >= 0) {
delete[] value_str_;
}
value_str_ = new char[value_size];
value_str_len_ = value_size;
}
s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
value_str_);
next_offset_ += value_size;
value_ = value_slice;
}
void SimpleTableIterator::Prev() {
assert(false);
}
Slice SimpleTableIterator::key() const {
Log(table_->rep_->options.info_log, "key!!!!");
return key_;
}
Slice SimpleTableIterator::value() const {
return value_;
}
Status SimpleTableIterator::status() const {
return status_;
}
class SimpleTableBuilder: public TableBuilder {
public:
// Create a builder that will store the contents of the table it is
// building in *file. Does not close the file. It is up to the
// caller to close the file after calling Finish(). The output file
// will be part of level specified by 'level'. A value of -1 means
// that the caller does not know which level the output file will reside.
SimpleTableBuilder(const Options& options, WritableFile* file,
CompressionType compression_type);
// REQUIRES: Either Finish() or Abandon() has been called.
~SimpleTableBuilder();
// Add key,value to the table being constructed.
// REQUIRES: key is after any previously added key according to comparator.
// REQUIRES: Finish(), Abandon() have not been called
void Add(const Slice& key, const Slice& value) override;
// Return non-ok iff some error has been detected.
Status status() const override;
// Finish building the table. Stops using the file passed to the
// constructor after this function returns.
// REQUIRES: Finish(), Abandon() have not been called
Status Finish() override;
// Indicate that the contents of this builder should be abandoned. Stops
// using the file passed to the constructor after this function returns.
// If the caller is not going to call Finish(), it must call Abandon()
// before destroying this builder.
// REQUIRES: Finish(), Abandon() have not been called
void Abandon() override;
// Number of calls to Add() so far.
uint64_t NumEntries() const override;
// Size of the file generated so far. If invoked after a successful
// Finish() call, returns the size of the final generated file.
uint64_t FileSize() const override;
private:
struct Rep;
Rep* rep_;
// No copying allowed
SimpleTableBuilder(const SimpleTableBuilder&) = delete;
void operator=(const SimpleTableBuilder&) = delete;
};
struct SimpleTableBuilder::Rep {
Options options;
WritableFile* file;
uint64_t offset = 0;
Status status;
uint64_t num_entries = 0;
bool closed = false; // Either Finish() or Abandon() has been called.
const static int user_key_size = 16;
const static int offset_length = 8;
const static int key_footer_len = 8;
static int GetInternalKeyLength() {
return user_key_size + key_footer_len;
}
std::string index;
Rep(const Options& opt, WritableFile* f) :
options(opt), file(f) {
}
~Rep() {
}
};
SimpleTableBuilder::SimpleTableBuilder(const Options& options,
WritableFile* file,
CompressionType compression_type) :
rep_(new SimpleTableBuilder::Rep(options, file)) {
}
SimpleTableBuilder::~SimpleTableBuilder() {
delete (rep_);
}
void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
assert((int ) key.size() == Rep::GetInternalKeyLength());
// Update index
rep_->index.append(key.data(), key.size());
PutFixed64(&(rep_->index), rep_->offset);
// Write key-value pair
rep_->file->Append(key);
rep_->offset += Rep::GetInternalKeyLength();
std::string size;
int value_size = value.size();
PutFixed32(&size, value_size);
Slice sizeSlice(size);
rep_->file->Append(sizeSlice);
rep_->file->Append(value);
rep_->offset += value_size + 4;
rep_->num_entries++;
}
Status SimpleTableBuilder::status() const {
return Status::OK();
}
Status SimpleTableBuilder::Finish() {
Rep* r = rep_;
assert(!r->closed);
r->closed = true;
uint64_t index_offset = rep_->offset;
Slice index_slice(rep_->index);
rep_->file->Append(index_slice);
rep_->offset += index_slice.size();
std::string index_offset_str;
PutFixed64(&index_offset_str, index_offset);
Slice foot_slice(index_offset_str);
rep_->file->Append(foot_slice);
rep_->offset += foot_slice.size();
return Status::OK();
}
void SimpleTableBuilder::Abandon() {
rep_->closed = true;
}
uint64_t SimpleTableBuilder::NumEntries() const {
return rep_->num_entries;
}
uint64_t SimpleTableBuilder::FileSize() const {
return rep_->offset;
}
class SimpleTableFactory: public TableFactory {
public:
~SimpleTableFactory() {
}
SimpleTableFactory() {
}
const char* Name() const override {
return "SimpleTable";
}
Status GetTableReader(const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file,
uint64_t file_size,
unique_ptr<TableReader>* table_reader) const;
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
CompressionType compression_type) const;
};
Status SimpleTableFactory::GetTableReader(
const Options& options, const EnvOptions& soptions,
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const {
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
table_reader);
}
TableBuilder* SimpleTableFactory::GetTableBuilder(
const Options& options, WritableFile* file,
CompressionType compression_type) const {
return new SimpleTableBuilder(options, file, compression_type);
}
class SimpleTableDBTest {
protected:
public:
std::string dbname_;
Env* env_;
DB* db_;
Options last_options_;
SimpleTableDBTest() :
env_(Env::Default()) {
dbname_ = test::TmpDir() + "/simple_table_db_test";
ASSERT_OK(DestroyDB(dbname_, Options()));
db_ = nullptr;
Reopen();
}
~SimpleTableDBTest() {
delete db_;
ASSERT_OK(DestroyDB(dbname_, Options()));
}
// Return the current option configuration.
Options CurrentOptions() {
Options options;
options.table_factory.reset(new SimpleTableFactory());
return options;
}
DBImpl* dbfull() {
return reinterpret_cast<DBImpl*>(db_);
}
void Reopen(Options* options = nullptr) {
ASSERT_OK(TryReopen(options));
}
void Close() {
delete db_;
db_ = nullptr;
}
void DestroyAndReopen(Options* options = nullptr) {
//Destroy using last options
Destroy(&last_options_);
ASSERT_OK(TryReopen(options));
}
void Destroy(Options* options) {
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, *options));
}
Status PureReopen(Options* options, DB** db) {
return DB::Open(*options, dbname_, db);
}
Status TryReopen(Options* options = nullptr) {
delete db_;
db_ = nullptr;
Options opts;
if (options != nullptr) {
opts = *options;
} else {
opts = CurrentOptions();
opts.create_if_missing = true;
}
last_options_ = opts;
return DB::Open(opts, dbname_, &db_);
}
Status Put(const Slice& k, const Slice& v) {
return db_->Put(WriteOptions(), k, v);
}
Status Delete(const std::string& k) {
return db_->Delete(WriteOptions(), k);
}
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
ReadOptions options;
options.snapshot = snapshot;
std::string result;
Status s = db_->Get(options, k, &result);
if (s.IsNotFound()) {
result = "NOT_FOUND";
} else if (!s.ok()) {
result = s.ToString();
}
return result;
}
int NumTableFilesAtLevel(int level) {
std::string property;
ASSERT_TRUE(
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
&property));
return atoi(property.c_str());
}
// Return spread of files per level
std::string FilesPerLevel() {
std::string result;
int last_non_zero_offset = 0;
for (int level = 0; level < db_->NumberLevels(); level++) {
int f = NumTableFilesAtLevel(level);
char buf[100];
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
result += buf;
if (f > 0) {
last_non_zero_offset = result.size();
}
}
result.resize(last_non_zero_offset);
return result;
}
std::string IterStatus(Iterator* iter) {
std::string result;
if (iter->Valid()) {
result = iter->key().ToString() + "->" + iter->value().ToString();
} else {
result = "(invalid)";
}
return result;
}
};
TEST(SimpleTableDBTest, Empty) {
ASSERT_TRUE(db_ != nullptr);
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
}
TEST(SimpleTableDBTest, ReadWrite) {
ASSERT_OK(Put("0000000000000foo", "v1"));
ASSERT_EQ("v1", Get("0000000000000foo"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("0000000000000foo", "v3"));
ASSERT_EQ("v3", Get("0000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(SimpleTableDBTest, Flush) {
ASSERT_OK(Put("0000000000000foo", "v1"));
ASSERT_OK(Put("0000000000000bar", "v2"));
ASSERT_OK(Put("0000000000000foo", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000foo"));
ASSERT_EQ("v2", Get("0000000000000bar"));
}
TEST(SimpleTableDBTest, Flush2) {
ASSERT_OK(Put("0000000000000bar", "b"));
ASSERT_OK(Put("0000000000000foo", "v1"));
dbfull()->TEST_FlushMemTable();
ASSERT_OK(Put("0000000000000foo", "v2"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v2", Get("0000000000000foo"));
ASSERT_OK(Put("0000000000000eee", "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v3", Get("0000000000000eee"));
ASSERT_OK(Delete("0000000000000bar"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
ASSERT_OK(Put("0000000000000eee", "v5"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v5", Get("0000000000000eee"));
}
static std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key_______%06d", i);
return std::string(buf);
}
static std::string RandomString(Random* rnd, int len) {
std::string r;
test::RandomString(rnd, len, &r);
return r;
}
TEST(SimpleTableDBTest, CompactionTrigger) {
Options options = CurrentOptions();
options.write_buffer_size = 100 << 10; //100KB
options.num_levels = 3;
options.max_mem_compaction_level = 0;
options.level0_file_num_compaction_trigger = 3;
Reopen(&options);
Random rnd(301);
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
num++) {
std::vector<std::string> values;
// Write 120KB (12 values, each 10K)
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
}
//generate one more file in level-0, and should trigger level-0 compaction
std::vector<std::string> values;
for (int i = 0; i < 12; i++) {
values.push_back(RandomString(&rnd, 10000));
ASSERT_OK(Put(Key(i), values[i]));
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

416
db/skiplist.h Normal file
View File

@ -0,0 +1,416 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread safety
// -------------
//
// Writes require external synchronization, most likely a mutex.
// Reads require a guarantee that the SkipList will not be destroyed
// while the read is in progress. Apart from that, reads progress
// without any internal locking or synchronization.
//
// Invariants:
//
// (1) Allocated nodes are never deleted until the SkipList is
// destroyed. This is trivially guaranteed by the code since we
// never delete any skip list nodes.
//
// (2) The contents of a Node except for the next/prev pointers are
// immutable after the Node has been linked into the SkipList.
// Only Insert() modifies the list, and it is careful to initialize
// a node and use release-stores to publish the nodes in one or
// more lists.
//
// ... prev vs. next pointer ordering ...
//
#pragma once
#include <assert.h>
#include <stdlib.h>
#include "port/port.h"
#include "util/random.h"
namespace rocksdb {
template<typename Key, class Comparator>
class SkipList {
private:
struct Node;
public:
// Create a new SkipList object that will use "cmp" for comparing keys,
// and will allocate memory using "*arena". Objects allocated in the arena
// must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Arena* arena);
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
void Insert(const Key& key);
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const Key& key) const;
// Iteration over the contents of a skip list
class Iterator {
public:
// Initialize an iterator over the specified list.
// The returned iterator is not valid.
explicit Iterator(const SkipList* list);
// Change the underlying skiplist used for this iterator
// This enables us not changing the iterator without deallocating
// an old one and then allocating a new one
void SetList(const SkipList* list);
// Returns true iff the iterator is positioned at a valid node.
bool Valid() const;
// Returns the key at the current position.
// REQUIRES: Valid()
const Key& key() const;
// Advances to the next position.
// REQUIRES: Valid()
void Next();
// Advances to the previous position.
// REQUIRES: Valid()
void Prev();
// Advance to the first entry with a key >= target
void Seek(const Key& target);
// Position at the first entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToFirst();
// Position at the last entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast();
private:
const SkipList* list_;
Node* node_;
// Intentionally copyable
};
private:
enum { kMaxHeight = 12 };
// Immutable after construction
Comparator const compare_;
Arena* const arena_; // Arena used for allocations of nodes
Node* const head_;
// Modified only by Insert(). Read racily by readers, but stale
// values are ok.
port::AtomicPointer max_height_; // Height of the entire list
// Used for optimizing sequential insert patterns
Node* prev_[kMaxHeight];
int prev_height_;
inline int GetMaxHeight() const {
return static_cast<int>(
reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
}
// Read/written only by Insert().
Random rnd_;
Node* NewNode(const Key& key, int height);
int RandomHeight();
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
// Return true if key is greater than the data stored in "n"
bool KeyIsAfterNode(const Key& key, Node* n) const;
// Return the earliest node that comes at or after key.
// Return nullptr if there is no such node.
//
// If prev is non-nullptr, fills prev[level] with pointer to previous
// node at "level" for every level in [0..max_height_-1].
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
// Return the latest node with a key < key.
// Return head_ if there is no such node.
Node* FindLessThan(const Key& key) const;
// Return the last node in the list.
// Return head_ if list is empty.
Node* FindLast() const;
// No copying allowed
SkipList(const SkipList&);
void operator=(const SkipList&);
};
// Implementation details follow
template<typename Key, class Comparator>
struct SkipList<Key,Comparator>::Node {
explicit Node(const Key& k) : key(k) { }
Key const key;
// Accessors/mutators for links. Wrapped in methods so we can
// add the appropriate barriers as necessary.
Node* Next(int n) {
assert(n >= 0);
// Use an 'acquire load' so that we observe a fully initialized
// version of the returned Node.
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
}
void SetNext(int n, Node* x) {
assert(n >= 0);
// Use a 'release store' so that anybody who reads through this
// pointer observes a fully initialized version of the inserted node.
next_[n].Release_Store(x);
}
// No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next(int n) {
assert(n >= 0);
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
}
void NoBarrier_SetNext(int n, Node* x) {
assert(n >= 0);
next_[n].NoBarrier_Store(x);
}
private:
// Array of length equal to the node height. next_[0] is lowest level link.
port::AtomicPointer next_[1];
};
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
char* mem = arena_->AllocateAligned(
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
return new (mem) Node(key);
}
template<typename Key, class Comparator>
inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
SetList(list);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SetList(const SkipList* list) {
list_ = list;
node_ = nullptr;
}
template<typename Key, class Comparator>
inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
return node_ != nullptr;
}
template<typename Key, class Comparator>
inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
assert(Valid());
return node_->key;
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Next() {
assert(Valid());
node_ = node_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the
// last node that falls before key.
assert(Valid());
node_ = list_->FindLessThan(node_->key);
if (node_ == list_->head_) {
node_ = nullptr;
}
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
node_ = list_->FindGreaterOrEqual(target, nullptr);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
node_ = list_->head_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
node_ = list_->FindLast();
if (node_ == list_->head_) {
node_ = nullptr;
}
}
template<typename Key, class Comparator>
int SkipList<Key,Comparator>::RandomHeight() {
// Increase height with probability 1 in kBranching
static const unsigned int kBranching = 4;
int height = 1;
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
height++;
}
assert(height > 0);
assert(height <= kMaxHeight);
return height;
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
// nullptr n is considered infinite
return (n != nullptr) && (compare_(n->key, key) < 0);
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
const {
// Use prev as an optimization hint and fallback to slow path
if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
Node* x = prev[0];
Node* next = x->Next(0);
if ((x == head_) || KeyIsAfterNode(key, x)) {
// Adjust all relevant insertion points to the previous entry
for (int i = 1; i < prev_height_; i++) {
prev[i] = x;
}
return next;
}
}
// Normal lookup
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
// Make sure the lists are sorted.
// If x points to head_ or next points nullptr, it is trivially satisfied.
assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
if (KeyIsAfterNode(key, next)) {
// Keep searching in this list
x = next;
} else {
if (prev != nullptr) prev[level] = x;
if (level == 0) {
return next;
} else {
// Switch to next list
level--;
}
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
assert(x == head_ || compare_(x->key, key) < 0);
Node* next = x->Next(level);
if (next == nullptr || compare_(next->key, key) >= 0) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (next == nullptr) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
: compare_(cmp),
arena_(arena),
head_(NewNode(0 /* any key will do */, kMaxHeight)),
max_height_(reinterpret_cast<void*>(1)),
prev_height_(1),
rnd_(0xdeadbeef) {
for (int i = 0; i < kMaxHeight; i++) {
head_->SetNext(i, nullptr);
prev_[i] = head_;
}
}
template<typename Key, class Comparator>
void SkipList<Key,Comparator>::Insert(const Key& key) {
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
// here since Insert() is externally synchronized.
Node* x = FindGreaterOrEqual(key, prev_);
// Our data structure does not allow duplicate insertion
assert(x == nullptr || !Equal(key, x->key));
int height = RandomHeight();
if (height > GetMaxHeight()) {
for (int i = GetMaxHeight(); i < height; i++) {
prev_[i] = head_;
}
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
// It is ok to mutate max_height_ without any synchronization
// with concurrent readers. A concurrent reader that observes
// the new value of max_height_ will see either the old value of
// new level pointers from head_ (nullptr), or a new value set in
// the loop below. In the former case the reader will
// immediately drop to the next level since nullptr sorts after all
// keys. In the latter case the reader will use the new node.
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
}
x = NewNode(key, height);
for (int i = 0; i < height; i++) {
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
prev_[i]->SetNext(i, x);
}
prev_[0] = x;
prev_height_ = height;
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::Contains(const Key& key) const {
Node* x = FindGreaterOrEqual(key, nullptr);
if (x != nullptr && Equal(key, x->key)) {
return true;
} else {
return false;
}
}
} // namespace rocksdb

383
db/skiplist_test.cc Normal file
View File

@ -0,0 +1,383 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/skiplist.h"
#include <set>
#include "rocksdb/env.h"
#include "util/arena_impl.h"
#include "util/hash.h"
#include "util/random.h"
#include "util/testharness.h"
namespace rocksdb {
typedef uint64_t Key;
struct TestComparator {
int operator()(const Key& a, const Key& b) const {
if (a < b) {
return -1;
} else if (a > b) {
return +1;
} else {
return 0;
}
}
};
class SkipTest { };
TEST(SkipTest, Empty) {
ArenaImpl arena_impl;
TestComparator cmp;
SkipList<Key, TestComparator> list(cmp, &arena_impl);
ASSERT_TRUE(!list.Contains(10));
SkipList<Key, TestComparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.SeekToFirst();
ASSERT_TRUE(!iter.Valid());
iter.Seek(100);
ASSERT_TRUE(!iter.Valid());
iter.SeekToLast();
ASSERT_TRUE(!iter.Valid());
}
TEST(SkipTest, InsertAndLookup) {
const int N = 2000;
const int R = 5000;
Random rnd(1000);
std::set<Key> keys;
ArenaImpl arena_impl;
TestComparator cmp;
SkipList<Key, TestComparator> list(cmp, &arena_impl);
for (int i = 0; i < N; i++) {
Key key = rnd.Next() % R;
if (keys.insert(key).second) {
list.Insert(key);
}
}
for (int i = 0; i < R; i++) {
if (list.Contains(i)) {
ASSERT_EQ(keys.count(i), 1U);
} else {
ASSERT_EQ(keys.count(i), 0U);
}
}
// Simple iterator tests
{
SkipList<Key, TestComparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.Seek(0);
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToFirst();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToLast();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.rbegin()), iter.key());
}
// Forward iteration test
for (int i = 0; i < R; i++) {
SkipList<Key, TestComparator>::Iterator iter(&list);
iter.Seek(i);
// Compare against model iterator
std::set<Key>::iterator model_iter = keys.lower_bound(i);
for (int j = 0; j < 3; j++) {
if (model_iter == keys.end()) {
ASSERT_TRUE(!iter.Valid());
break;
} else {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
++model_iter;
iter.Next();
}
}
}
// Backward iteration test
{
SkipList<Key, TestComparator>::Iterator iter(&list);
iter.SeekToLast();
// Compare against model iterator
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
model_iter != keys.rend();
++model_iter) {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
iter.Prev();
}
ASSERT_TRUE(!iter.Valid());
}
}
// We want to make sure that with a single writer and multiple
// concurrent readers (with no synchronization other than when a
// reader's iterator is created), the reader always observes all the
// data that was present in the skip list when the iterator was
// constructor. Because insertions are happening concurrently, we may
// also observe new values that were inserted since the iterator was
// constructed, but we should never miss any values that were present
// at iterator construction time.
//
// We generate multi-part keys:
// <key,gen,hash>
// where:
// key is in range [0..K-1]
// gen is a generation number for key
// hash is hash(key,gen)
//
// The insertion code picks a random key, sets gen to be 1 + the last
// generation number inserted for that key, and sets hash to Hash(key,gen).
//
// At the beginning of a read, we snapshot the last inserted
// generation number for each key. We then iterate, including random
// calls to Next() and Seek(). For every key we encounter, we
// check that it is either expected given the initial snapshot or has
// been concurrently added since the iterator started.
class ConcurrentTest {
private:
static const uint32_t K = 4;
static uint64_t key(Key key) { return (key >> 40); }
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
static uint64_t hash(Key key) { return key & 0xff; }
static uint64_t HashNumbers(uint64_t k, uint64_t g) {
uint64_t data[2] = { k, g };
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
}
static Key MakeKey(uint64_t k, uint64_t g) {
assert(sizeof(Key) == sizeof(uint64_t));
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
assert(g <= 0xffffffffu);
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
}
static bool IsValidKey(Key k) {
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
}
static Key RandomTarget(Random* rnd) {
switch (rnd->Next() % 10) {
case 0:
// Seek to beginning
return MakeKey(0, 0);
case 1:
// Seek to end
return MakeKey(K, 0);
default:
// Seek to middle
return MakeKey(rnd->Next() % K, 0);
}
}
// Per-key generation
struct State {
port::AtomicPointer generation[K];
void Set(int k, intptr_t v) {
generation[k].Release_Store(reinterpret_cast<void*>(v));
}
intptr_t Get(int k) {
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
}
State() {
for (unsigned int k = 0; k < K; k++) {
Set(k, 0);
}
}
};
// Current state of the test
State current_;
ArenaImpl arena_impl_;
// SkipList is not protected by mu_. We just use a single writer
// thread to modify it.
SkipList<Key, TestComparator> list_;
public:
ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
// REQUIRES: External synchronization
void WriteStep(Random* rnd) {
const uint32_t k = rnd->Next() % K;
const intptr_t g = current_.Get(k) + 1;
const Key key = MakeKey(k, g);
list_.Insert(key);
current_.Set(k, g);
}
void ReadStep(Random* rnd) {
// Remember the initial committed state of the skiplist.
State initial_state;
for (unsigned int k = 0; k < K; k++) {
initial_state.Set(k, current_.Get(k));
}
Key pos = RandomTarget(rnd);
SkipList<Key, TestComparator>::Iterator iter(&list_);
iter.Seek(pos);
while (true) {
Key current;
if (!iter.Valid()) {
current = MakeKey(K, 0);
} else {
current = iter.key();
ASSERT_TRUE(IsValidKey(current)) << current;
}
ASSERT_LE(pos, current) << "should not go backwards";
// Verify that everything in [pos,current) was not present in
// initial_state.
while (pos < current) {
ASSERT_LT(key(pos), K) << pos;
// Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0U) ||
(gen(pos) > (uint64_t)initial_state.Get(key(pos)))
) << "key: " << key(pos)
<< "; gen: " << gen(pos)
<< "; initgen: "
<< initial_state.Get(key(pos));
// Advance to next key in the valid key space
if (key(pos) < key(current)) {
pos = MakeKey(key(pos) + 1, 0);
} else {
pos = MakeKey(key(pos), gen(pos) + 1);
}
}
if (!iter.Valid()) {
break;
}
if (rnd->Next() % 2) {
iter.Next();
pos = MakeKey(key(pos), gen(pos) + 1);
} else {
Key new_target = RandomTarget(rnd);
if (new_target > pos) {
pos = new_target;
iter.Seek(new_target);
}
}
}
}
};
const uint32_t ConcurrentTest::K;
// Simple test that does single-threaded testing of the ConcurrentTest
// scaffolding.
TEST(SkipTest, ConcurrentWithoutThreads) {
ConcurrentTest test;
Random rnd(test::RandomSeed());
for (int i = 0; i < 10000; i++) {
test.ReadStep(&rnd);
test.WriteStep(&rnd);
}
}
class TestState {
public:
ConcurrentTest t_;
int seed_;
port::AtomicPointer quit_flag_;
enum ReaderState {
STARTING,
RUNNING,
DONE
};
explicit TestState(int s)
: seed_(s),
quit_flag_(nullptr),
state_(STARTING),
state_cv_(&mu_) {}
void Wait(ReaderState s) {
mu_.Lock();
while (state_ != s) {
state_cv_.Wait();
}
mu_.Unlock();
}
void Change(ReaderState s) {
mu_.Lock();
state_ = s;
state_cv_.Signal();
mu_.Unlock();
}
private:
port::Mutex mu_;
ReaderState state_;
port::CondVar state_cv_;
};
static void ConcurrentReader(void* arg) {
TestState* state = reinterpret_cast<TestState*>(arg);
Random rnd(state->seed_);
int64_t reads = 0;
state->Change(TestState::RUNNING);
while (!state->quit_flag_.Acquire_Load()) {
state->t_.ReadStep(&rnd);
++reads;
}
state->Change(TestState::DONE);
}
static void RunConcurrent(int run) {
const int seed = test::RandomSeed() + (run * 100);
Random rnd(seed);
const int N = 1000;
const int kSize = 1000;
for (int i = 0; i < N; i++) {
if ((i % 100) == 0) {
fprintf(stderr, "Run %d of %d\n", i, N);
}
TestState state(seed + 1);
Env::Default()->Schedule(ConcurrentReader, &state);
state.Wait(TestState::RUNNING);
for (int i = 0; i < kSize; i++) {
state.t_.WriteStep(&rnd);
}
state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do
state.Wait(TestState::DONE);
}
}
TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

86
db/snapshot.h Normal file
View File

@ -0,0 +1,86 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/db.h"
namespace rocksdb {
class SnapshotList;
// Snapshots are kept in a doubly-linked list in the DB.
// Each SnapshotImpl corresponds to a particular sequence number.
class SnapshotImpl : public Snapshot {
public:
SequenceNumber number_; // const after creation
private:
friend class SnapshotList;
// SnapshotImpl is kept in a doubly-linked circular list
SnapshotImpl* prev_;
SnapshotImpl* next_;
SnapshotList* list_; // just for sanity checks
};
class SnapshotList {
public:
SnapshotList() {
list_.prev_ = &list_;
list_.next_ = &list_;
list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
}
bool empty() const { return list_.next_ == &list_; }
SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
const SnapshotImpl* New(SequenceNumber seq) {
SnapshotImpl* s = new SnapshotImpl;
s->number_ = seq;
s->list_ = this;
s->next_ = &list_;
s->prev_ = list_.prev_;
s->prev_->next_ = s;
s->next_->prev_ = s;
return s;
}
void Delete(const SnapshotImpl* s) {
assert(s->list_ == this);
s->prev_->next_ = s->next_;
s->next_->prev_ = s->prev_;
delete s;
}
// retrieve all snapshot numbers. They are sorted in ascending order.
void getAll(std::vector<SequenceNumber>& ret) {
if (empty()) return;
SnapshotImpl* s = &list_;
while (s->next_ != &list_) {
ret.push_back(s->next_->number_);
s = s ->next_;
}
}
// get the sequence number of the most recent snapshot
const SequenceNumber GetNewest() {
if (empty()) {
return 0;
}
return newest()->number_;
}
private:
// Dummy head of doubly-linked list of snapshots
SnapshotImpl list_;
};
} // namespace rocksdb

173
db/table_cache.cc Normal file
View File

@ -0,0 +1,173 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/table_cache.h"
#include "db/filename.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table.h"
#include "util/coding.h"
#include "util/stop_watch.h"
namespace rocksdb {
static void DeleteEntry(const Slice& key, void* value) {
TableReader* table_reader = reinterpret_cast<TableReader*>(value);
delete table_reader;
}
static void UnrefEntry(void* arg1, void* arg2) {
Cache* cache = reinterpret_cast<Cache*>(arg1);
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
cache->Release(h);
}
TableCache::TableCache(const std::string& dbname,
const Options* options,
const EnvOptions& storage_options,
int entries)
: env_(options->env),
dbname_(dbname),
options_(options),
storage_options_(storage_options),
cache_(
NewLRUCache(entries, options->table_cache_numshardbits,
options->table_cache_remove_scan_count_limit)) {
}
TableCache::~TableCache() {
}
Status TableCache::FindTable(const EnvOptions& toptions,
uint64_t file_number, uint64_t file_size,
Cache::Handle** handle, bool* table_io,
const bool no_io) {
Status s;
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf));
*handle = cache_->Lookup(key);
if (*handle == nullptr) {
if (no_io) { // Dont do IO and return a not-found status
return Status::Incomplete("Table not found in table_cache, no_io is set");
}
if (table_io != nullptr) {
*table_io = true; // we had to do IO from storage
}
std::string fname = TableFileName(dbname_, file_number);
unique_ptr<RandomAccessFile> file;
unique_ptr<TableReader> table_reader;
s = env_->NewRandomAccessFile(fname, &file, toptions);
RecordTick(options_->statistics, NO_FILE_OPENS);
if (s.ok()) {
if (options_->advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM);
}
StopWatch sw(env_, options_->statistics, TABLE_OPEN_IO_MICROS);
s = options_->table_factory->GetTableReader(*options_, toptions,
std::move(file), file_size,
&table_reader);
}
if (!s.ok()) {
assert(table_reader == nullptr);
RecordTick(options_->statistics, NO_FILE_ERRORS);
// We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically.
} else {
assert(file.get() == nullptr);
*handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
}
}
return s;
}
Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions,
uint64_t file_number,
uint64_t file_size,
TableReader** table_reader_ptr,
bool for_compaction) {
if (table_reader_ptr != nullptr) {
*table_reader_ptr = nullptr;
}
Cache::Handle* handle = nullptr;
Status s = FindTable(toptions, file_number, file_size, &handle,
nullptr, options.read_tier == kBlockCacheTier);
if (!s.ok()) {
return NewErrorIterator(s);
}
TableReader* table_reader =
reinterpret_cast<TableReader*>(cache_->Value(handle));
Iterator* result = table_reader->NewIterator(options);
result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
if (table_reader_ptr != nullptr) {
*table_reader_ptr = table_reader;
}
if (for_compaction) {
table_reader->SetupForCompaction();
}
return result;
}
Status TableCache::Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool),
bool* table_io,
void (*mark_key_may_exist)(void*)) {
Cache::Handle* handle = nullptr;
Status s = FindTable(storage_options_, file_number, file_size,
&handle, table_io,
options.read_tier == kBlockCacheTier);
if (s.ok()) {
TableReader* t =
reinterpret_cast<TableReader*>(cache_->Value(handle));
s = t->Get(options, k, arg, saver, mark_key_may_exist);
cache_->Release(handle);
} else if (options.read_tier && s.IsIncomplete()) {
// Couldnt find Table in cache but treat as kFound if no_io set
(*mark_key_may_exist)(arg);
return Status::OK();
}
return s;
}
bool TableCache::PrefixMayMatch(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& internal_prefix,
bool* table_io) {
Cache::Handle* handle = nullptr;
Status s = FindTable(storage_options_, file_number,
file_size, &handle, table_io);
bool may_match = true;
if (s.ok()) {
TableReader* t =
reinterpret_cast<TableReader*>(cache_->Value(handle));
may_match = t->PrefixMayMatch(internal_prefix);
cache_->Release(handle);
}
return may_match;
}
void TableCache::Evict(uint64_t file_number) {
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
cache_->Erase(Slice(buf, sizeof(buf)));
}
} // namespace rocksdb

78
db/table_cache.h Normal file
View File

@ -0,0 +1,78 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread-safe (provides internal synchronization)
#pragma once
#include <string>
#include <stdint.h>
#include "db/dbformat.h"
#include "rocksdb/env.h"
#include "rocksdb/cache.h"
#include "port/port.h"
#include "rocksdb/table.h"
namespace rocksdb {
class Env;
class TableCache {
public:
TableCache(const std::string& dbname, const Options* options,
const EnvOptions& storage_options, int entries);
~TableCache();
// Return an iterator for the specified file number (the corresponding
// file length must be exactly "file_size" bytes). If "tableptr" is
// non-nullptr, also sets "*tableptr" to point to the Table object
// underlying the returned iterator, or nullptr if no Table object underlies
// the returned iterator. The returned "*tableptr" object is owned by
// the cache and should not be deleted, and is valid for as long as the
// returned iterator is live.
Iterator* NewIterator(const ReadOptions& options,
const EnvOptions& toptions,
uint64_t file_number,
uint64_t file_size,
TableReader** table_reader_ptr = nullptr,
bool for_compaction = false);
// If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value) repeatedly until
// it returns false.
Status Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
bool (*handle_result)(void*, const Slice&, const Slice&, bool),
bool* table_io,
void (*mark_key_may_exist)(void*) = nullptr);
// Determine whether the table may contain the specified prefix. If
// the table index of blooms are not in memory, this may cause an I/O
bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
uint64_t file_size, const Slice& internal_prefix,
bool* table_io);
// Evict any entry for the specified file number
void Evict(uint64_t file_number);
private:
Env* const env_;
const std::string dbname_;
const Options* options_;
const EnvOptions& storage_options_;
std::shared_ptr<Cache> cache_;
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
const bool no_io = false);
};
} // namespace rocksdb

View File

@ -0,0 +1,55 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/table_stats_collector.h"
#include "db/dbformat.h"
#include "util/coding.h"
namespace rocksdb {
Status InternalKeyStatsCollector::Add(const Slice& key, const Slice& value) {
ParsedInternalKey ikey;
if (!ParseInternalKey(key, &ikey)) {
return Status::InvalidArgument("Invalid internal key");
}
if (ikey.type == ValueType::kTypeDeletion) {
++deleted_keys_;
}
return Status::OK();
}
Status InternalKeyStatsCollector::Finish(
TableStats::UserCollectedStats* stats) {
assert(stats);
assert(stats->find(InternalKeyTableStatsNames::kDeletedKeys) == stats->end());
std::string val;
PutVarint64(&val, deleted_keys_);
stats->insert(std::make_pair(InternalKeyTableStatsNames::kDeletedKeys, val));
return Status::OK();
}
Status UserKeyTableStatsCollector::Add(const Slice& key, const Slice& value) {
ParsedInternalKey ikey;
if (!ParseInternalKey(key, &ikey)) {
return Status::InvalidArgument("Invalid internal key");
}
return collector_->Add(ikey.user_key, value);
}
Status UserKeyTableStatsCollector::Finish(
TableStats::UserCollectedStats* stats) {
return collector_->Finish(stats);
}
const std::string InternalKeyTableStatsNames::kDeletedKeys
= "rocksdb.deleted.keys";
} // namespace rocksdb

View File

@ -0,0 +1,58 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// This file defines a collection of statistics collectors.
#pragma once
#include "rocksdb/table_stats.h"
#include <memory>
#include <string>
#include <vector>
namespace rocksdb {
struct InternalKeyTableStatsNames {
static const std::string kDeletedKeys;
};
// Collecting the statistics for internal keys. Visible only by internal
// rocksdb modules.
class InternalKeyStatsCollector : public TableStatsCollector {
public:
virtual Status Add(const Slice& key, const Slice& value);
virtual Status Finish(TableStats::UserCollectedStats* stats);
virtual const char* Name() const { return "InternalKeyStatsCollector"; }
private:
uint64_t deleted_keys_ = 0;
};
// When rocksdb creates a new table, it will encode all "user keys" into
// "internal keys", which contains meta information of a given entry.
//
// This class extracts user key from the encoded internal key when Add() is
// invoked.
class UserKeyTableStatsCollector : public TableStatsCollector {
public:
explicit UserKeyTableStatsCollector(TableStatsCollector* collector):
UserKeyTableStatsCollector(
std::shared_ptr<TableStatsCollector>(collector)
) {
}
explicit UserKeyTableStatsCollector(
std::shared_ptr<TableStatsCollector> collector) : collector_(collector) {
}
virtual ~UserKeyTableStatsCollector() { }
virtual Status Add(const Slice& key, const Slice& value);
virtual Status Finish(TableStats::UserCollectedStats* stats);
virtual const char* Name() const { return collector_->Name(); }
protected:
std::shared_ptr<TableStatsCollector> collector_;
};
} // namespace rocksdb

View File

@ -0,0 +1,261 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <map>
#include <memory>
#include <string>
#include "db/dbformat.h"
#include "db/db_impl.h"
#include "db/table_stats_collector.h"
#include "rocksdb/table_stats.h"
#include "rocksdb/table.h"
#include "util/coding.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
class TableStatsTest {
private:
unique_ptr<TableReader> table_reader_;
};
// TODO(kailiu) the following classes should be moved to some more general
// places, so that other tests can also make use of them.
// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
// and therefore enable us to quickly setup the tests.
class FakeWritableFile : public WritableFile {
public:
~FakeWritableFile() { }
const std::string& contents() const { return contents_; }
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& data) {
contents_.append(data.data(), data.size());
return Status::OK();
}
private:
std::string contents_;
};
class FakeRandomeAccessFile : public RandomAccessFile {
public:
explicit FakeRandomeAccessFile(const Slice& contents)
: contents_(contents.data(), contents.size()) {
}
virtual ~FakeRandomeAccessFile() { }
uint64_t Size() const { return contents_.size(); }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
if (offset > contents_.size()) {
return Status::InvalidArgument("invalid Read offset");
}
if (offset + n > contents_.size()) {
n = contents_.size() - offset;
}
memcpy(scratch, &contents_[offset], n);
*result = Slice(scratch, n);
return Status::OK();
}
private:
std::string contents_;
};
class DumbLogger : public Logger {
public:
virtual void Logv(const char* format, va_list ap) { }
virtual size_t GetLogFileSize() const { return 0; }
};
// Utilities test functions
void MakeBuilder(
Options options,
std::unique_ptr<FakeWritableFile>* writable,
std::unique_ptr<TableBuilder>* builder) {
writable->reset(new FakeWritableFile);
if (!options.flush_block_policy_factory) {
options.SetUpDefaultFlushBlockPolicyFactory();
}
builder->reset(
options.table_factory->GetTableBuilder(options, writable->get(),
options.compression));
}
void OpenTable(
const Options& options,
const std::string& contents,
std::unique_ptr<TableReader>* table_reader) {
std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
auto s = options.table_factory->GetTableReader(
options,
EnvOptions(),
std::move(file),
contents.size(),
table_reader
);
ASSERT_OK(s);
}
// Collects keys that starts with "A" in a table.
class RegularKeysStartWithA: public TableStatsCollector {
public:
const char* Name() const { return "RegularKeysStartWithA"; }
Status Finish(TableStats::UserCollectedStats* stats) {
std::string encoded;
PutVarint32(&encoded, count_);
*stats = TableStats::UserCollectedStats {
{ "TableStatsTest", "Rocksdb" },
{ "Count", encoded }
};
return Status::OK();
}
Status Add(const Slice& user_key, const Slice& value) {
// simply asssume all user keys are not empty.
if (user_key.data()[0] == 'A') {
++count_;
}
return Status::OK();
}
private:
uint32_t count_ = 0;
};
TEST(TableStatsTest, CustomizedTableStatsCollector) {
Options options;
// make sure the entries will be inserted with order.
std::map<std::string, std::string> kvs = {
{"About", "val5"}, // starts with 'A'
{"Abstract", "val2"}, // starts with 'A'
{"Around", "val7"}, // starts with 'A'
{"Beyond", "val3"},
{"Builder", "val1"},
{"Cancel", "val4"},
{"Find", "val6"},
};
// Test stats collectors with internal keys or regular keys
for (bool encode_as_internal : { true, false }) {
// -- Step 1: build table
auto collector = new RegularKeysStartWithA();
if (encode_as_internal) {
options.table_stats_collectors = {
std::make_shared<UserKeyTableStatsCollector>(collector)
};
} else {
options.table_stats_collectors.resize(1);
options.table_stats_collectors[0].reset(collector);
}
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
MakeBuilder(options, &writable, &builder);
for (const auto& kv : kvs) {
if (encode_as_internal) {
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
builder->Add(ikey.Encode(), kv.second);
} else {
builder->Add(kv.first, kv.second);
}
}
ASSERT_OK(builder->Finish());
// -- Step 2: Open table
std::unique_ptr<TableReader> table_reader;
OpenTable(options, writable->contents(), &table_reader);
const auto& stats = table_reader->GetTableStats().user_collected_stats;
ASSERT_EQ("Rocksdb", stats.at("TableStatsTest"));
uint32_t starts_with_A = 0;
Slice key(stats.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(3u, starts_with_A);
}
}
TEST(TableStatsTest, InternalKeyStatsCollector) {
InternalKey keys[] = {
InternalKey("A", 0, ValueType::kTypeValue),
InternalKey("B", 0, ValueType::kTypeValue),
InternalKey("C", 0, ValueType::kTypeValue),
InternalKey("W", 0, ValueType::kTypeDeletion),
InternalKey("X", 0, ValueType::kTypeDeletion),
InternalKey("Y", 0, ValueType::kTypeDeletion),
InternalKey("Z", 0, ValueType::kTypeDeletion),
};
for (bool sanitized : { false, true }) {
std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable;
Options options;
if (sanitized) {
options.table_stats_collectors = {
std::make_shared<RegularKeysStartWithA>()
};
// with sanitization, even regular stats collector will be able to
// handle internal keys.
auto comparator = options.comparator;
// HACK: Set options.info_log to avoid writing log in
// SanitizeOptions().
options.info_log = std::make_shared<DumbLogger>();
options = SanitizeOptions(
"db", // just a place holder
nullptr, // with skip internal key comparator
nullptr, // don't care filter policy
options
);
options.comparator = comparator;
} else {
options.table_stats_collectors = {
std::make_shared<InternalKeyStatsCollector>()
};
}
MakeBuilder(options, &writable, &builder);
for (const auto& k : keys) {
builder->Add(k.Encode(), "val");
}
ASSERT_OK(builder->Finish());
std::unique_ptr<TableReader> table_reader;
OpenTable(options, writable->contents(), &table_reader);
const auto& stats = table_reader->GetTableStats().user_collected_stats;
uint64_t deleted = 0;
Slice key(stats.at(InternalKeyTableStatsNames::kDeletedKeys));
ASSERT_TRUE(GetVarint64(&key, &deleted));
ASSERT_EQ(4u, deleted);
if (sanitized) {
uint32_t starts_with_A = 0;
Slice key(stats.at("Count"));
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
ASSERT_EQ(1u, starts_with_A);
}
}
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

264
db/transaction_log_impl.cc Normal file
View File

@ -0,0 +1,264 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "db/transaction_log_impl.h"
#include "db/write_batch_internal.h"
namespace rocksdb {
TransactionLogIteratorImpl::TransactionLogIteratorImpl(
const std::string& dir,
const Options* options,
const EnvOptions& soptions,
const SequenceNumber seq,
std::unique_ptr<VectorLogPtr> files,
DBImpl const * const dbimpl) :
dir_(dir),
options_(options),
soptions_(soptions),
startingSequenceNumber_(seq),
files_(std::move(files)),
started_(false),
isValid_(false),
currentFileIndex_(0),
currentBatchSeq_(0),
currentLastSeq_(0),
dbimpl_(dbimpl) {
assert(files_ != nullptr);
assert(dbimpl_ != nullptr);
reporter_.env = options_->env;
reporter_.info_log = options_->info_log.get();
SeekToStartSequence(); // Seek till starting sequence
}
Status TransactionLogIteratorImpl::OpenLogFile(
const LogFile* logFile,
unique_ptr<SequentialFile>* file) {
Env* env = options_->env;
if (logFile->Type() == kArchivedLogFile) {
std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
return env->NewSequentialFile(fname, file, soptions_);
} else {
std::string fname = LogFileName(dir_, logFile->LogNumber());
Status status = env->NewSequentialFile(fname, file, soptions_);
if (!status.ok()) {
// If cannot open file in DB directory.
// Try the archive dir, as it could have moved in the meanwhile.
fname = ArchivedLogFileName(dir_, logFile->LogNumber());
status = env->NewSequentialFile(fname, file, soptions_);
if (!status.ok()) {
return Status::IOError("Requested file not present in the dir");
}
}
return status;
}
}
BatchResult TransactionLogIteratorImpl::GetBatch() {
assert(isValid_); // cannot call in a non valid state.
BatchResult result;
result.sequence = currentBatchSeq_;
result.writeBatchPtr = std::move(currentBatch_);
return result;
}
Status TransactionLogIteratorImpl::status() {
return currentStatus_;
}
bool TransactionLogIteratorImpl::Valid() {
return started_ && isValid_;
}
bool TransactionLogIteratorImpl::RestrictedRead(
Slice* record,
std::string* scratch) {
// Don't read if no more complete entries to read from logs
if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
return false;
}
return currentLogReader_->ReadRecord(record, scratch);
}
void TransactionLogIteratorImpl::SeekToStartSequence(
uint64_t startFileIndex,
bool strict) {
std::string scratch;
Slice record;
started_ = false;
isValid_ = false;
if (files_->size() <= startFileIndex) {
return;
}
Status s = OpenLogReader(files_->at(startFileIndex).get());
if (!s.ok()) {
currentStatus_ = s;
return;
}
while (RestrictedRead(&record, &scratch)) {
if (record.size() < 12) {
reporter_.Corruption(
record.size(), Status::Corruption("very small log record"));
continue;
}
UpdateCurrentWriteBatch(record);
if (currentLastSeq_ >= startingSequenceNumber_) {
if (strict && currentBatchSeq_ != startingSequenceNumber_) {
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
"seek to required sequence number");
reporter_.Info(currentStatus_.ToString().c_str());
return;
} else if (strict) {
reporter_.Info("Could seek required sequence number. Iterator will "
"continue.");
}
isValid_ = true;
started_ = true; // set started_ as we could seek till starting sequence
return;
} else {
isValid_ = false;
}
}
// Could not find start sequence in first file. Normally this must be the
// only file. Otherwise log the error and let the iterator return next entry
// If strict is set, we want to seek exactly till the start sequence and it
// should have been present in the file we scanned above
if (strict) {
currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
"seek to required sequence number");
reporter_.Info(currentStatus_.ToString().c_str());
} else if (files_->size() != 1) {
currentStatus_ = Status::Corruption("Start sequence was not found, "
"skipping to the next available");
reporter_.Info(currentStatus_.ToString().c_str());
// Let NextImpl find the next available entry. started_ remains false
// because we don't want to check for gaps while moving to start sequence
NextImpl(true);
}
}
void TransactionLogIteratorImpl::Next() {
return NextImpl(false);
}
void TransactionLogIteratorImpl::NextImpl(bool internal) {
std::string scratch;
Slice record;
isValid_ = false;
if (!internal && !started_) {
// Runs every time until we can seek to the start sequence
return SeekToStartSequence();
}
while(true) {
assert(currentLogReader_);
if (currentLogReader_->IsEOF()) {
currentLogReader_->UnmarkEOF();
}
while (RestrictedRead(&record, &scratch)) {
if (record.size() < 12) {
reporter_.Corruption(
record.size(), Status::Corruption("very small log record"));
continue;
} else {
// started_ should be true if called by application
assert(internal || started_);
// started_ should be false if called internally
assert(!internal || !started_);
UpdateCurrentWriteBatch(record);
if (internal && !started_) {
started_ = true;
}
return;
}
}
// Open the next file
if (currentFileIndex_ < files_->size() - 1) {
++currentFileIndex_;
Status status =OpenLogReader(files_->at(currentFileIndex_).get());
if (!status.ok()) {
isValid_ = false;
currentStatus_ = status;
return;
}
} else {
isValid_ = false;
if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
currentStatus_ = Status::OK();
} else {
currentStatus_ = Status::IOError("NO MORE DATA LEFT");
}
return;
}
}
}
bool TransactionLogIteratorImpl::IsBatchExpected(
const WriteBatch* batch,
const SequenceNumber expectedSeq) {
assert(batch);
SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
if (batchSeq != expectedSeq) {
char buf[200];
snprintf(buf, sizeof(buf),
"Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
"Last flushed seq=%lu.Log iterator will reseek the correct "
"batch.",
(unsigned long)batchSeq,
(unsigned long)expectedSeq,
(unsigned long)dbimpl_->GetLatestSequenceNumber());
reporter_.Info(buf);
return false;
}
return true;
}
void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
std::unique_ptr<WriteBatch> batch(new WriteBatch());
WriteBatchInternal::SetContents(batch.get(), record);
SequenceNumber expectedSeq = currentLastSeq_ + 1;
// If the iterator has started, then confirm that we get continuous batches
if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
// Seek to the batch having expected sequence number
if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
// Expected batch must lie in the previous log file
// Avoid underflow.
if (currentFileIndex_ != 0) {
currentFileIndex_--;
}
}
startingSequenceNumber_ = expectedSeq;
// currentStatus_ will be set to Ok if reseek succeeds
currentStatus_ = Status::NotFound("Gap in sequence numbers");
return SeekToStartSequence(currentFileIndex_, true);
}
currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
currentLastSeq_ = currentBatchSeq_ +
WriteBatchInternal::Count(batch.get()) - 1;
// currentBatchSeq_ can only change here
assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
currentBatch_ = move(batch);
isValid_ = true;
currentStatus_ = Status::OK();
}
Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
unique_ptr<SequentialFile> file;
Status status = OpenLogFile(logFile, &file);
if (!status.ok()) {
return status;
}
assert(file);
currentLogReader_.reset(
new log::Reader(std::move(file), &reporter_, true, 0)
);
return Status::OK();
}
} // namespace rocksdb

118
db/transaction_log_impl.h Normal file
View File

@ -0,0 +1,118 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include <vector>
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/types.h"
#include "rocksdb/transaction_log.h"
#include "db/db_impl.h"
#include "db/log_reader.h"
#include "db/filename.h"
namespace rocksdb {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
virtual void Corruption(size_t bytes, const Status& s) {
Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
}
virtual void Info(const char* s) {
Log(info_log, "%s", s);
}
};
class LogFileImpl : public LogFile {
public:
LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
uint64_t sizeBytes) :
logNumber_(logNum),
type_(logType),
startSequence_(startSeq),
sizeFileBytes_(sizeBytes) {
}
std::string PathName() const {
if (type_ == kArchivedLogFile) {
return ArchivedLogFileName("", logNumber_);
}
return LogFileName("", logNumber_);
}
uint64_t LogNumber() const { return logNumber_; }
WalFileType Type() const { return type_; }
SequenceNumber StartSequence() const { return startSequence_; }
uint64_t SizeFileBytes() const { return sizeFileBytes_; }
bool operator < (const LogFile& that) const {
return LogNumber() < that.LogNumber();
}
private:
uint64_t logNumber_;
WalFileType type_;
SequenceNumber startSequence_;
uint64_t sizeFileBytes_;
};
class TransactionLogIteratorImpl : public TransactionLogIterator {
public:
TransactionLogIteratorImpl(const std::string& dir,
const Options* options,
const EnvOptions& soptions,
const SequenceNumber seqNum,
std::unique_ptr<VectorLogPtr> files,
DBImpl const * const dbimpl);
virtual bool Valid();
virtual void Next();
virtual Status status();
virtual BatchResult GetBatch();
private:
const std::string& dir_;
const Options* options_;
const EnvOptions& soptions_;
SequenceNumber startingSequenceNumber_;
std::unique_ptr<VectorLogPtr> files_;
bool started_;
bool isValid_; // not valid when it starts of.
Status currentStatus_;
size_t currentFileIndex_;
std::unique_ptr<WriteBatch> currentBatch_;
unique_ptr<log::Reader> currentLogReader_;
Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
LogReporter reporter_;
SequenceNumber currentBatchSeq_; // sequence number at start of current batch
SequenceNumber currentLastSeq_; // last sequence in the current batch
DBImpl const * const dbimpl_; // The db on whose log files this iterates
// Reads from transaction log only if the writebatch record has been written
bool RestrictedRead(Slice* record, std::string* scratch);
// Seeks to startingSequenceNumber reading from startFileIndex in files_.
// If strict is set,then must get a batch starting with startingSequenceNumber
void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
// Implementation of Next. SeekToStartSequence calls it internally with
// internal=true to let it find next entry even if it has to jump gaps because
// the iterator may start off from the first available entry but promises to
// be continuous after that
void NextImpl(bool internal = false);
// Check if batch is expected, else return false
bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
// Update current batch if a continuous batch is found, else return false
void UpdateCurrentWriteBatch(const Slice& record);
Status OpenLogReader(const LogFile* file);
};
} // namespace rocksdb

301
db/version_edit.cc Normal file
View File

@ -0,0 +1,301 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "db/version_set.h"
#include "util/coding.h"
namespace rocksdb {
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed.
enum Tag {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
// 8 was used for large value refs
kPrevLogNumber = 9,
// these are new formats divergent from open source leveldb
kNewFile2 = 100 // store smallest & largest seqno
};
void VersionEdit::Clear() {
comparator_.clear();
log_number_ = 0;
prev_log_number_ = 0;
last_sequence_ = 0;
next_file_number_ = 0;
has_comparator_ = false;
has_log_number_ = false;
has_prev_log_number_ = false;
has_next_file_number_ = false;
has_last_sequence_ = false;
deleted_files_.clear();
new_files_.clear();
}
void VersionEdit::EncodeTo(std::string* dst) const {
if (has_comparator_) {
PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_);
}
if (has_log_number_) {
PutVarint32(dst, kLogNumber);
PutVarint64(dst, log_number_);
}
if (has_prev_log_number_) {
PutVarint32(dst, kPrevLogNumber);
PutVarint64(dst, prev_log_number_);
}
if (has_next_file_number_) {
PutVarint32(dst, kNextFileNumber);
PutVarint64(dst, next_file_number_);
}
if (has_last_sequence_) {
PutVarint32(dst, kLastSequence);
PutVarint64(dst, last_sequence_);
}
for (size_t i = 0; i < compact_pointers_.size(); i++) {
PutVarint32(dst, kCompactPointer);
PutVarint32(dst, compact_pointers_[i].first); // level
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
PutVarint32(dst, kDeletedFile);
PutVarint32(dst, iter->first); // level
PutVarint64(dst, iter->second); // file number
}
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
PutVarint32(dst, kNewFile2);
PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.number);
PutVarint64(dst, f.file_size);
PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode());
PutVarint64(dst, f.smallest_seqno);
PutVarint64(dst, f.largest_seqno);
}
}
static bool GetInternalKey(Slice* input, InternalKey* dst) {
Slice str;
if (GetLengthPrefixedSlice(input, &str)) {
dst->DecodeFrom(str);
return true;
} else {
return false;
}
}
bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
uint32_t v;
if (GetVarint32(input, &v) &&
(int)v < number_levels_) {
*level = v;
return true;
} else {
if ((int)v >= number_levels_) {
*msg = "db already has more levels than options.num_levels";
}
return false;
}
}
Status VersionEdit::DecodeFrom(const Slice& src) {
Clear();
Slice input = src;
const char* msg = nullptr;
uint32_t tag;
// Temporary storage for parsing
int level;
uint64_t number;
FileMetaData f;
Slice str;
InternalKey key;
while (msg == nullptr && GetVarint32(&input, &tag)) {
switch (tag) {
case kComparator:
if (GetLengthPrefixedSlice(&input, &str)) {
comparator_ = str.ToString();
has_comparator_ = true;
} else {
msg = "comparator name";
}
break;
case kLogNumber:
if (GetVarint64(&input, &log_number_)) {
has_log_number_ = true;
} else {
msg = "log number";
}
break;
case kPrevLogNumber:
if (GetVarint64(&input, &prev_log_number_)) {
has_prev_log_number_ = true;
} else {
msg = "previous log number";
}
break;
case kNextFileNumber:
if (GetVarint64(&input, &next_file_number_)) {
has_next_file_number_ = true;
} else {
msg = "next file number";
}
break;
case kLastSequence:
if (GetVarint64(&input, &last_sequence_)) {
has_last_sequence_ = true;
} else {
msg = "last sequence number";
}
break;
case kCompactPointer:
if (GetLevel(&input, &level, &msg) &&
GetInternalKey(&input, &key)) {
compact_pointers_.push_back(std::make_pair(level, key));
} else {
if (!msg) {
msg = "compaction pointer";
}
}
break;
case kDeletedFile:
if (GetLevel(&input, &level, &msg) &&
GetVarint64(&input, &number)) {
deleted_files_.insert(std::make_pair(level, number));
} else {
if (!msg) {
msg = "deleted file";
}
}
break;
case kNewFile:
if (GetLevel(&input, &level, &msg) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest)) {
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file entry";
}
}
break;
case kNewFile2:
if (GetLevel(&input, &level, &msg) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest) &&
GetVarint64(&input, &f.smallest_seqno) &&
GetVarint64(&input, &f.largest_seqno) ) {
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file2 entry";
}
}
break;
default:
msg = "unknown tag";
break;
}
}
if (msg == nullptr && !input.empty()) {
msg = "invalid tag";
}
Status result;
if (msg != nullptr) {
result = Status::Corruption("VersionEdit", msg);
}
return result;
}
std::string VersionEdit::DebugString(bool hex_key) const {
std::string r;
r.append("VersionEdit {");
if (has_comparator_) {
r.append("\n Comparator: ");
r.append(comparator_);
}
if (has_log_number_) {
r.append("\n LogNumber: ");
AppendNumberTo(&r, log_number_);
}
if (has_prev_log_number_) {
r.append("\n PrevLogNumber: ");
AppendNumberTo(&r, prev_log_number_);
}
if (has_next_file_number_) {
r.append("\n NextFile: ");
AppendNumberTo(&r, next_file_number_);
}
if (has_last_sequence_) {
r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_);
}
for (size_t i = 0; i < compact_pointers_.size(); i++) {
r.append("\n CompactPointer: ");
AppendNumberTo(&r, compact_pointers_[i].first);
r.append(" ");
r.append(compact_pointers_[i].second.DebugString(hex_key));
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
r.append("\n DeleteFile: ");
AppendNumberTo(&r, iter->first);
r.append(" ");
AppendNumberTo(&r, iter->second);
}
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
r.append("\n AddFile: ");
AppendNumberTo(&r, new_files_[i].first);
r.append(" ");
AppendNumberTo(&r, f.number);
r.append(" ");
AppendNumberTo(&r, f.file_size);
r.append(" ");
r.append(f.smallest.DebugString(hex_key));
r.append(" .. ");
r.append(f.largest.DebugString(hex_key));
}
r.append("\n}\n");
return r;
}
} // namespace rocksdb

128
db/version_edit.h Normal file
View File

@ -0,0 +1,128 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <set>
#include <utility>
#include <vector>
#include "db/dbformat.h"
namespace rocksdb {
class VersionSet;
struct FileMetaData {
int refs;
int allowed_seeks; // Seeks allowed until compaction
uint64_t number;
uint64_t file_size; // File size in bytes
InternalKey smallest; // Smallest internal key served by table
InternalKey largest; // Largest internal key served by table
bool being_compacted; // Is this file undergoing compaction?
SequenceNumber smallest_seqno;// The smallest seqno in this file
SequenceNumber largest_seqno; // The largest seqno in this file
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
being_compacted(false) { }
};
class VersionEdit {
public:
explicit VersionEdit(int number_levels) :
number_levels_(number_levels) {
Clear();
}
~VersionEdit() { }
void Clear();
void SetComparatorName(const Slice& name) {
has_comparator_ = true;
comparator_ = name.ToString();
}
void SetLogNumber(uint64_t num) {
has_log_number_ = true;
log_number_ = num;
}
void SetPrevLogNumber(uint64_t num) {
has_prev_log_number_ = true;
prev_log_number_ = num;
}
void SetNextFile(uint64_t num) {
has_next_file_number_ = true;
next_file_number_ = num;
}
void SetLastSequence(SequenceNumber seq) {
has_last_sequence_ = true;
last_sequence_ = seq;
}
void SetCompactPointer(int level, const InternalKey& key) {
compact_pointers_.push_back(std::make_pair(level, key));
}
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file,
uint64_t file_size,
const InternalKey& smallest,
const InternalKey& largest,
const SequenceNumber& smallest_seqno,
const SequenceNumber& largest_seqno) {
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
f.smallest_seqno = smallest_seqno;
f.largest_seqno = largest_seqno;
assert(smallest_seqno <= largest_seqno);
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file));
}
// Number of edits
int NumEntries() {
return new_files_.size() + deleted_files_.size();
}
void EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src);
std::string DebugString(bool hex_key = false) const;
private:
friend class VersionSet;
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
bool GetLevel(Slice* input, int* level, const char** msg);
int number_levels_;
std::string comparator_;
uint64_t log_number_;
uint64_t prev_log_number_;
uint64_t next_file_number_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
bool has_prev_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
std::vector< std::pair<int, InternalKey> > compact_pointers_;
DeletedFileSet deleted_files_;
std::vector< std::pair<int, FileMetaData> > new_files_;
};
} // namespace rocksdb

53
db/version_edit_test.cc Normal file
View File

@ -0,0 +1,53 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "util/testharness.h"
namespace rocksdb {
static void TestEncodeDecode(const VersionEdit& edit) {
std::string encoded, encoded2;
edit.EncodeTo(&encoded);
VersionEdit parsed(7);
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
parsed.EncodeTo(&encoded2);
ASSERT_EQ(encoded, encoded2);
}
class VersionEditTest { };
TEST(VersionEditTest, EncodeDecode) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit(7);
for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", kBig + 500 + i, kTypeValue),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
kBig + 500 + i,
kBig + 600 + i);
edit.DeleteFile(4, kBig + 700 + i);
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
}
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit);
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

3152
db/version_set.cc Normal file

File diff suppressed because it is too large Load Diff

654
db/version_set.h Normal file
View File

@ -0,0 +1,654 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions. The
// newest version is called "current". Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level. The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.
#pragma once
#include <map>
#include <memory>
#include <set>
#include <vector>
#include <deque>
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "port/port.h"
#include "db/table_cache.h"
namespace rocksdb {
namespace log { class Writer; }
class Compaction;
class Iterator;
class MemTable;
class TableCache;
class Version;
class VersionSet;
// Return the smallest index i such that files[i]->largest >= key.
// Return files.size() if there is no such file.
// REQUIRES: "files" contains a sorted list of non-overlapping files.
extern int FindFile(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>& files,
const Slice& key);
// Returns true iff some file in "files" overlaps the user key range
// [*smallest,*largest].
// smallest==nullptr represents a key smaller than all keys in the DB.
// largest==nullptr represents a key largest than all keys in the DB.
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
// in sorted order.
extern bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice* smallest_user_key,
const Slice* largest_user_key);
class Version {
public:
// Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
std::vector<Iterator*>* iters);
// Lookup the value for key. If found, store it in *val and
// return OK. Else return a non-OK status. Fills *stats.
// Uses *operands to store merge_operator operations to apply later
// REQUIRES: lock is not held
struct GetStats {
FileMetaData* seek_file;
int seek_file_level;
};
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
Status* status, std::deque<std::string>* operands, GetStats* stats,
const Options& db_option,
bool* value_found = nullptr);
// Adds "stats" into the current state. Returns true if a new
// compaction may need to be triggered, false otherwise.
// REQUIRES: lock is held
bool UpdateStats(const GetStats& stats);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
void Unref();
void GetOverlappingInputs(
int level,
const InternalKey* begin, // nullptr means before all keys
const InternalKey* end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
int hint_index = -1, // index of overlap file
int* file_index = nullptr); // return index of overlap file
void GetOverlappingInputsBinarySearch(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
int hint_index, // index of overlap file
int* file_index); // return index of overlap file
void ExtendOverlappingInputs(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
unsigned int index); // start extending from this index
// Returns true iff some file in the specified level overlaps
// some part of [*smallest_user_key,*largest_user_key].
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
// largest_user_key==NULL represents a key largest than all keys in the DB.
bool OverlapInLevel(int level,
const Slice* smallest_user_key,
const Slice* largest_user_key);
// Returns true iff the first or last file in inputs contains
// an overlapping user key to the file "just outside" of it (i.e.
// just after the last file, or just before the first file)
// REQUIRES: "*inputs" is a sorted list of non-overlapping files
bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
int level);
// Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key);
int NumFiles(int level) const { return files_[level].size(); }
// Return a human readable string that describes this version's contents.
std::string DebugString(bool hex = false) const;
// Returns the version nuber of this version
uint64_t GetVersionNumber() {
return version_number_;
}
private:
friend class Compaction;
friend class VersionSet;
friend class DBImpl;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&,
const EnvOptions& soptions,
int level) const;
bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
const Slice& internal_prefix, Iterator* level_iter) const;
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version
// List of files per level, files in each level are arranged
// in increasing order of keys
std::vector<FileMetaData*>* files_;
// A list for the same set of files that are stored in files_,
// but files in each level are now sorted based on file
// size. The file with the largest size is at the front.
// This vector stores the index of the file from files_.
std::vector< std::vector<int> > files_by_size_;
// An index into files_by_size_ that specifies the first
// file that is not yet compacted
std::vector<int> next_file_to_compact_by_size_;
// Only the first few entries of files_by_size_ are sorted.
// There is no need to sort all the files because it is likely
// that on a running system, we need to look at only the first
// few largest files because a new version is created every few
// seconds/minutes (because of concurrent compactions).
static const int number_of_files_to_sort_ = 50;
// Next file to compact based on seek stats.
FileMetaData* file_to_compact_;
int file_to_compact_level_;
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
// The most critical level to be compacted is listed first
// These are used to pick the best compaction level
std::vector<double> compaction_score_;
std::vector<int> compaction_level_;
double max_compaction_score_; // max score in l1 to ln-1
int max_compaction_score_level_; // level on which max score occurs
// The offset in the manifest file where this version is stored.
uint64_t offset_manifest_file_;
// A version number that uniquely represents this version. This is
// used for debugging and logging purposes only.
uint64_t version_number_;
explicit Version(VersionSet* vset, uint64_t version_number = 0);
~Version();
// re-initializes the index that is used to offset into files_by_size_
// to find the next compaction candidate file.
void ResetNextCompactionIndex(int level) {
next_file_to_compact_by_size_[level] = 0;
}
// No copying allowed
Version(const Version&);
void operator=(const Version&);
};
class VersionSet {
public:
VersionSet(const std::string& dbname,
const Options* options,
const EnvOptions& storage_options,
TableCache* table_cache,
const InternalKeyComparator*);
~VersionSet();
// Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new
// current version. Will release *mu while actually writing to the file.
// REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply()
Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
bool new_descriptor_log = false);
// Recover the last saved descriptor from persistent storage.
Status Recover();
// Try to reduce the number of levels. This call is valid when
// only one level from the new max level to the old
// max level containing files.
// For example, a db currently has 7 levels [0-6], and a call to
// to reduce to 5 [0-4] can only be executed when only one level
// among [4-6] contains files.
Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu);
// Return the current version.
Version* current() const { return current_; }
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
// Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; }
// Arrange to reuse "file_number" unless a newer file number has
// already been allocated.
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
void ReuseFileNumber(uint64_t file_number) {
if (next_file_number_ == file_number + 1) {
next_file_number_ = file_number;
}
}
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
// Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const;
// Return the last sequence number.
uint64_t LastSequence() const { return last_sequence_; }
// Set the last sequence number to s.
void SetLastSequence(uint64_t s) {
assert(s >= last_sequence_);
last_sequence_ = s;
}
// Mark the specified file number as used.
void MarkFileNumberUsed(uint64_t number);
// Return the current log file number.
uint64_t LogNumber() const { return log_number_; }
// Return the log file number for the log file that is currently
// being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; }
int NumberLevels() const { return num_levels_; }
// Pick level and inputs for a new compaction.
// Returns nullptr if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
Compaction* PickCompaction();
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns nullptr if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
Compaction* CompactRange(
int level,
const InternalKey* begin,
const InternalKey* end);
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t MaxNextLevelOverlappingBytes();
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
// Returns true iff some level needs a compaction because it has
// exceeded its target size.
bool NeedsSizeCompaction() const {
// In universal compaction case, this check doesn't really
// check the compaction condition, but checks num of files threshold
// only. We are not going to miss any compaction opportunity
// but it's likely that more compactions are scheduled but
// ending up with nothing to do. We can improve it later.
// TODO: improve this function to be accurate for universal
// compactions.
int num_levels_to_check =
(options_->compaction_style != kCompactionStyleUniversal) ?
NumberLevels() - 1 : 1;
for (int i = 0; i < num_levels_to_check; i++) {
if (current_->compaction_score_[i] >= 1) {
return true;
}
}
return false;
}
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const {
return ((current_->file_to_compact_ != nullptr) ||
NeedsSizeCompaction());
}
// Returns the maxmimum compaction score for levels 1 to max
double MaxCompactionScore() const {
return current_->max_compaction_score_;
}
// See field declaration
int MaxCompactionScoreLevel() const {
return current_->max_compaction_score_level_;
}
// Add all files listed in any live version to *live.
void AddLiveFiles(std::vector<uint64_t>* live_list);
// Add all files listed in the current version to *live.
void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
// Return the approximate offset in the database of the data for
// "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
struct FileSummaryStorage {
char buffer[1000];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
// printf contents (for debugging)
Status DumpManifest(Options& options, std::string& manifestFileName,
bool verbose, bool hex = false);
// Return a human-readable short (single-line) summary of the data size
// of files per level. Uses *scratch as backing store.
const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
// Return a human-readable short (single-line) summary of files
// in a specified level. Uses *scratch as backing store.
const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
// Return the size of the current manifest file
const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
// For the specfied level, pick a compaction.
// Returns nullptr if there is no compaction to be done.
// If level is 0 and there is already a compaction on that level, this
// function will return nullptr.
Compaction* PickCompactionBySize(int level, double score);
// Pick files to compact in Universal mode
Compaction* PickCompactionUniversal(int level, double score);
// Pick Universal compaction to limit read amplification
Compaction* PickCompactionUniversalReadAmp(int level, double score,
unsigned int ratio, unsigned int num_files);
// Pick Universal compaction to limit space amplification.
Compaction* PickCompactionUniversalSizeAmp(int level, double score);
// Free up the files that were participated in a compaction
void ReleaseCompactionFiles(Compaction* c, Status status);
// verify that the files that we started with for a compaction
// still exist in the current version and in the same original level.
// This ensures that a concurrent compaction did not erroneously
// pick the same files to compact.
bool VerifyCompactionFileConsistency(Compaction* c);
// used to sort files by size
typedef struct fsize {
int index;
FileMetaData* file;
} Fsize;
// Sort all files for this version based on their file size and
// record results in files_by_size_. The largest files are listed first.
void UpdateFilesBySize(Version *v);
// Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level);
double MaxBytesForLevel(int level);
Status GetMetadataForFile(
uint64_t number, int *filelevel, FileMetaData *metadata);
void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata);
void GetObsoleteFiles(std::vector<FileMetaData*>* files);
private:
class Builder;
struct ManifestWriter;
friend class Compaction;
friend class Version;
void Init(int num_levels);
void Finalize(Version* v, std::vector<uint64_t>&);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);
void GetRange2(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest,
InternalKey* largest);
void ExpandWhileOverlapping(Compaction* c);
void SetupOtherInputs(Compaction* c);
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
void AppendVersion(Version* v);
bool ManifestContains(const std::string& record) const;
uint64_t ExpandedCompactionByteSizeLimit(int level);
uint64_t MaxGrandParentOverlapBytes(int level);
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
int num_levels_;
// Opened lazily
unique_ptr<log::Writer> descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions_.prev_
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string* compact_pointer_;
// Per-level target file size.
uint64_t* max_file_size_;
// Per-level max bytes
uint64_t* level_max_bytes_;
// record all the ongoing compactions for all levels
std::vector<std::set<Compaction*> > compactions_in_progress_;
// generates a increasing version number for every new version
uint64_t current_version_number_;
// Queue of writers to the manifest file
std::deque<ManifestWriter*> manifest_writers_;
// Store the manifest file size when it is checked.
// Save us the cost of checking file size twice in LogAndApply
uint64_t last_observed_manifest_size_;
std::vector<FileMetaData*> obsolete_files_;
// storage options for all reads and writes except compactions
const EnvOptions& storage_options_;
// storage options used for compactions. This is a copy of
// storage_options_ but with readaheads set to readahead_compactions_.
const EnvOptions storage_options_compactions_;
// No copying allowed
VersionSet(const VersionSet&);
void operator=(const VersionSet&);
// Return the total amount of data that is undergoing
// compactions per level
void SizeBeingCompacted(std::vector<uint64_t>&);
// Returns true if any one of the parent files are being compacted
bool ParentRangeInCompaction(const InternalKey* smallest,
const InternalKey* largest, int level, int* index);
// Returns true if any one of the specified files are being compacted
bool FilesInCompaction(std::vector<FileMetaData*>& files);
void LogAndApplyHelper(Builder*b, Version* v,
VersionEdit* edit, port::Mutex* mu);
};
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// will be merged.
int level() const { return level_; }
// Outputs will go to this level
int output_level() const { return out_level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Whether compression will be enabled for compaction outputs
bool enable_compression() const { return enable_compression_; }
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
void Summary(char* output, int len);
// Return the score that was used to pick this compaction run.
double score() const { return score_; }
// Is this compaction creating a file in the bottom most level?
bool BottomMostLevel() { return bottommost_level_; }
// Does this compaction include all sst files?
bool IsFullCompaction() { return is_full_compaction_; }
private:
friend class Version;
friend class VersionSet;
explicit Compaction(int level, int out_level, uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes, int number_levels,
bool seek_compaction = false, bool enable_compression = true);
int level_;
int out_level_; // levels to which output files are stored
uint64_t max_output_file_size_;
uint64_t maxGrandParentOverlapBytes_;
Version* input_version_;
VersionEdit* edit_;
int number_levels_;
bool seek_compaction_;
bool enable_compression_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen
uint64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files
int base_index_; // index of the file in files_[level_]
int parent_index_; // index of some file with same range in files_[level_+1]
double score_; // score that was used to pick this compaction.
// Is this compaction creating a file in the bottom most level?
bool bottommost_level_;
// Does this compaction include all sst files?
bool is_full_compaction_;
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
std::vector<size_t> level_ptrs_;
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool);
// Initialize whether compaction producing files at the bottommost level
void SetupBottomMostLevel(bool isManual);
// In case of compaction error, reset the nextIndex that is used
// to pick up the next file to be compacted from files_by_size_
void ResetNextCompactionIndex();
};
} // namespace rocksdb

View File

@ -0,0 +1,80 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 Facebook. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "db/version_set.h"
#include <algorithm>
#include <stdio.h>
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "util/logging.h"
namespace rocksdb {
Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
if(new_levels <= 1) {
return Status::InvalidArgument(
"Number of levels needs to be bigger than 1");
}
Version* current_version = current_;
int current_levels = NumberLevels();
if (current_levels <= new_levels) {
return Status::OK();
}
// Make sure there are file only on one level from
// (new_levels-1) to (current_levels-1)
int first_nonempty_level = -1;
int first_nonempty_level_filenum = 0;
for (int i = new_levels - 1; i < current_levels; i++) {
int file_num = NumLevelFiles(i);
if (file_num != 0) {
if (first_nonempty_level < 0) {
first_nonempty_level = i;
first_nonempty_level_filenum = file_num;
} else {
char msg[255];
sprintf(msg, "Found at least two levels containing files: "
"[%d:%d],[%d:%d].\n",
first_nonempty_level, first_nonempty_level_filenum, i, file_num);
return Status::InvalidArgument(msg);
}
}
}
Status st;
std::vector<FileMetaData*>* old_files_list = current_version->files_;
std::vector<FileMetaData*>* new_files_list =
new std::vector<FileMetaData*>[new_levels];
for (int i = 0; i < new_levels - 1; i++) {
new_files_list[i] = old_files_list[i];
}
if (first_nonempty_level > 0) {
new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
}
delete[] current_version->files_;
current_version->files_ = new_files_list;
delete[] compact_pointer_;
delete[] max_file_size_;
delete[] level_max_bytes_;
num_levels_ = new_levels;
compact_pointer_ = new std::string[new_levels];
Init(new_levels);
VersionEdit ve(new_levels);
st = LogAndApply(&ve , mu, true);
return st;
}
}

184
db/version_set_test.cc Normal file
View File

@ -0,0 +1,184 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
class FindFileTest {
public:
std::vector<FileMetaData*> files_;
bool disjoint_sorted_files_;
FindFileTest() : disjoint_sorted_files_(true) { }
~FindFileTest() {
for (unsigned int i = 0; i < files_.size(); i++) {
delete files_[i];
}
}
void Add(const char* smallest, const char* largest,
SequenceNumber smallest_seq = 100,
SequenceNumber largest_seq = 100) {
FileMetaData* f = new FileMetaData;
f->number = files_.size() + 1;
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
files_.push_back(f);
}
int Find(const char* key) {
InternalKey target(key, 100, kTypeValue);
InternalKeyComparator cmp(BytewiseComparator());
return FindFile(cmp, files_, target.Encode());
}
bool Overlaps(const char* smallest, const char* largest) {
InternalKeyComparator cmp(BytewiseComparator());
Slice s(smallest != nullptr ? smallest : "");
Slice l(largest != nullptr ? largest : "");
return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
(smallest != nullptr ? &s : nullptr),
(largest != nullptr ? &l : nullptr));
}
};
TEST(FindFileTest, Empty) {
ASSERT_EQ(0, Find("foo"));
ASSERT_TRUE(! Overlaps("a", "z"));
ASSERT_TRUE(! Overlaps(nullptr, "z"));
ASSERT_TRUE(! Overlaps("a", nullptr));
ASSERT_TRUE(! Overlaps(nullptr, nullptr));
}
TEST(FindFileTest, Single) {
Add("p", "q");
ASSERT_EQ(0, Find("a"));
ASSERT_EQ(0, Find("p"));
ASSERT_EQ(0, Find("p1"));
ASSERT_EQ(0, Find("q"));
ASSERT_EQ(1, Find("q1"));
ASSERT_EQ(1, Find("z"));
ASSERT_TRUE(! Overlaps("a", "b"));
ASSERT_TRUE(! Overlaps("z1", "z2"));
ASSERT_TRUE(Overlaps("a", "p"));
ASSERT_TRUE(Overlaps("a", "q"));
ASSERT_TRUE(Overlaps("a", "z"));
ASSERT_TRUE(Overlaps("p", "p1"));
ASSERT_TRUE(Overlaps("p", "q"));
ASSERT_TRUE(Overlaps("p", "z"));
ASSERT_TRUE(Overlaps("p1", "p2"));
ASSERT_TRUE(Overlaps("p1", "z"));
ASSERT_TRUE(Overlaps("q", "q"));
ASSERT_TRUE(Overlaps("q", "q1"));
ASSERT_TRUE(! Overlaps(nullptr, "j"));
ASSERT_TRUE(! Overlaps("r", nullptr));
ASSERT_TRUE(Overlaps(nullptr, "p"));
ASSERT_TRUE(Overlaps(nullptr, "p1"));
ASSERT_TRUE(Overlaps("q", nullptr));
ASSERT_TRUE(Overlaps(nullptr, nullptr));
}
TEST(FindFileTest, Multiple) {
Add("150", "200");
Add("200", "250");
Add("300", "350");
Add("400", "450");
ASSERT_EQ(0, Find("100"));
ASSERT_EQ(0, Find("150"));
ASSERT_EQ(0, Find("151"));
ASSERT_EQ(0, Find("199"));
ASSERT_EQ(0, Find("200"));
ASSERT_EQ(1, Find("201"));
ASSERT_EQ(1, Find("249"));
ASSERT_EQ(1, Find("250"));
ASSERT_EQ(2, Find("251"));
ASSERT_EQ(2, Find("299"));
ASSERT_EQ(2, Find("300"));
ASSERT_EQ(2, Find("349"));
ASSERT_EQ(2, Find("350"));
ASSERT_EQ(3, Find("351"));
ASSERT_EQ(3, Find("400"));
ASSERT_EQ(3, Find("450"));
ASSERT_EQ(4, Find("451"));
ASSERT_TRUE(! Overlaps("100", "149"));
ASSERT_TRUE(! Overlaps("251", "299"));
ASSERT_TRUE(! Overlaps("451", "500"));
ASSERT_TRUE(! Overlaps("351", "399"));
ASSERT_TRUE(Overlaps("100", "150"));
ASSERT_TRUE(Overlaps("100", "200"));
ASSERT_TRUE(Overlaps("100", "300"));
ASSERT_TRUE(Overlaps("100", "400"));
ASSERT_TRUE(Overlaps("100", "500"));
ASSERT_TRUE(Overlaps("375", "400"));
ASSERT_TRUE(Overlaps("450", "450"));
ASSERT_TRUE(Overlaps("450", "500"));
}
TEST(FindFileTest, MultipleNullBoundaries) {
Add("150", "200");
Add("200", "250");
Add("300", "350");
Add("400", "450");
ASSERT_TRUE(! Overlaps(nullptr, "149"));
ASSERT_TRUE(! Overlaps("451", nullptr));
ASSERT_TRUE(Overlaps(nullptr, nullptr));
ASSERT_TRUE(Overlaps(nullptr, "150"));
ASSERT_TRUE(Overlaps(nullptr, "199"));
ASSERT_TRUE(Overlaps(nullptr, "200"));
ASSERT_TRUE(Overlaps(nullptr, "201"));
ASSERT_TRUE(Overlaps(nullptr, "400"));
ASSERT_TRUE(Overlaps(nullptr, "800"));
ASSERT_TRUE(Overlaps("100", nullptr));
ASSERT_TRUE(Overlaps("200", nullptr));
ASSERT_TRUE(Overlaps("449", nullptr));
ASSERT_TRUE(Overlaps("450", nullptr));
}
TEST(FindFileTest, OverlapSequenceChecks) {
Add("200", "200", 5000, 3000);
ASSERT_TRUE(! Overlaps("199", "199"));
ASSERT_TRUE(! Overlaps("201", "300"));
ASSERT_TRUE(Overlaps("200", "200"));
ASSERT_TRUE(Overlaps("190", "200"));
ASSERT_TRUE(Overlaps("200", "210"));
}
TEST(FindFileTest, OverlappingFiles) {
Add("150", "600");
Add("400", "500");
disjoint_sorted_files_ = false;
ASSERT_TRUE(! Overlaps("100", "149"));
ASSERT_TRUE(! Overlaps("601", "700"));
ASSERT_TRUE(Overlaps("100", "150"));
ASSERT_TRUE(Overlaps("100", "200"));
ASSERT_TRUE(Overlaps("100", "300"));
ASSERT_TRUE(Overlaps("100", "400"));
ASSERT_TRUE(Overlaps("100", "500"));
ASSERT_TRUE(Overlaps("375", "400"));
ASSERT_TRUE(Overlaps("450", "450"));
ASSERT_TRUE(Overlaps("450", "500"));
ASSERT_TRUE(Overlaps("450", "700"));
ASSERT_TRUE(Overlaps("600", "700"));
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

247
db/write_batch.cc Normal file
View File

@ -0,0 +1,247 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// WriteBatch::rep_ :=
// sequence: fixed64
// count: fixed32
// data: record[count]
// record :=
// kTypeValue varstring varstring
// kTypeMerge varstring varstring
// kTypeDeletion varstring
// varstring :=
// len: varint32
// data: uint8[len]
#include "rocksdb/write_batch.h"
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
#include "db/dbformat.h"
#include "db/db_impl.h"
#include "db/memtable.h"
#include "db/snapshot.h"
#include "db/write_batch_internal.h"
#include "util/coding.h"
#include <stdexcept>
namespace rocksdb {
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
static const size_t kHeader = 12;
WriteBatch::WriteBatch() {
Clear();
}
WriteBatch::~WriteBatch() { }
WriteBatch::Handler::~Handler() { }
void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
throw std::runtime_error("Handler::Merge not implemented!");
}
void WriteBatch::Handler::LogData(const Slice& blob) {
// If the user has not specified something to do with blobs, then we ignore
// them.
}
bool WriteBatch::Handler::Continue() {
return true;
}
void WriteBatch::Clear() {
rep_.clear();
rep_.resize(kHeader);
}
int WriteBatch::Count() const {
return WriteBatchInternal::Count(this);
}
Status WriteBatch::Iterate(Handler* handler) const {
Slice input(rep_);
if (input.size() < kHeader) {
return Status::Corruption("malformed WriteBatch (too small)");
}
input.remove_prefix(kHeader);
Slice key, value, blob;
int found = 0;
while (!input.empty() && handler->Continue()) {
char tag = input[0];
input.remove_prefix(1);
switch (tag) {
case kTypeValue:
if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) {
handler->Put(key, value);
found++;
} else {
return Status::Corruption("bad WriteBatch Put");
}
break;
case kTypeDeletion:
if (GetLengthPrefixedSlice(&input, &key)) {
handler->Delete(key);
found++;
} else {
return Status::Corruption("bad WriteBatch Delete");
}
break;
case kTypeMerge:
if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) {
handler->Merge(key, value);
found++;
} else {
return Status::Corruption("bad WriteBatch Merge");
}
break;
case kTypeLogData:
if (GetLengthPrefixedSlice(&input, &blob)) {
handler->LogData(blob);
} else {
return Status::Corruption("bad WriteBatch Blob");
}
break;
default:
return Status::Corruption("unknown WriteBatch tag");
}
}
if (found != WriteBatchInternal::Count(this)) {
return Status::Corruption("WriteBatch has wrong count");
} else {
return Status::OK();
}
}
int WriteBatchInternal::Count(const WriteBatch* b) {
return DecodeFixed32(b->rep_.data() + 8);
}
void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
EncodeFixed32(&b->rep_[8], n);
}
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
return SequenceNumber(DecodeFixed64(b->rep_.data()));
}
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
EncodeFixed64(&b->rep_[0], seq);
}
void WriteBatch::Put(const Slice& key, const Slice& value) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue));
PutLengthPrefixedSlice(&rep_, key);
PutLengthPrefixedSlice(&rep_, value);
}
void WriteBatch::Put(const SliceParts& key, const SliceParts& value) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue));
PutLengthPrefixedSliceParts(&rep_, key);
PutLengthPrefixedSliceParts(&rep_, value);
}
void WriteBatch::Delete(const Slice& key) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeDeletion));
PutLengthPrefixedSlice(&rep_, key);
}
void WriteBatch::Merge(const Slice& key, const Slice& value) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeMerge));
PutLengthPrefixedSlice(&rep_, key);
PutLengthPrefixedSlice(&rep_, value);
}
void WriteBatch::PutLogData(const Slice& blob) {
rep_.push_back(static_cast<char>(kTypeLogData));
PutLengthPrefixedSlice(&rep_, blob);
}
namespace {
class MemTableInserter : public WriteBatch::Handler {
public:
SequenceNumber sequence_;
MemTable* mem_;
const Options* options_;
DBImpl* db_;
const bool filter_deletes_;
MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts,
DB* db, const bool filter_deletes)
: sequence_(sequence),
mem_(mem),
options_(opts),
db_(reinterpret_cast<DBImpl*>(db)),
filter_deletes_(filter_deletes) {
assert(mem_);
if (filter_deletes_) {
assert(options_);
assert(db_);
}
}
virtual void Put(const Slice& key, const Slice& value) {
if (options_->inplace_update_support
&& mem_->Update(sequence_, kTypeValue, key, value)) {
RecordTick(options_->statistics, NUMBER_KEYS_UPDATED);
} else {
mem_->Add(sequence_, kTypeValue, key, value);
}
sequence_++;
}
virtual void Merge(const Slice& key, const Slice& value) {
mem_->Add(sequence_, kTypeMerge, key, value);
sequence_++;
}
virtual void Delete(const Slice& key) {
if (filter_deletes_) {
SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_;
ReadOptions ropts;
ropts.snapshot = &read_from_snapshot;
std::string value;
if (!db_->KeyMayExist(ropts, key, &value)) {
RecordTick(options_->statistics, NUMBER_FILTERED_DELETES);
return;
}
}
mem_->Add(sequence_, kTypeDeletion, key, Slice());
sequence_++;
}
};
} // namespace
Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem,
const Options* opts, DB* db,
const bool filter_deletes) {
MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db,
filter_deletes);
return b->Iterate(&inserter);
}
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
assert(contents.size() >= kHeader);
b->rep_.assign(contents.data(), contents.size());
}
void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
SetCount(dst, Count(dst) + Count(src));
assert(src->rep_.size() >= kHeader);
dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
}
} // namespace rocksdb

57
db/write_batch_internal.h Normal file
View File

@ -0,0 +1,57 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "rocksdb/types.h"
#include "rocksdb/write_batch.h"
#include "rocksdb/db.h"
#include "rocksdb/options.h"
namespace rocksdb {
class MemTable;
// WriteBatchInternal provides static methods for manipulating a
// WriteBatch that we don't want in the public WriteBatch interface.
class WriteBatchInternal {
public:
// Return the number of entries in the batch.
static int Count(const WriteBatch* batch);
// Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, int n);
// Return the seqeunce number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch);
// Store the specified number as the seqeunce number for the start of
// this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
static Slice Contents(const WriteBatch* batch) {
return Slice(batch->rep_);
}
static size_t ByteSize(const WriteBatch* batch) {
return batch->rep_.size();
}
static void SetContents(WriteBatch* batch, const Slice& contents);
// Inserts batch entries into memtable
// Drops deletes in batch if filter_del is set to true and
// db->KeyMayExist returns false
static Status InsertInto(const WriteBatch* batch, MemTable* memtable,
const Options* opts, DB* db = nullptr,
const bool filter_del = false);
static void Append(WriteBatch* dst, const WriteBatch* src);
};
} // namespace rocksdb

262
db/write_batch_test.cc Normal file
View File

@ -0,0 +1,262 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/db.h"
#include <memory>
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace rocksdb {
static std::string PrintContents(WriteBatch* b) {
InternalKeyComparator cmp(BytewiseComparator());
auto factory = std::make_shared<SkipListFactory>();
MemTable* mem = new MemTable(cmp, factory);
mem->Ref();
std::string state;
Options options;
Status s = WriteBatchInternal::InsertInto(b, mem, &options);
int count = 0;
Iterator* iter = mem->NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey;
memset((void *)&ikey, 0, sizeof(ikey));
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
switch (ikey.type) {
case kTypeValue:
state.append("Put(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
count++;
break;
case kTypeMerge:
state.append("Merge(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
count++;
break;
case kTypeDeletion:
state.append("Delete(");
state.append(ikey.user_key.ToString());
state.append(")");
count++;
break;
case kTypeLogData:
assert(false);
break;
}
state.append("@");
state.append(NumberToString(ikey.sequence));
}
delete iter;
if (!s.ok()) {
state.append(s.ToString());
} else if (count != WriteBatchInternal::Count(b)) {
state.append("CountMismatch()");
}
mem->Unref();
return state;
}
class WriteBatchTest { };
TEST(WriteBatchTest, Empty) {
WriteBatch batch;
ASSERT_EQ("", PrintContents(&batch));
ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
ASSERT_EQ(0, batch.Count());
}
TEST(WriteBatchTest, Multiple) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
batch.Put(Slice("baz"), Slice("boo"));
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
ASSERT_EQ("Put(baz, boo)@102"
"Delete(box)@101"
"Put(foo, bar)@100",
PrintContents(&batch));
ASSERT_EQ(3, batch.Count());
}
TEST(WriteBatchTest, Corruption) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
WriteBatchInternal::SetSequence(&batch, 200);
Slice contents = WriteBatchInternal::Contents(&batch);
WriteBatchInternal::SetContents(&batch,
Slice(contents.data(),contents.size()-1));
ASSERT_EQ("Put(foo, bar)@200"
"Corruption: bad WriteBatch Delete",
PrintContents(&batch));
}
TEST(WriteBatchTest, Append) {
WriteBatch b1, b2;
WriteBatchInternal::SetSequence(&b1, 200);
WriteBatchInternal::SetSequence(&b2, 300);
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("",
PrintContents(&b1));
ASSERT_EQ(0, b1.Count());
b2.Put("a", "va");
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("Put(a, va)@200",
PrintContents(&b1));
ASSERT_EQ(1, b1.Count());
b2.Clear();
b2.Put("b", "vb");
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("Put(a, va)@200"
"Put(b, vb)@201",
PrintContents(&b1));
ASSERT_EQ(2, b1.Count());
b2.Delete("foo");
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("Put(a, va)@200"
"Put(b, vb)@202"
"Put(b, vb)@201"
"Delete(foo)@203",
PrintContents(&b1));
ASSERT_EQ(4, b1.Count());
}
namespace {
struct TestHandler : public WriteBatch::Handler {
std::string seen;
virtual void Put(const Slice& key, const Slice& value) {
seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
}
virtual void Merge(const Slice& key, const Slice& value) {
seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
}
virtual void LogData(const Slice& blob) {
seen += "LogData(" + blob.ToString() + ")";
}
virtual void Delete(const Slice& key) {
seen += "Delete(" + key.ToString() + ")";
}
};
}
TEST(WriteBatchTest, Blob) {
WriteBatch batch;
batch.Put(Slice("k1"), Slice("v1"));
batch.Put(Slice("k2"), Slice("v2"));
batch.Put(Slice("k3"), Slice("v3"));
batch.PutLogData(Slice("blob1"));
batch.Delete(Slice("k2"));
batch.PutLogData(Slice("blob2"));
batch.Merge(Slice("foo"), Slice("bar"));
ASSERT_EQ(5, batch.Count());
ASSERT_EQ("Merge(foo, bar)@4"
"Put(k1, v1)@0"
"Delete(k2)@3"
"Put(k2, v2)@1"
"Put(k3, v3)@2",
PrintContents(&batch));
TestHandler handler;
batch.Iterate(&handler);
ASSERT_EQ(
"Put(k1, v1)"
"Put(k2, v2)"
"Put(k3, v3)"
"LogData(blob1)"
"Delete(k2)"
"LogData(blob2)"
"Merge(foo, bar)",
handler.seen);
}
TEST(WriteBatchTest, Continue) {
WriteBatch batch;
struct Handler : public TestHandler {
int num_seen = 0;
virtual void Put(const Slice& key, const Slice& value) {
++num_seen;
TestHandler::Put(key, value);
}
virtual void Merge(const Slice& key, const Slice& value) {
++num_seen;
TestHandler::Merge(key, value);
}
virtual void LogData(const Slice& blob) {
++num_seen;
TestHandler::LogData(blob);
}
virtual void Delete(const Slice& key) {
++num_seen;
TestHandler::Delete(key);
}
virtual bool Continue() override {
return num_seen < 3;
}
} handler;
batch.Put(Slice("k1"), Slice("v1"));
batch.PutLogData(Slice("blob1"));
batch.Delete(Slice("k1"));
batch.PutLogData(Slice("blob2"));
batch.Merge(Slice("foo"), Slice("bar"));
batch.Iterate(&handler);
ASSERT_EQ(
"Put(k1, v1)"
"LogData(blob1)"
"Delete(k1)",
handler.seen);
}
TEST(WriteBatchTest, PutGatherSlices) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
{
// Try a write where the key is one slice but the value is two
Slice key_slice("baz");
Slice value_slices[2] = { Slice("header"), Slice("payload") };
batch.Put(SliceParts(&key_slice, 1),
SliceParts(value_slices, 2));
}
{
// One where the key is composite but the value is a single slice
Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
Slice value_slice("value");
batch.Put(SliceParts(key_slices, 3),
SliceParts(&value_slice, 1));
}
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ("Put(baz, headerpayload)@101"
"Put(foo, bar)@100"
"Put(keypart2part3, value)@102",
PrintContents(&batch));
ASSERT_EQ(3, batch.Count());
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

89
doc/doc.css Normal file
View File

@ -0,0 +1,89 @@
body {
margin-left: 0.5in;
margin-right: 0.5in;
background: white;
color: black;
}
h1 {
margin-left: -0.2in;
font-size: 14pt;
}
h2 {
margin-left: -0in;
font-size: 12pt;
}
h3 {
margin-left: -0in;
}
h4 {
margin-left: -0in;
}
hr {
margin-left: -0in;
}
/* Definition lists: definition term bold */
dt {
font-weight: bold;
}
address {
text-align: center;
}
code,samp,var {
color: blue;
}
kbd {
color: #600000;
}
div.note p {
float: right;
width: 3in;
margin-right: 0%;
padding: 1px;
border: 2px solid #6060a0;
background-color: #fffff0;
}
ul {
margin-top: -0em;
margin-bottom: -0em;
}
ol {
margin-top: -0em;
margin-bottom: -0em;
}
UL.nobullets {
list-style-type: none;
list-style-image: none;
margin-left: -1em;
}
p {
margin: 1em 0 1em 0;
padding: 0 0 0 0;
}
pre {
line-height: 1.3em;
padding: 0.4em 0 0.8em 0;
margin: 0 0 0 0;
border: 0 0 0 0;
color: blue;
}
.datatable {
margin-left: auto;
margin-right: auto;
margin-top: 2em;
margin-bottom: 2em;
border: 1px solid;
}
.datatable td,th {
padding: 0 0.5em 0 0.5em;
text-align: right;
}

831
doc/index.html Normal file
View File

@ -0,0 +1,831 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>RocksDB</title>
</head>
<body>
<h1>RocksDB</h1>
<address>The Facebook Database Engineering Team</address>
<address>Build on earlier work on leveldb by Sanjay Ghemawat
(sanjay@google.com) and Jeff Dean (jeff@google.com)</address>
<p>
The <code>rocksdb</code> library provides a persistent key value store. Keys and
values are arbitrary byte arrays. The keys are ordered within the key
value store according to a user-specified comparator function.
<p>
<h1>Opening A Database</h1>
<p>
A <code>rocksdb</code> database has a name which corresponds to a file system
directory. All of the contents of database are stored in this
directory. The following example shows how to open a database,
creating it if necessary:
<p>
<pre>
#include &lt;assert&gt;
#include "rocksdb/db.h"
rocksdb::DB* db;
rocksdb::Options options;
options.create_if_missing = true;
rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
assert(status.ok());
...
</pre>
If you want to raise an error if the database already exists, add
the following line before the <code>rocksdb::DB::Open</code> call:
<pre>
options.error_if_exists = true;
</pre>
<h1>Status</h1>
<p>
You may have noticed the <code>rocksdb::Status</code> type above. Values of this
type are returned by most functions in <code>rocksdb</code> that may encounter an
error. You can check if such a result is ok, and also print an
associated error message:
<p>
<pre>
rocksdb::Status s = ...;
if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
</pre>
<h1>Closing A Database</h1>
<p>
When you are done with a database, just delete the database object.
Example:
<p>
<pre>
... open the db as described above ...
... do something with db ...
delete db;
</pre>
<h1>Reads And Writes</h1>
<p>
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
modify/query the database. For example, the following code
moves the value stored under key1 to key2.
<pre>
std::string value;
rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
if (s.ok()) s = db-&gt;Put(rocksdb::WriteOptions(), key2, value);
if (s.ok()) s = db-&gt;Delete(rocksdb::WriteOptions(), key1);
</pre>
<h1>Atomic Updates</h1>
<p>
Note that if the process dies after the Put of key2 but before the
delete of key1, the same value may be left stored under multiple keys.
Such problems can be avoided by using the <code>WriteBatch</code> class to
atomically apply a set of updates:
<p>
<pre>
#include "leveldb/write_batch.h"
...
std::string value;
rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
if (s.ok()) {
rocksdb::WriteBatch batch;
batch.Delete(key1);
batch.Put(key2, value);
s = db-&gt;Write(rocksdb::WriteOptions(), &amp;batch);
}
</pre>
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
and these edits within the batch are applied in order. Note that we
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
we do not end up erroneously dropping the value entirely.
<p>
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
speed up bulk updates by placing lots of individual mutations into the
same batch.
<h1>Synchronous Writes</h1>
By default, each write to <code>leveldb</code> is asynchronous: it
returns after pushing the write from the process into the operating
system. The transfer from operating system memory to the underlying
persistent storage happens asynchronously. The <code>sync</code> flag
can be turned on for a particular write to make the write operation
not return until the data being written has been pushed all the way to
persistent storage. (On Posix systems, this is implemented by calling
either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
<code>msync(..., MS_SYNC)</code> before the write operation returns.)
<pre>
rocksdb::WriteOptions write_options;
write_options.sync = true;
db-&gt;Put(write_options, ...);
</pre>
Asynchronous writes are often more than a thousand times as fast as
synchronous writes. The downside of asynchronous writes is that a
crash of the machine may cause the last few updates to be lost. Note
that a crash of just the writing process (i.e., not a reboot) will not
cause any loss since even when <code>sync</code> is false, an update
is pushed from the process memory into the operating system before it
is considered done.
<p>
Asynchronous writes can often be used safely. For example, when
loading a large amount of data into the database you can handle lost
updates by restarting the bulk load after a crash. A hybrid scheme is
also possible where every Nth write is synchronous, and in the event
of a crash, the bulk load is restarted just after the last synchronous
write finished by the previous run. (The synchronous write can update
a marker that describes where to restart on a crash.)
<p>
<code>WriteBatch</code> provides an alternative to asynchronous writes.
Multiple updates may be placed in the same <code>WriteBatch</code> and
applied together using a synchronous write (i.e.,
<code>write_options.sync</code> is set to true). The extra cost of
the synchronous write will be amortized across all of the writes in
the batch.
<p>
We also provide a way to completely disable Write Ahead Log for a
particular write. If you set write_option.disableWAL to true, the
write will not go to the log at all and may be lost in an event of
process crash.
<p>
When opening a DB, you can disable syncing of data files by setting
Options::disableDataSync to true. This can be useful when doing
bulk-loading or big idempotent operations. Once the operation is
finished, you can manually call sync() to flush all dirty buffers
to stable storage.
<p>
RocksDB by default uses faster fdatasync() to sync files. If you want
to use fsync(), you can set Options::use_fsync to true. You should set
this to true on filesystems like ext3 that can lose files after a
reboot.
<p>
<h1>Concurrency</h1>
<p>
A database may only be opened by one process at a time.
The <code>rocksdb</code> implementation acquires a lock from the
operating system to prevent misuse. Within a single process, the
same <code>rocksdb::DB</code> object may be safely shared by multiple
concurrent threads. I.e., different threads may write into or fetch
iterators or call <code>Get</code> on the same database without any
external synchronization (the leveldb implementation will
automatically do the required synchronization). However other objects
(like Iterator and WriteBatch) may require external synchronization.
If two threads share such an object, they must protect access to it
using their own locking protocol. More details are available in
the public header files.
<p>
<h1>Merge operators</h1>
<p>
Merge operators provide efficient support for read-modify-write operation.
More on the interface and implementation can be found on:
<p>
<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator">
Merge Operator</a>
<p>
<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation">
Merge Operator Implementation</a>
<p>
<h1>Iteration</h1>
<p>
The following example demonstrates how to print all key,value pairs
in a database.
<p>
<pre>
rocksdb::Iterator* it = db-&gt;NewIterator(rocksdb::ReadOptions());
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": " &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
}
assert(it-&gt;status().ok()); // Check for any errors found during the scan
delete it;
</pre>
The following variation shows how to process just the keys in the
range <code>[start,limit)</code>:
<p>
<pre>
for (it-&gt;Seek(start);
it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
it-&gt;Next()) {
...
}
</pre>
You can also process entries in reverse order. (Caveat: reverse
iteration may be somewhat slower than forward iteration.)
<p>
<pre>
for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
...
}
</pre>
<h1>Snapshots</h1>
<p>
Snapshots provide consistent read-only views over the entire state of
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
that a read should operate on a particular version of the DB state.
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
implicit snapshot of the current state.
<p>
Snapshots are created by the DB::GetSnapshot() method:
<p>
<pre>
rocksdb::ReadOptions options;
options.snapshot = db-&gt;GetSnapshot();
... apply some updates to db ...
rocksdb::Iterator* iter = db-&gt;NewIterator(options);
... read using iter to view the state when the snapshot was created ...
delete iter;
db-&gt;ReleaseSnapshot(options.snapshot);
</pre>
Note that when a snapshot is no longer needed, it should be released
using the DB::ReleaseSnapshot interface. This allows the
implementation to get rid of state that was being maintained just to
support reading as of that snapshot.
<h1>Slice</h1>
<p>
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
are instances of the <code>rocksdb::Slice</code> type. <code>Slice</code> is a simple
structure that contains a length and a pointer to an external byte
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
<code>std::string</code> since we do not need to copy potentially large keys and
values. In addition, <code>rocksdb</code> methods do not return null-terminated
C-style strings since <code>rocksdb</code> keys and values are allowed to
contain '\0' bytes.
<p>
C++ strings and null-terminated C-style strings can be easily converted
to a Slice:
<p>
<pre>
rocksdb::Slice s1 = "hello";
std::string str("world");
rocksdb::Slice s2 = str;
</pre>
A Slice can be easily converted back to a C++ string:
<pre>
std::string str = s1.ToString();
assert(str == std::string("hello"));
</pre>
Be careful when using Slices since it is up to the caller to ensure that
the external byte array into which the Slice points remains live while
the Slice is in use. For example, the following is buggy:
<p>
<pre>
rocksdb::Slice slice;
if (...) {
std::string str = ...;
slice = str;
}
Use(slice);
</pre>
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
backing storage for <code>slice</code> will disappear.
<p>
<h1>Comparators</h1>
<p>
The preceding examples used the default ordering function for key,
which orders bytes lexicographically. You can however supply a custom
comparator when opening a database. For example, suppose each
database key consists of two numbers and we should sort by the first
number, breaking ties by the second number. First, define a proper
subclass of <code>rocksdb::Comparator</code> that expresses these rules:
<p>
<pre>
class TwoPartComparator : public rocksdb::Comparator {
public:
// Three-way comparison function:
// if a &lt; b: negative result
// if a &gt; b: positive result
// else: zero result
int Compare(const rocksdb::Slice&amp; a, const rocksdb::Slice&amp; b) const {
int a1, a2, b1, b2;
ParseKey(a, &amp;a1, &amp;a2);
ParseKey(b, &amp;b1, &amp;b2);
if (a1 &lt; b1) return -1;
if (a1 &gt; b1) return +1;
if (a2 &lt; b2) return -1;
if (a2 &gt; b2) return +1;
return 0;
}
// Ignore the following methods for now:
const char* Name() const { return "TwoPartComparator"; }
void FindShortestSeparator(std::string*, const rocksdb::Slice&amp;) const { }
void FindShortSuccessor(std::string*) const { }
};
</pre>
Now create a database using this custom comparator:
<p>
<pre>
TwoPartComparator cmp;
rocksdb::DB* db;
rocksdb::Options options;
options.create_if_missing = true;
options.comparator = &amp;cmp;
rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
...
</pre>
<h2>Backwards compatibility</h2>
<p>
The result of the comparator's <code>Name</code> method is attached to the
database when it is created, and is checked on every subsequent
database open. If the name changes, the <code>rocksdb::DB::Open</code> call will
fail. Therefore, change the name if and only if the new key format
and comparison function are incompatible with existing databases, and
it is ok to discard the contents of all existing databases.
<p>
You can however still gradually evolve your key format over time with
a little bit of pre-planning. For example, you could store a version
number at the end of each key (one byte should suffice for most uses).
When you wish to switch to a new key format (e.g., adding an optional
third part to the keys processed by <code>TwoPartComparator</code>),
(a) keep the same comparator name (b) increment the version number
for new keys (c) change the comparator function so it uses the
version numbers found in the keys to decide how to interpret them.
<p>
<h1>MemTable and Table factories</h1>
<p>
By default, we keep the data in memory in skiplist memtable and the data
on disk in a table format described here:
<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
RocksDB Table Format</a>.
<p>
Since one of the goals of RocksDB is to have
different parts of the system easily pluggable, we support different
implementations of both memtable and table format. You can supply
your own memtable factory by setting <code>Options::memtable_factory</code>
and your own table factory by setting <code>Options::table_factory</code>.
For available memtable factories, please refer to
<code>rocksdb/memtablerep.h</code> and for table factores to
<code>rocksdb/table.h</code>. These features are both in active development
and please be wary of any API changes that might break your application
going forward.
<p>
You can also read more about memtables here:
<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#memtables">
Memtables wiki
</a>
<p>
<h1>Performance</h1>
<p>
Performance can be tuned by changing the default values of the
types defined in <code>include/rocksdb/options.h</code>.
<p>
<h2>Block size</h2>
<p>
<code>rocksdb</code> groups adjacent keys together into the same block and such a
block is the unit of transfer to and from persistent storage. The
default block size is approximately 4096 uncompressed bytes.
Applications that mostly do bulk scans over the contents of the
database may wish to increase this size. Applications that do a lot
of point reads of small values may wish to switch to a smaller block
size if performance measurements indicate an improvement. There isn't
much benefit in using blocks smaller than one kilobyte, or larger than
a few megabytes. Also note that compression will be more effective
with larger block sizes. To change block size parameter, use
<code>Options::block_size</code>.
<p>
<h2>Write buffer</h2>
<p>
<code>Options::write_buffer_size</code> specifies the amount of data
to build up in memory before converting to a sorted on-disk file.
Larger values increase performance, especially during bulk loads.
Up to max_write_buffer_number write buffers may be held in memory
at the same time,
so you may wish to adjust this parameter to control memory usage.
Also, a larger write buffer will result in a longer recovery time
the next time the database is opened.
Related option is
<code>Options::max_write_buffer_number</code>, which is maximum number
of write buffers that are built up in memory. The default is 2, so that
when 1 write buffer is being flushed to storage, new writes can continue
to the other write buffer.
<code>Options::min_write_buffer_number_to_merge</code> is the minimum number
of write buffers that will be merged together before writing to storage.
If set to 1, then all write buffers are fushed to L0 as individual files and
this increases read amplification because a get request has to check in all
of these files. Also, an in-memory merge may result in writing lesser
data to storage if there are duplicate records in each of these
individual write buffers. Default: 1
<p>
<h2>Compression</h2>
<p>
Each block is individually compressed before being written to
persistent storage. Compression is on by default since the default
compression method is very fast, and is automatically disabled for
uncompressible data. In rare cases, applications may want to disable
compression entirely, but should only do so if benchmarks show a
performance improvement:
<p>
<pre>
rocksdb::Options options;
options.compression = rocksdb::kNoCompression;
... rocksdb::DB::Open(options, name, ...) ....
</pre>
<h2>Cache</h2>
<p>
The contents of the database are stored in a set of files in the
filesystem and each file stores a sequence of compressed blocks. If
<code>options.block_cache</code> is non-NULL, it is used to cache frequently
used uncompressed block contents. If <code>options.block_cache_compressed</code>
is non-NULL, it is used to cache frequently used compressed blocks. Compressed
cache is an alternative to OS cache, which also caches compressed blocks. If
compressed cache is used, the OS cache will be disabled automatically by setting
<code>options.allow_os_buffer</code> to false.
<p>
<pre>
#include "rocksdb/cache.h"
rocksdb::Options options;
options.block_cache = rocksdb::NewLRUCache(100 * 1048576); // 100MB uncompressed cache
options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576); // 100MB compressed cache
rocksdb::DB* db;
rocksdb::DB::Open(options, name, &db);
... use the db ...
delete db
delete options.block_cache;
delete options.block_cache_compressed;
</pre>
<p>
When performing a bulk read, the application may wish to disable
caching so that the data processed by the bulk read does not end up
displacing most of the cached contents. A per-iterator option can be
used to achieve this:
<p>
<pre>
rocksdb::ReadOptions options;
options.fill_cache = false;
rocksdb::Iterator* it = db-&gt;NewIterator(options);
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
...
}
</pre>
<p>
You can also disable block cache by setting <code>options.no_block_cache</code>
to true.
<h2>Key Layout</h2>
<p>
Note that the unit of disk transfer and caching is a block. Adjacent
keys (according to the database sort order) will usually be placed in
the same block. Therefore the application can improve its performance
by placing keys that are accessed together near each other and placing
infrequently used keys in a separate region of the key space.
<p>
For example, suppose we are implementing a simple file system on top
of <code>rocksdb</code>. The types of entries we might wish to store are:
<p>
<pre>
filename -&gt; permission-bits, length, list of file_block_ids
file_block_id -&gt; data
</pre>
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
<code>file_block_id</code> keys with a different letter (say '0') so that scans
over just the metadata do not force us to fetch and cache bulky file
contents.
<p>
<h2>Filters</h2>
<p>
Because of the way <code>rocksdb</code> data is organized on disk,
a single <code>Get()</code> call may involve multiple reads from disk.
The optional <code>FilterPolicy</code> mechanism can be used to reduce
the number of disk reads substantially.
<pre>
rocksdb::Options options;
options.filter_policy = NewBloomFilter(10);
rocksdb::DB* db;
rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
... use the database ...
delete db;
delete options.filter_policy;
</pre>
The preceding code associates a
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
based filtering policy with the database. Bloom filter based
filtering relies on keeping some number of bits of data in memory per
key (in this case 10 bits per key since that is the argument we passed
to NewBloomFilter). This filter will reduce the number of unnecessary
disk reads needed for <code>Get()</code> calls by a factor of
approximately a 100. Increasing the bits per key will lead to a
larger reduction at the cost of more memory usage. We recommend that
applications whose working set does not fit in memory and that do a
lot of random reads set a filter policy.
<p>
If you are using a custom comparator, you should ensure that the filter
policy you are using is compatible with your comparator. For example,
consider a comparator that ignores trailing spaces when comparing keys.
<code>NewBloomFilter</code> must not be used with such a comparator.
Instead, the application should provide a custom filter policy that
also ignores trailing spaces. For example:
<pre>
class CustomFilterPolicy : public rocksdb::FilterPolicy {
private:
FilterPolicy* builtin_policy_;
public:
CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
~CustomFilterPolicy() { delete builtin_policy_; }
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Use builtin bloom filter code after removing trailing spaces
std::vector&lt;Slice&gt; trimmed(n);
for (int i = 0; i &lt; n; i++) {
trimmed[i] = RemoveTrailingSpaces(keys[i]);
}
return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
// Use builtin bloom filter code after removing trailing spaces
return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
}
};
</pre>
<p>
Advanced applications may provide a filter policy that does not use
a bloom filter but uses some other mechanism for summarizing a set
of keys. See <code>rocksdb/filter_policy.h</code> for detail.
<p>
<h1>Checksums</h1>
<p>
<code>rocksdb</code> associates checksums with all data it stores in the file system.
There are two separate controls provided over how aggressively these
checksums are verified:
<p>
<ul>
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
checksum verification of all data that is read from the file system on
behalf of a particular read. By default, no such verification is
done.
<p>
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
database to make the database implementation raise an error as soon as
it detects an internal corruption. Depending on which portion of the
database has been corrupted, the error may be raised when the database
is opened, or later by another database operation. By default,
paranoid checking is off so that the database can be used even if
parts of its persistent storage have been corrupted.
<p>
If a database is corrupted (perhaps it cannot be opened when
paranoid checking is turned on), the <code>rocksdb::RepairDB</code> function
may be used to recover as much of the data as possible.
<p>
</ul>
<p>
<h1>Compaction</h1>
<p>
You can read more on Compactions here:
<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#multi-threaded-compactions">
Multi-threaded compactions
</a>
<p>
Here we give overview of the options that impact behavior of Compactions:
<ul>
<p>
<li><code>Options::compaction_style</code> - RocksDB currently supports two
compaction algorithms - Universal style and Level style. This option switches
between the two. Can be kCompactionStyleUniversal or kCompactionStyleLevel.
If this is kCompactionStyleUniversal, then you can configure universal style
parameters with <code>Options::compaction_options_universal</code>.
<p>
<li><code>Options::disable_auto_compactions</code> - Disable automatic compactions.
Manual compactions can still be issued on this database.
<p>
<li><code>Options::compaction_filter</code> - Allows an application to modify/delete
a key-value during background compaction. The client must provide
compaction_filter_factory if it requires a new compaction filter to be used
for different compaction processes. Client should specify only one of filter
or factory.
<p>
<li><code>Options::compaction_filter_factory</code> - a factory that provides
compaction filter objects which allow an application to modify/delete a
key-value during background compaction.
</ul>
<p>
Other options impacting performance of compactions and when they get triggered
are:
<ul>
<p>
<li> <code>Options::access_hint_on_compaction_start</code> - Specify the file access
pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL
<p>
<li> <code>Options::level0_file_num_compaction_trigger</code> - Number of files to trigger level-0 compaction.
A negative value means that level-0 compaction will not be triggered by number of files at all.
<p>
<li> <code>Options::max_mem_compaction_level</code> - Maximum level to which a new compacted memtable is pushed if it
does not create overlap. We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some
expensive manifest file operations. We do not push all the way to the largest level since that can generate a lot of wasted disk
space if the same key space is being repeatedly overwritten.
<p>
<li> <code>Options::target_file_size_base</code> and <code>Options::target_file_size_multiplier</code> -
Target file size for compaction. target_file_size_base is per-file size for level-1.
Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1))
For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will
be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB
and default target_file_size_multiplier is 1.
<p>
<li> <code>Options::expanded_compaction_factor</code> - Maximum number of bytes in all compacted files. We avoid expanding
the lower level file set of a compaction if it would make the total compaction cover more than
(expanded_compaction_factor * targetFileSizeLevel()) many bytes.
<p>
<li> <code>Options::source_compaction_factor</code> - Maximum number of bytes in all source files to be compacted in a
single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes
for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes.
Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
<p>
<li> <code>Options::max_grandparent_overlap_factor</code> - Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
stop building a single file in a level->level+1 compaction.
<p>
<li> <code>Options::disable_seek_compaction</code> - Disable compaction triggered by seek.
With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache
(which is true if max_open_files is large).
<p>
<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs, submitted to
the default LOW priority thread pool
</ul>
<p>
You can learn more about all of those options in <code>rocksdb/options.h</code>
<h2> Universal style compaction specific settings</h2>
<p>
If you're using Universal style compaction, there is an object <code>CompactionOptionsUniversal</code>
that hold all the different options for that compaction. The exact definition is in
<code>rocksdb/universal_compaction.h</code> and you can set it in <code>Options::compaction_options_universal</code>.
Here we give short overview of options in <code>CompactionOptionsUniversal</code>:
<ul>
<p>
<li> <code>CompactionOptionsUniversal::size_ratio</code> - Percentage flexibilty while comparing file size. If the candidate file(s)
size is 1% smaller than the next file's size, then include next file into
this candidate set. Default: 1
<p>
<li> <code>CompactionOptionsUniversal::min_merge_width</code> - The minimum number of files in a single compaction run. Default: 2
<p>
<li> <code>CompactionOptionsUniversal::max_merge_width</code> - The maximum number of files in a single compaction run. Default: UINT_MAX
<p>
<li> <code>CompactionOptionsUniversal::max_size_amplification_percent</code> - The size amplification is defined as the amount (in percentage) of
additional storage needed to store a single byte of data in the database. For example, a size amplification of 2% means that a database that
contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has
a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding
the earliest file contribute to the size amplification. Default: 200, which means that a 100 byte database could require upto
300 bytes of storage.
<p>
<li> <code>CompactionOptionsUniversal::compression_size_percent</code> - If this option is set to be -1 (the default value), all the output files
will follow compression type specified. If this option is not negative, we will try to make sure compressed
size is just above this value. In normal cases, at least this percentage
of data will be compressed.
When we are compacting to a new file, here is the criteria whether
it needs to be compressed: assuming here are the list of files sorted
by generation time: [ A1...An B1...Bm C1...Ct ],
where A1 is the newest and Ct is the oldest, and we are going to compact
B1...Bm, we calculate the total size of all the files as total_size, as
well as the total size of C1...Ct as total_C, the compaction output file
will be compressed iff total_C / total_size < this percentage
<p>
<li> <code>CompactionOptionsUniversal::stop_style</code> - The algorithm used to stop picking files into a single compaction run.
Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file).
Default: kCompactionStopStyleTotalSize
</ul>
<h1>Thread pools</h1>
<p>
A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background
threads using method <code>Env::SetBackgroundThreads()</code> defined in <code>rocksdb/env.h</code>.
We use the thread pool for compactions and memtable flushes.
Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest
having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool.
There are two options available for configuration of background compactions and flushes:
<ul>
<p>
<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs,
submitted to the default LOW priority thread pool
<p>
<li> <code>Options::max_background_flushes</code> - Maximum number of concurrent background memtable flush jobs, submitted to
the HIGH priority thread pool. By default, all background jobs (major compaction and memtable flush) go
to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool.
It is important when the same Env is shared by multiple db instances. Without a separate pool, long running major compaction jobs could
potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls.
</ul>
<p>
<pre>
#include "rocksdb/env.h"
#include "rocksdb/db.h"
auto env = rocksdb::Env::Default();
env->SetBackgroundThreads(2, rocksdb::Env::LOW);
env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
rocksdb::DB* db;
rocksdb::Options options;
options.env = env;
options.max_background_compactions = 2;
options.max_background_flushes = 1;
rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
assert(status.ok());
...
</pre>
<h1>Approximate Sizes</h1>
<p>
The <code>GetApproximateSizes</code> method can used to get the approximate
number of bytes of file system space used by one or more key ranges.
<p>
<pre>
rocksdb::Range ranges[2];
ranges[0] = rocksdb::Range("a", "c");
ranges[1] = rocksdb::Range("x", "z");
uint64_t sizes[2];
rocksdb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
</pre>
The preceding call will set <code>sizes[0]</code> to the approximate number of
bytes of file system space used by the key range <code>[a..c)</code> and
<code>sizes[1]</code> to the approximate number of bytes used by the key range
<code>[x..z)</code>.
<p>
<h1>Environment</h1>
<p>
All file operations (and other operating system calls) issued by the
<code>rocksdb</code> implementation are routed through a <code>rocksdb::Env</code> object.
Sophisticated clients may wish to provide their own <code>Env</code>
implementation to get better control. For example, an application may
introduce artificial delays in the file IO paths to limit the impact
of <code>rocksdb</code> on other activities in the system.
<p>
<pre>
class SlowEnv : public rocksdb::Env {
.. implementation of the Env interface ...
};
SlowEnv env;
rocksdb::Options options;
options.env = &amp;env;
Status s = rocksdb::DB::Open(options, ...);
</pre>
<h1>Porting</h1>
<p>
<code>rocksdb</code> may be ported to a new platform by providing platform
specific implementations of the types/methods/functions exported by
<code>rocksdb/port/port.h</code>. See <code>rocksdb/port/port_example.h</code> for more
details.
<p>
In addition, the new platform may need a new default <code>rocksdb::Env</code>
implementation. See <code>rocksdb/util/env_posix.h</code> for an example.
<h1>Statistics</h1>
<p>
To be able to efficiently tune your application, it is always helpful if you
have access to usage statistics. You can collect those statistics by setting
<code>Options::table_stats_collectors</code> or
<code>Options::statistics</code>. For more information, refer to
<code>rocksdb/table_stats.h</code> and <code>rocksdb/statistics.h</code>.
These should not add significant overhead to your application and we
recommend exporting them to other monitoring tools.
<h1>Purging WAL files</h1>
<p>
By default, old write-ahead logs are deleted automatically when they fall out
of scope and application doesn't need them anymore. There are options that
enable the user to archive the logs and then delete them lazily, either in
TTL fashion or based on size limit.
The options are <code>Options::WAL_ttl_seconds</code> and
<code>Options::WAL_size_limit_MB</code>. Here is how they can be used:
<ul>
<li>
<p>
If both set to 0, logs will be deleted asap and will never get into the archive.
<li>
<p>
If <code>WAL_ttl_seconds</code> is 0 and WAL_size_limit_MB is not 0, WAL
files will be checked every 10 min and if total size is greater then
<code>WAL_size_limit_MB</code>, they will be deleted starting with the
earliest until size_limit is met. All empty files will be deleted.
<li>
<p>
If <code>WAL_ttl_seconds</code> is not 0 and WAL_size_limit_MB is 0, then
WAL files will be checked every <code>WAL_ttl_seconds / 2</code> and those
that are older than WAL_ttl_seconds will be deleted.
<li>
<p>
If both are not 0, WAL files will be checked every 10 min and both
checks will be performed with ttl being first.
</ul>
<h1>Other Information</h1>
<p>
Details about the <code>rocksdb</code> implementation may be found in
the following documents:
<ul>
<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide">
RocksDB Architecture Guide</a>
<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
Format of an immutable Table file</a>
<li> <a href="log_format.txt">Format of a log file</a>
</ul>
</body>
</html>

75
doc/log_format.txt Normal file
View File

@ -0,0 +1,75 @@
The log file contents are a sequence of 32KB blocks. The only
exception is that the tail of the file may contain a partial block.
Each block consists of a sequence of records:
block := record* trailer?
record :=
checksum: uint32 // crc32c of type and data[]
length: uint16
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
data: uint8[length]
A record never starts within the last six bytes of a block (since it
won't fit). Any leftover bytes here form the trailer, which must
consist entirely of zero bytes and must be skipped by readers.
Aside: if exactly seven bytes are left in the current block, and a new
non-zero length record is added, the writer must emit a FIRST record
(which contains zero bytes of user data) to fill up the trailing seven
bytes of the block and then emit all of the user data in subsequent
blocks.
More types may be added in the future. Some Readers may skip record
types they do not understand, others may report that some data was
skipped.
FULL == 1
FIRST == 2
MIDDLE == 3
LAST == 4
The FULL record contains the contents of an entire user record.
FIRST, MIDDLE, LAST are types used for user records that have been
split into multiple fragments (typically because of block boundaries).
FIRST is the type of the first fragment of a user record, LAST is the
type of the last fragment of a user record, and MID is the type of all
interior fragments of a user record.
Example: consider a sequence of user records:
A: length 1000
B: length 97270
C: length 8000
A will be stored as a FULL record in the first block.
B will be split into three fragments: first fragment occupies the rest
of the first block, second fragment occupies the entirety of the
second block, and the third fragment occupies a prefix of the third
block. This will leave six bytes free in the third block, which will
be left empty as the trailer.
C will be stored as a FULL record in the fourth block.
===================
Some benefits over the recordio format:
(1) We do not need any heuristics for resyncing - just go to next
block boundary and scan. If there is a corruption, skip to the next
block. As a side-benefit, we do not get confused when part of the
contents of one log file are embedded as a record inside another log
file.
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
simple: find the next block boundary and skip records until we
hit a FULL or FIRST record.
(3) We do not need extra buffering for large records.
Some downsides compared to recordio format:
(1) No packing of tiny records. This could be fixed by adding a new
record type, so it is a shortcoming of the current implementation,
not necessarily the format.
(2) No compression. Again, this could be fixed by adding new record types.

BIN
doc/rockslogo.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 134 KiB

BIN
doc/rockslogo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

26
hdfs/README Normal file
View File

@ -0,0 +1,26 @@
This directory contains the hdfs extensions needed to make rocksdb store
files in HDFS.
The hdfs.h file is copied from the Apache Hadoop 1.0 source code.
It defines the libhdfs library
(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access
data in HDFS. The libhdfs.a is copied from the Apache Hadoop 1.0 build.
It implements the API defined in hdfs.h. If your hadoop cluster is running
a different hadoop release, then install these two files manually from your
hadoop distribution and then recompile rocksdb.
The env_hdfs.h file defines the rocksdb objects that are needed to talk to an
underlying filesystem.
If you want to compile rocksdb with hdfs support, please set the following
enviroment variables appropriately:
USE_HDFS=1
JAVA_HOME=/usr/local/jdk-6u22-64
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs
make clean all db_bench
To run dbbench,
set CLASSPATH to include your hadoop distribution
db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000"

302
hdfs/env_hdfs.h Normal file
View File

@ -0,0 +1,302 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include <algorithm>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <iostream>
#include "rocksdb/env.h"
#include "rocksdb/status.h"
#ifdef USE_HDFS
#include "hdfs/hdfs.h"
namespace rocksdb {
static const std::string kProto = "hdfs://";
static const std::string pathsep = "/";
// Thrown during execution when there is an issue with the supplied
// arguments.
class HdfsUsageException : public std::exception { };
// A simple exception that indicates something went wrong that is not
// recoverable. The intention is for the message to be printed (with
// nothing else) and the process terminate.
class HdfsFatalException : public std::exception {
public:
explicit HdfsFatalException(const std::string& s) : what_(s) { }
virtual ~HdfsFatalException() throw() { }
virtual const char* what() const throw() {
return what_.c_str();
}
private:
const std::string what_;
};
//
// The HDFS environment for rocksdb. This class overrides all the
// file/dir access methods and delegates the thread-mgmt methods to the
// default posix environment.
//
class HdfsEnv : public Env {
public:
HdfsEnv(const std::string& fsname) : fsname_(fsname) {
posixEnv = Env::Default();
fileSys_ = connectToPath(fsname_);
}
virtual ~HdfsEnv() {
fprintf(stderr, "Destroying HdfsEnv::Default()\n");
hdfsDisconnect(fileSys_);
}
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result);
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result);
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result);
virtual Status NewRandomRWFile(const std::string& fname,
unique_ptr<RandomRWFile>* result,
const EnvOptions& options);
virtual bool FileExists(const std::string& fname);
virtual Status GetChildren(const std::string& path,
std::vector<std::string>* result);
virtual Status DeleteFile(const std::string& fname);
virtual Status CreateDir(const std::string& name);
virtual Status CreateDirIfMissing(const std::string& name);
virtual Status DeleteDir(const std::string& name);
virtual Status GetFileSize(const std::string& fname, uint64_t* size);
virtual Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime);
virtual Status RenameFile(const std::string& src, const std::string& target);
virtual Status LockFile(const std::string& fname, FileLock** lock);
virtual Status UnlockFile(FileLock* lock);
virtual Status NewLogger(const std::string& fname, Logger** result);
virtual void Schedule(void (*function)(void* arg), void* arg,
Priority pri = LOW) {
posixEnv->Schedule(function, arg, pri);
}
virtual void StartThread(void (*function)(void* arg), void* arg) {
posixEnv->StartThread(function, arg);
}
virtual Status GetTestDirectory(std::string* path) {
return posixEnv->GetTestDirectory(path);
}
virtual uint64_t NowMicros() {
return posixEnv->NowMicros();
}
virtual void SleepForMicroseconds(int micros) {
posixEnv->SleepForMicroseconds(micros);
}
virtual Status GetHostName(char* name, uint64_t len) {
return posixEnv->GetHostName(name, len);
}
virtual Status GetCurrentTime(int64_t* unix_time) {
return posixEnv->GetCurrentTime(unix_time);
}
virtual Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) {
return posixEnv->GetAbsolutePath(db_path, output_path);
}
virtual void SetBackgroundThreads(int number, Priority pri = LOW) {
posixEnv->SetBackgroundThreads(number, pri);
}
virtual std::string TimeToString(uint64_t number) {
return posixEnv->TimeToString(number);
}
static uint64_t gettid() {
assert(sizeof(pthread_t) <= sizeof(uint64_t));
return (uint64_t)pthread_self();
}
private:
std::string fsname_; // string of the form "hdfs://hostname:port/"
hdfsFS fileSys_; // a single FileSystem object for all files
Env* posixEnv; // This object is derived from Env, but not from
// posixEnv. We have posixnv as an encapsulated
// object here so that we can use posix timers,
// posix threads, etc.
/**
* If the URI is specified of the form hdfs://server:port/path,
* then connect to the specified cluster
* else connect to default.
*/
hdfsFS connectToPath(const std::string& uri) {
if (uri.empty()) {
return NULL;
}
if (uri.find(kProto) != 0) {
// uri doesn't start with hdfs:// -> use default:0, which is special
// to libhdfs.
return hdfsConnectNewInstance("default", 0);
}
const std::string hostport = uri.substr(kProto.length());
std::vector <std::string> parts;
split(hostport, ':', parts);
if (parts.size() != 2) {
throw HdfsFatalException("Bad uri for hdfs " + uri);
}
// parts[0] = hosts, parts[1] = port/xxx/yyy
std::string host(parts[0]);
std::string remaining(parts[1]);
int rem = remaining.find(pathsep);
std::string portStr = (rem == 0 ? remaining :
remaining.substr(0, rem));
tPort port;
port = atoi(portStr.c_str());
if (port == 0) {
throw HdfsFatalException("Bad host-port for hdfs " + uri);
}
hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port);
return fs;
}
void split(const std::string &s, char delim,
std::vector<std::string> &elems) {
elems.clear();
size_t prev = 0;
size_t pos = s.find(delim);
while (pos != std::string::npos) {
elems.push_back(s.substr(prev, pos));
prev = pos + 1;
pos = s.find(delim, prev);
}
elems.push_back(s.substr(prev, s.size()));
}
};
} // namespace rocksdb
#else // USE_HDFS
namespace rocksdb {
static const Status notsup;
class HdfsEnv : public Env {
public:
HdfsEnv(const std::string& fsname) {
fprintf(stderr, "You have not build rocksdb with HDFS support\n");
fprintf(stderr, "Please see hdfs/README for details\n");
throw new std::exception();
}
virtual ~HdfsEnv() {
}
virtual Status NewSequentialFile(const std::string& fname,
unique_ptr<SequentialFile>* result,
const EnvOptions& options);
virtual Status NewRandomAccessFile(const std::string& fname,
unique_ptr<RandomAccessFile>* result,
const EnvOptions& options) {
return notsup;
}
virtual Status NewWritableFile(const std::string& fname,
unique_ptr<WritableFile>* result,
const EnvOptions& options) {
return notsup;
}
virtual Status NewRandomRWFile(const std::string& fname,
unique_ptr<RandomRWFile>* result,
const EnvOptions& options) {
return notsup;
}
virtual bool FileExists(const std::string& fname){return false;}
virtual Status GetChildren(const std::string& path,
std::vector<std::string>* result){return notsup;}
virtual Status DeleteFile(const std::string& fname){return notsup;}
virtual Status CreateDir(const std::string& name){return notsup;}
virtual Status CreateDirIfMissing(const std::string& name){return notsup;}
virtual Status DeleteDir(const std::string& name){return notsup;}
virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;}
virtual Status GetFileModificationTime(const std::string& fname,
uint64_t* time) {
return notsup;
}
virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;}
virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;}
virtual Status UnlockFile(FileLock* lock){return notsup;}
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result){return notsup;}
virtual void Schedule(void (*function)(void* arg), void* arg,
Priority pri = LOW) {}
virtual void StartThread(void (*function)(void* arg), void* arg) {}
virtual Status GetTestDirectory(std::string* path) {return notsup;}
virtual uint64_t NowMicros() {return 0;}
virtual void SleepForMicroseconds(int micros) {}
virtual Status GetHostName(char* name, uint64_t len) {return notsup;}
virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;}
virtual Status GetAbsolutePath(const std::string& db_path,
std::string* outputpath) {return notsup;}
virtual void SetBackgroundThreads(int number, Priority pri = LOW) {}
virtual std::string TimeToString(uint64_t number) { return "";}
};
}
#endif // USE_HDFS

477
hdfs/hdfs.h Normal file
View File

@ -0,0 +1,477 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#ifndef LIBHDFS_HDFS_H
#define LIBHDFS_HDFS_H
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <errno.h>
#include <jni.h>
#ifndef O_RDONLY
#define O_RDONLY 1
#endif
#ifndef O_WRONLY
#define O_WRONLY 2
#endif
#ifndef EINTERNAL
#define EINTERNAL 255
#endif
/** All APIs set errno to meaningful values */
#ifdef __cplusplus
extern "C" {
#endif
/**
* Some utility decls used in libhdfs.
*/
typedef int32_t tSize; /// size of data for read/write io ops
typedef time_t tTime; /// time type in seconds
typedef int64_t tOffset;/// offset within the file
typedef uint16_t tPort; /// port
typedef enum tObjectKind {
kObjectKindFile = 'F',
kObjectKindDirectory = 'D',
} tObjectKind;
/**
* The C reflection of org.apache.org.hadoop.FileSystem .
*/
typedef void* hdfsFS;
/**
* The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream .
*/
enum hdfsStreamType
{
UNINITIALIZED = 0,
INPUT = 1,
OUTPUT = 2,
};
/**
* The 'file-handle' to a file in hdfs.
*/
struct hdfsFile_internal {
void* file;
enum hdfsStreamType type;
};
typedef struct hdfsFile_internal* hdfsFile;
/**
* hdfsConnectAsUser - Connect to a hdfs file system as a specific user
* Connect to the hdfs.
* @param host A string containing either a host name, or an ip address
* of the namenode of a hdfs cluster. 'host' should be passed as NULL if
* you want to connect to local filesystem. 'host' should be passed as
* 'default' (and port as 0) to used the 'configured' filesystem
* (core-site/core-default.xml).
* @param port The port on which the server is listening.
* @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port)
* @param groups the groups (these are hadoop domain groups)
* @return Returns a handle to the filesystem or NULL on error.
*/
hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
/**
* hdfsConnect - Connect to a hdfs file system.
* Connect to the hdfs.
* @param host A string containing either a host name, or an ip address
* of the namenode of a hdfs cluster. 'host' should be passed as NULL if
* you want to connect to local filesystem. 'host' should be passed as
* 'default' (and port as 0) to used the 'configured' filesystem
* (core-site/core-default.xml).
* @param port The port on which the server is listening.
* @return Returns a handle to the filesystem or NULL on error.
*/
hdfsFS hdfsConnect(const char* host, tPort port);
/**
* This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle.
* Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance.
*/
hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
hdfsFS hdfsConnectNewInstance(const char* host, tPort port);
hdfsFS hdfsConnectPath(const char* uri);
/**
* hdfsDisconnect - Disconnect from the hdfs file system.
* Disconnect from hdfs.
* @param fs The configured filesystem handle.
* @return Returns 0 on success, -1 on error.
*/
int hdfsDisconnect(hdfsFS fs);
/**
* hdfsOpenFile - Open a hdfs file in given mode.
* @param fs The configured filesystem handle.
* @param path The full path to the file.
* @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT),
* O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
* @param bufferSize Size of buffer for read/write - pass 0 if you want
* to use the default configured values.
* @param replication Block replication - pass 0 if you want to use
* the default configured values.
* @param blocksize Size of block - pass 0 if you want to use the
* default configured values.
* @return Returns the handle to the open file or NULL on error.
*/
hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
int bufferSize, short replication, tSize blocksize);
/**
* hdfsCloseFile - Close an open file.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @return Returns 0 on success, -1 on error.
*/
int hdfsCloseFile(hdfsFS fs, hdfsFile file);
/**
* hdfsExists - Checks if a given path exsits on the filesystem
* @param fs The configured filesystem handle.
* @param path The path to look for
* @return Returns 0 on exists, 1 on non-exists, -1/-2 on error.
*/
int hdfsExists(hdfsFS fs, const char *path);
/**
* hdfsSeek - Seek to given offset in file.
* This works only for files opened in read-only mode.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @param desiredPos Offset into the file to seek into.
* @return Returns 0 on success, -1 on error.
*/
int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos);
/**
* hdfsTell - Get the current offset in the file, in bytes.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @return Current offset, -1 on error.
*/
tOffset hdfsTell(hdfsFS fs, hdfsFile file);
/**
* hdfsRead - Read data from an open file.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @param buffer The buffer to copy read bytes into.
* @param length The length of the buffer.
* @return Returns the number of bytes actually read, possibly less
* than than length;-1 on error.
*/
tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
/**
* hdfsPread - Positional read of data from an open file.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @param position Position from which to read
* @param buffer The buffer to copy read bytes into.
* @param length The length of the buffer.
* @return Returns the number of bytes actually read, possibly less than
* than length;-1 on error.
*/
tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
void* buffer, tSize length);
/**
* hdfsWrite - Write data into an open file.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @param buffer The data.
* @param length The no. of bytes to write.
* @return Returns the number of bytes written, -1 on error.
*/
tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
tSize length);
/**
* hdfsWrite - Flush the data.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @return Returns 0 on success, -1 on error.
*/
int hdfsFlush(hdfsFS fs, hdfsFile file);
/**
* hdfsSync - Sync the data to persistent store.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @return Returns 0 on success, -1 on error.
*/
int hdfsSync(hdfsFS fs, hdfsFile file);
/**
* hdfsGetNumReplicasInPipeline - get number of remaining replicas in
* pipeline
* @param fs The configured filesystem handle
* @param file the file handle
* @return returns the # of datanodes in the write pipeline; -1 on error
*/
int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file);
/**
* hdfsAvailable - Number of bytes that can be read from this
* input stream without blocking.
* @param fs The configured filesystem handle.
* @param file The file handle.
* @return Returns available bytes; -1 on error.
*/
int hdfsAvailable(hdfsFS fs, hdfsFile file);
/**
* hdfsCopy - Copy file from one filesystem to another.
* @param srcFS The handle to source filesystem.
* @param src The path of source file.
* @param dstFS The handle to destination filesystem.
* @param dst The path of destination file.
* @return Returns 0 on success, -1 on error.
*/
int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
/**
* hdfsMove - Move file from one filesystem to another.
* @param srcFS The handle to source filesystem.
* @param src The path of source file.
* @param dstFS The handle to destination filesystem.
* @param dst The path of destination file.
* @return Returns 0 on success, -1 on error.
*/
int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
/**
* hdfsDelete - Delete file.
* @param fs The configured filesystem handle.
* @param path The path of the file.
* @return Returns 0 on success, -1 on error.
*/
int hdfsDelete(hdfsFS fs, const char* path);
/**
* hdfsRename - Rename file.
* @param fs The configured filesystem handle.
* @param oldPath The path of the source file.
* @param newPath The path of the destination file.
* @return Returns 0 on success, -1 on error.
*/
int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
/**
* hdfsGetWorkingDirectory - Get the current working directory for
* the given filesystem.
* @param fs The configured filesystem handle.
* @param buffer The user-buffer to copy path of cwd into.
* @param bufferSize The length of user-buffer.
* @return Returns buffer, NULL on error.
*/
char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
/**
* hdfsSetWorkingDirectory - Set the working directory. All relative
* paths will be resolved relative to it.
* @param fs The configured filesystem handle.
* @param path The path of the new 'cwd'.
* @return Returns 0 on success, -1 on error.
*/
int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
/**
* hdfsCreateDirectory - Make the given file and all non-existent
* parents into directories.
* @param fs The configured filesystem handle.
* @param path The path of the directory.
* @return Returns 0 on success, -1 on error.
*/
int hdfsCreateDirectory(hdfsFS fs, const char* path);
/**
* hdfsSetReplication - Set the replication of the specified
* file to the supplied value
* @param fs The configured filesystem handle.
* @param path The path of the file.
* @return Returns 0 on success, -1 on error.
*/
int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
/**
* hdfsFileInfo - Information about a file/directory.
*/
typedef struct {
tObjectKind mKind; /* file or directory */
char *mName; /* the name of the file */
tTime mLastMod; /* the last modification time for the file in seconds */
tOffset mSize; /* the size of the file in bytes */
short mReplication; /* the count of replicas */
tOffset mBlockSize; /* the block size for the file */
char *mOwner; /* the owner of the file */
char *mGroup; /* the group associated with the file */
short mPermissions; /* the permissions associated with the file */
tTime mLastAccess; /* the last access time for the file in seconds */
} hdfsFileInfo;
/**
* hdfsListDirectory - Get list of files/directories for a given
* directory-path. hdfsFreeFileInfo should be called to deallocate memory if
* the function returns non-NULL value.
* @param fs The configured filesystem handle.
* @param path The path of the directory.
* @param numEntries Set to the number of files/directories in path.
* @return Returns a dynamically-allocated array of hdfsFileInfo
* objects; NULL if empty or on error.
* on error, numEntries will be -1.
*/
hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path,
int *numEntries);
/**
* hdfsGetPathInfo - Get information about a path as a (dynamically
* allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
* called when the pointer is no longer needed.
* @param fs The configured filesystem handle.
* @param path The path of the file.
* @return Returns a dynamically-allocated hdfsFileInfo object;
* NULL on error.
*/
hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path);
/**
* hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields)
* @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
* objects.
* @param numEntries The size of the array.
*/
void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
/**
* hdfsGetHosts - Get hostnames where a particular block (determined by
* pos & blocksize) of a file is stored. The last element in the array
* is NULL. Due to replication, a single block could be present on
* multiple hosts.
* @param fs The configured filesystem handle.
* @param path The path of the file.
* @param start The start of the block.
* @param length The length of the block.
* @return Returns a dynamically-allocated 2-d array of blocks-hosts;
* NULL on error.
*/
char*** hdfsGetHosts(hdfsFS fs, const char* path,
tOffset start, tOffset length);
/**
* hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
* @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
* objects.
* @param numEntries The size of the array.
*/
void hdfsFreeHosts(char ***blockHosts);
/**
* hdfsGetDefaultBlockSize - Get the optimum blocksize.
* @param fs The configured filesystem handle.
* @return Returns the blocksize; -1 on error.
*/
tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
/**
* hdfsGetCapacity - Return the raw capacity of the filesystem.
* @param fs The configured filesystem handle.
* @return Returns the raw-capacity; -1 on error.
*/
tOffset hdfsGetCapacity(hdfsFS fs);
/**
* hdfsGetUsed - Return the total raw size of all files in the filesystem.
* @param fs The configured filesystem handle.
* @return Returns the total-size; -1 on error.
*/
tOffset hdfsGetUsed(hdfsFS fs);
/**
* hdfsChown
* @param fs The configured filesystem handle.
* @param path the path to the file or directory
* @param owner this is a string in Hadoop land. Set to null or "" if only setting group
* @param group this is a string in Hadoop land. Set to null or "" if only setting user
* @return 0 on success else -1
*/
int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group);
/**
* hdfsChmod
* @param fs The configured filesystem handle.
* @param path the path to the file or directory
* @param mode the bitmask to set it to
* @return 0 on success else -1
*/
int hdfsChmod(hdfsFS fs, const char* path, short mode);
/**
* hdfsUtime
* @param fs The configured filesystem handle.
* @param path the path to the file or directory
* @param mtime new modification time or 0 for only set access time in seconds
* @param atime new access time or 0 for only set modification time in seconds
* @return 0 on success else -1
*/
int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
#ifdef __cplusplus
}
#endif
#endif /*LIBHDFS_HDFS_H*/
/**
* vim: ts=4: sw=4: et
*/

BIN
hdfs/libhdfs.a Normal file

Binary file not shown.

386
helpers/memenv/memenv.cc Normal file
View File

@ -0,0 +1,386 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "helpers/memenv/memenv.h"
#include "rocksdb/env.h"
#include "rocksdb/status.h"
#include "port/port.h"
#include "util/mutexlock.h"
#include <map>
#include <string.h>
#include <string>
#include <vector>
namespace rocksdb {
namespace {
class FileState {
public:
// FileStates are reference counted. The initial reference count is zero
// and the caller must call Ref() at least once.
FileState() : refs_(0), size_(0) {}
// Increase the reference count.
void Ref() {
MutexLock lock(&refs_mutex_);
++refs_;
}
// Decrease the reference count. Delete if this is the last reference.
void Unref() {
bool do_delete = false;
{
MutexLock lock(&refs_mutex_);
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
do_delete = true;
}
}
if (do_delete) {
delete this;
}
}
uint64_t Size() const { return size_; }
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
if (offset > size_) {
return Status::IOError("Offset greater than file size.");
}
const uint64_t available = size_ - offset;
if (n > available) {
n = available;
}
if (n == 0) {
*result = Slice();
return Status::OK();
}
size_t block = offset / kBlockSize;
size_t block_offset = offset % kBlockSize;
if (n <= kBlockSize - block_offset) {
// The requested bytes are all in the first block.
*result = Slice(blocks_[block] + block_offset, n);
return Status::OK();
}
size_t bytes_to_copy = n;
char* dst = scratch;
while (bytes_to_copy > 0) {
size_t avail = kBlockSize - block_offset;
if (avail > bytes_to_copy) {
avail = bytes_to_copy;
}
memcpy(dst, blocks_[block] + block_offset, avail);
bytes_to_copy -= avail;
dst += avail;
block++;
block_offset = 0;
}
*result = Slice(scratch, n);
return Status::OK();
}
Status Append(const Slice& data) {
const char* src = data.data();
size_t src_len = data.size();
while (src_len > 0) {
size_t avail;
size_t offset = size_ % kBlockSize;
if (offset != 0) {
// There is some room in the last block.
avail = kBlockSize - offset;
} else {
// No room in the last block; push new one.
blocks_.push_back(new char[kBlockSize]);
avail = kBlockSize;
}
if (avail > src_len) {
avail = src_len;
}
memcpy(blocks_.back() + offset, src, avail);
src_len -= avail;
src += avail;
size_ += avail;
}
return Status::OK();
}
private:
// Private since only Unref() should be used to delete it.
~FileState() {
for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
++i) {
delete [] *i;
}
}
// No copying allowed.
FileState(const FileState&);
void operator=(const FileState&);
port::Mutex refs_mutex_;
int refs_; // Protected by refs_mutex_;
// The following fields are not protected by any mutex. They are only mutable
// while the file is being written, and concurrent access is not allowed
// to writable files.
std::vector<char*> blocks_;
uint64_t size_;
enum { kBlockSize = 8 * 1024 };
};
class SequentialFileImpl : public SequentialFile {
public:
explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
file_->Ref();
}
~SequentialFileImpl() {
file_->Unref();
}
virtual Status Read(size_t n, Slice* result, char* scratch) {
Status s = file_->Read(pos_, n, result, scratch);
if (s.ok()) {
pos_ += result->size();
}
return s;
}
virtual Status Skip(uint64_t n) {
if (pos_ > file_->Size()) {
return Status::IOError("pos_ > file_->Size()");
}
const size_t available = file_->Size() - pos_;
if (n > available) {
n = available;
}
pos_ += n;
return Status::OK();
}
private:
FileState* file_;
size_t pos_;
};
class RandomAccessFileImpl : public RandomAccessFile {
public:
explicit RandomAccessFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~RandomAccessFileImpl() {
file_->Unref();
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
return file_->Read(offset, n, result, scratch);
}
private:
FileState* file_;
};
class WritableFileImpl : public WritableFile {
public:
WritableFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~WritableFileImpl() {
file_->Unref();
}
virtual Status Append(const Slice& data) {
return file_->Append(data);
}
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
private:
FileState* file_;
};
class InMemoryEnv : public EnvWrapper {
public:
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
virtual ~InMemoryEnv() {
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
i->second->Unref();
}
}
// Partial implementation of the Env interface.
virtual Status NewSequentialFile(const std::string& fname,
unique_ptr<SequentialFile>* result,
const EnvOptions& soptions) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
result->reset(new SequentialFileImpl(file_map_[fname]));
return Status::OK();
}
virtual Status NewRandomAccessFile(const std::string& fname,
unique_ptr<RandomAccessFile>* result,
const EnvOptions& soptions) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
result->reset(new RandomAccessFileImpl(file_map_[fname]));
return Status::OK();
}
virtual Status NewWritableFile(const std::string& fname,
unique_ptr<WritableFile>* result,
const EnvOptions& soptions) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) != file_map_.end()) {
DeleteFileInternal(fname);
}
FileState* file = new FileState();
file->Ref();
file_map_[fname] = file;
result->reset(new WritableFileImpl(file));
return Status::OK();
}
virtual bool FileExists(const std::string& fname) {
MutexLock lock(&mutex_);
return file_map_.find(fname) != file_map_.end();
}
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) {
MutexLock lock(&mutex_);
result->clear();
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
const std::string& filename = i->first;
if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
Slice(filename).starts_with(Slice(dir))) {
result->push_back(filename.substr(dir.size() + 1));
}
}
return Status::OK();
}
void DeleteFileInternal(const std::string& fname) {
if (file_map_.find(fname) == file_map_.end()) {
return;
}
file_map_[fname]->Unref();
file_map_.erase(fname);
}
virtual Status DeleteFile(const std::string& fname) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
DeleteFileInternal(fname);
return Status::OK();
}
virtual Status CreateDir(const std::string& dirname) {
return Status::OK();
}
virtual Status CreateDirIfMissing(const std::string& dirname) {
return Status::OK();
}
virtual Status DeleteDir(const std::string& dirname) {
return Status::OK();
}
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
*file_size = file_map_[fname]->Size();
return Status::OK();
}
virtual Status GetFileModificationTime(const std::string& fname,
uint64_t* time) {
return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
}
virtual Status RenameFile(const std::string& src,
const std::string& target) {
MutexLock lock(&mutex_);
if (file_map_.find(src) == file_map_.end()) {
return Status::IOError(src, "File not found");
}
DeleteFileInternal(target);
file_map_[target] = file_map_[src];
file_map_.erase(src);
return Status::OK();
}
virtual Status LockFile(const std::string& fname, FileLock** lock) {
*lock = new FileLock;
return Status::OK();
}
virtual Status UnlockFile(FileLock* lock) {
delete lock;
return Status::OK();
}
virtual Status GetTestDirectory(std::string* path) {
*path = "/test";
return Status::OK();
}
private:
// Map from filenames to FileState objects, representing a simple file system.
typedef std::map<std::string, FileState*> FileSystem;
port::Mutex mutex_;
FileSystem file_map_; // Protected by mutex_.
};
} // namespace
Env* NewMemEnv(Env* base_env) {
return new InMemoryEnv(base_env);
}
} // namespace rocksdb

19
helpers/memenv/memenv.h Normal file
View File

@ -0,0 +1,19 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
#define STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
namespace rocksdb {
class Env;
// Returns a new environment that stores its data in memory and delegates
// all non-file-storage tasks to base_env. The caller must delete the result
// when it is no longer needed.
// *base_env must remain live while the result is in use.
Env* NewMemEnv(Env* base_env);
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_

View File

@ -0,0 +1,233 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "helpers/memenv/memenv.h"
#include "db/db_impl.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "util/testharness.h"
#include <memory>
#include <string>
#include <vector>
namespace rocksdb {
class MemEnvTest {
public:
Env* env_;
const EnvOptions soptions_;
MemEnvTest()
: env_(NewMemEnv(Env::Default())) {
}
~MemEnvTest() {
delete env_;
}
};
TEST(MemEnvTest, Basics) {
uint64_t file_size;
unique_ptr<WritableFile> writable_file;
std::vector<std::string> children;
ASSERT_OK(env_->CreateDir("/dir"));
// Check that the directory is empty.
ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
ASSERT_OK(env_->GetChildren("/dir", &children));
ASSERT_EQ(0U, children.size());
// Create a file.
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
writable_file.reset();
// Check that the file exists.
ASSERT_TRUE(env_->FileExists("/dir/f"));
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
ASSERT_EQ(0U, file_size);
ASSERT_OK(env_->GetChildren("/dir", &children));
ASSERT_EQ(1U, children.size());
ASSERT_EQ("f", children[0]);
// Write to the file.
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
ASSERT_OK(writable_file->Append("abc"));
writable_file.reset();
// Check for expected size.
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
ASSERT_EQ(3U, file_size);
// Check that renaming works.
ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
ASSERT_TRUE(!env_->FileExists("/dir/f"));
ASSERT_TRUE(env_->FileExists("/dir/g"));
ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
ASSERT_EQ(3U, file_size);
// Check that opening non-existent file fails.
unique_ptr<SequentialFile> seq_file;
unique_ptr<RandomAccessFile> rand_file;
ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
soptions_).ok());
ASSERT_TRUE(!seq_file);
ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
soptions_).ok());
ASSERT_TRUE(!rand_file);
// Check that deleting works.
ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
ASSERT_OK(env_->DeleteFile("/dir/g"));
ASSERT_TRUE(!env_->FileExists("/dir/g"));
ASSERT_OK(env_->GetChildren("/dir", &children));
ASSERT_EQ(0U, children.size());
ASSERT_OK(env_->DeleteDir("/dir"));
}
TEST(MemEnvTest, ReadWrite) {
unique_ptr<WritableFile> writable_file;
unique_ptr<SequentialFile> seq_file;
unique_ptr<RandomAccessFile> rand_file;
Slice result;
char scratch[100];
ASSERT_OK(env_->CreateDir("/dir"));
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
ASSERT_OK(writable_file->Append("hello "));
ASSERT_OK(writable_file->Append("world"));
writable_file.reset();
// Read sequentially.
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
ASSERT_EQ(0, result.compare("hello"));
ASSERT_OK(seq_file->Skip(1));
ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
ASSERT_EQ(0, result.compare("world"));
ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
ASSERT_EQ(0U, result.size());
ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
ASSERT_OK(seq_file->Read(1000, &result, scratch));
ASSERT_EQ(0U, result.size());
// Random reads.
ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
ASSERT_EQ(0, result.compare("world"));
ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
ASSERT_EQ(0, result.compare("hello"));
ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
ASSERT_EQ(0, result.compare("d"));
// Too high offset.
ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
}
TEST(MemEnvTest, Locks) {
FileLock* lock;
// These are no-ops, but we test they return success.
ASSERT_OK(env_->LockFile("some file", &lock));
ASSERT_OK(env_->UnlockFile(lock));
}
TEST(MemEnvTest, Misc) {
std::string test_dir;
ASSERT_OK(env_->GetTestDirectory(&test_dir));
ASSERT_TRUE(!test_dir.empty());
unique_ptr<WritableFile> writable_file;
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
// These are no-ops, but we test they return success.
ASSERT_OK(writable_file->Sync());
ASSERT_OK(writable_file->Flush());
ASSERT_OK(writable_file->Close());
writable_file.reset();
}
TEST(MemEnvTest, LargeWrite) {
const size_t kWriteSize = 300 * 1024;
char* scratch = new char[kWriteSize * 2];
std::string write_data;
for (size_t i = 0; i < kWriteSize; ++i) {
write_data.append(1, static_cast<char>(i));
}
unique_ptr<WritableFile> writable_file;
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
ASSERT_OK(writable_file->Append("foo"));
ASSERT_OK(writable_file->Append(write_data));
writable_file.reset();
unique_ptr<SequentialFile> seq_file;
Slice result;
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
ASSERT_EQ(0, result.compare("foo"));
size_t read = 0;
std::string read_data;
while (read < kWriteSize) {
ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
read_data.append(result.data(), result.size());
read += result.size();
}
ASSERT_TRUE(write_data == read_data);
delete [] scratch;
}
TEST(MemEnvTest, DBTest) {
Options options;
options.create_if_missing = true;
options.env = env_;
DB* db;
const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
ASSERT_OK(DB::Open(options, "/dir/db", &db));
for (size_t i = 0; i < 3; ++i) {
ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
}
for (size_t i = 0; i < 3; ++i) {
std::string res;
ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
ASSERT_TRUE(res == vals[i]);
}
Iterator* iterator = db->NewIterator(ReadOptions());
iterator->SeekToFirst();
for (size_t i = 0; i < 3; ++i) {
ASSERT_TRUE(iterator->Valid());
ASSERT_TRUE(keys[i] == iterator->key());
ASSERT_TRUE(vals[i] == iterator->value());
iterator->Next();
}
ASSERT_TRUE(!iterator->Valid());
delete iterator;
DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
ASSERT_OK(dbi->TEST_FlushMemTable());
for (size_t i = 0; i < 3; ++i) {
std::string res;
ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
ASSERT_TRUE(res == vals[i]);
}
delete db;
}
} // namespace rocksdb
int main(int argc, char** argv) {
return rocksdb::test::RunAllTests();
}

41
include/rocksdb/arena.h Normal file
View File

@ -0,0 +1,41 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Arena class defines memory allocation methods. It's used by memtable and
// skiplist.
#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
#include <limits>
#include <memory>
namespace rocksdb {
class Arena {
public:
Arena() {};
virtual ~Arena() {};
// Return a pointer to a newly allocated memory block of "bytes" bytes.
virtual char* Allocate(size_t bytes) = 0;
// Allocate memory with the normal alignment guarantees provided by malloc.
virtual char* AllocateAligned(size_t bytes) = 0;
// Returns an estimate of the total memory used by arena.
virtual const size_t ApproximateMemoryUsage() = 0;
// Returns the total number of bytes in all blocks allocated so far.
virtual const size_t MemoryAllocatedBytes() = 0;
private:
// No copying allowed
Arena(const Arena&);
void operator=(const Arena&);
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_

281
include/rocksdb/c.h Normal file
View File

@ -0,0 +1,281 @@
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors.
C bindings for leveldb. May be useful as a stable ABI that can be
used by programs that keep leveldb in a shared library, or for
a JNI api.
Does not support:
. getters for the option types
. custom comparators that implement key shortening
. capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings
Some conventions:
(1) We expose just opaque struct pointers and functions to clients.
This allows us to change internal representations without having to
recompile clients.
(2) For simplicity, there is no equivalent to the Slice type. Instead,
the caller has to pass the pointer and length as separate
arguments.
(3) Errors are represented by a null-terminated c string. NULL
means no error. All operations that can raise an error are passed
a "char** errptr" as the last argument. One of the following must
be true on entry:
*errptr == NULL
*errptr points to a malloc()ed null-terminated error message
On success, a leveldb routine leaves *errptr unchanged.
On failure, leveldb frees the old value of *errptr and
set *errptr to a malloc()ed error message.
(4) Bools have the type unsigned char (0 == false; rest == true)
(5) All of the pointer arguments must be non-NULL.
*/
#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
#define STORAGE_ROCKSDB_INCLUDE_C_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
/* Exported types */
typedef struct leveldb_t leveldb_t;
typedef struct leveldb_cache_t leveldb_cache_t;
typedef struct leveldb_comparator_t leveldb_comparator_t;
typedef struct leveldb_env_t leveldb_env_t;
typedef struct leveldb_filelock_t leveldb_filelock_t;
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
typedef struct leveldb_iterator_t leveldb_iterator_t;
typedef struct leveldb_logger_t leveldb_logger_t;
typedef struct leveldb_options_t leveldb_options_t;
typedef struct leveldb_randomfile_t leveldb_randomfile_t;
typedef struct leveldb_readoptions_t leveldb_readoptions_t;
typedef struct leveldb_seqfile_t leveldb_seqfile_t;
typedef struct leveldb_snapshot_t leveldb_snapshot_t;
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
/* DB operations */
extern leveldb_t* leveldb_open(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_close(leveldb_t* db);
extern void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr);
extern void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
char** errptr);
/* Returns NULL if not found. A malloc()ed array otherwise.
Stores the length of the array in *vallen. */
extern char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr);
extern leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options);
extern const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db);
extern void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot);
/* Returns NULL if property name is unknown.
Else returns a pointer to a malloc()-ed null-terminated value. */
extern char* leveldb_property_value(
leveldb_t* db,
const char* propname);
extern void leveldb_approximate_sizes(
leveldb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes);
extern void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
/* Management operations */
extern void leveldb_destroy_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_repair_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
/* Iterator */
extern void leveldb_iter_destroy(leveldb_iterator_t*);
extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
extern void leveldb_iter_next(leveldb_iterator_t*);
extern void leveldb_iter_prev(leveldb_iterator_t*);
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
/* Write batch */
extern leveldb_writebatch_t* leveldb_writebatch_create();
extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
extern void leveldb_writebatch_put(
leveldb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void leveldb_writebatch_delete(
leveldb_writebatch_t*,
const char* key, size_t klen);
extern void leveldb_writebatch_iterate(
leveldb_writebatch_t*,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen));
/* Options */
extern leveldb_options_t* leveldb_options_create();
extern void leveldb_options_destroy(leveldb_options_t*);
extern void leveldb_options_set_comparator(
leveldb_options_t*,
leveldb_comparator_t*);
extern void leveldb_options_set_compression_per_level(
leveldb_options_t* opt,
int* level_values,
size_t num_levels);
extern void leveldb_options_set_filter_policy(
leveldb_options_t*,
leveldb_filterpolicy_t*);
extern void leveldb_options_set_create_if_missing(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_error_if_exists(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_paranoid_checks(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
extern void leveldb_options_set_compression_options(
leveldb_options_t* opt, int w_bits, int level, int strategy);
enum {
leveldb_no_compression = 0,
leveldb_snappy_compression = 1
};
extern void leveldb_options_set_compression(leveldb_options_t*, int);
/* Comparator */
extern leveldb_comparator_t* leveldb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*));
extern void leveldb_comparator_destroy(leveldb_comparator_t*);
/* Filter policy */
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*));
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
int bits_per_key);
/* Read options */
extern leveldb_readoptions_t* leveldb_readoptions_create();
extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
extern void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t*,
unsigned char);
extern void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t*, unsigned char);
extern void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t*,
const leveldb_snapshot_t*);
/* Write options */
extern leveldb_writeoptions_t* leveldb_writeoptions_create();
extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
extern void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t*, unsigned char);
/* Cache */
extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
extern void leveldb_cache_destroy(leveldb_cache_t* cache);
/* Env */
extern leveldb_env_t* leveldb_create_default_env();
extern void leveldb_env_destroy(leveldb_env_t*);
#ifdef __cplusplus
} /* end extern "C" */
#endif
#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */

118
include/rocksdb/cache.h Normal file
View File

@ -0,0 +1,118 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Cache is an interface that maps keys to values. It has internal
// synchronization and may be safely accessed concurrently from
// multiple threads. It may automatically evict entries to make room
// for new entries. Values have a specified charge against the cache
// capacity. For example, a cache where the values are variable
// length strings, may use the length of the string as the charge for
// the string.
//
// A builtin cache implementation with a least-recently-used eviction
// policy is provided. Clients may use their own implementations if
// they want something more sophisticated (like scan-resistance, a
// custom eviction policy, variable cache sizing, etc.)
#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
#include <memory>
#include <stdint.h>
#include "rocksdb/slice.h"
namespace rocksdb {
using std::shared_ptr;
class Cache;
// Create a new cache with a fixed size capacity. The cache is sharded
// to 2^numShardBits shards, by hash of the key. The total capacity
// is divided and evenly assigned to each shard. Inside each shard,
// the eviction is done in two passes: first try to free spaces by
// evicting entries that are among the most least used removeScanCountLimit
// entries and do not have reference other than by the cache itself, in
// the least-used order. If not enough space is freed, further free the
// entries in least used order.
//
// The functions without parameter numShardBits and/or removeScanCountLimit
// use default values. removeScanCountLimit's default value is 0, which
// means a strict LRU order inside each shard.
extern shared_ptr<Cache> NewLRUCache(size_t capacity);
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
int removeScanCountLimit);
class Cache {
public:
Cache() { }
// Destroys all existing entries by calling the "deleter"
// function that was passed to the constructor.
virtual ~Cache();
// Opaque handle to an entry stored in the cache.
struct Handle { };
// Insert a mapping from key->value into the cache and assign it
// the specified charge against the total cache capacity.
//
// Returns a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
//
// When the inserted entry is no longer needed, the key and
// value will be passed to "deleter".
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) = 0;
// If the cache has no mapping for "key", returns nullptr.
//
// Else return a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
virtual Handle* Lookup(const Slice& key) = 0;
// Release a mapping returned by a previous Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void Release(Handle* handle) = 0;
// Return the value encapsulated in a handle returned by a
// successful Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void* Value(Handle* handle) = 0;
// If the cache contains entry for key, erase it. Note that the
// underlying entry will be kept around until all existing handles
// to it have been released.
virtual void Erase(const Slice& key) = 0;
// Return a new numeric id. May be used by multiple clients who are
// sharing the same cache to partition the key space. Typically the
// client will allocate a new id at startup and prepend the id to
// its cache keys.
virtual uint64_t NewId() = 0;
// returns the maximum configured capacity of the cache
virtual size_t GetCapacity() = 0;
private:
void LRU_Remove(Handle* e);
void LRU_Append(Handle* e);
void Unref(Handle* e);
struct Rep;
Rep* rep_;
// No copying allowed
Cache(const Cache&);
void operator=(const Cache&);
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_UTIL_CACHE_H_

View File

@ -0,0 +1,79 @@
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#include <string>
namespace rocksdb {
class Slice;
// CompactionFilter allows an application to modify/delete a key-value at
// the time of compaction.
class CompactionFilter {
public:
// Context information of a compaction run
struct Context {
// Does this compaction run include all data files
bool is_full_compaction;
};
virtual ~CompactionFilter() {}
// The compaction process invokes this
// method for kv that is being compacted. A return value
// of false indicates that the kv should be preserved in the
// output of this compaction run and a return value of true
// indicates that this key-value should be removed from the
// output of the compaction. The application can inspect
// the existing value of the key and make decision based on it.
//
// When the value is to be preserved, the application has the option
// to modify the existing_value and pass it back through new_value.
// value_changed needs to be set to true in this case.
virtual bool Filter(int level,
const Slice& key,
const Slice& existing_value,
std::string* new_value,
bool* value_changed) const = 0;
// Returns a name that identifies this compaction filter.
// The name will be printed to LOG file on start up for diagnosis.
virtual const char* Name() const = 0;
};
// Each compaction will create a new CompactionFilter allowing the
// application to know about different campactions
class CompactionFilterFactory {
public:
virtual ~CompactionFilterFactory() { };
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& context) = 0;
// Returns a name that identifies this compaction filter factory.
virtual const char* Name() const = 0;
};
// Default implementaion of CompactionFilterFactory which does not
// return any filter
class DefaultCompactionFilterFactory : public CompactionFilterFactory {
public:
virtual std::unique_ptr<CompactionFilter>
CreateCompactionFilter(const CompactionFilter::Context& context) override {
return std::unique_ptr<CompactionFilter>(nullptr);
}
virtual const char* Name() const override {
return "DefaultCompactionFilterFactory";
}
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_

View File

@ -0,0 +1,63 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
#include <string>
namespace rocksdb {
class Slice;
// A Comparator object provides a total order across slices that are
// used as keys in an sstable or a database. A Comparator implementation
// must be thread-safe since rocksdb may invoke its methods concurrently
// from multiple threads.
class Comparator {
public:
virtual ~Comparator();
// Three-way comparison. Returns value:
// < 0 iff "a" < "b",
// == 0 iff "a" == "b",
// > 0 iff "a" > "b"
virtual int Compare(const Slice& a, const Slice& b) const = 0;
// The name of the comparator. Used to check for comparator
// mismatches (i.e., a DB created with one comparator is
// accessed using a different comparator.
//
// The client of this package should switch to a new name whenever
// the comparator implementation changes in a way that will cause
// the relative ordering of any two keys to change.
//
// Names starting with "rocksdb." are reserved and should not be used
// by any clients of this package.
virtual const char* Name() const = 0;
// Advanced functions: these are used to reduce the space requirements
// for internal data structures like index blocks.
// If *start < limit, changes *start to a short string in [start,limit).
// Simple comparator implementations may return with *start unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const = 0;
// Changes *key to a short string >= *key.
// Simple comparator implementations may return with *key unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortSuccessor(std::string* key) const = 0;
};
// Return a builtin comparator that uses lexicographic byte-wise
// ordering. The result remains the property of this module and
// must not be deleted.
extern const Comparator* BytewiseComparator();
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_

303
include/rocksdb/db.h Normal file
View File

@ -0,0 +1,303 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
#define STORAGE_ROCKSDB_INCLUDE_DB_H_
#include <stdint.h>
#include <stdio.h>
#include <memory>
#include <vector>
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/types.h"
#include "rocksdb/transaction_log.h"
namespace rocksdb {
using std::unique_ptr;
// Update Makefile if you change these
static const int kMajorVersion = 2;
static const int kMinorVersion = 0;
struct Options;
struct ReadOptions;
struct WriteOptions;
struct FlushOptions;
class WriteBatch;
// Metadata associated with each SST file.
struct LiveFileMetaData {
std::string name; // Name of the file
int level; // Level at which this file resides.
size_t size; // File size in bytes.
std::string smallestkey; // Smallest user defined key in the file.
std::string largestkey; // Largest user defined key in the file.
SequenceNumber smallest_seqno; // smallest seqno in file
SequenceNumber largest_seqno; // largest seqno in file
};
// Abstract handle to particular state of a DB.
// A Snapshot is an immutable object and can therefore be safely
// accessed from multiple threads without any external synchronization.
class Snapshot {
protected:
virtual ~Snapshot();
};
// A range of keys
struct Range {
Slice start; // Included in the range
Slice limit; // Not included in the range
Range() { }
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
};
// A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
class DB {
public:
// Open the database with the specified "name".
// Stores a pointer to a heap-allocated database in *dbptr and returns
// OK on success.
// Stores nullptr in *dbptr and returns a non-OK status on error.
// Caller should delete *dbptr when it is no longer needed.
static Status Open(const Options& options,
const std::string& name,
DB** dbptr);
// Open the database for read only. All DB interfaces
// that modify data, like put/delete, will return error.
// If the db is opened in read only mode, then no compactions
// will happen.
static Status OpenForReadOnly(const Options& options,
const std::string& name, DB** dbptr,
bool error_if_log_file_exist = false);
DB() { }
virtual ~DB();
// Set the database entry for "key" to "value".
// Returns OK on success, and a non-OK status on error.
// Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Merge the database entry for "key" with "value". Returns OK on success,
// and a non-OK status on error. The semantics of this operation is
// determined by the user provided merge_operator when opening DB.
// Note: consider setting options.sync = true.
virtual Status Merge(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value) = 0;
// If keys[i] does not exist in the database, then the i'th returned
// status will be one for which Status::IsNotFound() is true, and
// (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
// the i'th returned status will have Status::ok() true, and (*values)[i]
// will store the value associated with keys[i].
//
// (*values) will always be resized to be the same size as (keys).
// Similarly, the number of returned statuses will be the number of keys.
// Note: keys will not be "de-duplicated". Duplicate keys will return
// duplicate values in order.
virtual std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<Slice>& keys,
std::vector<std::string>* values) = 0;
// If the key definitely does not exist in the database, then this method
// returns false, else true. If the caller wants to obtain value when the key
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
// will be true on return if value has been set properly.
// This check is potentially lighter-weight than invoking DB::Get(). One way
// to make this lighter weight is to avoid doing any IOs.
// Default implementation here returns true and sets 'value_found' to false
virtual bool KeyMayExist(const ReadOptions& options,
const Slice& key,
std::string* value,
bool* value_found = nullptr) {
if (value_found != nullptr) {
*value_found = false;
}
return true;
}
// Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
// Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB
// state. The caller must call ReleaseSnapshot(result) when the
// snapshot is no longer needed.
virtual const Snapshot* GetSnapshot() = 0;
// Release a previously acquired snapshot. The caller must not
// use "snapshot" after this call.
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
// DB implementations can export properties about their state
// via this method. If "property" is a valid property understood by this
// DB implementation, fills "*value" with its current value and returns
// true. Otherwise returns false.
//
//
// Valid property names include:
//
// "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
// where <N> is an ASCII representation of a level number (e.g. "0").
// "rocksdb.stats" - returns a multi-line string that describes statistics
// about the internal operation of the DB.
// "rocksdb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents.
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)".
//
// Note that the returned sizes measure file system space usage, so
// if the user data compresses by a factor of ten, the returned
// sizes will be one-tenth the size of the corresponding user data size.
//
// The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(const Range* range, int n,
uint64_t* sizes) = 0;
// Compact the underlying storage for the key range [*begin,*end].
// In particular, deleted and overwritten versions are discarded,
// and the data is rearranged to reduce the cost of operations
// needed to access the data. This operation should typically only
// be invoked by users who understand the underlying implementation.
//
// begin==nullptr is treated as a key before all keys in the database.
// end==nullptr is treated as a key after all keys in the database.
// Therefore the following call will compact the entire database:
// db->CompactRange(nullptr, nullptr);
// Note that after the entire database is compacted, all data are pushed
// down to the last level containing any data. If the total data size
// after compaction is reduced, that level might not be appropriate for
// hosting all the files. In this case, client could set reduce_level
// to true, to move the files back to the minimum level capable of holding
// the data set or a given level (specified by non-negative target_level).
virtual void CompactRange(const Slice* begin, const Slice* end,
bool reduce_level = false,
int target_level = -1) = 0;
// Number of levels used for this DB.
virtual int NumberLevels() = 0;
// Maximum level to which a new compacted memtable is pushed if it
// does not create overlap.
virtual int MaxMemCompactionLevel() = 0;
// Number of files in level-0 that would stop writes.
virtual int Level0StopWriteTrigger() = 0;
// Flush all mem-table data.
virtual Status Flush(const FlushOptions& options) = 0;
// Prevent file deletions. Compactions will continue to occur,
// but no obsolete files will be deleted. Calling this multiple
// times have the same effect as calling it once.
virtual Status DisableFileDeletions() = 0;
// Allow compactions to delete obselete files.
virtual Status EnableFileDeletions() = 0;
// GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
// THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more
// detailed information on the live files.
// Retrieve the list of all files in the database. The files are
// relative to the dbname and are not absolute paths. The valid size of the
// manifest file is returned in manifest_file_size. The manifest file is an
// ever growing file, but only the portion specified by manifest_file_size is
// valid for this snapshot.
// Setting flush_memtable to true does Flush before recording the live files.
// Setting flush_memtable to false is useful when we don't want to wait for
// flush which may have to wait for compaction to complete taking an
// indeterminate time. But this will have to use GetSortedWalFiles after
// GetLiveFiles to compensate for memtables missed in this snapshot due to the
// absence of Flush, by WAL files to recover the database consistently later
virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size,
bool flush_memtable = true) = 0;
// Retrieve the sorted list of all wal files with earliest file first
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
// The sequence number of the most recent transaction.
virtual SequenceNumber GetLatestSequenceNumber() const = 0;
// Sets iter to an iterator that is positioned at a write-batch containing
// seq_number. If the sequence number is non existent, it returns an iterator
// at the first available seq_no after the requested seq_no
// Returns Status::Ok if iterator is valid
// Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
// use this api, else the WAL files will get
// cleared aggressively and the iterator might keep getting invalid before
// an update is read.
virtual Status GetUpdatesSince(SequenceNumber seq_number,
unique_ptr<TransactionLogIterator>* iter) = 0;
// Delete the file name from the db directory and update the internal state to
// reflect that. Supports deletion of sst and log files only. 'name' must be
// path relative to the db directory. eg. 000001.sst, /archive/000003.log
virtual Status DeleteFile(std::string name) = 0;
// Returns a list of all table files with their level, start key
// and end key
virtual void GetLiveFilesMetaData(
std::vector<LiveFileMetaData> *metadata) {
}
private:
// No copying allowed
DB(const DB&);
void operator=(const DB&);
};
// Destroy the contents of the specified database.
// Be very careful using this method.
Status DestroyDB(const std::string& name, const Options& options);
// If a DB cannot be opened, you may attempt to call this method to
// resurrect as much of the contents of the database as possible.
// Some data may be lost, so be careful when calling this function
// on a database that contains important information.
Status RepairDB(const std::string& dbname, const Options& options);
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_

646
include/rocksdb/env.h Normal file
View File

@ -0,0 +1,646 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An Env is an interface used by the rocksdb implementation to access
// operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
//
// All Env implementations are safe for concurrent access from
// multiple threads without any external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
#include <cstdarg>
#include <string>
#include <memory>
#include <vector>
#include <stdint.h>
#include "rocksdb/status.h"
namespace rocksdb {
class FileLock;
class Logger;
class RandomAccessFile;
class SequentialFile;
class Slice;
class WritableFile;
class RandomRWFile;
struct Options;
using std::unique_ptr;
using std::shared_ptr;
// Options while opening a file to read/write
struct EnvOptions {
// construct with default Options
EnvOptions();
// construct from Options
explicit EnvOptions(const Options& options);
// If true, then allow caching of data in environment buffers
bool use_os_buffer;
// If true, then use mmap to read data
bool use_mmap_reads;
// If true, then use mmap to write data
bool use_mmap_writes;
// If true, set the FD_CLOEXEC on open fd.
bool set_fd_cloexec;
// Allows OS to incrementally sync files to disk while they are being
// written, in the background. Issue one request for every bytes_per_sync
// written. 0 turns it off.
// Default: 0
uint64_t bytes_per_sync;
};
class Env {
public:
Env() { }
virtual ~Env();
// Return a default environment suitable for the current operating
// system. Sophisticated users may wish to provide their own Env
// implementation instead of relying on this default environment.
//
// The result of Default() belongs to rocksdb and must never be deleted.
static Env* Default();
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores nullptr in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewSequentialFile(const std::string& fname,
unique_ptr<SequentialFile>* result,
const EnvOptions& options)
= 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual Status NewRandomAccessFile(const std::string& fname,
unique_ptr<RandomAccessFile>* result,
const EnvOptions& options)
= 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname,
unique_ptr<WritableFile>* result,
const EnvOptions& options) = 0;
// Create an object that both reads and writes to a file on
// specified offsets (random access). If file already exists,
// does not overwrite it. On success, stores a pointer to the
// new file in *result and returns OK. On failure stores nullptr
// in *result and returns non-OK.
virtual Status NewRandomRWFile(const std::string& fname,
unique_ptr<RandomRWFile>* result,
const EnvOptions& options) = 0;
// Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) = 0;
// Delete the named file.
virtual Status DeleteFile(const std::string& fname) = 0;
// Create the specified directory. Returns error if directory exists.
virtual Status CreateDir(const std::string& dirname) = 0;
// Creates directory if missing. Return Ok if it exists, or successful in
// Creating.
virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
// Delete the specified directory.
virtual Status DeleteDir(const std::string& dirname) = 0;
// Store the size of fname in *file_size.
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
// Store the last modification time of fname in *file_mtime.
virtual Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime) = 0;
// Rename file src to target.
virtual Status RenameFile(const std::string& src,
const std::string& target) = 0;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores nullptr in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual Status UnlockFile(FileLock* lock) = 0;
enum Priority { LOW, HIGH, TOTAL };
// Arrange to run "(*function)(arg)" once in a background thread, in
// the thread pool specified by pri. By default, jobs go to the 'LOW'
// priority thread pool.
// "function" may run in an unspecified thread. Multiple functions
// added to the same Env may run concurrently in different threads.
// I.e., the caller may not assume that background work items are
// serialized.
virtual void Schedule(
void (*function)(void* arg),
void* arg,
Priority pri = LOW) = 0;
// Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual Status GetTestDirectory(std::string* path) = 0;
// Create and return a log file for storing informational messages.
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) = 0;
// Returns the number of micro-seconds since some fixed point in time. Only
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
// Returns the number of nano-seconds since some fixed point in time. Only
// useful for computing deltas of time in one run.
// Default implementation simply relies on NowMicros
virtual uint64_t NowNanos() {
return NowMicros() * 1000;
}
// Sleep/delay the thread for the perscribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
// Get the current host name.
virtual Status GetHostName(char* name, uint64_t len) = 0;
// Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
virtual Status GetCurrentTime(int64_t* unix_time) = 0;
// Get full directory name for this db.
virtual Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) = 0;
// The number of background worker threads of a specific thread pool
// for this environment. 'LOW' is the default pool.
// default number: 1
virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
// Converts seconds-since-Jan-01-1970 to a printable string
virtual std::string TimeToString(uint64_t time) = 0;
// Generates a unique id that can be used to identify a db
virtual std::string GenerateUniqueId();
private:
// No copying allowed
Env(const Env&);
void operator=(const Env&);
};
// A file abstraction for reading sequentially through a file
class SequentialFile {
public:
SequentialFile() { }
virtual ~SequentialFile();
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0;
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
};
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
// Tries to get an unique ID for this file that will be the same each time
// the file is opened (and will stay the same while the file is open).
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
// ID can be created this function returns the length of the ID and places it
// in "id"; otherwise, this function returns 0, in which case "id"
// may not have been modified.
//
// This function guarantees, for IDs from a given environment, two unique ids
// cannot be made equal to eachother by adding arbitrary bytes to one of
// them. That is, no unique ID is the prefix of another.
//
// This function guarantees that the returned ID will not be interpretable as
// a single varint.
//
// Note: these IDs are only valid for the duration of the process.
virtual size_t GetUniqueId(char* id, size_t max_size) const {
return 0; // Default implementation to prevent issues with backwards
// compatibility.
};
enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
virtual void Hint(AccessPattern pattern) {}
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class WritableFile {
public:
WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
}
virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0; // sync data
/*
* Sync data and/or metadata as well.
* By default, sync only data.
* Override this method for environments where we need to sync
* metadata as well.
*/
virtual Status Fsync() {
return Sync();
}
/*
* Get the size of valid data in the file.
*/
virtual uint64_t GetFileSize() {
return 0;
}
/*
* Get and set the default pre-allocation block size for writes to
* this file. If non-zero, then Allocate will be used to extend the
* underlying storage of a file (generally via fallocate) if the Env
* instance supports it.
*/
void SetPreallocationBlockSize(size_t size) {
preallocation_block_size_ = size;
}
virtual void GetPreallocationStatus(size_t* block_size,
size_t* last_allocated_block) {
*last_allocated_block = last_preallocated_block_;
*block_size = preallocation_block_size_;
}
// For documentation, refer to RandomAccessFile::GetUniqueId()
virtual size_t GetUniqueId(char* id, size_t max_size) const {
return 0; // Default implementation to prevent issues with backwards
}
// Remove any kind of caching of data from the offset to offset+length
// of this file. If the length is 0, then it refers to the end of file.
// If the system is not caching the file contents, then this is a noop.
// This call has no effect on dirty pages in the cache.
virtual Status InvalidateCache(size_t offset, size_t length) {
return Status::NotSupported("InvalidateCache not supported.");
}
protected:
// PrepareWrite performs any necessary preparation for a write
// before the write actually occurs. This allows for pre-allocation
// of space on devices where it can result in less file
// fragmentation and/or less waste from over-zealous filesystem
// pre-allocation.
void PrepareWrite(size_t offset, size_t len) {
if (preallocation_block_size_ == 0) {
return;
}
// If this write would cross one or more preallocation blocks,
// determine what the last preallocation block necesessary to
// cover this write would be and Allocate to that point.
const auto block_size = preallocation_block_size_;
size_t new_last_preallocated_block =
(offset + len + block_size - 1) / block_size;
if (new_last_preallocated_block > last_preallocated_block_) {
size_t num_spanned_blocks =
new_last_preallocated_block - last_preallocated_block_;
Allocate(block_size * last_preallocated_block_,
block_size * num_spanned_blocks);
last_preallocated_block_ = new_last_preallocated_block;
}
}
/*
* Pre-allocate space for a file.
*/
virtual Status Allocate(off_t offset, off_t len) {
return Status::OK();
}
// Sync a file range with disk.
// offset is the starting byte of the file range to be synchronized.
// nbytes specifies the length of the range to be synchronized.
// This asks the OS to initiate flushing the cached data to disk,
// without waiting for completion.
// Default implementation does nothing.
virtual Status RangeSync(off_t offset, off_t nbytes) {
return Status::OK();
}
private:
size_t last_preallocated_block_;
size_t preallocation_block_size_;
// No copying allowed
WritableFile(const WritableFile&);
void operator=(const WritableFile&);
};
// A file abstraction for random reading and writing.
class RandomRWFile {
public:
RandomRWFile() {}
virtual ~RandomRWFile() {}
// Write data from Slice data to file starting from offset
// Returns IOError on failure, but does not guarantee
// atomicity of a write. Returns OK status on success.
//
// Safe for concurrent use.
virtual Status Write(uint64_t offset, const Slice& data) = 0;
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
virtual Status Close() = 0; // closes the file
virtual Status Sync() = 0; // sync data
/*
* Sync data and/or metadata as well.
* By default, sync only data.
* Override this method for environments where we need to sync
* metadata as well.
*/
virtual Status Fsync() {
return Sync();
}
/*
* Pre-allocate space for a file.
*/
virtual Status Allocate(off_t offset, off_t len) {
return Status::OK();
}
private:
// No copying allowed
RandomRWFile(const RandomRWFile&);
void operator=(const RandomRWFile&);
};
// An interface for writing log messages.
class Logger {
public:
enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
Logger() { }
virtual ~Logger();
// Write an entry to the log file with the specified format.
virtual void Logv(const char* format, va_list ap) = 0;
virtual size_t GetLogFileSize() const {
return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
}
// Flush to the OS buffers
virtual void Flush() {}
private:
// No copying allowed
Logger(const Logger&);
void operator=(const Logger&);
};
// Identifies a locked file.
class FileLock {
public:
FileLock() { }
virtual ~FileLock();
private:
// No copying allowed
FileLock(const FileLock&);
void operator=(const FileLock&);
};
extern void LogFlush(const shared_ptr<Logger>& info_log);
// Log the specified data to *info_log if info_log is non-nullptr.
extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
extern void LogFlush(Logger *info_log);
extern void Log(Logger* info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
// A utility routine: write "data" to the named file.
extern Status WriteStringToFile(Env* env, const Slice& data,
const std::string& fname);
// A utility routine: read contents of named file into *data
extern Status ReadFileToString(Env* env, const std::string& fname,
std::string* data);
// An implementation of Env that forwards all calls to another Env.
// May be useful to clients who wish to override just part of the
// functionality of another Env.
class EnvWrapper : public Env {
public:
// Initialize an EnvWrapper that delegates all calls to *t
explicit EnvWrapper(Env* t) : target_(t) { }
virtual ~EnvWrapper();
// Return the target to which this Env forwards all calls
Env* target() const { return target_; }
// The following text is boilerplate that forwards all methods to target()
Status NewSequentialFile(const std::string& f,
unique_ptr<SequentialFile>* r,
const EnvOptions& options) {
return target_->NewSequentialFile(f, r, options);
}
Status NewRandomAccessFile(const std::string& f,
unique_ptr<RandomAccessFile>* r,
const EnvOptions& options) {
return target_->NewRandomAccessFile(f, r, options);
}
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
const EnvOptions& options) {
return target_->NewWritableFile(f, r, options);
}
Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
const EnvOptions& options) {
return target_->NewRandomRWFile(f, r, options);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r);
}
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
Status CreateDirIfMissing(const std::string& d) {
return target_->CreateDirIfMissing(d);
}
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
Status GetFileSize(const std::string& f, uint64_t* s) {
return target_->GetFileSize(f, s);
}
Status GetFileModificationTime(const std::string& fname,
uint64_t* file_mtime) {
return target_->GetFileModificationTime(fname, file_mtime);
}
Status RenameFile(const std::string& s, const std::string& t) {
return target_->RenameFile(s, t);
}
Status LockFile(const std::string& f, FileLock** l) {
return target_->LockFile(f, l);
}
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
void Schedule(void (*f)(void*), void* a, Priority pri) {
return target_->Schedule(f, a, pri);
}
void StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a);
}
virtual Status GetTestDirectory(std::string* path) {
return target_->GetTestDirectory(path);
}
virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) {
return target_->NewLogger(fname, result);
}
uint64_t NowMicros() {
return target_->NowMicros();
}
void SleepForMicroseconds(int micros) {
target_->SleepForMicroseconds(micros);
}
Status GetHostName(char* name, uint64_t len) {
return target_->GetHostName(name, len);
}
Status GetCurrentTime(int64_t* unix_time) {
return target_->GetCurrentTime(unix_time);
}
Status GetAbsolutePath(const std::string& db_path,
std::string* output_path) {
return target_->GetAbsolutePath(db_path, output_path);
}
void SetBackgroundThreads(int num, Priority pri) {
return target_->SetBackgroundThreads(num, pri);
}
std::string TimeToString(uint64_t time) {
return target_->TimeToString(time);
}
private:
Env* target_;
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_

View File

@ -0,0 +1,70 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A database can be configured with a custom FilterPolicy object.
// This object is responsible for creating a small filter from a set
// of keys. These filters are stored in rocksdb and are consulted
// automatically by rocksdb to decide whether or not to read some
// information from disk. In many cases, a filter can cut down the
// number of disk seeks form a handful to a single disk seek per
// DB::Get() call.
//
// Most people will want to use the builtin bloom filter support (see
// NewBloomFilterPolicy() below).
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#include <string>
namespace rocksdb {
class Slice;
class FilterPolicy {
public:
virtual ~FilterPolicy();
// Return the name of this policy. Note that if the filter encoding
// changes in an incompatible way, the name returned by this method
// must be changed. Otherwise, old incompatible filters may be
// passed to methods of this type.
virtual const char* Name() const = 0;
// keys[0,n-1] contains a list of keys (potentially with duplicates)
// that are ordered according to the user supplied comparator.
// Append a filter that summarizes keys[0,n-1] to *dst.
//
// Warning: do not change the initial contents of *dst. Instead,
// append the newly constructed filter to *dst.
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
const = 0;
// "filter" contains the data appended by a preceding call to
// CreateFilter() on this class. This method must return true if
// the key was in the list of keys passed to CreateFilter().
// This method may return true or false if the key was not on the
// list, but it should aim to return false with a high probability.
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
};
// Return a new filter policy that uses a bloom filter with approximately
// the specified number of bits per key. A good value for bits_per_key
// is 10, which yields a filter with ~ 1% false positive rate.
//
// Callers must delete the result after any database that is using the
// result has been closed.
//
// Note: if you are using a custom comparator that ignores some parts
// of the keys being compared, you must not use NewBloomFilterPolicy()
// and must provide your own FilterPolicy that also ignores the
// corresponding parts of the keys. For example, if the comparator
// ignores trailing spaces, it would be incorrect to use a
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
// trailing spaces in keys.
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
}
#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_

View File

@ -0,0 +1,64 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <string>
namespace rocksdb {
class Slice;
class BlockBuilder;
// FlushBlockPolicy provides a configurable way to determine when to flush a
// block in the block based tables,
class FlushBlockPolicy {
public:
// Keep track of the key/value sequences and return the boolean value to
// determine if table builder should flush current data block.
virtual bool Update(const Slice& key,
const Slice& value) = 0;
virtual ~FlushBlockPolicy() { }
};
class FlushBlockPolicyFactory {
public:
// Return the name of the flush block policy.
virtual const char* Name() const = 0;
// Return a new block flush policy that flushes data blocks by data size.
// FlushBlockPolicy may need to access the metadata of the data block
// builder to determine when to flush the blocks.
//
// Callers must delete the result after any database that is using the
// result has been closed.
virtual FlushBlockPolicy* NewFlushBlockPolicy(
const BlockBuilder& data_block_builder) const = 0;
virtual ~FlushBlockPolicyFactory() { }
};
class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
public:
FlushBlockBySizePolicyFactory(const uint64_t block_size,
const uint64_t block_size_deviation) :
block_size_(block_size),
block_size_deviation_(block_size_deviation) {
}
virtual const char* Name() const override {
return "FlushBlockBySizePolicyFactory";
}
virtual FlushBlockPolicy* NewFlushBlockPolicy(
const BlockBuilder& data_block_builder) const override;
private:
const uint64_t block_size_;
const uint64_t block_size_deviation_;
};
} // rocksdb

102
include/rocksdb/iterator.h Normal file
View File

@ -0,0 +1,102 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An iterator yields a sequence of key/value pairs from a source.
// The following class defines the interface. Multiple implementations
// are provided by this library. In particular, iterators are provided
// to access the contents of a Table or a DB.
//
// Multiple threads can invoke const methods on an Iterator without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Iterator must use
// external synchronization.
#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
namespace rocksdb {
class Iterator {
public:
Iterator();
virtual ~Iterator();
// An iterator is either positioned at a key/value pair, or
// not valid. This method returns true iff the iterator is valid.
virtual bool Valid() const = 0;
// Position at the first key in the source. The iterator is Valid()
// after this call iff the source is not empty.
virtual void SeekToFirst() = 0;
// Position at the last key in the source. The iterator is
// Valid() after this call iff the source is not empty.
virtual void SeekToLast() = 0;
// Position at the first key in the source that at or past target
// The iterator is Valid() after this call iff the source contains
// an entry that comes at or past target.
virtual void Seek(const Slice& target) = 0;
// Moves to the next entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the last entry in the source.
// REQUIRES: Valid()
virtual void Next() = 0;
// Moves to the previous entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the first entry in source.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Return the key for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: Valid()
virtual Slice key() const = 0;
// Return the value for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: !AtEnd() && !AtStart()
virtual Slice value() const = 0;
// If an error has occurred, return it. Else return an ok status.
// If non-blocking IO is requested and this operation cannot be
// satisfied without doing some IO, then this returns Status::Incomplete().
virtual Status status() const = 0;
// Clients are allowed to register function/arg1/arg2 triples that
// will be invoked when this iterator is destroyed.
//
// Note that unlike all of the preceding methods, this method is
// not abstract and therefore clients should not override it.
typedef void (*CleanupFunction)(void* arg1, void* arg2);
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
private:
struct Cleanup {
CleanupFunction function;
void* arg1;
void* arg2;
Cleanup* next;
};
Cleanup cleanup_;
// No copying allowed
Iterator(const Iterator&);
void operator=(const Iterator&);
};
// Return an empty iterator (yields nothing).
extern Iterator* NewEmptyIterator();
// Return an empty iterator with the specified status.
extern Iterator* NewErrorIterator(const Status& status);
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_

View File

@ -0,0 +1,15 @@
// Copyright 2008-present Facebook. All Rights Reserved.
#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
#include "rocksdb/options.h"
namespace rocksdb {
class LDBTool {
public:
void Run(int argc, char** argv, Options = Options());
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H

Some files were not shown because too many files have changed in this diff Show More