Compare commits
46 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
6ea4477cf2 | ||
|
c1f8ed8a60 | ||
|
a5dd5b6ac0 | ||
|
6f35cb93c4 | ||
|
c15650710a | ||
|
dd0efa4ee9 | ||
|
aead404172 | ||
|
a0cdc3cecc | ||
|
7513f63505 | ||
|
9e47084ce2 | ||
|
36074ba5de | ||
|
aa00523e0e | ||
|
cf2b982375 | ||
|
e8c9350f26 | ||
|
4907d2463b | ||
|
5d928c795a | ||
|
725bb9d665 | ||
|
b7367fe844 | ||
|
13b2a9b6ff | ||
|
5dc70a15ca | ||
|
9019e91254 | ||
|
7f1815c379 | ||
|
2584a18efb | ||
|
17f67b5462 | ||
|
6fb56c582c | ||
|
f90ced92f5 | ||
|
632f36dcd3 | ||
|
11bacd5787 | ||
|
f98efcb1e3 | ||
|
c1e99eddc8 | ||
|
ffc3c62ca2 | ||
|
9e82540901 | ||
|
d66bb21e18 | ||
|
05d5c575ac | ||
|
2b8893b9e4 | ||
|
419b93c56f | ||
|
8afb0036ca | ||
|
dded348dda | ||
|
3747361235 | ||
|
8cff6e9456 | ||
|
c293472908 | ||
|
eae53de3b5 | ||
|
65aec19df1 | ||
|
30b38c98cf | ||
|
2879f4bebd | ||
|
88595c882a |
@ -61,7 +61,7 @@ script:
|
||||
- if [ "${TEST_GROUP}" == '1' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=comparator_db_test make -j4 check_some; fi
|
||||
- if [ "${TEST_GROUP}" == '2' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=comparator_db_test make -j4 check_some; fi
|
||||
- if [ "${JOB_NAME}" == 'java_test' ]; then OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest; fi
|
||||
- if [ "${JOB_NAME}" == 'lite_build' ]; then OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib; fi
|
||||
- if [ "${JOB_NAME}" == 'lite_build' ]; then OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib tools; fi
|
||||
- if [ "${JOB_NAME}" == 'examples' ]; then OPT=-DTRAVIS V=1 make -j4 static_lib; cd examples; make -j4; fi
|
||||
- if [ "${JOB_NAME}" == 'cmake' ]; then mkdir build && cd build && cmake .. && make -j4 rocksdb; fi
|
||||
- if [ "${JOB_NAME}" == 'cmake-mingw' ]; then mkdir build && cd build && cmake .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb; fi
|
||||
|
@ -41,10 +41,10 @@ endif()
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules/")
|
||||
|
||||
option(WITH_JEMALLOC "build with JeMalloc" OFF)
|
||||
if(MSVC)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
|
||||
else()
|
||||
option(WITH_JEMALLOC "build with JeMalloc" OFF)
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||
# FreeBSD has jemaloc as default malloc
|
||||
# but it does not have all the jemalloc files in include/...
|
||||
@ -574,6 +574,12 @@ if(WIN32)
|
||||
port/win/win_logger.cc
|
||||
port/win/win_thread.cc
|
||||
port/win/xpress_win.cc)
|
||||
|
||||
if(WITH_JEMALLOC)
|
||||
list(APPEND SOURCES
|
||||
port/win/win_jemalloc.cc)
|
||||
endif()
|
||||
|
||||
else()
|
||||
list(APPEND SOURCES
|
||||
port/port_posix.cc
|
||||
@ -713,6 +719,7 @@ if(WITH_TESTS)
|
||||
db/corruption_test.cc
|
||||
db/cuckoo_table_db_test.cc
|
||||
db/db_basic_test.cc
|
||||
db/db_blob_index_test.cc
|
||||
db/db_block_cache_test.cc
|
||||
db/db_bloom_filter_test.cc
|
||||
db/db_compaction_filter_test.cc
|
||||
@ -778,6 +785,7 @@ if(WITH_TESTS)
|
||||
options/options_test.cc
|
||||
table/block_based_filter_block_test.cc
|
||||
table/block_test.cc
|
||||
table/cleanable_test.cc
|
||||
table/cuckoo_table_builder_test.cc
|
||||
table/cuckoo_table_reader_test.cc
|
||||
table/full_filter_block_test.cc
|
||||
|
17
HISTORY.md
17
HISTORY.md
@ -1,8 +1,19 @@
|
||||
# Rocksdb Change Log
|
||||
## Unreleased
|
||||
### Public API Change
|
||||
### New Features
|
||||
## 5.8.8 (12/6/2017)
|
||||
### Bug Fixes
|
||||
* Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker.
|
||||
|
||||
## 5.8.7 (11/28/2017)
|
||||
### Bug Fixes
|
||||
* Fix IOError on WAL write doesn't propagate to write group follower
|
||||
|
||||
## 5.8.6 (11/20/2017)
|
||||
### Bug Fixes
|
||||
* Fixed aligned_alloc issues with Windows.
|
||||
|
||||
## 5.8.1 (10/23/2017)
|
||||
### New Features
|
||||
* Add a new db property "rocksdb.estimate-oldest-key-time" to return oldest data timestamp. The property is available only for FIFO compaction with compaction_options_fifo.allow_compaction = false.
|
||||
|
||||
## 5.8.0 (08/30/2017)
|
||||
### Public API Change
|
||||
|
4
Makefile
4
Makefile
@ -360,6 +360,7 @@ TESTS = \
|
||||
db_wal_test \
|
||||
db_block_cache_test \
|
||||
db_test \
|
||||
db_blob_index_test \
|
||||
db_bloom_filter_test \
|
||||
db_iter_test \
|
||||
db_log_iter_test \
|
||||
@ -1063,6 +1064,9 @@ db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
db_test2: db/db_test2.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(AM_LINK)
|
||||
|
||||
db_blob_index_test: db/db_blob_index_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(AM_LINK)
|
||||
|
||||
db_block_cache_test: db/db_block_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(AM_LINK)
|
||||
|
||||
|
1
TARGETS
1
TARGETS
@ -367,6 +367,7 @@ ROCKS_TESTS = [['arena_test', 'util/arena_test.cc', 'serial'],
|
||||
['cuckoo_table_reader_test', 'table/cuckoo_table_reader_test.cc', 'serial'],
|
||||
['date_tiered_test', 'utilities/date_tiered/date_tiered_test.cc', 'serial'],
|
||||
['db_basic_test', 'db/db_basic_test.cc', 'serial'],
|
||||
['db_blob_index_test', 'db/db_blob_index_test.cc', 'serial'],
|
||||
['db_block_cache_test', 'db/db_block_cache_test.cc', 'serial'],
|
||||
['db_bloom_filter_test', 'db/db_bloom_filter_test.cc', 'serial'],
|
||||
['db_compaction_filter_test', 'db/db_compaction_filter_test.cc', 'parallel'],
|
||||
|
@ -3,7 +3,7 @@ image: Visual Studio 2015
|
||||
before_build:
|
||||
- md %APPVEYOR_BUILD_FOLDER%\build
|
||||
- cd %APPVEYOR_BUILD_FOLDER%\build
|
||||
- cmake -G "Visual Studio 14 2015 Win64" -DOPTDBG=1 -DXPRESS=1 ..
|
||||
- cmake -G "Visual Studio 14 2015 Win64" -DOPTDBG=1 -DXPRESS=1 -DPORTABLE=1 ..
|
||||
- cd ..
|
||||
build:
|
||||
project: build\rocksdb.sln
|
||||
|
@ -51,11 +51,13 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
|
||||
FBCODE_BUILD="true"
|
||||
# If we're compiling with TSAN we need pic build
|
||||
PIC_BUILD=$COMPILE_WITH_TSAN
|
||||
if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
|
||||
source "$PWD/build_tools/fbcode_config.sh"
|
||||
else
|
||||
if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
|
||||
# we need this to build with MySQL. Don't use for other purposes.
|
||||
source "$PWD/build_tools/fbcode_config4.8.1.sh"
|
||||
elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_5xx" ]; then
|
||||
source "$PWD/build_tools/fbcode_config.sh"
|
||||
else
|
||||
source "$PWD/build_tools/fbcode_config_platform007.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
18
build_tools/dependencies_platform007.sh
Normal file
18
build_tools/dependencies_platform007.sh
Normal file
@ -0,0 +1,18 @@
|
||||
GCC_BASE=/mnt/gvfs/third-party2/gcc/6e8e715624fd15256a7970073387793dfcf79b46/7.x/centos7-native/b2ef2b6
|
||||
CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/ef37e1faa1c29782abfac1ae65a291b9b7966f6d/stable/centos7-native/c9f9104
|
||||
LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c67031f0f739ac61575a061518d6ef5038f99f90/7.x/platform007/5620abc
|
||||
GLIBC_BASE=/mnt/gvfs/third-party2/glibc/60d6f124a78798b73944f5ba87c2306ae3460153/2.26/platform007/f259413
|
||||
SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
|
||||
ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/platform007/ca4da3d
|
||||
BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d
|
||||
LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/platform007/ca4da3d
|
||||
ZSTD_BASE=/mnt/gvfs/third-party2/zstd/3ee276cbacfad3074e3f07bf826ac47f06970f4e/1.3.5/platform007/15a3614
|
||||
GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d
|
||||
JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/9c910d36d6235cc40e8ff559358f1833452300ca/master/platform007/5b0f53e
|
||||
NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/platform007/ca4da3d
|
||||
LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/platform007/6f3e0a9
|
||||
TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/platform007/ca4da3d
|
||||
KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/fb/platform007/da39a3e
|
||||
BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/92ff90349e2f43ea0a8246d8b1cf17b6869013e3/2.29.1/centos7-native/da39a3e
|
||||
VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/platform007/ca4da3d
|
||||
LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832
|
157
build_tools/fbcode_config_platform007.sh
Normal file
157
build_tools/fbcode_config_platform007.sh
Normal file
@ -0,0 +1,157 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Set environment variables so that we can compile rocksdb using
|
||||
# fbcode settings. It uses the latest g++ and clang compilers and also
|
||||
# uses jemalloc
|
||||
# Environment variables that change the behavior of this script:
|
||||
# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
|
||||
|
||||
|
||||
BASEDIR=`dirname $BASH_SOURCE`
|
||||
source "$BASEDIR/dependencies_platform007.sh"
|
||||
|
||||
CFLAGS=""
|
||||
|
||||
# libgcc
|
||||
LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/7.3.0"
|
||||
LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
|
||||
|
||||
# glibc
|
||||
GLIBC_INCLUDE="$GLIBC_BASE/include"
|
||||
GLIBC_LIBS=" -L $GLIBC_BASE/lib"
|
||||
|
||||
# snappy
|
||||
SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
|
||||
if test -z $PIC_BUILD; then
|
||||
SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"
|
||||
else
|
||||
SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a"
|
||||
fi
|
||||
CFLAGS+=" -DSNAPPY"
|
||||
|
||||
if test -z $PIC_BUILD; then
|
||||
# location of zlib headers and libraries
|
||||
ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
|
||||
ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
|
||||
CFLAGS+=" -DZLIB"
|
||||
|
||||
# location of bzip headers and libraries
|
||||
BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
|
||||
BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a"
|
||||
CFLAGS+=" -DBZIP2"
|
||||
|
||||
LZ4_INCLUDE=" -I $LZ4_BASE/include/"
|
||||
LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
|
||||
CFLAGS+=" -DLZ4"
|
||||
fi
|
||||
|
||||
ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
|
||||
if test -z $PIC_BUILD; then
|
||||
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
|
||||
else
|
||||
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
|
||||
fi
|
||||
CFLAGS+=" -DZSTD"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
|
||||
if test -z $PIC_BUILD; then
|
||||
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"
|
||||
else
|
||||
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
|
||||
fi
|
||||
CFLAGS+=" -DGFLAGS=gflags"
|
||||
|
||||
# location of jemalloc
|
||||
JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
|
||||
JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a"
|
||||
|
||||
if test -z $PIC_BUILD; then
|
||||
# location of numa
|
||||
NUMA_INCLUDE=" -I $NUMA_BASE/include/"
|
||||
NUMA_LIB=" $NUMA_BASE/lib/libnuma.a"
|
||||
CFLAGS+=" -DNUMA"
|
||||
|
||||
# location of libunwind
|
||||
LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
|
||||
fi
|
||||
|
||||
# location of TBB
|
||||
TBB_INCLUDE=" -isystem $TBB_BASE/include/"
|
||||
if test -z $PIC_BUILD; then
|
||||
TBB_LIBS="$TBB_BASE/lib/libtbb.a"
|
||||
else
|
||||
TBB_LIBS="$TBB_BASE/lib/libtbb_pic.a"
|
||||
fi
|
||||
CFLAGS+=" -DTBB"
|
||||
|
||||
# use Intel SSE support for checksum calculations
|
||||
export USE_SSE=1
|
||||
export PORTABLE=1
|
||||
|
||||
BINUTILS="$BINUTILS_BASE/bin"
|
||||
AR="$BINUTILS/ar"
|
||||
|
||||
DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE"
|
||||
|
||||
STDLIBS="-L $GCC_BASE/lib64"
|
||||
|
||||
CLANG_BIN="$CLANG_BASE/bin"
|
||||
CLANG_LIB="$CLANG_BASE/lib"
|
||||
CLANG_SRC="$CLANG_BASE/../../src"
|
||||
|
||||
CLANG_ANALYZER="$CLANG_BIN/clang++"
|
||||
CLANG_SCAN_BUILD="$CLANG_SRC/llvm/tools/clang/tools/scan-build/bin/scan-build"
|
||||
|
||||
if [ -z "$USE_CLANG" ]; then
|
||||
# gcc
|
||||
CC="$GCC_BASE/bin/gcc"
|
||||
CXX="$GCC_BASE/bin/g++"
|
||||
|
||||
CFLAGS+=" -B$BINUTILS/gold"
|
||||
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
|
||||
CFLAGS+=" -isystem $GLIBC_INCLUDE"
|
||||
JEMALLOC=1
|
||||
else
|
||||
# clang
|
||||
CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
|
||||
CC="$CLANG_BIN/clang"
|
||||
CXX="$CLANG_BIN/clang++"
|
||||
|
||||
KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
|
||||
|
||||
CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
|
||||
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x "
|
||||
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x/x86_64-facebook-linux "
|
||||
CFLAGS+=" -isystem $GLIBC_INCLUDE"
|
||||
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
|
||||
CFLAGS+=" -isystem $CLANG_INCLUDE"
|
||||
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
|
||||
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
|
||||
CFLAGS+=" -Wno-expansion-to-defined "
|
||||
CXXFLAGS="-nostdinc++"
|
||||
fi
|
||||
|
||||
CFLAGS+=" $DEPS_INCLUDE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42"
|
||||
CXXFLAGS+=" $CFLAGS"
|
||||
|
||||
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
|
||||
EXEC_LDFLAGS+=" -B$BINUTILS/gold"
|
||||
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform007/lib/ld.so"
|
||||
EXEC_LDFLAGS+=" $LIBUNWIND"
|
||||
EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform007/lib"
|
||||
# required by libtbb
|
||||
EXEC_LDFLAGS+=" -ldl"
|
||||
|
||||
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
|
||||
|
||||
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS"
|
||||
|
||||
VALGRIND_VER="$VALGRIND_BASE/bin/"
|
||||
|
||||
# lua not supported because it's on track for deprecation, I think
|
||||
LUA_PATH=
|
||||
LUA_LIB=
|
||||
|
||||
export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
|
@ -85,8 +85,9 @@ NON_SHM="TMPD=/tmp/rocksdb_test_tmp"
|
||||
GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1"
|
||||
ASAN="COMPILE_WITH_ASAN=1"
|
||||
CLANG="USE_CLANG=1"
|
||||
LITE="OPT=\"-DROCKSDB_LITE -g\""
|
||||
TSAN="COMPILE_WITH_TSAN=1"
|
||||
# in gcc-5 there are known problems with TSAN like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090.
|
||||
# using platform007 gives us gcc-8 or higher which has that bug fixed.
|
||||
TSAN="ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1"
|
||||
UBSAN="COMPILE_WITH_UBSAN=1"
|
||||
DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
|
||||
HTTP_PROXY="https_proxy=http://fwdproxy.29.prn1:8080 http_proxy=http://fwdproxy.29.prn1:8080 ftp_proxy=http://fwdproxy.29.prn1:8080"
|
||||
@ -343,27 +344,7 @@ LITE_BUILD_COMMANDS="[
|
||||
$CLEANUP_ENV,
|
||||
{
|
||||
'name':'Build RocksDB debug version',
|
||||
'shell':'$LITE make J=1 static_lib || $CONTRUN_NAME=lite_static_lib $TASK_CREATION_TOOL',
|
||||
'user':'root',
|
||||
$PARSER
|
||||
},
|
||||
],
|
||||
$REPORT
|
||||
}
|
||||
]"
|
||||
|
||||
#
|
||||
# RocksDB lite tests
|
||||
#
|
||||
LITE_UNIT_TEST_COMMANDS="[
|
||||
{
|
||||
'name':'Rocksdb Lite Unit Test',
|
||||
'oncall':'$ONCALL',
|
||||
'steps': [
|
||||
$CLEANUP_ENV,
|
||||
{
|
||||
'name':'Build RocksDB debug version',
|
||||
'shell':'$SHM $LITE make J=1 check || $CONTRUN_NAME=lite_check $TASK_CREATION_TOOL',
|
||||
'shell':'$LITE make J=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
|
||||
'user':'root',
|
||||
$PARSER
|
||||
},
|
||||
@ -748,9 +729,6 @@ case $1 in
|
||||
lite)
|
||||
echo $LITE_BUILD_COMMANDS
|
||||
;;
|
||||
lite_test)
|
||||
echo $LITE_UNIT_TEST_COMMANDS
|
||||
;;
|
||||
stress_crash)
|
||||
echo $STRESS_CRASH_TEST_COMMANDS
|
||||
;;
|
||||
|
@ -52,6 +52,45 @@ function get_lib_base()
|
||||
log_variable $__res_var
|
||||
}
|
||||
|
||||
###########################################################
|
||||
# platform007 dependencies #
|
||||
###########################################################
|
||||
|
||||
OUTPUT="$BASEDIR/dependencies_platform007.sh"
|
||||
|
||||
rm -f "$OUTPUT"
|
||||
touch "$OUTPUT"
|
||||
|
||||
echo "Writing dependencies to $OUTPUT"
|
||||
|
||||
# Compilers locations
|
||||
GCC_BASE=`readlink -f $TP2_LATEST/gcc/7.x/centos7-native/*/`
|
||||
CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
|
||||
|
||||
log_variable GCC_BASE
|
||||
log_variable CLANG_BASE
|
||||
|
||||
# Libraries locations
|
||||
get_lib_base libgcc 7.x platform007
|
||||
get_lib_base glibc 2.26 platform007
|
||||
get_lib_base snappy LATEST platform007
|
||||
get_lib_base zlib LATEST platform007
|
||||
get_lib_base bzip2 LATEST platform007
|
||||
get_lib_base lz4 LATEST platform007
|
||||
get_lib_base zstd LATEST platform007
|
||||
get_lib_base gflags LATEST platform007
|
||||
get_lib_base jemalloc LATEST platform007
|
||||
get_lib_base numa LATEST platform007
|
||||
get_lib_base libunwind LATEST platform007
|
||||
get_lib_base tbb LATEST platform007
|
||||
|
||||
get_lib_base kernel-headers fb platform007
|
||||
get_lib_base binutils LATEST centos7-native
|
||||
get_lib_base valgrind LATEST platform007
|
||||
get_lib_base lua 5.3.4 platform007
|
||||
|
||||
git diff $OUTPUT
|
||||
|
||||
###########################################################
|
||||
# 5.x dependencies #
|
||||
###########################################################
|
||||
|
7
cache/lru_cache.cc
vendored
7
cache/lru_cache.cc
vendored
@ -465,14 +465,7 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
|
||||
bool strict_capacity_limit, double high_pri_pool_ratio)
|
||||
: ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
|
||||
num_shards_ = 1 << num_shard_bits;
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable: 4316) // We've validated the alignment with the new operators
|
||||
#endif
|
||||
shards_ = new LRUCacheShard[num_shards_];
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
SetCapacity(capacity);
|
||||
SetStrictCapacityLimit(strict_capacity_limit);
|
||||
for (int i = 0; i < num_shards_; i++) {
|
||||
|
@ -47,15 +47,15 @@ TableBuilder* NewTableBuilder(
|
||||
WritableFileWriter* file, const CompressionType compression_type,
|
||||
const CompressionOptions& compression_opts, int level,
|
||||
const std::string* compression_dict, const bool skip_filters,
|
||||
const uint64_t creation_time) {
|
||||
const uint64_t creation_time, const uint64_t oldest_key_time) {
|
||||
assert((column_family_id ==
|
||||
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
|
||||
column_family_name.empty());
|
||||
return ioptions.table_factory->NewTableBuilder(
|
||||
TableBuilderOptions(ioptions, internal_comparator,
|
||||
int_tbl_prop_collector_factories, compression_type,
|
||||
compression_opts, compression_dict, skip_filters,
|
||||
column_family_name, level, creation_time),
|
||||
TableBuilderOptions(
|
||||
ioptions, internal_comparator, int_tbl_prop_collector_factories,
|
||||
compression_type, compression_opts, compression_dict, skip_filters,
|
||||
column_family_name, level, creation_time, oldest_key_time),
|
||||
column_family_id, file);
|
||||
}
|
||||
|
||||
@ -74,8 +74,8 @@ Status BuildTable(
|
||||
const CompressionOptions& compression_opts, bool paranoid_file_checks,
|
||||
InternalStats* internal_stats, TableFileCreationReason reason,
|
||||
EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
|
||||
TableProperties* table_properties, int level,
|
||||
const uint64_t creation_time) {
|
||||
TableProperties* table_properties, int level, const uint64_t creation_time,
|
||||
const uint64_t oldest_key_time) {
|
||||
assert((column_family_id ==
|
||||
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
|
||||
column_family_name.empty());
|
||||
@ -120,12 +120,11 @@ Status BuildTable(
|
||||
|
||||
file_writer.reset(new WritableFileWriter(std::move(file), env_options,
|
||||
ioptions.statistics));
|
||||
|
||||
builder = NewTableBuilder(
|
||||
ioptions, internal_comparator, int_tbl_prop_collector_factories,
|
||||
column_family_id, column_family_name, file_writer.get(), compression,
|
||||
compression_opts, level, nullptr /* compression_dict */,
|
||||
false /* skip_filters */, creation_time);
|
||||
false /* skip_filters */, creation_time, oldest_key_time);
|
||||
}
|
||||
|
||||
MergeHelper merge(env, internal_comparator.user_comparator(),
|
||||
|
@ -50,7 +50,8 @@ TableBuilder* NewTableBuilder(
|
||||
WritableFileWriter* file, const CompressionType compression_type,
|
||||
const CompressionOptions& compression_opts, int level,
|
||||
const std::string* compression_dict = nullptr,
|
||||
const bool skip_filters = false, const uint64_t creation_time = 0);
|
||||
const bool skip_filters = false, const uint64_t creation_time = 0,
|
||||
const uint64_t oldest_key_time = 0);
|
||||
|
||||
// Build a Table file from the contents of *iter. The generated file
|
||||
// will be named according to number specified in meta. On success, the rest of
|
||||
@ -77,6 +78,6 @@ extern Status BuildTable(
|
||||
EventLogger* event_logger = nullptr, int job_id = 0,
|
||||
const Env::IOPriority io_priority = Env::IO_HIGH,
|
||||
TableProperties* table_properties = nullptr, int level = -1,
|
||||
const uint64_t creation_time = 0);
|
||||
const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
77
db/c.cc
77
db/c.cc
@ -1386,23 +1386,24 @@ void rocksdb_writebatch_put_log_data(
|
||||
b->rep.PutLogData(Slice(blob, len));
|
||||
}
|
||||
|
||||
class H : public WriteBatch::Handler {
|
||||
public:
|
||||
void* state_;
|
||||
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
|
||||
void (*deleted_)(void*, const char* k, size_t klen);
|
||||
virtual void Put(const Slice& key, const Slice& value) override {
|
||||
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
|
||||
}
|
||||
virtual void Delete(const Slice& key) override {
|
||||
(*deleted_)(state_, key.data(), key.size());
|
||||
}
|
||||
};
|
||||
|
||||
void rocksdb_writebatch_iterate(
|
||||
rocksdb_writebatch_t* b,
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*deleted)(void*, const char* k, size_t klen)) {
|
||||
class H : public WriteBatch::Handler {
|
||||
public:
|
||||
void* state_;
|
||||
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
|
||||
void (*deleted_)(void*, const char* k, size_t klen);
|
||||
virtual void Put(const Slice& key, const Slice& value) override {
|
||||
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
|
||||
}
|
||||
virtual void Delete(const Slice& key) override {
|
||||
(*deleted_)(state_, key.data(), key.size());
|
||||
}
|
||||
};
|
||||
H handler;
|
||||
handler.state_ = state;
|
||||
handler.put_ = put;
|
||||
@ -1647,18 +1648,6 @@ void rocksdb_writebatch_wi_iterate(
|
||||
void* state,
|
||||
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
|
||||
void (*deleted)(void*, const char* k, size_t klen)) {
|
||||
class H : public WriteBatch::Handler {
|
||||
public:
|
||||
void* state_;
|
||||
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
|
||||
void (*deleted_)(void*, const char* k, size_t klen);
|
||||
virtual void Put(const Slice& key, const Slice& value) override {
|
||||
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
|
||||
}
|
||||
virtual void Delete(const Slice& key) override {
|
||||
(*deleted_)(state_, key.data(), key.size());
|
||||
}
|
||||
};
|
||||
H handler;
|
||||
handler.state_ = state;
|
||||
handler.put_ = put;
|
||||
@ -3005,20 +2994,21 @@ void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) {
|
||||
delete st;
|
||||
}
|
||||
|
||||
struct Wrapper : public rocksdb_slicetransform_t {
|
||||
const SliceTransform* rep_;
|
||||
~Wrapper() { delete rep_; }
|
||||
const char* Name() const override { return rep_->Name(); }
|
||||
Slice Transform(const Slice& src) const override {
|
||||
return rep_->Transform(src);
|
||||
}
|
||||
bool InDomain(const Slice& src) const override {
|
||||
return rep_->InDomain(src);
|
||||
}
|
||||
bool InRange(const Slice& src) const override { return rep_->InRange(src); }
|
||||
static void DoNothing(void*) { }
|
||||
};
|
||||
|
||||
rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) {
|
||||
struct Wrapper : public rocksdb_slicetransform_t {
|
||||
const SliceTransform* rep_;
|
||||
~Wrapper() { delete rep_; }
|
||||
const char* Name() const override { return rep_->Name(); }
|
||||
Slice Transform(const Slice& src) const override {
|
||||
return rep_->Transform(src);
|
||||
}
|
||||
bool InDomain(const Slice& src) const override {
|
||||
return rep_->InDomain(src);
|
||||
}
|
||||
bool InRange(const Slice& src) const override { return rep_->InRange(src); }
|
||||
static void DoNothing(void*) { }
|
||||
};
|
||||
Wrapper* wrapper = new Wrapper;
|
||||
wrapper->rep_ = rocksdb::NewFixedPrefixTransform(prefixLen);
|
||||
wrapper->state_ = nullptr;
|
||||
@ -3027,19 +3017,6 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t pref
|
||||
}
|
||||
|
||||
rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
|
||||
struct Wrapper : public rocksdb_slicetransform_t {
|
||||
const SliceTransform* rep_;
|
||||
~Wrapper() { delete rep_; }
|
||||
const char* Name() const override { return rep_->Name(); }
|
||||
Slice Transform(const Slice& src) const override {
|
||||
return rep_->Transform(src);
|
||||
}
|
||||
bool InDomain(const Slice& src) const override {
|
||||
return rep_->InDomain(src);
|
||||
}
|
||||
bool InRange(const Slice& src) const override { return rep_->InRange(src); }
|
||||
static void DoNothing(void*) { }
|
||||
};
|
||||
Wrapper* wrapper = new Wrapper;
|
||||
wrapper->rep_ = rocksdb::NewNoopTransform();
|
||||
wrapper->state_ = nullptr;
|
||||
|
@ -930,6 +930,13 @@ SuperVersion* ColumnFamilyData::InstallSuperVersion(
|
||||
super_version_ = new_superversion;
|
||||
++super_version_number_;
|
||||
super_version_->version_number = super_version_number_;
|
||||
if (old_superversion != nullptr) {
|
||||
if (old_superversion->mutable_cf_options.write_buffer_size !=
|
||||
mutable_cf_options.write_buffer_size) {
|
||||
mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Reset SuperVersions cached in thread local storage
|
||||
ResetThreadLocalSuperVersions();
|
||||
|
||||
|
@ -25,6 +25,8 @@ CompactionEventListener::CompactionListenerValueType fromInternalValueType(
|
||||
kSingleDelete;
|
||||
case kTypeRangeDeletion:
|
||||
return CompactionEventListener::CompactionListenerValueType::kRangeDelete;
|
||||
case kTypeBlobIndex:
|
||||
return CompactionEventListener::CompactionListenerValueType::kBlobIndex;
|
||||
default:
|
||||
assert(false);
|
||||
return CompactionEventListener::CompactionListenerValueType::kInvalid;
|
||||
@ -228,7 +230,8 @@ void CompactionIterator::NextFromInput() {
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// apply the compaction filter to the first occurrence of the user key
|
||||
if (compaction_filter_ != nullptr && ikey_.type == kTypeValue &&
|
||||
if (compaction_filter_ != nullptr &&
|
||||
(ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex) &&
|
||||
(visible_at_tip_ || ikey_.sequence > latest_snapshot_ ||
|
||||
ignore_snapshots_)) {
|
||||
// If the user has specified a compaction filter and the sequence
|
||||
@ -238,11 +241,13 @@ void CompactionIterator::NextFromInput() {
|
||||
CompactionFilter::Decision filter;
|
||||
compaction_filter_value_.clear();
|
||||
compaction_filter_skip_until_.Clear();
|
||||
CompactionFilter::ValueType value_type =
|
||||
ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
|
||||
: CompactionFilter::ValueType::kBlobIndex;
|
||||
{
|
||||
StopWatchNano timer(env_, true);
|
||||
filter = compaction_filter_->FilterV2(
|
||||
compaction_->level(), ikey_.user_key,
|
||||
CompactionFilter::ValueType::kValue, value_,
|
||||
compaction_->level(), ikey_.user_key, value_type, value_,
|
||||
&compaction_filter_value_, compaction_filter_skip_until_.rep());
|
||||
iter_stats_.total_filter_time +=
|
||||
env_ != nullptr ? timer.ElapsedNanos() : 0;
|
||||
|
@ -1442,19 +1442,22 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
|
||||
inputs.emplace_back();
|
||||
inputs[0].level = 0;
|
||||
|
||||
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
|
||||
auto f = *ritr;
|
||||
if (f->fd.table_reader != nullptr &&
|
||||
f->fd.table_reader->GetTableProperties() != nullptr) {
|
||||
auto creation_time =
|
||||
f->fd.table_reader->GetTableProperties()->creation_time;
|
||||
if (creation_time == 0 ||
|
||||
creation_time >=
|
||||
(current_time - ioptions_.compaction_options_fifo.ttl)) {
|
||||
break;
|
||||
// avoid underflow
|
||||
if (current_time > ioptions_.compaction_options_fifo.ttl) {
|
||||
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
|
||||
auto f = *ritr;
|
||||
if (f->fd.table_reader != nullptr &&
|
||||
f->fd.table_reader->GetTableProperties() != nullptr) {
|
||||
auto creation_time =
|
||||
f->fd.table_reader->GetTableProperties()->creation_time;
|
||||
if (creation_time == 0 ||
|
||||
creation_time >=
|
||||
(current_time - ioptions_.compaction_options_fifo.ttl)) {
|
||||
break;
|
||||
}
|
||||
total_size -= f->compensated_file_size;
|
||||
inputs[0].files.push_back(f);
|
||||
}
|
||||
total_size -= f->compensated_file_size;
|
||||
inputs[0].files.push_back(f);
|
||||
}
|
||||
}
|
||||
|
||||
|
409
db/db_blob_index_test.cc
Normal file
409
db/db_blob_index_test.cc
Normal file
@ -0,0 +1,409 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "db/column_family.h"
|
||||
#include "db/db_iter.h"
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "port/port.h"
|
||||
#include "port/stack_trace.h"
|
||||
#include "util/string_util.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
|
||||
// should accept the value type on write, and report not supported value
|
||||
// for reads, unless caller request for it explicitly. The base rocksdb
|
||||
// doesn't understand format of actual blob index (the value).
|
||||
class DBBlobIndexTest : public DBTestBase {
|
||||
public:
|
||||
enum Tier {
|
||||
kMemtable = 0,
|
||||
kImmutableMemtables = 1,
|
||||
kL0SstFile = 2,
|
||||
kLnSstFile = 3,
|
||||
};
|
||||
const std::vector<Tier> kAllTiers = {Tier::kMemtable,
|
||||
Tier::kImmutableMemtables,
|
||||
Tier::kL0SstFile, Tier::kLnSstFile};
|
||||
|
||||
DBBlobIndexTest() : DBTestBase("/db_blob_index_test") {}
|
||||
|
||||
ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
|
||||
|
||||
ColumnFamilyData* cfd() {
|
||||
return reinterpret_cast<ColumnFamilyHandleImpl*>(cfh())->cfd();
|
||||
}
|
||||
|
||||
Status PutBlobIndex(WriteBatch* batch, const Slice& key,
|
||||
const Slice& blob_index) {
|
||||
return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
|
||||
blob_index);
|
||||
}
|
||||
|
||||
Status Write(WriteBatch* batch) {
|
||||
return dbfull()->Write(WriteOptions(), batch);
|
||||
}
|
||||
|
||||
std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
|
||||
const Snapshot* snapshot = nullptr) {
|
||||
ReadOptions read_options;
|
||||
read_options.snapshot = snapshot;
|
||||
PinnableSlice value;
|
||||
auto s = dbfull()->GetImpl(read_options, cfh(), key, &value,
|
||||
nullptr /*value_found*/, is_blob_index);
|
||||
if (s.IsNotFound()) {
|
||||
return "NOT_FOUND";
|
||||
}
|
||||
if (s.IsNotSupported()) {
|
||||
return "NOT_SUPPORTED";
|
||||
}
|
||||
if (!s.ok()) {
|
||||
return s.ToString();
|
||||
}
|
||||
return value.ToString();
|
||||
}
|
||||
|
||||
std::string GetBlobIndex(const Slice& key,
|
||||
const Snapshot* snapshot = nullptr) {
|
||||
bool is_blob_index = false;
|
||||
std::string value = GetImpl(key, &is_blob_index, snapshot);
|
||||
if (!is_blob_index) {
|
||||
return "NOT_BLOB";
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
ArenaWrappedDBIter* GetBlobIterator() {
|
||||
return dbfull()->NewIteratorImpl(ReadOptions(), cfd(),
|
||||
dbfull()->GetLatestSequenceNumber(),
|
||||
true /*allow_blob*/);
|
||||
}
|
||||
|
||||
Options GetTestOptions() {
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.num_levels = 2;
|
||||
options.disable_auto_compactions = true;
|
||||
// Disable auto flushes.
|
||||
options.max_write_buffer_number = 10;
|
||||
options.min_write_buffer_number_to_merge = 10;
|
||||
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
||||
return options;
|
||||
}
|
||||
|
||||
void MoveDataTo(Tier tier) {
|
||||
switch (tier) {
|
||||
case Tier::kMemtable:
|
||||
break;
|
||||
case Tier::kImmutableMemtables:
|
||||
ASSERT_OK(dbfull()->TEST_SwitchMemtable());
|
||||
break;
|
||||
case Tier::kL0SstFile:
|
||||
ASSERT_OK(Flush());
|
||||
break;
|
||||
case Tier::kLnSstFile:
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(Put("a", "dummy"));
|
||||
ASSERT_OK(Put("z", "dummy"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(
|
||||
dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
#ifndef ROCKSDB_LITE
|
||||
ASSERT_EQ("0,1", FilesPerLevel());
|
||||
#endif // !ROCKSDB_LITE
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Should be able to write kTypeBlobIndex to memtables and SST files.
|
||||
TEST_F(DBBlobIndexTest, Write) {
|
||||
for (auto tier : kAllTiers) {
|
||||
DestroyAndReopen(GetTestOptions());
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
std::string index = ToString(i);
|
||||
WriteBatch batch;
|
||||
ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index));
|
||||
ASSERT_OK(Write(&batch));
|
||||
}
|
||||
MoveDataTo(tier);
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
std::string index = ToString(i);
|
||||
ASSERT_EQ("blob" + index, GetBlobIndex("key" + index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get should be able to return blob index if is_blob_index is provided,
|
||||
// otherwise return Status::NotSupported status.
|
||||
TEST_F(DBBlobIndexTest, Get) {
|
||||
for (auto tier : kAllTiers) {
|
||||
DestroyAndReopen(GetTestOptions());
|
||||
WriteBatch batch;
|
||||
ASSERT_OK(batch.Put("key", "value"));
|
||||
ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index"));
|
||||
ASSERT_OK(Write(&batch));
|
||||
MoveDataTo(tier);
|
||||
// Verify normal value
|
||||
bool is_blob_index = false;
|
||||
PinnableSlice value;
|
||||
ASSERT_EQ("value", Get("key"));
|
||||
ASSERT_EQ("value", GetImpl("key"));
|
||||
ASSERT_EQ("value", GetImpl("key", &is_blob_index));
|
||||
ASSERT_FALSE(is_blob_index);
|
||||
// Verify blob index
|
||||
ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
|
||||
ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
|
||||
ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index));
|
||||
ASSERT_TRUE(is_blob_index);
|
||||
}
|
||||
}
|
||||
|
||||
// Get should NOT return Status::NotSupported if blob index is updated with
|
||||
// a normal value.
|
||||
TEST_F(DBBlobIndexTest, Updated) {
|
||||
for (auto tier : kAllTiers) {
|
||||
DestroyAndReopen(GetTestOptions());
|
||||
WriteBatch batch;
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index"));
|
||||
}
|
||||
ASSERT_OK(Write(&batch));
|
||||
// Avoid blob values from being purged.
|
||||
const Snapshot* snapshot = dbfull()->GetSnapshot();
|
||||
ASSERT_OK(Put("key1", "new_value"));
|
||||
ASSERT_OK(Merge("key2", "a"));
|
||||
ASSERT_OK(Merge("key2", "b"));
|
||||
ASSERT_OK(Merge("key2", "c"));
|
||||
ASSERT_OK(Delete("key3"));
|
||||
ASSERT_OK(SingleDelete("key4"));
|
||||
ASSERT_OK(Delete("key5"));
|
||||
ASSERT_OK(Merge("key5", "a"));
|
||||
ASSERT_OK(Merge("key5", "b"));
|
||||
ASSERT_OK(Merge("key5", "c"));
|
||||
ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
|
||||
MoveDataTo(tier);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot));
|
||||
}
|
||||
ASSERT_EQ("new_value", Get("key1"));
|
||||
ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("key3"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("key4"));
|
||||
ASSERT_EQ("a,b,c", GetImpl("key5"));
|
||||
for (int i = 6; i < 9; i++) {
|
||||
ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
|
||||
}
|
||||
ASSERT_EQ("blob_index", GetBlobIndex("key9"));
|
||||
dbfull()->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterator should get blob value if allow_blob flag is set,
|
||||
// otherwise return Status::NotSupported status.
|
||||
TEST_F(DBBlobIndexTest, Iterate) {
|
||||
const std::vector<std::vector<ValueType>> data = {
|
||||
/*00*/ {kTypeValue},
|
||||
/*01*/ {kTypeBlobIndex},
|
||||
/*02*/ {kTypeValue},
|
||||
/*03*/ {kTypeBlobIndex, kTypeValue},
|
||||
/*04*/ {kTypeValue},
|
||||
/*05*/ {kTypeValue, kTypeBlobIndex},
|
||||
/*06*/ {kTypeValue},
|
||||
/*07*/ {kTypeDeletion, kTypeBlobIndex},
|
||||
/*08*/ {kTypeValue},
|
||||
/*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
|
||||
/*10*/ {kTypeValue},
|
||||
/*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
|
||||
/*12*/ {kTypeValue},
|
||||
/*13*/
|
||||
{kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
|
||||
/*14*/ {kTypeValue},
|
||||
/*15*/ {kTypeBlobIndex},
|
||||
/*16*/ {kTypeValue},
|
||||
};
|
||||
|
||||
auto get_key = [](int index) {
|
||||
char buf[20];
|
||||
snprintf(buf, sizeof(buf), "%02d", index);
|
||||
return "key" + std::string(buf);
|
||||
};
|
||||
|
||||
auto get_value = [&](int index, int version) {
|
||||
return get_key(index) + "_value" + ToString(version);
|
||||
};
|
||||
|
||||
auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
|
||||
const Slice& expected_value) {
|
||||
ASSERT_EQ(expected_status, iterator->status().code());
|
||||
if (expected_status == Status::kOk) {
|
||||
ASSERT_TRUE(iterator->Valid());
|
||||
ASSERT_EQ(expected_value, iterator->value());
|
||||
} else {
|
||||
ASSERT_FALSE(iterator->Valid());
|
||||
}
|
||||
};
|
||||
|
||||
auto create_normal_iterator = [&]() -> Iterator* {
|
||||
return dbfull()->NewIterator(ReadOptions());
|
||||
};
|
||||
|
||||
auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
|
||||
|
||||
auto check_is_blob = [&](bool is_blob) {
|
||||
return [is_blob](Iterator* iterator) {
|
||||
ASSERT_EQ(is_blob,
|
||||
reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
|
||||
};
|
||||
};
|
||||
|
||||
auto verify = [&](int index, Status::Code expected_status,
|
||||
const Slice& forward_value, const Slice& backward_value,
|
||||
std::function<Iterator*()> create_iterator,
|
||||
std::function<void(Iterator*)> extra_check = nullptr) {
|
||||
// Seek
|
||||
auto* iterator = create_iterator();
|
||||
ASSERT_OK(iterator->Refresh());
|
||||
iterator->Seek(get_key(index));
|
||||
check_iterator(iterator, expected_status, forward_value);
|
||||
if (extra_check) {
|
||||
extra_check(iterator);
|
||||
}
|
||||
delete iterator;
|
||||
|
||||
// Next
|
||||
iterator = create_iterator();
|
||||
ASSERT_OK(iterator->Refresh());
|
||||
iterator->Seek(get_key(index - 1));
|
||||
ASSERT_TRUE(iterator->Valid());
|
||||
iterator->Next();
|
||||
check_iterator(iterator, expected_status, forward_value);
|
||||
if (extra_check) {
|
||||
extra_check(iterator);
|
||||
}
|
||||
delete iterator;
|
||||
|
||||
// SeekForPrev
|
||||
iterator = create_iterator();
|
||||
ASSERT_OK(iterator->Refresh());
|
||||
iterator->SeekForPrev(get_key(index));
|
||||
check_iterator(iterator, expected_status, backward_value);
|
||||
if (extra_check) {
|
||||
extra_check(iterator);
|
||||
}
|
||||
delete iterator;
|
||||
|
||||
// Prev
|
||||
iterator = create_iterator();
|
||||
iterator->Seek(get_key(index + 1));
|
||||
ASSERT_TRUE(iterator->Valid());
|
||||
iterator->Prev();
|
||||
check_iterator(iterator, expected_status, backward_value);
|
||||
if (extra_check) {
|
||||
extra_check(iterator);
|
||||
}
|
||||
delete iterator;
|
||||
};
|
||||
|
||||
for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
|
||||
// Avoid values from being purged.
|
||||
std::vector<const Snapshot*> snapshots;
|
||||
DestroyAndReopen(GetTestOptions());
|
||||
|
||||
// fill data
|
||||
for (int i = 0; i < static_cast<int>(data.size()); i++) {
|
||||
for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
|
||||
std::string key = get_key(i);
|
||||
std::string value = get_value(i, j);
|
||||
WriteBatch batch;
|
||||
switch (data[i][j]) {
|
||||
case kTypeValue:
|
||||
ASSERT_OK(Put(key, value));
|
||||
break;
|
||||
case kTypeDeletion:
|
||||
ASSERT_OK(Delete(key));
|
||||
break;
|
||||
case kTypeSingleDeletion:
|
||||
ASSERT_OK(SingleDelete(key));
|
||||
break;
|
||||
case kTypeMerge:
|
||||
ASSERT_OK(Merge(key, value));
|
||||
break;
|
||||
case kTypeBlobIndex:
|
||||
ASSERT_OK(PutBlobIndex(&batch, key, value));
|
||||
ASSERT_OK(Write(&batch));
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
};
|
||||
}
|
||||
snapshots.push_back(dbfull()->GetSnapshot());
|
||||
}
|
||||
ASSERT_OK(
|
||||
dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
|
||||
snapshots.push_back(dbfull()->GetSnapshot());
|
||||
MoveDataTo(tier);
|
||||
|
||||
// Normal iterator
|
||||
verify(1, Status::kNotSupported, "", "", create_normal_iterator);
|
||||
verify(3, Status::kNotSupported, "", "", create_normal_iterator);
|
||||
verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
|
||||
create_normal_iterator);
|
||||
verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
|
||||
create_normal_iterator);
|
||||
verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
|
||||
create_normal_iterator);
|
||||
verify(11, Status::kNotSupported, "", "", create_normal_iterator);
|
||||
verify(13, Status::kOk,
|
||||
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
|
||||
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
|
||||
create_normal_iterator);
|
||||
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
|
||||
create_normal_iterator);
|
||||
|
||||
// Iterator with blob support
|
||||
verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
|
||||
create_blob_iterator, check_is_blob(true));
|
||||
verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
|
||||
create_blob_iterator, check_is_blob(true));
|
||||
verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
|
||||
create_blob_iterator, check_is_blob(false));
|
||||
verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
|
||||
create_blob_iterator, check_is_blob(false));
|
||||
verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
|
||||
create_blob_iterator, check_is_blob(false));
|
||||
verify(11, Status::kNotSupported, "", "", create_blob_iterator);
|
||||
verify(13, Status::kOk,
|
||||
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
|
||||
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
|
||||
create_blob_iterator, check_is_blob(false));
|
||||
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
|
||||
create_blob_iterator, check_is_blob(false));
|
||||
|
||||
for (auto* snapshot : snapshots) {
|
||||
dbfull()->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
@ -1439,6 +1439,60 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
|
||||
ASSERT_GT(old_num_files, new_num_files);
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
|
||||
// regression test for #2833: groups of files whose user-keys overlap at the
|
||||
// endpoints could be split by `DeleteFilesInRange`. This caused old data to
|
||||
// reappear, either because a new version of the key was removed, or a range
|
||||
// deletion was partially dropped. It could also cause non-overlapping
|
||||
// invariant to be violated if the files dropped by DeleteFilesInRange were
|
||||
// a subset of files that a range deletion spans.
|
||||
const int kNumL0Files = 2;
|
||||
const int kValSize = 8 << 10; // 8KB
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
options.target_file_size_base = 1 << 10; // 1KB
|
||||
DestroyAndReopen(options);
|
||||
|
||||
// The snapshot prevents key 1 from having its old version dropped. The low
|
||||
// `target_file_size_base` ensures two keys will be in each output file.
|
||||
const Snapshot* snapshot = nullptr;
|
||||
Random rnd(301);
|
||||
// The value indicates which flush the key belonged to, which is enough
|
||||
// for us to determine the keys' relative ages. After L0 flushes finish,
|
||||
// files look like:
|
||||
//
|
||||
// File 0: 0 -> vals[0], 1 -> vals[0]
|
||||
// File 1: 1 -> vals[1], 2 -> vals[1]
|
||||
//
|
||||
// Then L0->L1 compaction happens, which outputs keys as follows:
|
||||
//
|
||||
// File 0: 0 -> vals[0], 1 -> vals[1]
|
||||
// File 1: 1 -> vals[0], 2 -> vals[1]
|
||||
//
|
||||
// DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
|
||||
// would cause `1 -> vals[0]` (an older key) to reappear.
|
||||
std::string vals[kNumL0Files];
|
||||
for (int i = 0; i < kNumL0Files; ++i) {
|
||||
vals[i] = RandomString(&rnd, kValSize);
|
||||
Put(Key(i), vals[i]);
|
||||
Put(Key(i + 1), vals[i]);
|
||||
Flush();
|
||||
if (i == 0) {
|
||||
snapshot = db_->GetSnapshot();
|
||||
}
|
||||
}
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
|
||||
// Verify `DeleteFilesInRange` can't drop only file 0 which would cause
|
||||
// "1 -> vals[0]" to reappear.
|
||||
Slice begin = Key(0);
|
||||
Slice end = Key(1);
|
||||
ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
|
||||
ASSERT_EQ(vals[1], Get(Key(1)));
|
||||
|
||||
db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
|
||||
int32_t trivial_move = 0;
|
||||
int32_t non_trivial_move = 0;
|
||||
|
201
db/db_impl.cc
201
db/db_impl.cc
@ -909,7 +909,8 @@ Status DBImpl::Get(const ReadOptions& read_options,
|
||||
|
||||
Status DBImpl::GetImpl(const ReadOptions& read_options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
PinnableSlice* pinnable_val, bool* value_found) {
|
||||
PinnableSlice* pinnable_val, bool* value_found,
|
||||
bool* is_blob_index) {
|
||||
assert(pinnable_val != nullptr);
|
||||
StopWatch sw(env_, stats_, DB_GET);
|
||||
PERF_TIMER_GUARD(get_snapshot_time);
|
||||
@ -959,13 +960,13 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
|
||||
bool done = false;
|
||||
if (!skip_memtable) {
|
||||
if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
|
||||
&range_del_agg, read_options)) {
|
||||
&range_del_agg, read_options, is_blob_index)) {
|
||||
done = true;
|
||||
pinnable_val->PinSelf();
|
||||
RecordTick(stats_, MEMTABLE_HIT);
|
||||
} else if ((s.ok() || s.IsMergeInProgress()) &&
|
||||
sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
|
||||
&range_del_agg, read_options)) {
|
||||
&range_del_agg, read_options, is_blob_index)) {
|
||||
done = true;
|
||||
pinnable_val->PinSelf();
|
||||
RecordTick(stats_, MEMTABLE_HIT);
|
||||
@ -977,7 +978,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
|
||||
if (!done) {
|
||||
PERF_TIMER_GUARD(get_from_output_files_time);
|
||||
sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context,
|
||||
&range_del_agg, value_found);
|
||||
&range_del_agg, value_found, nullptr, nullptr,
|
||||
is_blob_index);
|
||||
RecordTick(stats_, MEMTABLE_MISS);
|
||||
}
|
||||
|
||||
@ -1417,73 +1419,79 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
||||
#endif
|
||||
} else {
|
||||
SequenceNumber latest_snapshot = versions_->LastSequence();
|
||||
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
||||
|
||||
auto snapshot =
|
||||
read_options.snapshot != nullptr
|
||||
? reinterpret_cast<const SnapshotImpl*>(
|
||||
read_options.snapshot)->number_
|
||||
? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
||||
->number_
|
||||
: latest_snapshot;
|
||||
|
||||
// Try to generate a DB iterator tree in continuous memory area to be
|
||||
// cache friendly. Here is an example of result:
|
||||
// +-------------------------------+
|
||||
// | |
|
||||
// | ArenaWrappedDBIter |
|
||||
// | + |
|
||||
// | +---> Inner Iterator ------------+
|
||||
// | | | |
|
||||
// | | +-- -- -- -- -- -- -- --+ |
|
||||
// | +--- | Arena | |
|
||||
// | | | |
|
||||
// | Allocated Memory: | |
|
||||
// | | +-------------------+ |
|
||||
// | | | DBIter | <---+
|
||||
// | | + |
|
||||
// | | | +-> iter_ ------------+
|
||||
// | | | | |
|
||||
// | | +-------------------+ |
|
||||
// | | | MergingIterator | <---+
|
||||
// | | + |
|
||||
// | | | +->child iter1 ------------+
|
||||
// | | | | | |
|
||||
// | | +->child iter2 ----------+ |
|
||||
// | | | | | | |
|
||||
// | | | +->child iter3 --------+ | |
|
||||
// | | | | | |
|
||||
// | | +-------------------+ | | |
|
||||
// | | | Iterator1 | <--------+
|
||||
// | | +-------------------+ | |
|
||||
// | | | Iterator2 | <------+
|
||||
// | | +-------------------+ |
|
||||
// | | | Iterator3 | <----+
|
||||
// | | +-------------------+
|
||||
// | | |
|
||||
// +-------+-----------------------+
|
||||
//
|
||||
// ArenaWrappedDBIter inlines an arena area where all the iterators in
|
||||
// the iterator tree are allocated in the order of being accessed when
|
||||
// querying.
|
||||
// Laying out the iterators in the order of being accessed makes it more
|
||||
// likely that any iterator pointer is close to the iterator it points to so
|
||||
// that they are likely to be in the same cache line and/or page.
|
||||
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
||||
env_, read_options, *cfd->ioptions(), snapshot,
|
||||
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
||||
sv->version_number,
|
||||
((read_options.snapshot != nullptr) ? nullptr : this), cfd);
|
||||
|
||||
InternalIterator* internal_iter =
|
||||
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
|
||||
db_iter->GetRangeDelAggregator());
|
||||
db_iter->SetIterUnderDBIter(internal_iter);
|
||||
|
||||
return db_iter;
|
||||
return NewIteratorImpl(read_options, cfd, snapshot);
|
||||
}
|
||||
// To stop compiler from complaining
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
|
||||
ColumnFamilyData* cfd,
|
||||
SequenceNumber snapshot,
|
||||
bool allow_blob) {
|
||||
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
||||
|
||||
// Try to generate a DB iterator tree in continuous memory area to be
|
||||
// cache friendly. Here is an example of result:
|
||||
// +-------------------------------+
|
||||
// | |
|
||||
// | ArenaWrappedDBIter |
|
||||
// | + |
|
||||
// | +---> Inner Iterator ------------+
|
||||
// | | | |
|
||||
// | | +-- -- -- -- -- -- -- --+ |
|
||||
// | +--- | Arena | |
|
||||
// | | | |
|
||||
// | Allocated Memory: | |
|
||||
// | | +-------------------+ |
|
||||
// | | | DBIter | <---+
|
||||
// | | + |
|
||||
// | | | +-> iter_ ------------+
|
||||
// | | | | |
|
||||
// | | +-------------------+ |
|
||||
// | | | MergingIterator | <---+
|
||||
// | | + |
|
||||
// | | | +->child iter1 ------------+
|
||||
// | | | | | |
|
||||
// | | +->child iter2 ----------+ |
|
||||
// | | | | | | |
|
||||
// | | | +->child iter3 --------+ | |
|
||||
// | | | | | |
|
||||
// | | +-------------------+ | | |
|
||||
// | | | Iterator1 | <--------+
|
||||
// | | +-------------------+ | |
|
||||
// | | | Iterator2 | <------+
|
||||
// | | +-------------------+ |
|
||||
// | | | Iterator3 | <----+
|
||||
// | | +-------------------+
|
||||
// | | |
|
||||
// +-------+-----------------------+
|
||||
//
|
||||
// ArenaWrappedDBIter inlines an arena area where all the iterators in
|
||||
// the iterator tree are allocated in the order of being accessed when
|
||||
// querying.
|
||||
// Laying out the iterators in the order of being accessed makes it more
|
||||
// likely that any iterator pointer is close to the iterator it points to so
|
||||
// that they are likely to be in the same cache line and/or page.
|
||||
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
||||
env_, read_options, *cfd->ioptions(), snapshot,
|
||||
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
||||
sv->version_number, ((read_options.snapshot != nullptr) ? nullptr : this),
|
||||
cfd, allow_blob);
|
||||
|
||||
InternalIterator* internal_iter =
|
||||
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
|
||||
db_iter->GetRangeDelAggregator());
|
||||
db_iter->SetIterUnderDBIter(internal_iter);
|
||||
|
||||
return db_iter;
|
||||
}
|
||||
|
||||
Status DBImpl::NewIterators(
|
||||
const ReadOptions& read_options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
@ -1527,28 +1535,16 @@ Status DBImpl::NewIterators(
|
||||
#endif
|
||||
} else {
|
||||
SequenceNumber latest_snapshot = versions_->LastSequence();
|
||||
auto snapshot =
|
||||
read_options.snapshot != nullptr
|
||||
? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
||||
->number_
|
||||
: latest_snapshot;
|
||||
|
||||
for (size_t i = 0; i < column_families.size(); ++i) {
|
||||
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
|
||||
column_families[i])->cfd();
|
||||
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
||||
|
||||
auto snapshot =
|
||||
read_options.snapshot != nullptr
|
||||
? reinterpret_cast<const SnapshotImpl*>(
|
||||
read_options.snapshot)->number_
|
||||
: latest_snapshot;
|
||||
|
||||
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
||||
env_, read_options, *cfd->ioptions(), snapshot,
|
||||
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
||||
sv->version_number,
|
||||
((read_options.snapshot != nullptr) ? nullptr : this), cfd);
|
||||
InternalIterator* internal_iter =
|
||||
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
|
||||
db_iter->GetRangeDelAggregator());
|
||||
db_iter->SetIterUnderDBIter(internal_iter);
|
||||
iterators->push_back(db_iter);
|
||||
iterators->push_back(NewIteratorImpl(read_options, cfd, snapshot));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1587,12 +1583,10 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
||||
delete casted_s;
|
||||
}
|
||||
|
||||
bool DBImpl::HasActiveSnapshotLaterThanSN(SequenceNumber sn) {
|
||||
bool DBImpl::HasActiveSnapshotInRange(SequenceNumber lower_bound,
|
||||
SequenceNumber upper_bound) {
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
if (snapshots_.empty()) {
|
||||
return false;
|
||||
}
|
||||
return (snapshots_.newest()->GetSequenceNumber() > sn);
|
||||
return snapshots_.HasSnapshotInRange(lower_bound, upper_bound);
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
@ -2068,25 +2062,19 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
|
||||
end_key = &end_storage;
|
||||
}
|
||||
|
||||
vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1,
|
||||
nullptr, false);
|
||||
vstorage->GetCleanInputsWithinInterval(i, begin_key, end_key,
|
||||
&level_files, -1 /* hint_index */,
|
||||
nullptr /* file_index */);
|
||||
FileMetaData* level_file;
|
||||
for (uint32_t j = 0; j < level_files.size(); j++) {
|
||||
level_file = level_files[j];
|
||||
if (((begin == nullptr) ||
|
||||
(cfd->internal_comparator().user_comparator()->Compare(
|
||||
level_file->smallest.user_key(), *begin) >= 0)) &&
|
||||
((end == nullptr) ||
|
||||
(cfd->internal_comparator().user_comparator()->Compare(
|
||||
level_file->largest.user_key(), *end) <= 0))) {
|
||||
if (level_file->being_compacted) {
|
||||
continue;
|
||||
}
|
||||
edit.SetColumnFamily(cfd->GetID());
|
||||
edit.DeleteFile(i, level_file->fd.GetNumber());
|
||||
deleted_files.push_back(level_file);
|
||||
level_file->being_compacted = true;
|
||||
if (level_file->being_compacted) {
|
||||
continue;
|
||||
}
|
||||
edit.SetColumnFamily(cfd->GetID());
|
||||
edit.DeleteFile(i, level_file->fd.GetNumber());
|
||||
deleted_files.push_back(level_file);
|
||||
level_file->being_compacted = true;
|
||||
}
|
||||
}
|
||||
if (edit.GetDeletedFiles().empty()) {
|
||||
@ -2532,7 +2520,8 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
|
||||
#ifndef ROCKSDB_LITE
|
||||
Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||
bool cache_only, SequenceNumber* seq,
|
||||
bool* found_record_for_key) {
|
||||
bool* found_record_for_key,
|
||||
bool* is_blob_index) {
|
||||
Status s;
|
||||
MergeContext merge_context;
|
||||
RangeDelAggregator range_del_agg(sv->mem->GetInternalKeyComparator(),
|
||||
@ -2547,7 +2536,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||
|
||||
// Check if there is a record for this key in the latest memtable
|
||||
sv->mem->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
|
||||
read_options);
|
||||
read_options, is_blob_index);
|
||||
|
||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||
// unexpected error reading memtable.
|
||||
@ -2566,7 +2555,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||
|
||||
// Check if there is a record for this key in the immutable memtables
|
||||
sv->imm->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
|
||||
read_options);
|
||||
read_options, is_blob_index);
|
||||
|
||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||
// unexpected error reading memtable.
|
||||
@ -2585,7 +2574,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||
|
||||
// Check if there is a record for this key in the immutable memtables
|
||||
sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &range_del_agg,
|
||||
seq, read_options);
|
||||
seq, read_options, is_blob_index);
|
||||
|
||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||
// unexpected error reading memtable.
|
||||
@ -2609,7 +2598,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||
// Check tables
|
||||
sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
|
||||
&range_del_agg, nullptr /* value_found */,
|
||||
found_record_for_key, seq);
|
||||
found_record_for_key, seq, is_blob_index);
|
||||
|
||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||
// unexpected error reading SST files
|
||||
|
31
db/db_impl.h
31
db/db_impl.h
@ -52,6 +52,7 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class ArenaWrappedDBIter;
|
||||
class MemTable;
|
||||
class TableCache;
|
||||
class Version;
|
||||
@ -93,6 +94,13 @@ class DBImpl : public DB {
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
PinnableSlice* value) override;
|
||||
|
||||
// Function that Get and KeyMayExist call with no_io true or false
|
||||
// Note: 'value_found' from KeyMayExist propagates here
|
||||
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
||||
const Slice& key, PinnableSlice* value,
|
||||
bool* value_found = nullptr, bool* is_blob_index = nullptr);
|
||||
|
||||
using DB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
@ -123,6 +131,7 @@ class DBImpl : public DB {
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
std::string* value,
|
||||
bool* value_found = nullptr) override;
|
||||
|
||||
using DB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family) override;
|
||||
@ -130,6 +139,11 @@ class DBImpl : public DB {
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators) override;
|
||||
ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
|
||||
ColumnFamilyData* cfd,
|
||||
SequenceNumber snapshot,
|
||||
bool allow_blob = false);
|
||||
|
||||
virtual const Snapshot* GetSnapshot() override;
|
||||
virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
|
||||
using DB::GetProperty;
|
||||
@ -202,7 +216,9 @@ class DBImpl : public DB {
|
||||
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const override;
|
||||
|
||||
bool HasActiveSnapshotLaterThanSN(SequenceNumber sn);
|
||||
// Whether there is an active snapshot in range [lower_bound, upper_bound).
|
||||
bool HasActiveSnapshotInRange(SequenceNumber lower_bound,
|
||||
SequenceNumber upper_bound);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
using DB::ResetStats;
|
||||
@ -285,7 +301,8 @@ class DBImpl : public DB {
|
||||
// TODO(andrewkr): this API need to be aware of range deletion operations
|
||||
Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||
bool cache_only, SequenceNumber* seq,
|
||||
bool* found_record_for_key);
|
||||
bool* found_record_for_key,
|
||||
bool* is_blob_index = nullptr);
|
||||
|
||||
using DB::IngestExternalFile;
|
||||
virtual Status IngestExternalFile(
|
||||
@ -341,6 +358,8 @@ class DBImpl : public DB {
|
||||
return alive_log_files_.begin()->getting_flushed;
|
||||
}
|
||||
|
||||
Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status TEST_FlushMemTable(bool wait = true,
|
||||
ColumnFamilyHandle* cfh = nullptr);
|
||||
@ -644,7 +663,9 @@ class DBImpl : public DB {
|
||||
friend struct SuperVersion;
|
||||
friend class CompactedDBImpl;
|
||||
#ifndef NDEBUG
|
||||
friend class DBTest2_ReadCallbackTest_Test;
|
||||
friend class XFTransactionWriteHandler;
|
||||
friend class DBBlobIndexTest;
|
||||
#endif
|
||||
struct CompactionState;
|
||||
|
||||
@ -1241,12 +1262,6 @@ class DBImpl : public DB {
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// Function that Get and KeyMayExist call with no_io true or false
|
||||
// Note: 'value_found' from KeyMayExist propagates here
|
||||
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
||||
const Slice& key, PinnableSlice* value,
|
||||
bool* value_found = nullptr);
|
||||
|
||||
bool GetIntPropertyInternal(ColumnFamilyData* cfd,
|
||||
const DBPropertyInfo& property_info,
|
||||
bool is_locked, uint64_t* value);
|
||||
|
@ -80,6 +80,15 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
|
||||
disallow_trivial_move);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
|
||||
WriteContext write_context;
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
if (cfd == nullptr) {
|
||||
cfd = default_cf_handle_->cfd();
|
||||
}
|
||||
return SwitchMemtable(cfd, &write_context);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_FlushMemTable(bool wait, ColumnFamilyHandle* cfh) {
|
||||
FlushOptions fo;
|
||||
fo.wait = wait;
|
||||
|
@ -319,7 +319,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
versions_->SetLastSequence(last_sequence);
|
||||
}
|
||||
MemTableInsertStatusCheck(w.status);
|
||||
write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
|
||||
write_thread_.ExitAsBatchGroupLeader(write_group, status);
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
@ -543,7 +543,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
||||
if (!w.CallbackFailed()) {
|
||||
WriteCallbackStatusCheck(status);
|
||||
}
|
||||
nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
|
||||
nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status);
|
||||
if (status.ok()) {
|
||||
status = w.FinalStatus();
|
||||
}
|
||||
|
@ -8,8 +8,6 @@
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_iter.h"
|
||||
#include <stdexcept>
|
||||
#include <deque>
|
||||
#include <string>
|
||||
#include <limits>
|
||||
|
||||
@ -18,7 +16,6 @@
|
||||
#include "db/merge_helper.h"
|
||||
#include "db/pinned_iterators_manager.h"
|
||||
#include "monitoring/perf_context_imp.h"
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
@ -105,7 +102,7 @@ class DBIter: public Iterator {
|
||||
DBIter(Env* _env, const ReadOptions& read_options,
|
||||
const ImmutableCFOptions& cf_options, const Comparator* cmp,
|
||||
InternalIterator* iter, SequenceNumber s, bool arena_mode,
|
||||
uint64_t max_sequential_skip_in_iterations)
|
||||
uint64_t max_sequential_skip_in_iterations, bool allow_blob)
|
||||
: arena_mode_(arena_mode),
|
||||
env_(_env),
|
||||
logger_(cf_options.info_log),
|
||||
@ -122,7 +119,8 @@ class DBIter: public Iterator {
|
||||
pin_thru_lifetime_(read_options.pin_data),
|
||||
total_order_seek_(read_options.total_order_seek),
|
||||
range_del_agg_(cf_options.internal_comparator, s,
|
||||
true /* collapse_deletions */) {
|
||||
true /* collapse_deletions */),
|
||||
allow_blob_(allow_blob) {
|
||||
RecordTick(statistics_, NO_ITERATORS);
|
||||
prefix_extractor_ = cf_options.prefix_extractor;
|
||||
max_skip_ = max_sequential_skip_in_iterations;
|
||||
@ -180,6 +178,10 @@ class DBIter: public Iterator {
|
||||
return status_;
|
||||
}
|
||||
}
|
||||
bool IsBlob() const {
|
||||
assert(valid_ && (allow_blob_ || !is_blob_));
|
||||
return is_blob_;
|
||||
}
|
||||
|
||||
virtual Status GetProperty(std::string prop_name,
|
||||
std::string* prop) override {
|
||||
@ -287,6 +289,8 @@ class DBIter: public Iterator {
|
||||
RangeDelAggregator range_del_agg_;
|
||||
LocalStatistics local_stats_;
|
||||
PinnedIteratorsManager pinned_iters_mgr_;
|
||||
bool allow_blob_;
|
||||
bool is_blob_;
|
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&);
|
||||
@ -376,6 +380,8 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
|
||||
// - none of the above : saved_key_ can contain anything, it doesn't matter.
|
||||
uint64_t num_skipped = 0;
|
||||
|
||||
is_blob_ = false;
|
||||
|
||||
do {
|
||||
ParsedInternalKey ikey;
|
||||
|
||||
@ -420,6 +426,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
|
||||
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
|
||||
break;
|
||||
case kTypeValue:
|
||||
case kTypeBlobIndex:
|
||||
saved_key_.SetUserKey(
|
||||
ikey.user_key,
|
||||
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
|
||||
@ -431,6 +438,18 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
|
||||
skipping = true;
|
||||
num_skipped = 0;
|
||||
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
|
||||
} else if (ikey.type == kTypeBlobIndex) {
|
||||
if (!allow_blob_) {
|
||||
ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
|
||||
status_ = Status::NotSupported(
|
||||
"Encounter unexpected blob index. Please open DB with "
|
||||
"rocksdb::blob_db::BlobDB instead.");
|
||||
valid_ = false;
|
||||
} else {
|
||||
is_blob_ = true;
|
||||
valid_ = true;
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
valid_ = true;
|
||||
return;
|
||||
@ -572,6 +591,18 @@ void DBIter::MergeValuesNewToOld() {
|
||||
merge_context_.PushOperand(iter_->value(),
|
||||
iter_->IsValuePinned() /* operand_pinned */);
|
||||
PERF_COUNTER_ADD(internal_merge_count, 1);
|
||||
} else if (kTypeBlobIndex == ikey.type) {
|
||||
if (!allow_blob_) {
|
||||
ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
|
||||
status_ = Status::NotSupported(
|
||||
"Encounter unexpected blob index. Please open DB with "
|
||||
"rocksdb::blob_db::BlobDB instead.");
|
||||
} else {
|
||||
status_ =
|
||||
Status::NotSupported("Blob DB does not support merge operator.");
|
||||
}
|
||||
valid_ = false;
|
||||
return;
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
@ -678,7 +709,6 @@ void DBIter::PrevInternal() {
|
||||
!iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
|
||||
|
||||
if (FindValueForCurrentKey()) {
|
||||
valid_ = true;
|
||||
if (!iter_->Valid()) {
|
||||
return;
|
||||
}
|
||||
@ -745,6 +775,7 @@ bool DBIter::FindValueForCurrentKey() {
|
||||
last_key_entry_type = ikey.type;
|
||||
switch (last_key_entry_type) {
|
||||
case kTypeValue:
|
||||
case kTypeBlobIndex:
|
||||
if (range_del_agg_.ShouldDelete(
|
||||
ikey,
|
||||
RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
|
||||
@ -790,6 +821,7 @@ bool DBIter::FindValueForCurrentKey() {
|
||||
}
|
||||
|
||||
Status s;
|
||||
is_blob_ = false;
|
||||
switch (last_key_entry_type) {
|
||||
case kTypeDeletion:
|
||||
case kTypeSingleDeletion:
|
||||
@ -805,6 +837,18 @@ bool DBIter::FindValueForCurrentKey() {
|
||||
merge_operator_, saved_key_.GetUserKey(), nullptr,
|
||||
merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
|
||||
env_, &pinned_value_, true);
|
||||
} else if (last_not_merge_type == kTypeBlobIndex) {
|
||||
if (!allow_blob_) {
|
||||
ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
|
||||
status_ = Status::NotSupported(
|
||||
"Encounter unexpected blob index. Please open DB with "
|
||||
"rocksdb::blob_db::BlobDB instead.");
|
||||
} else {
|
||||
status_ =
|
||||
Status::NotSupported("Blob DB does not support merge operator.");
|
||||
}
|
||||
valid_ = false;
|
||||
return true;
|
||||
} else {
|
||||
assert(last_not_merge_type == kTypeValue);
|
||||
s = MergeHelper::TimedFullMerge(
|
||||
@ -816,6 +860,17 @@ bool DBIter::FindValueForCurrentKey() {
|
||||
case kTypeValue:
|
||||
// do nothing - we've already has value in saved_value_
|
||||
break;
|
||||
case kTypeBlobIndex:
|
||||
if (!allow_blob_) {
|
||||
ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
|
||||
status_ = Status::NotSupported(
|
||||
"Encounter unexpected blob index. Please open DB with "
|
||||
"rocksdb::blob_db::BlobDB instead.");
|
||||
valid_ = false;
|
||||
return true;
|
||||
}
|
||||
is_blob_ = true;
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
break;
|
||||
@ -849,7 +904,15 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
|
||||
valid_ = false;
|
||||
return false;
|
||||
}
|
||||
if (ikey.type == kTypeValue) {
|
||||
if (ikey.type == kTypeBlobIndex && !allow_blob_) {
|
||||
ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
|
||||
status_ = Status::NotSupported(
|
||||
"Encounter unexpected blob index. Please open DB with "
|
||||
"rocksdb::blob_db::BlobDB instead.");
|
||||
valid_ = false;
|
||||
return true;
|
||||
}
|
||||
if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
|
||||
assert(iter_->IsValuePinned());
|
||||
pinned_value_ = iter_->value();
|
||||
valid_ = true;
|
||||
@ -1160,10 +1223,11 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
|
||||
const Comparator* user_key_comparator,
|
||||
InternalIterator* internal_iter,
|
||||
const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations) {
|
||||
DBIter* db_iter = new DBIter(env, read_options, cf_options,
|
||||
user_key_comparator, internal_iter, sequence,
|
||||
false, max_sequential_skip_in_iterations);
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
bool allow_blob) {
|
||||
DBIter* db_iter = new DBIter(
|
||||
env, read_options, cf_options, user_key_comparator, internal_iter,
|
||||
sequence, false, max_sequential_skip_in_iterations, allow_blob);
|
||||
return db_iter;
|
||||
}
|
||||
|
||||
@ -1191,6 +1255,7 @@ inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
|
||||
inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
|
||||
inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
|
||||
inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
|
||||
bool ArenaWrappedDBIter::IsBlob() const { return db_iter_->IsBlob(); }
|
||||
inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
|
||||
std::string* prop) {
|
||||
if (prop_name == "rocksdb.iterator.super-version-number") {
|
||||
@ -1207,11 +1272,11 @@ void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
|
||||
const ImmutableCFOptions& cf_options,
|
||||
const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iteration,
|
||||
uint64_t version_number) {
|
||||
uint64_t version_number, bool allow_blob) {
|
||||
auto mem = arena_.AllocateAligned(sizeof(DBIter));
|
||||
db_iter_ = new (mem)
|
||||
DBIter(env, read_options, cf_options, cf_options.user_comparator, nullptr,
|
||||
sequence, true, max_sequential_skip_in_iteration);
|
||||
sequence, true, max_sequential_skip_in_iteration, allow_blob);
|
||||
sv_number_ = version_number;
|
||||
}
|
||||
|
||||
@ -1231,7 +1296,7 @@ Status ArenaWrappedDBIter::Refresh() {
|
||||
SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
|
||||
Init(env, read_options_, *(cfd_->ioptions()), latest_seq,
|
||||
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
||||
cur_sv_number);
|
||||
cur_sv_number, allow_blob_);
|
||||
|
||||
InternalIterator* internal_iter = db_impl_->NewInternalIterator(
|
||||
read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator());
|
||||
@ -1247,12 +1312,12 @@ ArenaWrappedDBIter* NewArenaWrappedDbIterator(
|
||||
Env* env, const ReadOptions& read_options,
|
||||
const ImmutableCFOptions& cf_options, const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
|
||||
DBImpl* db_impl, ColumnFamilyData* cfd) {
|
||||
DBImpl* db_impl, ColumnFamilyData* cfd, bool allow_blob) {
|
||||
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
|
||||
iter->Init(env, read_options, cf_options, sequence,
|
||||
max_sequential_skip_in_iterations, version_number);
|
||||
max_sequential_skip_in_iterations, version_number, allow_blob);
|
||||
if (db_impl != nullptr && cfd != nullptr) {
|
||||
iter->StoreRefreshInfo(read_options, db_impl, cfd);
|
||||
iter->StoreRefreshInfo(read_options, db_impl, cfd, allow_blob);
|
||||
}
|
||||
|
||||
return iter;
|
||||
|
15
db/db_iter.h
15
db/db_iter.h
@ -33,7 +33,8 @@ extern Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
|
||||
const Comparator* user_key_comparator,
|
||||
InternalIterator* internal_iter,
|
||||
const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations);
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
bool allow_blob = false);
|
||||
|
||||
// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
|
||||
// iterator is supposed be allocated. This class is used as an entry point of
|
||||
@ -63,20 +64,22 @@ class ArenaWrappedDBIter : public Iterator {
|
||||
virtual Slice value() const override;
|
||||
virtual Status status() const override;
|
||||
virtual Status Refresh() override;
|
||||
bool IsBlob() const;
|
||||
|
||||
virtual Status GetProperty(std::string prop_name, std::string* prop) override;
|
||||
|
||||
void Init(Env* env, const ReadOptions& read_options,
|
||||
const ImmutableCFOptions& cf_options,
|
||||
const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
uint64_t version_number);
|
||||
uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
|
||||
bool allow_blob);
|
||||
|
||||
void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
|
||||
ColumnFamilyData* cfd) {
|
||||
ColumnFamilyData* cfd, bool allow_blob) {
|
||||
read_options_ = read_options;
|
||||
db_impl_ = db_impl;
|
||||
cfd_ = cfd;
|
||||
allow_blob_ = allow_blob;
|
||||
}
|
||||
|
||||
private:
|
||||
@ -86,6 +89,7 @@ class ArenaWrappedDBIter : public Iterator {
|
||||
ColumnFamilyData* cfd_ = nullptr;
|
||||
DBImpl* db_impl_ = nullptr;
|
||||
ReadOptions read_options_;
|
||||
bool allow_blob_ = false;
|
||||
};
|
||||
|
||||
// Generate the arena wrapped iterator class.
|
||||
@ -95,6 +99,7 @@ extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
|
||||
Env* env, const ReadOptions& read_options,
|
||||
const ImmutableCFOptions& cf_options, const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
|
||||
DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr);
|
||||
DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
|
||||
bool allow_blob = false);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -1309,6 +1309,80 @@ TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
|
||||
ASSERT_EQ(0, num_keys);
|
||||
}
|
||||
|
||||
TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
|
||||
std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
|
||||
uint64_t oldest_key_time = 0;
|
||||
Options options;
|
||||
options.env = mock_env.get();
|
||||
|
||||
// "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
|
||||
mock_env->set_current_time(100);
|
||||
for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
|
||||
kCompactionStyleNone}) {
|
||||
options.compaction_style = compaction;
|
||||
options.create_if_missing = true;
|
||||
DestroyAndReopen(options);
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
ASSERT_FALSE(dbfull()->GetIntProperty(
|
||||
DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
|
||||
}
|
||||
|
||||
options.compaction_style = kCompactionStyleFIFO;
|
||||
options.compaction_options_fifo.ttl = 300;
|
||||
options.compaction_options_fifo.allow_compaction = false;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
mock_env->set_current_time(100);
|
||||
ASSERT_OK(Put("k1", "v1"));
|
||||
ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
|
||||
&oldest_key_time));
|
||||
ASSERT_EQ(100, oldest_key_time);
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_EQ("1", FilesPerLevel());
|
||||
ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
|
||||
&oldest_key_time));
|
||||
ASSERT_EQ(100, oldest_key_time);
|
||||
|
||||
mock_env->set_current_time(200);
|
||||
ASSERT_OK(Put("k2", "v2"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_EQ("2", FilesPerLevel());
|
||||
ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
|
||||
&oldest_key_time));
|
||||
ASSERT_EQ(100, oldest_key_time);
|
||||
|
||||
mock_env->set_current_time(300);
|
||||
ASSERT_OK(Put("k3", "v3"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_EQ("3", FilesPerLevel());
|
||||
ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
|
||||
&oldest_key_time));
|
||||
ASSERT_EQ(100, oldest_key_time);
|
||||
|
||||
mock_env->set_current_time(450);
|
||||
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
ASSERT_EQ("2", FilesPerLevel());
|
||||
ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
|
||||
&oldest_key_time));
|
||||
ASSERT_EQ(200, oldest_key_time);
|
||||
|
||||
mock_env->set_current_time(550);
|
||||
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
ASSERT_EQ("1", FilesPerLevel());
|
||||
ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
|
||||
&oldest_key_time));
|
||||
ASSERT_EQ(300, oldest_key_time);
|
||||
|
||||
mock_env->set_current_time(650);
|
||||
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
ASSERT_EQ("", FilesPerLevel());
|
||||
ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
|
||||
&oldest_key_time));
|
||||
|
||||
// Close before mock_env destructs.
|
||||
Close();
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
} // namespace rocksdb
|
||||
|
||||
|
@ -650,9 +650,18 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
|
||||
}
|
||||
|
||||
TEST_F(DBSSTTest, GetTotalSstFilesSize) {
|
||||
// We don't propagate oldest-key-time table property on compaction and
|
||||
// just write 0 as default value. This affect the exact table size, since
|
||||
// we encode table properties as varint64. Force time to be 0 to work around
|
||||
// it. Should remove the workaround after we propagate the property on
|
||||
// compaction.
|
||||
std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
|
||||
mock_env->set_current_time(0);
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.disable_auto_compactions = true;
|
||||
options.compression = kNoCompression;
|
||||
options.env = mock_env.get();
|
||||
DestroyAndReopen(options);
|
||||
// Generate 5 files in L0
|
||||
for (int i = 0; i < 5; i++) {
|
||||
@ -739,6 +748,9 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
|
||||
// Live SST files = 0
|
||||
// Total SST files = 0
|
||||
ASSERT_EQ(total_sst_files_size, 0);
|
||||
|
||||
// Close db before mock_env destruct.
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
|
||||
|
@ -3347,11 +3347,23 @@ TEST_F(DBTest, DynamicMemtableOptions) {
|
||||
{"write_buffer_size", "131072"},
|
||||
}));
|
||||
|
||||
// The existing memtable is still 64KB in size, after it becomes immutable,
|
||||
// the next memtable will be 128KB in size. Write 256KB total, we should
|
||||
// have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
|
||||
gen_l0_kb(256);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A)
|
||||
// The existing memtable inflated 64KB->128KB when we invoked SetOptions().
|
||||
// Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
|
||||
gen_l0_kb(192);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A)
|
||||
ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
|
||||
ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
|
||||
|
||||
// Decrease buffer size below current usage
|
||||
ASSERT_OK(dbfull()->SetOptions({
|
||||
{"write_buffer_size", "65536"},
|
||||
}));
|
||||
// The existing memtable became eligible for flush when we reduced its
|
||||
// capacity to 64KB. Two keys need to be added to trigger flush: first causes
|
||||
// memtable to be marked full, second schedules the flush. Then we should have
|
||||
// a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
|
||||
gen_l0_kb(2);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 2);
|
||||
ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
|
||||
ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
|
||||
|
||||
|
@ -2315,15 +2315,21 @@ TEST_F(DBTest2, ReduceLevel) {
|
||||
Put("foo", "bar");
|
||||
Flush();
|
||||
MoveFilesToLevel(6);
|
||||
#ifndef ROCKSDB_LITE
|
||||
ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
|
||||
#endif // !ROCKSDB_LITE
|
||||
CompactRangeOptions compact_options;
|
||||
compact_options.change_level = true;
|
||||
compact_options.target_level = 1;
|
||||
dbfull()->CompactRange(compact_options, nullptr, nullptr);
|
||||
#ifndef ROCKSDB_LITE
|
||||
ASSERT_EQ("0,1", FilesPerLevel());
|
||||
#endif // !ROCKSDB_LITE
|
||||
options.num_levels = 3;
|
||||
Reopen(options);
|
||||
#ifndef ROCKSDB_LITE
|
||||
ASSERT_EQ("0,1", FilesPerLevel());
|
||||
#endif // !ROCKSDB_LITE
|
||||
}
|
||||
} // namespace rocksdb
|
||||
|
||||
|
@ -572,6 +572,37 @@ class SpecialEnv : public EnvWrapper {
|
||||
std::atomic<bool> is_wal_sync_thread_safe_{true};
|
||||
};
|
||||
|
||||
class MockTimeEnv : public EnvWrapper {
|
||||
public:
|
||||
explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
|
||||
|
||||
virtual Status GetCurrentTime(int64_t* time) override {
|
||||
assert(time != nullptr);
|
||||
assert(current_time_ <=
|
||||
static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
|
||||
*time = static_cast<int64_t>(current_time_);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual uint64_t NowMicros() override {
|
||||
assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
|
||||
return current_time_ * 1000000;
|
||||
}
|
||||
|
||||
virtual uint64_t NowNanos() override {
|
||||
assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
|
||||
return current_time_ * 1000000000;
|
||||
}
|
||||
|
||||
void set_current_time(uint64_t time) {
|
||||
assert(time >= current_time_);
|
||||
current_time_ = time;
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t current_time_ = 0;
|
||||
};
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
class OnFileDeletionListener : public EventListener {
|
||||
public:
|
||||
|
@ -3,12 +3,17 @@
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "db/write_thread.h"
|
||||
#include "port/port.h"
|
||||
#include "port/stack_trace.h"
|
||||
#include "util/fault_injection_test_env.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -18,7 +23,9 @@ class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
|
||||
public:
|
||||
DBWriteTest() : DBTestBase("/db_write_test") {}
|
||||
|
||||
void Open() { DBTestBase::Reopen(GetOptions(GetParam())); }
|
||||
Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
|
||||
|
||||
void Open() { DBTestBase::Reopen(GetOptions()); }
|
||||
};
|
||||
|
||||
// Sequence number should be return through input write batch.
|
||||
@ -67,6 +74,47 @@ TEST_P(DBWriteTest, ReturnSeuqneceNumberMultiThreaded) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
|
||||
constexpr int kNumThreads = 5;
|
||||
std::unique_ptr<FaultInjectionTestEnv> mock_env(
|
||||
new FaultInjectionTestEnv(Env::Default()));
|
||||
Options options = GetOptions();
|
||||
options.env = mock_env.get();
|
||||
Reopen(options);
|
||||
std::atomic<int> ready_count{0};
|
||||
std::atomic<int> leader_count{0};
|
||||
std::vector<port::Thread> threads;
|
||||
mock_env->SetFilesystemActive(false);
|
||||
// Wait until all threads linked to write threads, to make sure
|
||||
// all threads join the same batch group.
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
|
||||
ready_count++;
|
||||
auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
|
||||
if (w->state == WriteThread::STATE_GROUP_LEADER) {
|
||||
leader_count++;
|
||||
while (ready_count < kNumThreads) {
|
||||
// busy waiting
|
||||
}
|
||||
}
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
for (int i = 0; i < kNumThreads; i++) {
|
||||
threads.push_back(port::Thread(
|
||||
[&](int index) {
|
||||
// All threads should fail.
|
||||
ASSERT_FALSE(Put("key" + ToString(index), "value").ok());
|
||||
},
|
||||
i));
|
||||
}
|
||||
for (int i = 0; i < kNumThreads; i++) {
|
||||
threads[i].join();
|
||||
}
|
||||
ASSERT_EQ(1, leader_count);
|
||||
// Close before mock_env destruct.
|
||||
Close();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
|
||||
testing::Values(DBTestBase::kDefault,
|
||||
DBTestBase::kConcurrentWALWrites,
|
||||
|
@ -27,7 +27,7 @@ namespace rocksdb {
|
||||
// and the value type is embedded as the low 8 bits in the sequence
|
||||
// number in internal keys, we need to use the highest-numbered
|
||||
// ValueType, not the lowest).
|
||||
const ValueType kValueTypeForSeek = kTypeSingleDeletion;
|
||||
const ValueType kValueTypeForSeek = kTypeBlobIndex;
|
||||
const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
|
||||
|
||||
uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
|
||||
|
@ -47,6 +47,8 @@ enum ValueType : unsigned char {
|
||||
kTypeNoop = 0xD, // WAL only.
|
||||
kTypeColumnFamilyRangeDeletion = 0xE, // WAL only.
|
||||
kTypeRangeDeletion = 0xF, // meta block
|
||||
kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only
|
||||
kTypeBlobIndex = 0x11, // Blob DB only
|
||||
kMaxValue = 0x7F // Not used for storing records.
|
||||
};
|
||||
|
||||
@ -57,7 +59,7 @@ extern const ValueType kValueTypeForSeekForPrev;
|
||||
// Checks whether a type is an inline value type
|
||||
// (i.e. a type used in memtable skiplist and sst file datablock).
|
||||
inline bool IsValueType(ValueType t) {
|
||||
return t <= kTypeMerge || t == kTypeSingleDeletion;
|
||||
return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex;
|
||||
}
|
||||
|
||||
// Checks whether a type is from user operation
|
||||
|
@ -301,6 +301,8 @@ Status FlushJob::WriteLevel0Table() {
|
||||
db_options_.env->GetCurrentTime(&_current_time); // ignore error
|
||||
const uint64_t current_time = static_cast<uint64_t>(_current_time);
|
||||
|
||||
uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime();
|
||||
|
||||
s = BuildTable(
|
||||
dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
|
||||
optimized_env_options, cfd_->table_cache(), iter.get(),
|
||||
@ -311,7 +313,8 @@ Status FlushJob::WriteLevel0Table() {
|
||||
cfd_->ioptions()->compression_opts,
|
||||
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
|
||||
TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
|
||||
Env::IO_HIGH, &table_properties_, 0 /* level */, current_time);
|
||||
Env::IO_HIGH, &table_properties_, 0 /* level */, current_time,
|
||||
oldest_key_time);
|
||||
LogFlush(db_options_.info_log);
|
||||
}
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
|
@ -13,8 +13,9 @@
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "db/column_family.h"
|
||||
@ -243,6 +244,7 @@ static const std::string num_running_flushes = "num-running-flushes";
|
||||
static const std::string actual_delayed_write_rate =
|
||||
"actual-delayed-write-rate";
|
||||
static const std::string is_write_stopped = "is-write-stopped";
|
||||
static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
|
||||
|
||||
const std::string DB::Properties::kNumFilesAtLevelPrefix =
|
||||
rocksdb_prefix + num_files_at_level_prefix;
|
||||
@ -316,6 +318,8 @@ const std::string DB::Properties::kActualDelayedWriteRate =
|
||||
rocksdb_prefix + actual_delayed_write_rate;
|
||||
const std::string DB::Properties::kIsWriteStopped =
|
||||
rocksdb_prefix + is_write_stopped;
|
||||
const std::string DB::Properties::kEstimateOldestKeyTime =
|
||||
rocksdb_prefix + estimate_oldest_key_time;
|
||||
|
||||
const std::unordered_map<std::string, DBPropertyInfo>
|
||||
InternalStats::ppt_name_to_info = {
|
||||
@ -414,6 +418,9 @@ const std::unordered_map<std::string, DBPropertyInfo>
|
||||
nullptr}},
|
||||
{DB::Properties::kIsWriteStopped,
|
||||
{false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr}},
|
||||
{DB::Properties::kEstimateOldestKeyTime,
|
||||
{false, nullptr, &InternalStats::HandleEstimateOldestKeyTime,
|
||||
nullptr}},
|
||||
};
|
||||
|
||||
const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
|
||||
@ -775,6 +782,35 @@ bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
|
||||
Version* /*version*/) {
|
||||
// TODO(yiwu): The property is currently available for fifo compaction
|
||||
// with allow_compaction = false. This is because we don't propagate
|
||||
// oldest_key_time on compaction.
|
||||
if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
|
||||
cfd_->ioptions()->compaction_options_fifo.allow_compaction) {
|
||||
return false;
|
||||
}
|
||||
|
||||
TablePropertiesCollection collection;
|
||||
auto s = cfd_->current()->GetPropertiesOfAllTables(&collection);
|
||||
if (!s.ok()) {
|
||||
return false;
|
||||
}
|
||||
*value = std::numeric_limits<uint64_t>::max();
|
||||
for (auto& p : collection) {
|
||||
*value = std::min(*value, p.second->oldest_key_time);
|
||||
if (*value == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (*value > 0) {
|
||||
*value = std::min({cfd_->mem()->ApproximateOldestKeyTime(),
|
||||
cfd_->imm()->ApproximateOldestKeyTime(), *value});
|
||||
}
|
||||
return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
|
||||
void InternalStats::DumpDBStats(std::string* value) {
|
||||
char buf[1000];
|
||||
// DB-level stats, only available from default column family
|
||||
|
@ -475,6 +475,8 @@ class InternalStats {
|
||||
bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
|
||||
Version* version);
|
||||
bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
|
||||
bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
|
||||
Version* version);
|
||||
|
||||
// Total number of background errors encountered. Every time a flush task
|
||||
// or compaction task fails, this counter is incremented. The failure can
|
||||
|
@ -9,8 +9,9 @@
|
||||
|
||||
#include "db/memtable.h"
|
||||
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "db/merge_context.h"
|
||||
@ -37,10 +38,10 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
MemTableOptions::MemTableOptions(const ImmutableCFOptions& ioptions,
|
||||
const MutableCFOptions& mutable_cf_options)
|
||||
: write_buffer_size(mutable_cf_options.write_buffer_size),
|
||||
arena_block_size(mutable_cf_options.arena_block_size),
|
||||
ImmutableMemTableOptions::ImmutableMemTableOptions(
|
||||
const ImmutableCFOptions& ioptions,
|
||||
const MutableCFOptions& mutable_cf_options)
|
||||
: arena_block_size(mutable_cf_options.arena_block_size),
|
||||
memtable_prefix_bloom_bits(
|
||||
static_cast<uint32_t>(
|
||||
static_cast<double>(mutable_cf_options.write_buffer_size) *
|
||||
@ -81,6 +82,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
data_size_(0),
|
||||
num_entries_(0),
|
||||
num_deletes_(0),
|
||||
write_buffer_size_(mutable_cf_options.write_buffer_size),
|
||||
flush_in_progress_(false),
|
||||
flush_completed_(false),
|
||||
file_number_(0),
|
||||
@ -96,7 +98,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
flush_state_(FLUSH_NOT_REQUESTED),
|
||||
env_(ioptions.env),
|
||||
insert_with_hint_prefix_extractor_(
|
||||
ioptions.memtable_insert_with_hint_prefix_extractor) {
|
||||
ioptions.memtable_insert_with_hint_prefix_extractor),
|
||||
oldest_key_time_(std::numeric_limits<uint64_t>::max()) {
|
||||
UpdateFlushState();
|
||||
// something went wrong if we need to flush before inserting anything
|
||||
assert(!ShouldScheduleFlush());
|
||||
@ -133,6 +136,7 @@ size_t MemTable::ApproximateMemoryUsage() {
|
||||
}
|
||||
|
||||
bool MemTable::ShouldFlushNow() const {
|
||||
size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
|
||||
// In a lot of times, we cannot allocate arena blocks that exactly matches the
|
||||
// buffer size. Thus we have to decide if we should over-allocate or
|
||||
// under-allocate.
|
||||
@ -150,16 +154,14 @@ bool MemTable::ShouldFlushNow() const {
|
||||
// if we can still allocate one more block without exceeding the
|
||||
// over-allocation ratio, then we should not flush.
|
||||
if (allocated_memory + kArenaBlockSize <
|
||||
moptions_.write_buffer_size +
|
||||
kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if user keeps adding entries that exceeds moptions.write_buffer_size,
|
||||
// we need to flush earlier even though we still have much available
|
||||
// memory left.
|
||||
if (allocated_memory > moptions_.write_buffer_size +
|
||||
kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
// if user keeps adding entries that exceeds write_buffer_size, we need to
|
||||
// flush earlier even though we still have much available memory left.
|
||||
if (allocated_memory >
|
||||
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -202,6 +204,21 @@ void MemTable::UpdateFlushState() {
|
||||
}
|
||||
}
|
||||
|
||||
void MemTable::UpdateOldestKeyTime() {
|
||||
uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
|
||||
if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
|
||||
int64_t current_time = 0;
|
||||
auto s = env_->GetCurrentTime(¤t_time);
|
||||
if (s.ok()) {
|
||||
assert(current_time >= 0);
|
||||
// If fail, the timestamp is already set.
|
||||
oldest_key_time_.compare_exchange_strong(
|
||||
oldest_key_time, static_cast<uint64_t>(current_time),
|
||||
std::memory_order_relaxed, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
|
||||
const char* prefix_len_key2) const {
|
||||
// Internal keys are encoded as length-prefixed strings.
|
||||
@ -247,7 +264,8 @@ class MemTableIterator : public InternalIterator {
|
||||
comparator_(mem.comparator_),
|
||||
valid_(false),
|
||||
arena_mode_(arena != nullptr),
|
||||
value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) {
|
||||
value_pinned_(
|
||||
!mem.GetImmutableMemTableOptions()->inplace_update_support) {
|
||||
if (use_range_del_table) {
|
||||
iter_ = mem.range_del_table_->GetIterator(arena);
|
||||
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
|
||||
@ -516,6 +534,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
||||
if (is_range_del_table_empty_ && type == kTypeRangeDeletion) {
|
||||
is_range_del_table_empty_ = false;
|
||||
}
|
||||
UpdateOldestKeyTime();
|
||||
}
|
||||
|
||||
// Callback from MemTable::Get()
|
||||
@ -537,6 +556,7 @@ struct Saver {
|
||||
Statistics* statistics;
|
||||
bool inplace_update_support;
|
||||
Env* env_;
|
||||
bool* is_blob_index;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
@ -566,11 +586,26 @@ static bool SaveValue(void* arg, const char* entry) {
|
||||
ValueType type;
|
||||
UnPackSequenceAndType(tag, &s->seq, &type);
|
||||
|
||||
if ((type == kTypeValue || type == kTypeMerge) &&
|
||||
if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
|
||||
range_del_agg->ShouldDelete(Slice(key_ptr, key_length))) {
|
||||
type = kTypeRangeDeletion;
|
||||
}
|
||||
switch (type) {
|
||||
case kTypeBlobIndex:
|
||||
if (s->is_blob_index == nullptr) {
|
||||
ROCKS_LOG_ERROR(s->logger, "Encounter unexpected blob index.");
|
||||
*(s->status) = Status::NotSupported(
|
||||
"Encounter unsupported blob value. Please open DB with "
|
||||
"rocksdb::blob_db::BlobDB instead.");
|
||||
} else if (*(s->merge_in_progress)) {
|
||||
*(s->status) =
|
||||
Status::NotSupported("Blob DB does not support merge operator.");
|
||||
}
|
||||
if (!s->status->ok()) {
|
||||
*(s->found_final_value) = true;
|
||||
return false;
|
||||
}
|
||||
// intentional fallthrough
|
||||
case kTypeValue: {
|
||||
if (s->inplace_update_support) {
|
||||
s->mem->GetLock(s->key->user_key())->ReadLock();
|
||||
@ -589,6 +624,9 @@ static bool SaveValue(void* arg, const char* entry) {
|
||||
s->mem->GetLock(s->key->user_key())->ReadUnlock();
|
||||
}
|
||||
*(s->found_final_value) = true;
|
||||
if (s->is_blob_index != nullptr) {
|
||||
*(s->is_blob_index) = (type == kTypeBlobIndex);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case kTypeDeletion:
|
||||
@ -635,7 +673,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg, SequenceNumber* seq,
|
||||
const ReadOptions& read_opts) {
|
||||
const ReadOptions& read_opts, bool* is_blob_index) {
|
||||
// The sequence number is updated synchronously in version_set.h
|
||||
if (IsEmpty()) {
|
||||
// Avoiding recording stats for speed.
|
||||
@ -681,6 +719,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
||||
saver.inplace_update_support = moptions_.inplace_update_support;
|
||||
saver.statistics = moptions_.statistics;
|
||||
saver.env_ = env_;
|
||||
saver.is_blob_index = is_blob_index;
|
||||
table_->Get(key, &saver, SaveValue);
|
||||
|
||||
*seq = saver.seq;
|
||||
|
@ -35,11 +35,9 @@ class MemTableIterator;
|
||||
class MergeContext;
|
||||
class InternalIterator;
|
||||
|
||||
struct MemTableOptions {
|
||||
explicit MemTableOptions(
|
||||
const ImmutableCFOptions& ioptions,
|
||||
const MutableCFOptions& mutable_cf_options);
|
||||
size_t write_buffer_size;
|
||||
struct ImmutableMemTableOptions {
|
||||
explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
|
||||
const MutableCFOptions& mutable_cf_options);
|
||||
size_t arena_block_size;
|
||||
uint32_t memtable_prefix_bloom_bits;
|
||||
size_t memtable_huge_page_size;
|
||||
@ -187,13 +185,15 @@ class MemTable {
|
||||
// status returned indicates a corruption or other unexpected error.
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||
SequenceNumber* seq, const ReadOptions& read_opts);
|
||||
SequenceNumber* seq, const ReadOptions& read_opts,
|
||||
bool* is_blob_index = nullptr);
|
||||
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||
const ReadOptions& read_opts) {
|
||||
const ReadOptions& read_opts, bool* is_blob_index = nullptr) {
|
||||
SequenceNumber seq;
|
||||
return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts);
|
||||
return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
|
||||
is_blob_index);
|
||||
}
|
||||
|
||||
// Attempts to update the new_value inplace, else does normal Add
|
||||
@ -258,6 +258,18 @@ class MemTable {
|
||||
return num_deletes_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Dynamically change the memtable's capacity. If set below the current usage,
|
||||
// the next key added will trigger a flush. Can only increase size when
|
||||
// memtable prefix bloom is disabled, since we can't easily allocate more
|
||||
// space.
|
||||
void UpdateWriteBufferSize(size_t new_write_buffer_size) {
|
||||
if (prefix_bloom_ == nullptr ||
|
||||
new_write_buffer_size < write_buffer_size_) {
|
||||
write_buffer_size_.store(new_write_buffer_size,
|
||||
std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the edits area that is needed for flushing the memtable
|
||||
VersionEdit* GetEdits() { return &edit_; }
|
||||
|
||||
@ -346,7 +358,13 @@ class MemTable {
|
||||
return comparator_.comparator;
|
||||
}
|
||||
|
||||
const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
|
||||
const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
|
||||
return &moptions_;
|
||||
}
|
||||
|
||||
uint64_t ApproximateOldestKeyTime() const {
|
||||
return oldest_key_time_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
private:
|
||||
enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
|
||||
@ -356,7 +374,7 @@ class MemTable {
|
||||
friend class MemTableList;
|
||||
|
||||
KeyComparator comparator_;
|
||||
const MemTableOptions moptions_;
|
||||
const ImmutableMemTableOptions moptions_;
|
||||
int refs_;
|
||||
const size_t kArenaBlockSize;
|
||||
AllocTracker mem_tracker_;
|
||||
@ -370,6 +388,9 @@ class MemTable {
|
||||
std::atomic<uint64_t> num_entries_;
|
||||
std::atomic<uint64_t> num_deletes_;
|
||||
|
||||
// Dynamically changeable memtable option
|
||||
std::atomic<size_t> write_buffer_size_;
|
||||
|
||||
// These are used to manage memtable flushes to storage
|
||||
bool flush_in_progress_; // started the flush
|
||||
bool flush_completed_; // finished the flush
|
||||
@ -411,12 +432,17 @@ class MemTable {
|
||||
// Insert hints for each prefix.
|
||||
std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
|
||||
|
||||
// Timestamp of oldest key
|
||||
std::atomic<uint64_t> oldest_key_time_;
|
||||
|
||||
// Returns a heuristic flush decision
|
||||
bool ShouldFlushNow() const;
|
||||
|
||||
// Updates flush_state_ using ShouldFlushNow()
|
||||
void UpdateFlushState();
|
||||
|
||||
void UpdateOldestKeyTime();
|
||||
|
||||
// No copying allowed
|
||||
MemTable(const MemTable&);
|
||||
MemTable& operator=(const MemTable&);
|
||||
|
@ -10,6 +10,7 @@
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include "db/memtable.h"
|
||||
#include "db/version_set.h"
|
||||
@ -103,35 +104,31 @@ int MemTableList::NumFlushed() const {
|
||||
bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
|
||||
Status* s, MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg,
|
||||
SequenceNumber* seq,
|
||||
const ReadOptions& read_opts) {
|
||||
SequenceNumber* seq, const ReadOptions& read_opts,
|
||||
bool* is_blob_index) {
|
||||
return GetFromList(&memlist_, key, value, s, merge_context, range_del_agg,
|
||||
seq, read_opts);
|
||||
seq, read_opts, is_blob_index);
|
||||
}
|
||||
|
||||
bool MemTableListVersion::GetFromHistory(const LookupKey& key,
|
||||
std::string* value, Status* s,
|
||||
MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg,
|
||||
SequenceNumber* seq,
|
||||
const ReadOptions& read_opts) {
|
||||
bool MemTableListVersion::GetFromHistory(
|
||||
const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||
SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
|
||||
return GetFromList(&memlist_history_, key, value, s, merge_context,
|
||||
range_del_agg, seq, read_opts);
|
||||
range_del_agg, seq, read_opts, is_blob_index);
|
||||
}
|
||||
|
||||
bool MemTableListVersion::GetFromList(std::list<MemTable*>* list,
|
||||
const LookupKey& key, std::string* value,
|
||||
Status* s, MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg,
|
||||
SequenceNumber* seq,
|
||||
const ReadOptions& read_opts) {
|
||||
bool MemTableListVersion::GetFromList(
|
||||
std::list<MemTable*>* list, const LookupKey& key, std::string* value,
|
||||
Status* s, MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||
SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
|
||||
*seq = kMaxSequenceNumber;
|
||||
|
||||
for (auto& memtable : *list) {
|
||||
SequenceNumber current_seq = kMaxSequenceNumber;
|
||||
|
||||
bool done = memtable->Get(key, value, s, merge_context, range_del_agg,
|
||||
¤t_seq, read_opts);
|
||||
¤t_seq, read_opts, is_blob_index);
|
||||
if (*seq == kMaxSequenceNumber) {
|
||||
// Store the most recent sequence number of any operation on this key.
|
||||
// Since we only care about the most recent change, we only need to
|
||||
@ -447,6 +444,13 @@ size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
|
||||
|
||||
size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
|
||||
|
||||
uint64_t MemTableList::ApproximateOldestKeyTime() const {
|
||||
if (!current_->memlist_.empty()) {
|
||||
return current_->memlist_.back()->ApproximateOldestKeyTime();
|
||||
}
|
||||
return std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
|
||||
void MemTableList::InstallNewVersion() {
|
||||
if (current_->refs_ == 1) {
|
||||
// we're the only one using the version, just keep using it
|
||||
|
@ -54,13 +54,15 @@ class MemTableListVersion {
|
||||
// returned). Otherwise, *seq will be set to kMaxSequenceNumber.
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||
SequenceNumber* seq, const ReadOptions& read_opts);
|
||||
SequenceNumber* seq, const ReadOptions& read_opts,
|
||||
bool* is_blob_index = nullptr);
|
||||
|
||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||
const ReadOptions& read_opts) {
|
||||
const ReadOptions& read_opts, bool* is_blob_index = nullptr) {
|
||||
SequenceNumber seq;
|
||||
return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts);
|
||||
return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
|
||||
is_blob_index);
|
||||
}
|
||||
|
||||
// Similar to Get(), but searches the Memtable history of memtables that
|
||||
@ -70,14 +72,16 @@ class MemTableListVersion {
|
||||
bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg, SequenceNumber* seq,
|
||||
const ReadOptions& read_opts);
|
||||
const ReadOptions& read_opts,
|
||||
bool* is_blob_index = nullptr);
|
||||
bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
|
||||
MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg,
|
||||
const ReadOptions& read_opts) {
|
||||
const ReadOptions& read_opts,
|
||||
bool* is_blob_index = nullptr) {
|
||||
SequenceNumber seq;
|
||||
return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq,
|
||||
read_opts);
|
||||
read_opts, is_blob_index);
|
||||
}
|
||||
|
||||
Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
|
||||
@ -117,7 +121,7 @@ class MemTableListVersion {
|
||||
bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
|
||||
std::string* value, Status* s, MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg, SequenceNumber* seq,
|
||||
const ReadOptions& read_opts);
|
||||
const ReadOptions& read_opts, bool* is_blob_index = nullptr);
|
||||
|
||||
void AddMemTable(MemTable* m);
|
||||
|
||||
@ -217,6 +221,9 @@ class MemTableList {
|
||||
// the unflushed mem-tables.
|
||||
size_t ApproximateUnflushedMemTablesMemoryUsage();
|
||||
|
||||
// Returns an estimate of the timestamp of the earliest key.
|
||||
uint64_t ApproximateOldestKeyTime() const;
|
||||
|
||||
// Request a flush of all existing memtables to storage. This will
|
||||
// cause future calls to IsFlushPending() to return true if this list is
|
||||
// non-empty (regardless of the min_write_buffer_number_to_merge
|
||||
|
@ -108,6 +108,22 @@ class SnapshotList {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Whether there is an active snapshot in range [lower_bound, upper_bound).
|
||||
bool HasSnapshotInRange(SequenceNumber lower_bound,
|
||||
SequenceNumber upper_bound) {
|
||||
if (empty()) {
|
||||
return false;
|
||||
}
|
||||
const SnapshotImpl* s = &list_;
|
||||
while (s->next_ != &list_) {
|
||||
if (s->next_->number_ >= lower_bound) {
|
||||
return s->next_->number_ < upper_bound;
|
||||
}
|
||||
s = s->next_;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// get the sequence number of the most recent snapshot
|
||||
SequenceNumber GetNewest() {
|
||||
if (empty()) {
|
||||
|
@ -965,7 +965,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
||||
PinnableSlice* value, Status* status,
|
||||
MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg, bool* value_found,
|
||||
bool* key_exists, SequenceNumber* seq) {
|
||||
bool* key_exists, SequenceNumber* seq, bool* is_blob) {
|
||||
Slice ikey = k.internal_key();
|
||||
Slice user_key = k.user_key();
|
||||
|
||||
@ -981,7 +981,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
||||
user_comparator(), merge_operator_, info_log_, db_statistics_,
|
||||
status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
|
||||
value, value_found, merge_context, range_del_agg, this->env_, seq,
|
||||
merge_operator_ ? &pinned_iters_mgr : nullptr);
|
||||
merge_operator_ ? &pinned_iters_mgr : nullptr, is_blob);
|
||||
|
||||
// Pin blocks that we read to hold merge operands
|
||||
if (merge_operator_) {
|
||||
@ -1030,6 +1030,12 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
||||
return;
|
||||
case GetContext::kMerge:
|
||||
break;
|
||||
case GetContext::kBlobIndex:
|
||||
ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
|
||||
*status = Status::NotSupported(
|
||||
"Encounter unexpected blob index. Please open DB with "
|
||||
"rocksdb::blob_db::BlobDB instead.");
|
||||
return;
|
||||
}
|
||||
f = fp.GetNextFile();
|
||||
}
|
||||
@ -1787,27 +1793,33 @@ void VersionStorageInfo::GetOverlappingInputs(
|
||||
void VersionStorageInfo::GetCleanInputsWithinInterval(
|
||||
int level, const InternalKey* begin, const InternalKey* end,
|
||||
std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
|
||||
if (level >= num_non_empty_levels_) {
|
||||
// this level is empty, no inputs within range
|
||||
return;
|
||||
}
|
||||
|
||||
inputs->clear();
|
||||
Slice user_begin, user_end;
|
||||
if (begin != nullptr) {
|
||||
user_begin = begin->user_key();
|
||||
}
|
||||
if (end != nullptr) {
|
||||
user_end = end->user_key();
|
||||
}
|
||||
if (file_index) {
|
||||
*file_index = -1;
|
||||
}
|
||||
if (begin != nullptr && end != nullptr && level > 0) {
|
||||
GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
|
||||
hint_index, file_index,
|
||||
true /* within_interval */);
|
||||
if (level >= num_non_empty_levels_ || level == 0 ||
|
||||
level_files_brief_[level].num_files == 0) {
|
||||
// this level is empty, no inputs within range
|
||||
// also don't support clean input interval within L0
|
||||
return;
|
||||
}
|
||||
|
||||
Slice user_begin, user_end;
|
||||
const auto& level_files = level_files_brief_[level];
|
||||
if (begin == nullptr) {
|
||||
user_begin = ExtractUserKey(level_files.files[0].smallest_key);
|
||||
} else {
|
||||
user_begin = begin->user_key();
|
||||
}
|
||||
if (end == nullptr) {
|
||||
user_end = ExtractUserKey(
|
||||
level_files.files[level_files.num_files - 1].largest_key);
|
||||
} else {
|
||||
user_end = end->user_key();
|
||||
}
|
||||
GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
|
||||
hint_index, file_index,
|
||||
true /* within_interval */);
|
||||
}
|
||||
|
||||
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
||||
@ -1870,8 +1882,8 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
|
||||
} else {
|
||||
ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid,
|
||||
&start_index, &end_index);
|
||||
assert(end_index >= start_index);
|
||||
}
|
||||
assert(end_index >= start_index);
|
||||
// insert overlapping files into vector
|
||||
for (int i = start_index; i <= end_index; i++) {
|
||||
inputs->push_back(files_[level][i]);
|
||||
|
@ -485,7 +485,8 @@ class Version {
|
||||
void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
|
||||
Status* status, MergeContext* merge_context,
|
||||
RangeDelAggregator* range_del_agg, bool* value_found = nullptr,
|
||||
bool* key_exists = nullptr, SequenceNumber* seq = nullptr);
|
||||
bool* key_exists = nullptr, SequenceNumber* seq = nullptr,
|
||||
bool* is_blob = nullptr);
|
||||
|
||||
// Loads some stats information from files. Call without mutex held. It needs
|
||||
// to be called before applying the version to the version set.
|
||||
|
@ -67,6 +67,7 @@ enum ContentFlags : uint32_t {
|
||||
HAS_COMMIT = 1 << 7,
|
||||
HAS_ROLLBACK = 1 << 8,
|
||||
HAS_DELETE_RANGE = 1 << 9,
|
||||
HAS_BLOB_INDEX = 1 << 10,
|
||||
};
|
||||
|
||||
struct BatchContentClassifier : public WriteBatch::Handler {
|
||||
@ -97,6 +98,11 @@ struct BatchContentClassifier : public WriteBatch::Handler {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
|
||||
content_flags |= ContentFlags::HAS_BLOB_INDEX;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status MarkBeginPrepare() override {
|
||||
content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
|
||||
return Status::OK();
|
||||
@ -328,6 +334,17 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
|
||||
return Status::Corruption("bad WriteBatch Merge");
|
||||
}
|
||||
break;
|
||||
case kTypeColumnFamilyBlobIndex:
|
||||
if (!GetVarint32(input, column_family)) {
|
||||
return Status::Corruption("bad WriteBatch BlobIndex");
|
||||
}
|
||||
// intentional fallthrough
|
||||
case kTypeBlobIndex:
|
||||
if (!GetLengthPrefixedSlice(input, key) ||
|
||||
!GetLengthPrefixedSlice(input, value)) {
|
||||
return Status::Corruption("bad WriteBatch BlobIndex");
|
||||
}
|
||||
break;
|
||||
case kTypeLogData:
|
||||
assert(blob != nullptr);
|
||||
if (!GetLengthPrefixedSlice(input, blob)) {
|
||||
@ -414,6 +431,13 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
||||
s = handler->MergeCF(column_family, key, value);
|
||||
found++;
|
||||
break;
|
||||
case kTypeColumnFamilyBlobIndex:
|
||||
case kTypeBlobIndex:
|
||||
assert(content_flags_.load(std::memory_order_relaxed) &
|
||||
(ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
|
||||
s = handler->PutBlobIndexCF(column_family, key, value);
|
||||
found++;
|
||||
break;
|
||||
case kTypeLogData:
|
||||
handler->LogData(blob);
|
||||
break;
|
||||
@ -759,6 +783,25 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
|
||||
value);
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
|
||||
uint32_t column_family_id,
|
||||
const Slice& key, const Slice& value) {
|
||||
LocalSavePoint save(b);
|
||||
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
||||
if (column_family_id == 0) {
|
||||
b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
|
||||
} else {
|
||||
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
|
||||
PutVarint32(&b->rep_, column_family_id);
|
||||
}
|
||||
PutLengthPrefixedSlice(&b->rep_, key);
|
||||
PutLengthPrefixedSlice(&b->rep_, value);
|
||||
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
||||
ContentFlags::HAS_BLOB_INDEX,
|
||||
std::memory_order_relaxed);
|
||||
return save.commit();
|
||||
}
|
||||
|
||||
Status WriteBatch::PutLogData(const Slice& blob) {
|
||||
LocalSavePoint save(this);
|
||||
rep_.push_back(static_cast<char>(kTypeLogData));
|
||||
@ -935,8 +978,8 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) override {
|
||||
Status PutCFImpl(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value, ValueType value_type) {
|
||||
if (rebuilding_trx_ != nullptr) {
|
||||
WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
|
||||
return Status::OK();
|
||||
@ -949,9 +992,9 @@ public:
|
||||
}
|
||||
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
auto* moptions = mem->GetMemTableOptions();
|
||||
auto* moptions = mem->GetImmutableMemTableOptions();
|
||||
if (!moptions->inplace_update_support) {
|
||||
mem->Add(sequence_, kTypeValue, key, value, concurrent_memtable_writes_,
|
||||
mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_,
|
||||
get_post_process_info(mem));
|
||||
} else if (moptions->inplace_callback == nullptr) {
|
||||
assert(!concurrent_memtable_writes_);
|
||||
@ -986,11 +1029,11 @@ public:
|
||||
value, &merged_value);
|
||||
if (status == UpdateStatus::UPDATED_INPLACE) {
|
||||
// prev_value is updated in-place with final value.
|
||||
mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
|
||||
mem->Add(sequence_, value_type, key, Slice(prev_buffer, prev_size));
|
||||
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
||||
} else if (status == UpdateStatus::UPDATED) {
|
||||
// merged_value contains the final value.
|
||||
mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
|
||||
mem->Add(sequence_, value_type, key, Slice(merged_value));
|
||||
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
||||
}
|
||||
}
|
||||
@ -1003,6 +1046,11 @@ public:
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) override {
|
||||
return PutCFImpl(column_family_id, key, value, kTypeValue);
|
||||
}
|
||||
|
||||
Status DeleteImpl(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value, ValueType delete_type) {
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
@ -1091,7 +1139,7 @@ public:
|
||||
}
|
||||
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
auto* moptions = mem->GetMemTableOptions();
|
||||
auto* moptions = mem->GetImmutableMemTableOptions();
|
||||
bool perform_merge = false;
|
||||
|
||||
// If we pass DB through and options.max_successive_merges is hit
|
||||
@ -1159,6 +1207,12 @@ public:
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
|
||||
const Slice& value) override {
|
||||
// Same as PutCF except for value type.
|
||||
return PutCFImpl(column_family_id, key, value, kTypeBlobIndex);
|
||||
}
|
||||
|
||||
void CheckMemtableFull() {
|
||||
if (flush_scheduler_ != nullptr) {
|
||||
auto* cfd = cf_mems_->current();
|
||||
|
@ -99,6 +99,9 @@ class WriteBatchInternal {
|
||||
static Status Merge(WriteBatch* batch, uint32_t column_family_id,
|
||||
const SliceParts& key, const SliceParts& value);
|
||||
|
||||
static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
|
||||
const Slice& key, const Slice& value);
|
||||
|
||||
static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid);
|
||||
|
||||
static Status MarkRollback(WriteBatch* batch, const Slice& xid);
|
||||
|
@ -532,6 +532,11 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
|
||||
Writer* last_writer = write_group.last_writer;
|
||||
assert(leader->link_older == nullptr);
|
||||
|
||||
// Propagate memtable write error to the whole group.
|
||||
if (status.ok() && !write_group.status.ok()) {
|
||||
status = write_group.status;
|
||||
}
|
||||
|
||||
if (enable_pipelined_write_) {
|
||||
// Notify writers don't write to memtable to exit.
|
||||
for (Writer* w = last_writer; w != leader;) {
|
||||
|
@ -25,6 +25,15 @@ class Cleanable {
|
||||
public:
|
||||
Cleanable();
|
||||
~Cleanable();
|
||||
|
||||
// No copy constructor and copy assignment allowed.
|
||||
Cleanable(Cleanable&) = delete;
|
||||
Cleanable& operator=(Cleanable&) = delete;
|
||||
|
||||
// Move consturctor and move assignment is allowed.
|
||||
Cleanable(Cleanable&&);
|
||||
Cleanable& operator=(Cleanable&&);
|
||||
|
||||
// Clients are allowed to register function/arg1/arg2 triples that
|
||||
// will be invoked when this iterator is destroyed.
|
||||
//
|
||||
|
@ -36,6 +36,7 @@ class CompactionFilter {
|
||||
enum ValueType {
|
||||
kValue,
|
||||
kMergeOperand,
|
||||
kBlobIndex, // used internally by BlobDB.
|
||||
};
|
||||
|
||||
enum class Decision {
|
||||
@ -171,6 +172,8 @@ class CompactionFilter {
|
||||
bool rv = FilterMergeOperand(level, key, existing_value);
|
||||
return rv ? Decision::kRemove : Decision::kKeep;
|
||||
}
|
||||
case ValueType::kBlobIndex:
|
||||
return Decision::kKeep;
|
||||
}
|
||||
assert(false);
|
||||
return Decision::kKeep;
|
||||
|
@ -325,7 +325,8 @@ void CancelAllBackgroundWork(DB* db, bool wait = false);
|
||||
|
||||
// Delete files which are entirely in the given range
|
||||
// Could leave some keys in the range which are in files which are not
|
||||
// entirely in the range.
|
||||
// entirely in the range. Also leaves L0 files regardless of whether they're
|
||||
// in the range.
|
||||
// Snapshots before the delete might not see the data in the given range.
|
||||
Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end);
|
||||
|
@ -582,6 +582,12 @@ class DB {
|
||||
|
||||
// "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
|
||||
static const std::string kIsWriteStopped;
|
||||
|
||||
// "rocksdb.estimate-oldest-key-time" - returns an estimation of
|
||||
// oldest key timestamp in the DB. Currently only available for
|
||||
// FIFO compaction with
|
||||
// compaction_options_fifo.allow_compaction = false.
|
||||
static const std::string kEstimateOldestKeyTime;
|
||||
};
|
||||
#endif /* ROCKSDB_LITE */
|
||||
|
||||
@ -632,6 +638,7 @@ class DB {
|
||||
// "rocksdb.num-running-flushes"
|
||||
// "rocksdb.actual-delayed-write-rate"
|
||||
// "rocksdb.is-write-stopped"
|
||||
// "rocksdb.estimate-oldest-key-time"
|
||||
virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
|
||||
const Slice& property, uint64_t* value) = 0;
|
||||
virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
|
||||
|
@ -206,6 +206,7 @@ class CompactionEventListener {
|
||||
kDelete,
|
||||
kSingleDelete,
|
||||
kRangeDelete,
|
||||
kBlobIndex,
|
||||
kInvalid,
|
||||
};
|
||||
|
||||
|
@ -129,6 +129,10 @@ class PinnableSlice : public Slice, public Cleanable {
|
||||
PinnableSlice() { buf_ = &self_space_; }
|
||||
explicit PinnableSlice(std::string* buf) { buf_ = buf; }
|
||||
|
||||
// No copy constructor and copy assignment allowed.
|
||||
PinnableSlice(PinnableSlice&) = delete;
|
||||
PinnableSlice& operator=(PinnableSlice&) = delete;
|
||||
|
||||
inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
|
||||
void* arg2) {
|
||||
assert(!pinned_);
|
||||
|
@ -4,8 +4,8 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/types.h"
|
||||
|
||||
@ -49,6 +49,7 @@ struct TablePropertiesNames {
|
||||
static const std::string kPropertyCollectors;
|
||||
static const std::string kCompression;
|
||||
static const std::string kCreationTime;
|
||||
static const std::string kOldestKeyTime;
|
||||
};
|
||||
|
||||
extern const std::string kPropertiesBlock;
|
||||
@ -162,6 +163,8 @@ struct TableProperties {
|
||||
// The time when the SST file was created.
|
||||
// Since SST files are immutable, this is equivalent to last modified time.
|
||||
uint64_t creation_time = 0;
|
||||
// Timestamp of the earliest key. 0 means unknown.
|
||||
uint64_t oldest_key_time = 0;
|
||||
|
||||
// Name of the column family with which this SST file is associated.
|
||||
// If column family is unknown, `column_family_name` will be an empty string.
|
||||
|
@ -16,6 +16,8 @@ namespace rocksdb {
|
||||
// store multiple versions of a same user key due to snapshots, compaction not
|
||||
// happening yet, etc.
|
||||
struct KeyVersion {
|
||||
KeyVersion() : user_key(""), value(""), sequence(0), type(0) {}
|
||||
|
||||
KeyVersion(const std::string& _user_key, const std::string& _value,
|
||||
SequenceNumber _sequence, int _type)
|
||||
: user_key(_user_key), value(_value), sequence(_sequence), type(_type) {}
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
#define ROCKSDB_MAJOR 5
|
||||
#define ROCKSDB_MINOR 8
|
||||
#define ROCKSDB_PATCH 0
|
||||
#define ROCKSDB_PATCH 8
|
||||
|
||||
// Do not use these. We made the mistake of declaring macros starting with
|
||||
// double underscore. Now we have to live with our choice. We'll deprecate these
|
||||
|
@ -233,6 +233,12 @@ class WriteBatch : public WriteBatchBase {
|
||||
}
|
||||
virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
|
||||
|
||||
virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
|
||||
const Slice& /*key*/,
|
||||
const Slice& /*value*/) {
|
||||
return Status::InvalidArgument("PutBlobIndexCF not implemented");
|
||||
}
|
||||
|
||||
// The default implementation of LogData does nothing.
|
||||
virtual void LogData(const Slice& blob);
|
||||
|
||||
|
@ -278,7 +278,7 @@ struct InlineSkipList<Comparator>::Node {
|
||||
// next_[0]. This is used for passing data from AllocateKey to Insert.
|
||||
void StashHeight(const int height) {
|
||||
assert(sizeof(int) <= sizeof(next_[0]));
|
||||
memcpy(&next_[0], &height, sizeof(int));
|
||||
memcpy(static_cast<void*>(&next_[0]), &height, sizeof(int));
|
||||
}
|
||||
|
||||
// Retrieves the value passed to StashHeight. Undefined after a call
|
||||
@ -298,30 +298,30 @@ struct InlineSkipList<Comparator>::Node {
|
||||
assert(n >= 0);
|
||||
// Use an 'acquire load' so that we observe a fully initialized
|
||||
// version of the returned Node.
|
||||
return (next_[-n].load(std::memory_order_acquire));
|
||||
return ((&next_[0] - n)->load(std::memory_order_acquire));
|
||||
}
|
||||
|
||||
void SetNext(int n, Node* x) {
|
||||
assert(n >= 0);
|
||||
// Use a 'release store' so that anybody who reads through this
|
||||
// pointer observes a fully initialized version of the inserted node.
|
||||
next_[-n].store(x, std::memory_order_release);
|
||||
(&next_[0] - n)->store(x, std::memory_order_release);
|
||||
}
|
||||
|
||||
bool CASNext(int n, Node* expected, Node* x) {
|
||||
assert(n >= 0);
|
||||
return next_[-n].compare_exchange_strong(expected, x);
|
||||
return (&next_[0] - n)->compare_exchange_strong(expected, x);
|
||||
}
|
||||
|
||||
// No-barrier variants that can be safely used in a few locations.
|
||||
Node* NoBarrier_Next(int n) {
|
||||
assert(n >= 0);
|
||||
return next_[-n].load(std::memory_order_relaxed);
|
||||
return (&next_[0] - n)->load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
void NoBarrier_SetNext(int n, Node* x) {
|
||||
assert(n >= 0);
|
||||
next_[-n].store(x, std::memory_order_relaxed);
|
||||
(&next_[0] - n)->store(x, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Insert node after prev on specific level.
|
||||
|
@ -42,6 +42,9 @@ static std::map<CompactionStopStyle, std::string>
|
||||
{kCompactionStopStyleSimilarSize, "kCompactionStopStyleSimilarSize"},
|
||||
{kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}};
|
||||
|
||||
static std::unordered_map<std::string, ChecksumType> checksum_type_string_map =
|
||||
{{"kNoChecksum", kNoChecksum}, {"kCRC32c", kCRC32c}, {"kxxHash", kxxHash}};
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
Status GetMutableOptionsFromStrings(
|
||||
@ -600,9 +603,6 @@ static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
|
||||
static std::unordered_map<std::string, EncodingType> encoding_type_string_map =
|
||||
{{"kPlain", kPlain}, {"kPrefix", kPrefix}};
|
||||
|
||||
static std::unordered_map<std::string, ChecksumType> checksum_type_string_map =
|
||||
{{"kNoChecksum", kNoChecksum}, {"kCRC32c", kCRC32c}, {"kxxHash", kxxHash}};
|
||||
|
||||
static std::unordered_map<std::string, CompactionStyle>
|
||||
compaction_style_string_map = {
|
||||
{"kCompactionStyleLevel", kCompactionStyleLevel},
|
||||
|
@ -228,80 +228,3 @@ int GetMaxOpenFiles() { return -1; }
|
||||
|
||||
} // namespace port
|
||||
} // namespace rocksdb
|
||||
|
||||
#ifdef JEMALLOC
|
||||
|
||||
#include "jemalloc/jemalloc.h"
|
||||
|
||||
#ifndef JEMALLOC_NON_INIT
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace port {
|
||||
|
||||
__declspec(noinline) void WINAPI InitializeJemalloc() {
|
||||
je_init();
|
||||
atexit(je_uninit);
|
||||
}
|
||||
|
||||
} // port
|
||||
} // rocksdb
|
||||
|
||||
extern "C" {
|
||||
|
||||
#ifdef _WIN64
|
||||
|
||||
#pragma comment(linker, "/INCLUDE:p_rocksdb_init_jemalloc")
|
||||
|
||||
typedef void(WINAPI* CRT_Startup_Routine)(void);
|
||||
|
||||
// .CRT section is merged with .rdata on x64 so it must be constant data.
|
||||
// must be of external linkage
|
||||
// We put this into XCT since we want to run this earlier than C++ static
|
||||
// constructors
|
||||
// which are placed into XCU
|
||||
#pragma const_seg(".CRT$XCT")
|
||||
extern const CRT_Startup_Routine p_rocksdb_init_jemalloc;
|
||||
const CRT_Startup_Routine p_rocksdb_init_jemalloc =
|
||||
rocksdb::port::InitializeJemalloc;
|
||||
#pragma const_seg()
|
||||
|
||||
#else // _WIN64
|
||||
|
||||
// x86 untested
|
||||
|
||||
#pragma comment(linker, "/INCLUDE:_p_rocksdb_init_jemalloc")
|
||||
|
||||
#pragma section(".CRT$XCT", read)
|
||||
JEMALLOC_SECTION(".CRT$XCT") JEMALLOC_ATTR(used) static const void(
|
||||
WINAPI* p_rocksdb_init_jemalloc)(void) = rocksdb::port::InitializeJemalloc;
|
||||
|
||||
#endif // _WIN64
|
||||
|
||||
} // extern "C"
|
||||
|
||||
#endif // JEMALLOC_NON_INIT
|
||||
|
||||
// Global operators to be replaced by a linker
|
||||
|
||||
void* operator new(size_t size) {
|
||||
void* p = je_malloc(size);
|
||||
if (!p) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void* operator new[](size_t size) {
|
||||
void* p = je_malloc(size);
|
||||
if (!p) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void operator delete(void* p) { je_free(p); }
|
||||
|
||||
void operator delete[](void* p) { je_free(p); }
|
||||
|
||||
#endif // JEMALLOC
|
||||
|
@ -240,13 +240,31 @@ extern void InitOnce(OnceType* once, void (*initializer)());
|
||||
#define CACHE_LINE_SIZE 64U
|
||||
#endif
|
||||
|
||||
#ifdef ROCKSDB_JEMALLOC
|
||||
#include "jemalloc/jemalloc.h"
|
||||
// Separate inlines so they can be replaced if needed
|
||||
inline void* jemalloc_aligned_alloc( size_t size, size_t alignment) {
|
||||
return je_aligned_alloc(alignment, size);
|
||||
}
|
||||
inline void jemalloc_aligned_free(void* p) {
|
||||
je_free(p);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void *cacheline_aligned_alloc(size_t size) {
|
||||
return _aligned_malloc(CACHE_LINE_SIZE, size);
|
||||
#ifdef ROCKSDB_JEMALLOC
|
||||
return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE);
|
||||
#else
|
||||
return _aligned_malloc(size, CACHE_LINE_SIZE);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void cacheline_aligned_free(void *memblock) {
|
||||
#ifdef ROCKSDB_JEMALLOC
|
||||
jemalloc_aligned_free(memblock);
|
||||
#else
|
||||
_aligned_free(memblock);
|
||||
#endif
|
||||
}
|
||||
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52991 for MINGW32
|
||||
|
47
port/win/win_jemalloc.cc
Normal file
47
port/win/win_jemalloc.cc
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef ROCKSDB_JEMALLOC
|
||||
# error This file can only be part of jemalloc aware build
|
||||
#endif
|
||||
|
||||
#include <stdexcept>
|
||||
#include "jemalloc/jemalloc.h"
|
||||
|
||||
// Global operators to be replaced by a linker when this file is
|
||||
// a part of the build
|
||||
|
||||
void* operator new(size_t size) {
|
||||
void* p = je_malloc(size);
|
||||
if (!p) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void* operator new[](size_t size) {
|
||||
void* p = je_malloc(size);
|
||||
if (!p) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void operator delete(void* p) {
|
||||
if (p) {
|
||||
je_free(p);
|
||||
}
|
||||
}
|
||||
|
||||
void operator delete[](void* p) {
|
||||
if (p) {
|
||||
je_free(p);
|
||||
}
|
||||
}
|
||||
|
@ -17,10 +17,6 @@
|
||||
|
||||
#ifdef XPRESS
|
||||
|
||||
#ifdef JEMALLOC
|
||||
#include <jemalloc/jemalloc.h>
|
||||
#endif
|
||||
|
||||
// Put this under ifdef so windows systems w/o this
|
||||
// can still build
|
||||
#include <compressapi.h>
|
||||
@ -43,22 +39,6 @@ auto CloseDecompressorFun = [](void* h) {
|
||||
::CloseDecompressor(reinterpret_cast<DECOMPRESSOR_HANDLE>(h));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#ifdef JEMALLOC
|
||||
// Make sure compressors use our jemalloc if redirected
|
||||
PVOID CompressorAlloc(PVOID, SIZE_T size) {
|
||||
return je_malloc(size);
|
||||
}
|
||||
|
||||
VOID CompressorFree(PVOID, PVOID p) {
|
||||
if (p != NULL) {
|
||||
je_free(p);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
bool Compress(const char* input, size_t length, std::string* output) {
|
||||
@ -73,17 +53,6 @@ bool Compress(const char* input, size_t length, std::string* output) {
|
||||
|
||||
COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
|
||||
|
||||
#ifdef JEMALLOC
|
||||
COMPRESS_ALLOCATION_ROUTINES allocationRoutines;
|
||||
|
||||
// Init. allocation routines
|
||||
allocationRoutines.Allocate = CompressorAlloc;
|
||||
allocationRoutines.Free = CompressorFree;
|
||||
allocationRoutines.UserContext = NULL;
|
||||
|
||||
allocRoutinesPtr = &allocationRoutines;
|
||||
#endif
|
||||
|
||||
COMPRESSOR_HANDLE compressor = NULL;
|
||||
|
||||
BOOL success = CreateCompressor(
|
||||
@ -94,17 +63,17 @@ bool Compress(const char* input, size_t length, std::string* output) {
|
||||
if (!success) {
|
||||
#ifdef _DEBUG
|
||||
std::cerr << "XPRESS: Failed to create Compressor LastError: " <<
|
||||
GetLastError() << std::endl;
|
||||
GetLastError() << std::endl;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
std::unique_ptr<void, decltype(CloseCompressorFun)>
|
||||
compressorGuard(compressor, CloseCompressorFun);
|
||||
compressorGuard(compressor, CloseCompressorFun);
|
||||
|
||||
SIZE_T compressedBufferSize = 0;
|
||||
|
||||
// Query compressed buffer size.
|
||||
// Query compressed buffer size.
|
||||
success = ::Compress(
|
||||
compressor, // Compressor Handle
|
||||
const_cast<char*>(input), // Input buffer
|
||||
@ -123,8 +92,8 @@ bool Compress(const char* input, size_t length, std::string* output) {
|
||||
"XPRESS: Failed to estimate compressed buffer size LastError " <<
|
||||
lastError << std::endl;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
assert(compressedBufferSize > 0);
|
||||
@ -146,7 +115,7 @@ bool Compress(const char* input, size_t length, std::string* output) {
|
||||
if (!success) {
|
||||
#ifdef _DEBUG
|
||||
std::cerr << "XPRESS: Failed to compress LastError " <<
|
||||
GetLastError() << std::endl;
|
||||
GetLastError() << std::endl;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
@ -169,16 +138,6 @@ char* Decompress(const char* input_data, size_t input_length,
|
||||
|
||||
COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
|
||||
|
||||
#ifdef JEMALLOC
|
||||
COMPRESS_ALLOCATION_ROUTINES allocationRoutines;
|
||||
|
||||
// Init. allocation routines
|
||||
allocationRoutines.Allocate = CompressorAlloc;
|
||||
allocationRoutines.Free = CompressorFree;
|
||||
allocationRoutines.UserContext = NULL;
|
||||
allocRoutinesPtr = &allocationRoutines;
|
||||
#endif
|
||||
|
||||
DECOMPRESSOR_HANDLE decompressor = NULL;
|
||||
|
||||
BOOL success = CreateDecompressor(
|
||||
@ -190,7 +149,7 @@ char* Decompress(const char* input_data, size_t input_length,
|
||||
if (!success) {
|
||||
#ifdef _DEBUG
|
||||
std::cerr << "XPRESS: Failed to create Decompressor LastError "
|
||||
<< GetLastError() << std::endl;
|
||||
<< GetLastError() << std::endl;
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
@ -215,8 +174,8 @@ char* Decompress(const char* input_data, size_t input_length,
|
||||
if (lastError != ERROR_INSUFFICIENT_BUFFER) {
|
||||
#ifdef _DEBUG
|
||||
std::cerr
|
||||
<< "XPRESS: Failed to estimate decompressed buffer size LastError "
|
||||
<< lastError << std::endl;
|
||||
<< "XPRESS: Failed to estimate decompressed buffer size LastError "
|
||||
<< lastError << std::endl;
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
1
src.mk
1
src.mk
@ -301,6 +301,7 @@ MAIN_SOURCES = \
|
||||
options/options_test.cc \
|
||||
table/block_based_filter_block_test.cc \
|
||||
table/block_test.cc \
|
||||
table/cleanable_test.cc \
|
||||
table/cuckoo_table_builder_test.cc \
|
||||
table/cuckoo_table_reader_test.cc \
|
||||
table/full_filter_block_test.cc \
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include "table/block_based_table_builder.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <list>
|
||||
@ -276,6 +275,7 @@ struct BlockBasedTableBuilder::Rep {
|
||||
uint32_t column_family_id;
|
||||
const std::string& column_family_name;
|
||||
uint64_t creation_time = 0;
|
||||
uint64_t oldest_key_time = 0;
|
||||
|
||||
std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
|
||||
|
||||
@ -288,7 +288,8 @@ struct BlockBasedTableBuilder::Rep {
|
||||
const CompressionType _compression_type,
|
||||
const CompressionOptions& _compression_opts,
|
||||
const std::string* _compression_dict, const bool skip_filters,
|
||||
const std::string& _column_family_name, const uint64_t _creation_time)
|
||||
const std::string& _column_family_name, const uint64_t _creation_time,
|
||||
const uint64_t _oldest_key_time)
|
||||
: ioptions(_ioptions),
|
||||
table_options(table_opt),
|
||||
internal_comparator(icomparator),
|
||||
@ -305,7 +306,8 @@ struct BlockBasedTableBuilder::Rep {
|
||||
table_options, data_block)),
|
||||
column_family_id(_column_family_id),
|
||||
column_family_name(_column_family_name),
|
||||
creation_time(_creation_time) {
|
||||
creation_time(_creation_time),
|
||||
oldest_key_time(_oldest_key_time) {
|
||||
if (table_options.index_type ==
|
||||
BlockBasedTableOptions::kTwoLevelIndexSearch) {
|
||||
p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
|
||||
@ -344,7 +346,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
|
||||
const CompressionType compression_type,
|
||||
const CompressionOptions& compression_opts,
|
||||
const std::string* compression_dict, const bool skip_filters,
|
||||
const std::string& column_family_name, const uint64_t creation_time) {
|
||||
const std::string& column_family_name, const uint64_t creation_time,
|
||||
const uint64_t oldest_key_time) {
|
||||
BlockBasedTableOptions sanitized_table_options(table_options);
|
||||
if (sanitized_table_options.format_version == 0 &&
|
||||
sanitized_table_options.checksum != kCRC32c) {
|
||||
@ -357,10 +360,11 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
|
||||
sanitized_table_options.format_version = 1;
|
||||
}
|
||||
|
||||
rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator,
|
||||
int_tbl_prop_collector_factories, column_family_id, file,
|
||||
compression_type, compression_opts, compression_dict,
|
||||
skip_filters, column_family_name, creation_time);
|
||||
rep_ =
|
||||
new Rep(ioptions, sanitized_table_options, internal_comparator,
|
||||
int_tbl_prop_collector_factories, column_family_id, file,
|
||||
compression_type, compression_opts, compression_dict,
|
||||
skip_filters, column_family_name, creation_time, oldest_key_time);
|
||||
|
||||
if (rep_->filter_builder != nullptr) {
|
||||
rep_->filter_builder->StartBlock(0);
|
||||
@ -738,6 +742,7 @@ Status BlockBasedTableBuilder::Finish() {
|
||||
r->p_index_builder_->EstimateTopLevelIndexSize(r->offset);
|
||||
}
|
||||
r->props.creation_time = r->creation_time;
|
||||
r->props.oldest_key_time = r->oldest_key_time;
|
||||
|
||||
// Add basic properties
|
||||
property_block_builder.AddTableProperty(r->props);
|
||||
|
@ -47,7 +47,8 @@ class BlockBasedTableBuilder : public TableBuilder {
|
||||
const CompressionType compression_type,
|
||||
const CompressionOptions& compression_opts,
|
||||
const std::string* compression_dict, const bool skip_filters,
|
||||
const std::string& column_family_name, const uint64_t creation_time = 0);
|
||||
const std::string& column_family_name, const uint64_t creation_time = 0,
|
||||
const uint64_t oldest_key_time = 0);
|
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~BlockBasedTableBuilder();
|
||||
|
@ -79,7 +79,8 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
|
||||
table_builder_options.compression_dict,
|
||||
table_builder_options.skip_filters,
|
||||
table_builder_options.column_family_name,
|
||||
table_builder_options.creation_time);
|
||||
table_builder_options.creation_time,
|
||||
table_builder_options.oldest_key_time);
|
||||
|
||||
return table_builder;
|
||||
}
|
||||
|
@ -33,14 +33,12 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
|
||||
|
||||
} // namespace
|
||||
|
||||
GetContext::GetContext(const Comparator* ucmp,
|
||||
const MergeOperator* merge_operator, Logger* logger,
|
||||
Statistics* statistics, GetState init_state,
|
||||
const Slice& user_key, PinnableSlice* pinnable_val,
|
||||
bool* value_found, MergeContext* merge_context,
|
||||
RangeDelAggregator* _range_del_agg, Env* env,
|
||||
SequenceNumber* seq,
|
||||
PinnedIteratorsManager* _pinned_iters_mgr)
|
||||
GetContext::GetContext(
|
||||
const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
|
||||
Statistics* statistics, GetState init_state, const Slice& user_key,
|
||||
PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
|
||||
RangeDelAggregator* _range_del_agg, Env* env, SequenceNumber* seq,
|
||||
PinnedIteratorsManager* _pinned_iters_mgr, bool* is_blob_index)
|
||||
: ucmp_(ucmp),
|
||||
merge_operator_(merge_operator),
|
||||
logger_(logger),
|
||||
@ -54,7 +52,8 @@ GetContext::GetContext(const Comparator* ucmp,
|
||||
env_(env),
|
||||
seq_(seq),
|
||||
replay_log_(nullptr),
|
||||
pinned_iters_mgr_(_pinned_iters_mgr) {
|
||||
pinned_iters_mgr_(_pinned_iters_mgr),
|
||||
is_blob_index_(is_blob_index) {
|
||||
if (seq_) {
|
||||
*seq_ = kMaxSequenceNumber;
|
||||
}
|
||||
@ -99,13 +98,19 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
|
||||
|
||||
auto type = parsed_key.type;
|
||||
// Key matches. Process it
|
||||
if ((type == kTypeValue || type == kTypeMerge) &&
|
||||
if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
|
||||
range_del_agg_ != nullptr && range_del_agg_->ShouldDelete(parsed_key)) {
|
||||
type = kTypeRangeDeletion;
|
||||
}
|
||||
switch (type) {
|
||||
case kTypeValue:
|
||||
case kTypeBlobIndex:
|
||||
assert(state_ == kNotFound || state_ == kMerge);
|
||||
if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
|
||||
// Blob value not supported. Stop.
|
||||
state_ = kBlobIndex;
|
||||
return false;
|
||||
}
|
||||
if (kNotFound == state_) {
|
||||
state_ = kFound;
|
||||
if (LIKELY(pinnable_val_ != nullptr)) {
|
||||
@ -131,6 +136,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
|
||||
}
|
||||
}
|
||||
}
|
||||
if (is_blob_index_ != nullptr) {
|
||||
*is_blob_index_ = (type == kTypeBlobIndex);
|
||||
}
|
||||
return false;
|
||||
|
||||
case kTypeDeletion:
|
||||
|
@ -22,7 +22,8 @@ class GetContext {
|
||||
kFound,
|
||||
kDeleted,
|
||||
kCorrupt,
|
||||
kMerge // saver contains the current merge result (the operands)
|
||||
kMerge, // saver contains the current merge result (the operands)
|
||||
kBlobIndex,
|
||||
};
|
||||
|
||||
GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
|
||||
@ -30,7 +31,8 @@ class GetContext {
|
||||
const Slice& user_key, PinnableSlice* value, bool* value_found,
|
||||
MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||
Env* env, SequenceNumber* seq = nullptr,
|
||||
PinnedIteratorsManager* _pinned_iters_mgr = nullptr);
|
||||
PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
|
||||
bool* is_blob_index = nullptr);
|
||||
|
||||
void MarkKeyMayExist();
|
||||
|
||||
@ -83,6 +85,7 @@ class GetContext {
|
||||
// Used to temporarily pin blocks when state_ == GetContext::kMerge
|
||||
PinnedIteratorsManager* pinned_iters_mgr_;
|
||||
bool sample_;
|
||||
bool* is_blob_index_;
|
||||
};
|
||||
|
||||
void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
|
||||
|
@ -21,6 +21,19 @@ Cleanable::Cleanable() {
|
||||
|
||||
Cleanable::~Cleanable() { DoCleanup(); }
|
||||
|
||||
Cleanable::Cleanable(Cleanable&& other) {
|
||||
*this = std::move(other);
|
||||
}
|
||||
|
||||
Cleanable& Cleanable::operator=(Cleanable&& other) {
|
||||
if (this != &other) {
|
||||
cleanup_ = other.cleanup_;
|
||||
other.cleanup_.function = nullptr;
|
||||
other.cleanup_.next = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// If the entire linked list was on heap we could have simply add attach one
|
||||
// link list to another. However the head is an embeded object to avoid the cost
|
||||
// of creating objects for most of the use cases when the Cleanable has only one
|
||||
|
@ -77,6 +77,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
|
||||
Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
|
||||
Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id);
|
||||
Add(TablePropertiesNames::kCreationTime, props.creation_time);
|
||||
Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time);
|
||||
|
||||
if (!props.filter_policy_name.empty()) {
|
||||
Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
|
||||
@ -211,6 +212,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
|
||||
&new_table_properties->column_family_id},
|
||||
{TablePropertiesNames::kCreationTime,
|
||||
&new_table_properties->creation_time},
|
||||
{TablePropertiesNames::kOldestKeyTime,
|
||||
&new_table_properties->oldest_key_time},
|
||||
};
|
||||
|
||||
std::string last_key;
|
||||
|
@ -55,7 +55,7 @@ struct TableBuilderOptions {
|
||||
const CompressionOptions& _compression_opts,
|
||||
const std::string* _compression_dict, bool _skip_filters,
|
||||
const std::string& _column_family_name, int _level,
|
||||
const uint64_t _creation_time = 0)
|
||||
const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0)
|
||||
: ioptions(_ioptions),
|
||||
internal_comparator(_internal_comparator),
|
||||
int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
|
||||
@ -65,7 +65,8 @@ struct TableBuilderOptions {
|
||||
skip_filters(_skip_filters),
|
||||
column_family_name(_column_family_name),
|
||||
level(_level),
|
||||
creation_time(_creation_time) {}
|
||||
creation_time(_creation_time),
|
||||
oldest_key_time(_oldest_key_time) {}
|
||||
const ImmutableCFOptions& ioptions;
|
||||
const InternalKeyComparator& internal_comparator;
|
||||
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
|
||||
@ -78,6 +79,7 @@ struct TableBuilderOptions {
|
||||
const std::string& column_family_name;
|
||||
int level; // what level this table/file is on, -1 for "not set, don't know"
|
||||
const uint64_t creation_time;
|
||||
const int64_t oldest_key_time;
|
||||
};
|
||||
|
||||
// TableBuilder provides the interface used to build a Table
|
||||
|
@ -139,6 +139,9 @@ std::string TableProperties::ToString(
|
||||
|
||||
AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim);
|
||||
|
||||
AppendProperty(result, "time stamp of earliest key", oldest_key_time,
|
||||
prop_delim, kv_delim);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -191,6 +194,8 @@ const std::string TablePropertiesNames::kPropertyCollectors =
|
||||
"rocksdb.property.collectors";
|
||||
const std::string TablePropertiesNames::kCompression = "rocksdb.compression";
|
||||
const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time";
|
||||
const std::string TablePropertiesNames::kOldestKeyTime =
|
||||
"rocksdb.oldest.key.time";
|
||||
|
||||
extern const std::string kPropertiesBlock = "rocksdb.properties";
|
||||
// Old property block name for backward compatibility
|
||||
|
@ -8,8 +8,6 @@ set(USE_SNAPPY_DEFAULT 0) # SNAPPY is disabled by default, enable with -D
|
||||
set(USE_LZ4_DEFAULT 0) # LZ4 is disabled by default, enable with -DLZ4=1 cmake command line agrument
|
||||
set(USE_ZLIB_DEFAULT 0) # ZLIB is disabled by default, enable with -DZLIB=1 cmake command line agrument
|
||||
set(USE_XPRESS_DEFAULT 0) # XPRESS is disabled by default, enable with -DXPRESS=1 cmake command line agrument
|
||||
set(USE_JEMALLOC_DEFAULT 0) # JEMALLOC is disabled by default, enable with -DJEMALLOC=1 cmake command line agrument
|
||||
set(USE_JENONINIT_DEFAULT 1) # Default is enabled do not call je_init/je_uninit as the newer versions do not have it disable with -DJENONINIT=0
|
||||
|
||||
#
|
||||
# This example assumes all the libraries locate in directories under THIRDPARTY_HOME environment variable
|
||||
@ -219,15 +217,15 @@ set(JEMALLOC_LIB_RELEASE ${JEMALLOC_HOME}/bin/retail/amd64/jemalloc.lib)
|
||||
#
|
||||
# Don't touch these lines
|
||||
#
|
||||
if (DEFINED JEMALLOC)
|
||||
set(USE_JEMALLOC ${JEMALLOC})
|
||||
else ()
|
||||
set(USE_JEMALLOC ${USE_JEMALLOC_DEFAULT})
|
||||
endif ()
|
||||
|
||||
if (${USE_JEMALLOC} EQUAL 1)
|
||||
# For compatibilty with previous
|
||||
if(JEMALLOC)
|
||||
set(WITH_JEMALLOC ON)
|
||||
endif()
|
||||
|
||||
if (WITH_JEMALLOC)
|
||||
message(STATUS "JEMALLOC library is enabled")
|
||||
set(JEMALLOC_CXX_FLAGS "-DJEMALLOC -DJEMALLOC_EXPORT= ")
|
||||
set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= ")
|
||||
|
||||
if(DEFINED ENV{JEMALLOC_INCLUDE})
|
||||
set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE})
|
||||
@ -248,16 +246,7 @@ if (${USE_JEMALLOC} EQUAL 1)
|
||||
set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${JEMALLOC_LIBS})
|
||||
set (ARTIFACT_SUFFIX "_je")
|
||||
|
||||
set(USE_JENONINIT USE_JENONINIT_DEFAULT)
|
||||
|
||||
if(JENONINIT)
|
||||
set(USE_JENONINIT ${JENONINIT})
|
||||
endif()
|
||||
|
||||
if(${USE_JENONINIT} EQUAL 1)
|
||||
add_definitions(-DJEMALLOC_NON_INIT)
|
||||
message(STATUS "JEMALLOC NONINIT version")
|
||||
endif()
|
||||
set(WITH_JEMALLOC ON)
|
||||
|
||||
else ()
|
||||
set (ARTIFACT_SUFFIX "")
|
||||
|
78
utilities/blob_db/blob_compaction_filter.h
Normal file
78
utilities/blob_db/blob_compaction_filter.h
Normal file
@ -0,0 +1,78 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "rocksdb/compaction_filter.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "utilities/blob_db/blob_index.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
// CompactionFilter to delete expired blob index from base DB.
|
||||
class BlobIndexCompactionFilter : public CompactionFilter {
|
||||
public:
|
||||
explicit BlobIndexCompactionFilter(uint64_t current_time)
|
||||
: current_time_(current_time) {}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "BlobIndexCompactionFilter";
|
||||
}
|
||||
|
||||
// Filter expired blob indexes regardless of snapshots.
|
||||
virtual bool IgnoreSnapshots() const override { return true; }
|
||||
|
||||
virtual Decision FilterV2(int /*level*/, const Slice& /*key*/,
|
||||
ValueType value_type, const Slice& value,
|
||||
std::string* /*new_value*/,
|
||||
std::string* /*skip_until*/) const override {
|
||||
if (value_type != kBlobIndex) {
|
||||
return Decision::kKeep;
|
||||
}
|
||||
BlobIndex blob_index;
|
||||
Status s = blob_index.DecodeFrom(value);
|
||||
if (!s.ok()) {
|
||||
// Unable to decode blob index. Keeping the value.
|
||||
return Decision::kKeep;
|
||||
}
|
||||
if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
|
||||
// Expired
|
||||
return Decision::kRemove;
|
||||
}
|
||||
return Decision::kKeep;
|
||||
}
|
||||
|
||||
private:
|
||||
const uint64_t current_time_;
|
||||
};
|
||||
|
||||
class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
|
||||
public:
|
||||
explicit BlobIndexCompactionFilterFactory(Env* env) : env_(env) {}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "BlobIndexCompactionFilterFactory";
|
||||
}
|
||||
|
||||
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||
const CompactionFilter::Context& /*context*/) override {
|
||||
int64_t current_time = 0;
|
||||
Status s = env_->GetCurrentTime(¤t_time);
|
||||
if (!s.ok()) {
|
||||
return nullptr;
|
||||
}
|
||||
assert(current_time >= 0);
|
||||
return std::unique_ptr<CompactionFilter>(
|
||||
new BlobIndexCompactionFilter(static_cast<uint64_t>(current_time)));
|
||||
}
|
||||
|
||||
private:
|
||||
Env* env_;
|
||||
};
|
||||
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
@ -26,6 +26,7 @@
|
||||
#include "table/block_builder.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "util/filename.h"
|
||||
#include "utilities/blob_db/blob_compaction_filter.h"
|
||||
#include "utilities/blob_db/blob_db_impl.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -45,6 +46,11 @@ Status BlobDB::OpenAndLoad(const Options& options,
|
||||
const BlobDBOptions& bdb_options,
|
||||
const std::string& dbname, BlobDB** blob_db,
|
||||
Options* changed_options) {
|
||||
if (options.compaction_filter != nullptr ||
|
||||
options.compaction_filter_factory != nullptr) {
|
||||
return Status::NotSupported("Blob DB doesn't support compaction filter.");
|
||||
}
|
||||
|
||||
*changed_options = options;
|
||||
*blob_db = nullptr;
|
||||
|
||||
@ -57,12 +63,18 @@ Status BlobDB::OpenAndLoad(const Options& options,
|
||||
{
|
||||
MutexLock l(&listener_mutex);
|
||||
all_blobdb_listeners.push_back(fblistener);
|
||||
all_blobdb_listeners.push_back(ce_listener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
all_blobdb_listeners.push_back(ce_listener);
|
||||
}
|
||||
all_wal_filters.push_back(rw_filter);
|
||||
}
|
||||
|
||||
changed_options->compaction_filter_factory.reset(
|
||||
new BlobIndexCompactionFilterFactory(options.env));
|
||||
changed_options->listeners.emplace_back(fblistener);
|
||||
changed_options->listeners.emplace_back(ce_listener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
changed_options->listeners.emplace_back(ce_listener);
|
||||
}
|
||||
changed_options->wal_filter = rw_filter.get();
|
||||
|
||||
DBOptions db_options(*changed_options);
|
||||
@ -71,7 +83,9 @@ Status BlobDB::OpenAndLoad(const Options& options,
|
||||
BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
|
||||
|
||||
fblistener->SetImplPtr(bdb);
|
||||
ce_listener->SetImplPtr(bdb);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
ce_listener->SetImplPtr(bdb);
|
||||
}
|
||||
rw_filter->SetImplPtr(bdb);
|
||||
|
||||
Status s = bdb->OpenPhase1();
|
||||
@ -106,6 +120,11 @@ Status BlobDB::Open(const DBOptions& db_options_input,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, BlobDB** blob_db,
|
||||
bool no_base_db) {
|
||||
if (column_families.size() != 1 ||
|
||||
column_families[0].name != kDefaultColumnFamilyName) {
|
||||
return Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family.");
|
||||
}
|
||||
*blob_db = nullptr;
|
||||
Status s;
|
||||
|
||||
@ -124,34 +143,52 @@ Status BlobDB::Open(const DBOptions& db_options_input,
|
||||
ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
|
||||
|
||||
db_options.listeners.emplace_back(fblistener);
|
||||
db_options.listeners.emplace_back(ce_listener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
db_options.listeners.emplace_back(ce_listener);
|
||||
}
|
||||
db_options.wal_filter = rw_filter.get();
|
||||
|
||||
{
|
||||
MutexLock l(&listener_mutex);
|
||||
all_blobdb_listeners.push_back(fblistener);
|
||||
all_blobdb_listeners.push_back(ce_listener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
all_blobdb_listeners.push_back(ce_listener);
|
||||
}
|
||||
all_wal_filters.push_back(rw_filter);
|
||||
}
|
||||
|
||||
ColumnFamilyOptions cf_options(column_families[0].options);
|
||||
if (cf_options.compaction_filter != nullptr ||
|
||||
cf_options.compaction_filter_factory != nullptr) {
|
||||
return Status::NotSupported("Blob DB doesn't support compaction filter.");
|
||||
}
|
||||
cf_options.compaction_filter_factory.reset(
|
||||
new BlobIndexCompactionFilterFactory(db_options.env));
|
||||
ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options);
|
||||
|
||||
// we need to open blob db first so that recovery can happen
|
||||
BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
|
||||
fblistener->SetImplPtr(bdb);
|
||||
ce_listener->SetImplPtr(bdb);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
ce_listener->SetImplPtr(bdb);
|
||||
}
|
||||
rw_filter->SetImplPtr(bdb);
|
||||
|
||||
s = bdb->OpenPhase1();
|
||||
if (!s.ok()) {
|
||||
delete bdb;
|
||||
return s;
|
||||
}
|
||||
|
||||
if (no_base_db) {
|
||||
*blob_db = bdb;
|
||||
return s;
|
||||
}
|
||||
|
||||
DB* db = nullptr;
|
||||
s = DB::Open(db_options, dbname, column_families, handles, &db);
|
||||
s = DB::Open(db_options, dbname, {cf_descriptor}, handles, &db);
|
||||
if (!s.ok()) {
|
||||
delete bdb;
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -169,27 +206,27 @@ Status BlobDB::Open(const DBOptions& db_options_input,
|
||||
BlobDB::BlobDB(DB* db) : StackableDB(db) {}
|
||||
|
||||
void BlobDBOptions::Dump(Logger* log) const {
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir: %s",
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir: %s",
|
||||
blob_dir.c_str());
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.path_relative: %d",
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.path_relative: %d",
|
||||
path_relative);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.is_fifo: %d",
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.is_fifo: %d",
|
||||
is_fifo);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir_size: %" PRIu64,
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.blob_dir_size: %" PRIu64,
|
||||
blob_dir_size);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.ttl_range_secs: %" PRIu32,
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.ttl_range_secs: %" PRIu32,
|
||||
ttl_range_secs);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.bytes_per_sync: %" PRIu64,
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.bytes_per_sync: %" PRIu64,
|
||||
bytes_per_sync);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.blob_file_size: %" PRIu64,
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.blob_file_size: %" PRIu64,
|
||||
blob_file_size);
|
||||
ROCKS_LOG_HEADER(log, "blob_db_options.num_concurrent_simple_blobs: %" PRIu32,
|
||||
num_concurrent_simple_blobs);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.ttl_extractor: %p",
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.ttl_extractor: %p",
|
||||
ttl_extractor.get());
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.compression: %d",
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.compression: %d",
|
||||
static_cast<int>(compression));
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d",
|
||||
ROCKS_LOG_HEADER(log, "blob_db_options.enable_garbage_collection: %d",
|
||||
enable_garbage_collection);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d",
|
||||
disable_background_tasks);
|
||||
}
|
||||
|
||||
|
@ -52,6 +52,10 @@ struct BlobDBOptions {
|
||||
// and so on
|
||||
uint64_t ttl_range_secs = 3600;
|
||||
|
||||
// The smallest value to store in blob log. Value larger than this threshold
|
||||
// will be inlined in base DB together with the key.
|
||||
uint64_t min_blob_size = 0;
|
||||
|
||||
// at what bytes will the blob files be synced to blob log.
|
||||
uint64_t bytes_per_sync = 0;
|
||||
|
||||
@ -59,24 +63,20 @@ struct BlobDBOptions {
|
||||
// after it exceeds that size
|
||||
uint64_t blob_file_size = 256 * 1024 * 1024;
|
||||
|
||||
// how many files to use for simple blobs at one time
|
||||
uint32_t num_concurrent_simple_blobs = 1;
|
||||
|
||||
// Instead of setting TTL explicitly by calling PutWithTTL or PutUntil,
|
||||
// applications can set a TTLExtractor which can extract TTL from key-value
|
||||
// pairs.
|
||||
std::shared_ptr<TTLExtractor> ttl_extractor = nullptr;
|
||||
|
||||
// eviction callback.
|
||||
// this function will be called for every blob that is getting
|
||||
// evicted.
|
||||
std::function<void(const ColumnFamilyHandle*, const Slice&, const Slice&)>
|
||||
gc_evict_cb_fn;
|
||||
|
||||
// what compression to use for Blob's
|
||||
CompressionType compression = kNoCompression;
|
||||
|
||||
// Disable all background job.
|
||||
// If enabled, blob DB periodically cleanup stale data by rewriting remaining
|
||||
// live data in blob files to new files. If garbage collection is not enabled,
|
||||
// blob files will be cleanup based on TTL.
|
||||
bool enable_garbage_collection = false;
|
||||
|
||||
// Disable all background job. Used for test only.
|
||||
bool disable_background_tasks = false;
|
||||
|
||||
void Dump(Logger* log) const;
|
||||
@ -85,34 +85,55 @@ struct BlobDBOptions {
|
||||
class BlobDB : public StackableDB {
|
||||
public:
|
||||
using rocksdb::StackableDB::Put;
|
||||
|
||||
virtual Status Put(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value) override = 0;
|
||||
virtual Status Put(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) override = 0;
|
||||
const Slice& value) override {
|
||||
if (column_family != DefaultColumnFamily()) {
|
||||
return Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family.");
|
||||
}
|
||||
return Put(options, key, value);
|
||||
}
|
||||
|
||||
using rocksdb::StackableDB::Delete;
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) override = 0;
|
||||
|
||||
virtual Status PutWithTTL(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value, uint64_t ttl) = 0;
|
||||
virtual Status Delete(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) override {
|
||||
if (column_family != DefaultColumnFamily()) {
|
||||
return Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family.");
|
||||
}
|
||||
return Delete(options, key);
|
||||
}
|
||||
|
||||
virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t ttl) = 0;
|
||||
virtual Status PutWithTTL(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value, uint64_t ttl) {
|
||||
return PutWithTTL(options, DefaultColumnFamily(), key, value, ttl);
|
||||
if (column_family != DefaultColumnFamily()) {
|
||||
return Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family.");
|
||||
}
|
||||
return PutWithTTL(options, key, value, ttl);
|
||||
}
|
||||
|
||||
// Put with expiration. Key with expiration time equal to
|
||||
// std::numeric_limits<uint64_t>::max() means the key don't expire.
|
||||
virtual Status PutUntil(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t expiration) = 0;
|
||||
virtual Status PutUntil(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value, uint64_t expiration) = 0;
|
||||
|
||||
virtual Status PutUntil(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t expiration) {
|
||||
return PutUntil(options, DefaultColumnFamily(), key, value, expiration);
|
||||
if (column_family != DefaultColumnFamily()) {
|
||||
return Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family.");
|
||||
}
|
||||
return PutUntil(options, key, value, expiration);
|
||||
}
|
||||
|
||||
using rocksdb::StackableDB::Get;
|
||||
@ -123,25 +144,52 @@ class BlobDB : public StackableDB {
|
||||
using rocksdb::StackableDB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) override = 0;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) override {
|
||||
for (auto column_family : column_families) {
|
||||
if (column_family != DefaultColumnFamily()) {
|
||||
return std::vector<Status>(
|
||||
column_families.size(),
|
||||
Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family."));
|
||||
}
|
||||
}
|
||||
return MultiGet(options, keys, values);
|
||||
}
|
||||
|
||||
using rocksdb::StackableDB::SingleDelete;
|
||||
virtual Status SingleDelete(const WriteOptions& wopts,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) override = 0;
|
||||
virtual Status SingleDelete(const WriteOptions& /*wopts*/,
|
||||
ColumnFamilyHandle* /*column_family*/,
|
||||
const Slice& /*key*/) override {
|
||||
return Status::NotSupported("Not supported operation in blob db.");
|
||||
}
|
||||
|
||||
using rocksdb::StackableDB::Merge;
|
||||
virtual Status Merge(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value) override {
|
||||
virtual Status Merge(const WriteOptions& /*options*/,
|
||||
ColumnFamilyHandle* /*column_family*/,
|
||||
const Slice& /*key*/, const Slice& /*value*/) override {
|
||||
return Status::NotSupported("Not supported operation in blob db.");
|
||||
}
|
||||
|
||||
virtual Status Write(const WriteOptions& opts,
|
||||
WriteBatch* updates) override = 0;
|
||||
|
||||
using rocksdb::StackableDB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
|
||||
virtual Iterator* NewIterator(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family) override {
|
||||
if (column_family != DefaultColumnFamily()) {
|
||||
// Blob DB doesn't support non-default column family.
|
||||
return nullptr;
|
||||
}
|
||||
return NewIterator(options);
|
||||
}
|
||||
|
||||
// Starting point for opening a Blob DB.
|
||||
// changed_options - critical. Blob DB loads and inserts listeners
|
||||
// into options which are necessary for recovery and atomicity
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -9,26 +9,27 @@
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <ctime>
|
||||
#include <limits>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "db/db_iter.h"
|
||||
#include "rocksdb/compaction_filter.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/listener.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/wal_filter.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "util/mpsc.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/timer_queue.h"
|
||||
#include "utilities/blob_db/blob_db.h"
|
||||
#include "utilities/blob_db/blob_file.h"
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
#include "utilities/blob_db/blob_log_reader.h"
|
||||
#include "utilities/blob_db/blob_log_writer.h"
|
||||
@ -38,7 +39,6 @@ namespace rocksdb {
|
||||
class DBImpl;
|
||||
class ColumnFamilyHandle;
|
||||
class ColumnFamilyData;
|
||||
class OptimisticTransactionDBImpl;
|
||||
struct FlushJobInfo;
|
||||
|
||||
namespace blob_db {
|
||||
@ -205,45 +205,53 @@ class BlobDBImpl : public BlobDB {
|
||||
// how often to schedule check seq files period
|
||||
static constexpr uint32_t kCheckSeqFilesPeriodMillisecs = 10 * 1000;
|
||||
|
||||
using rocksdb::StackableDB::Put;
|
||||
Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
||||
const Slice& key, const Slice& value) override;
|
||||
// when should oldest file be evicted:
|
||||
// on reaching 90% of blob_dir_size
|
||||
static constexpr double kEvictOldestFileAtSize = 0.9;
|
||||
|
||||
using rocksdb::StackableDB::Delete;
|
||||
Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
||||
const Slice& key) override;
|
||||
using BlobDB::Put;
|
||||
Status Put(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value) override;
|
||||
|
||||
using rocksdb::StackableDB::SingleDelete;
|
||||
virtual Status SingleDelete(const WriteOptions& wopts,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice& key) override;
|
||||
using BlobDB::Delete;
|
||||
Status Delete(const WriteOptions& options, const Slice& key) override;
|
||||
|
||||
using rocksdb::StackableDB::Get;
|
||||
using BlobDB::Get;
|
||||
Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
|
||||
const Slice& key, PinnableSlice* value) override;
|
||||
|
||||
using rocksdb::StackableDB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& read_options,
|
||||
ColumnFamilyHandle* column_family) override;
|
||||
using BlobDB::NewIterator;
|
||||
virtual Iterator* NewIterator(const ReadOptions& read_options) override;
|
||||
|
||||
using rocksdb::StackableDB::MultiGet;
|
||||
using BlobDB::NewIterators;
|
||||
virtual Status NewIterators(
|
||||
const ReadOptions& read_options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||
std::vector<Iterator*>* iterators) override {
|
||||
return Status::NotSupported("Not implemented");
|
||||
}
|
||||
|
||||
using BlobDB::MultiGet;
|
||||
virtual std::vector<Status> MultiGet(
|
||||
const ReadOptions& read_options,
|
||||
const std::vector<ColumnFamilyHandle*>& column_family,
|
||||
const std::vector<Slice>& keys,
|
||||
std::vector<std::string>* values) override;
|
||||
|
||||
virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
|
||||
|
||||
virtual Status GetLiveFiles(std::vector<std::string>&,
|
||||
uint64_t* manifest_file_size,
|
||||
bool flush_memtable = true) override;
|
||||
virtual void GetLiveFilesMetaData(
|
||||
std::vector<LiveFileMetaData>* ) override;
|
||||
|
||||
using BlobDB::PutWithTTL;
|
||||
Status PutWithTTL(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
Status PutWithTTL(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t ttl) override;
|
||||
|
||||
using BlobDB::PutUntil;
|
||||
Status PutUntil(const WriteOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& value_unc, uint64_t expiration) override;
|
||||
Status PutUntil(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t expiration) override;
|
||||
|
||||
Status LinkToBaseDB(DB* db) override;
|
||||
|
||||
@ -257,7 +265,8 @@ class BlobDBImpl : public BlobDB {
|
||||
~BlobDBImpl();
|
||||
|
||||
#ifndef NDEBUG
|
||||
Status TEST_GetSequenceNumber(const Slice& key, SequenceNumber* sequence);
|
||||
Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
PinnableSlice* value);
|
||||
|
||||
std::vector<std::shared_ptr<BlobFile>> TEST_GetBlobFiles() const;
|
||||
|
||||
@ -270,21 +279,21 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
void TEST_RunGC();
|
||||
|
||||
void TEST_ObsoleteFile(std::shared_ptr<BlobFile>& bfile);
|
||||
|
||||
void TEST_DeleteObsoleteFiles();
|
||||
#endif // !NDEBUG
|
||||
|
||||
private:
|
||||
class GarbageCollectionWriteCallback;
|
||||
class BlobInserter;
|
||||
|
||||
Status OpenPhase1();
|
||||
|
||||
// Create a snapshot if there isn't one in read options.
|
||||
// Return true if a snapshot is created.
|
||||
bool SetSnapshotIfNeeded(ReadOptions* read_options);
|
||||
|
||||
Status CommonGet(const ColumnFamilyData* cfd, const Slice& key,
|
||||
const std::string& index_entry, std::string* value,
|
||||
SequenceNumber* sequence = nullptr);
|
||||
Status GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
PinnableSlice* value);
|
||||
|
||||
Slice GetCompressedSlice(const Slice& raw,
|
||||
std::string* compression_output) const;
|
||||
@ -298,7 +307,7 @@ class BlobDBImpl : public BlobDB {
|
||||
// tt - current time
|
||||
// last_id - the id of the non-TTL file to evict
|
||||
bool ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,
|
||||
bool is_oldest_simple_blob_file, std::string* reason);
|
||||
bool is_oldest_non_ttl_file, std::string* reason);
|
||||
|
||||
// collect all the blob log files from the blob directory
|
||||
Status GetAllLogFiles(std::set<std::pair<uint64_t, std::string>>* file_nums);
|
||||
@ -312,12 +321,14 @@ class BlobDBImpl : public BlobDB {
|
||||
uint64_t ExtractExpiration(const Slice& key, const Slice& value,
|
||||
Slice* value_slice, std::string* new_value);
|
||||
|
||||
Status PutBlobValue(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t expiration,
|
||||
SequenceNumber sequence, WriteBatch* batch);
|
||||
|
||||
Status AppendBlob(const std::shared_ptr<BlobFile>& bfile,
|
||||
const std::string& headerbuf, const Slice& key,
|
||||
const Slice& value, std::string* index_entry);
|
||||
|
||||
Status AppendSN(const std::shared_ptr<BlobFile>& bfile,
|
||||
const SequenceNumber& sn);
|
||||
const Slice& value, uint64_t expiration,
|
||||
std::string* index_entry);
|
||||
|
||||
// find an existing blob log file based on the expiration unix epoch
|
||||
// if such a file does not exist, return nullptr
|
||||
@ -328,8 +339,6 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
|
||||
|
||||
void UpdateWriteOptions(const WriteOptions& options);
|
||||
|
||||
void Shutdown();
|
||||
|
||||
// periodic sanity check. Bunch of checks
|
||||
@ -363,14 +372,8 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
std::pair<bool, int64_t> EvictCompacted(bool aborted);
|
||||
|
||||
bool CallbackEvictsImpl(std::shared_ptr<BlobFile> bfile);
|
||||
|
||||
std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
|
||||
|
||||
std::pair<bool, int64_t> CallbackEvicts(TimerQueue* tq,
|
||||
std::shared_ptr<BlobFile> bfile,
|
||||
bool aborted);
|
||||
|
||||
// Adds the background tasks to the timer queue
|
||||
void StartBackgroundTasks();
|
||||
|
||||
@ -406,6 +409,7 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
// checks if there is no snapshot which is referencing the
|
||||
// blobs
|
||||
bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
|
||||
bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
|
||||
|
||||
bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue);
|
||||
@ -413,7 +417,9 @@ class BlobDBImpl : public BlobDB {
|
||||
bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
|
||||
uint64_t blob_offset, uint64_t blob_size);
|
||||
|
||||
void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
|
||||
void CopyBlobFiles(
|
||||
std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
|
||||
std::function<bool(const std::shared_ptr<BlobFile>&)> predicate = {});
|
||||
|
||||
void FilterSubsetOfFiles(
|
||||
const std::vector<std::shared_ptr<BlobFile>>& blob_files,
|
||||
@ -422,19 +428,17 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
uint64_t EpochNow() { return env_->NowMicros() / 1000000; }
|
||||
|
||||
Status CheckSize(size_t blob_size);
|
||||
|
||||
std::shared_ptr<BlobFile> GetOldestBlobFile();
|
||||
|
||||
bool EvictOldestBlobFile();
|
||||
|
||||
// the base DB
|
||||
DBImpl* db_impl_;
|
||||
Env* env_;
|
||||
TTLExtractor* ttl_extractor_;
|
||||
|
||||
// Optimistic Transaction DB used during Garbage collection
|
||||
// for atomicity
|
||||
std::unique_ptr<OptimisticTransactionDBImpl> opt_db_;
|
||||
|
||||
// a boolean to capture whether write_options has been set
|
||||
std::atomic<bool> wo_set_;
|
||||
WriteOptions write_options_;
|
||||
|
||||
// the options that govern the behavior of Blob Storage
|
||||
BlobDBOptions bdb_options_;
|
||||
DBOptions db_options_;
|
||||
@ -468,12 +472,12 @@ class BlobDBImpl : public BlobDB {
|
||||
// epoch or version of the open files.
|
||||
std::atomic<uint64_t> epoch_of_;
|
||||
|
||||
// All opened non-TTL blob files.
|
||||
std::vector<std::shared_ptr<BlobFile>> open_simple_files_;
|
||||
// opened non-TTL blob file.
|
||||
std::shared_ptr<BlobFile> open_non_ttl_file_;
|
||||
|
||||
// all the blob files which are currently being appended to based
|
||||
// on variety of incoming TTL's
|
||||
std::multiset<std::shared_ptr<BlobFile>, blobf_compare_ttl> open_blob_files_;
|
||||
std::multiset<std::shared_ptr<BlobFile>, blobf_compare_ttl> open_ttl_files_;
|
||||
|
||||
// packet of information to put in lockess delete(s) queue
|
||||
struct delete_packet_t {
|
||||
@ -506,9 +510,6 @@ class BlobDBImpl : public BlobDB {
|
||||
// timer based queue to execute tasks
|
||||
TimerQueue tqueue_;
|
||||
|
||||
// timer queues to call eviction callbacks.
|
||||
std::vector<std::shared_ptr<TimerQueue>> cb_threads_;
|
||||
|
||||
// only accessed in GC thread, hence not atomic. The epoch of the
|
||||
// GC task. Each execution is one epoch. Helps us in allocating
|
||||
// files to one execution
|
||||
@ -536,223 +537,8 @@ class BlobDBImpl : public BlobDB {
|
||||
bool open_p1_done_;
|
||||
|
||||
uint32_t debug_level_;
|
||||
};
|
||||
|
||||
class BlobFile {
|
||||
friend class BlobDBImpl;
|
||||
friend struct blobf_compare_ttl;
|
||||
|
||||
private:
|
||||
// access to parent
|
||||
const BlobDBImpl* parent_;
|
||||
|
||||
// path to blob directory
|
||||
std::string path_to_dir_;
|
||||
|
||||
// the id of the file.
|
||||
// the above 2 are created during file creation and never changed
|
||||
// after that
|
||||
uint64_t file_number_;
|
||||
|
||||
// number of blobs in the file
|
||||
std::atomic<uint64_t> blob_count_;
|
||||
|
||||
// the file will be selected for GC in this future epoch
|
||||
std::atomic<int64_t> gc_epoch_;
|
||||
|
||||
// size of the file
|
||||
std::atomic<uint64_t> file_size_;
|
||||
|
||||
// number of blobs in this particular file which have been evicted
|
||||
uint64_t deleted_count_;
|
||||
|
||||
// size of deleted blobs (used by heuristic to select file for GC)
|
||||
uint64_t deleted_size_;
|
||||
|
||||
BlobLogHeader header_;
|
||||
|
||||
// closed_ = true implies the file is no more mutable
|
||||
// no more blobs will be appended and the footer has been written out
|
||||
std::atomic<bool> closed_;
|
||||
|
||||
// has a pass of garbage collection successfully finished on this file
|
||||
// can_be_deleted_ still needs to do iterator/snapshot checks
|
||||
std::atomic<bool> can_be_deleted_;
|
||||
|
||||
// should this file been gc'd once to reconcile lost deletes/compactions
|
||||
std::atomic<bool> gc_once_after_open_;
|
||||
|
||||
// et - lt of the blobs
|
||||
ttlrange_t ttl_range_;
|
||||
|
||||
// et - lt of the timestamp of the KV pairs.
|
||||
tsrange_t time_range_;
|
||||
|
||||
// ESN - LSN of the blobs
|
||||
snrange_t sn_range_;
|
||||
|
||||
// Sequential/Append writer for blobs
|
||||
std::shared_ptr<Writer> log_writer_;
|
||||
|
||||
// random access file reader for GET calls
|
||||
std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
|
||||
|
||||
// This Read-Write mutex is per file specific and protects
|
||||
// all the datastructures
|
||||
mutable port::RWMutex mutex_;
|
||||
|
||||
// time when the random access reader was last created.
|
||||
std::atomic<std::time_t> last_access_;
|
||||
|
||||
// last time file was fsync'd/fdatasyncd
|
||||
std::atomic<uint64_t> last_fsync_;
|
||||
|
||||
bool header_valid_;
|
||||
|
||||
public:
|
||||
BlobFile();
|
||||
|
||||
BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum);
|
||||
|
||||
~BlobFile();
|
||||
|
||||
ColumnFamilyHandle* GetColumnFamily(DB* db);
|
||||
|
||||
// Returns log file's pathname relative to the main db dir
|
||||
// Eg. For a live-log-file = blob_dir/000003.blob
|
||||
std::string PathName() const;
|
||||
|
||||
// Primary identifier for blob file.
|
||||
// once the file is created, this never changes
|
||||
uint64_t BlobFileNumber() const { return file_number_; }
|
||||
|
||||
// the following functions are atomic, and don't need
|
||||
// read lock
|
||||
uint64_t BlobCount() const {
|
||||
return blob_count_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
std::string DumpState() const;
|
||||
|
||||
// if the file has gone through GC and blobs have been relocated
|
||||
bool Obsolete() const { return can_be_deleted_.load(); }
|
||||
|
||||
// if the file is not taking any more appends.
|
||||
bool Immutable() const { return closed_.load(); }
|
||||
|
||||
// we will assume this is atomic
|
||||
bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
|
||||
|
||||
uint64_t GetFileSize() const {
|
||||
return file_size_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
// All Get functions which are not atomic, will need ReadLock on the mutex
|
||||
tsrange_t GetTimeRange() const {
|
||||
assert(HasTimestamp());
|
||||
return time_range_;
|
||||
}
|
||||
|
||||
ttlrange_t GetTTLRange() const { return ttl_range_; }
|
||||
|
||||
snrange_t GetSNRange() const { return sn_range_; }
|
||||
|
||||
bool HasTTL() const {
|
||||
assert(header_valid_);
|
||||
return header_.HasTTL();
|
||||
}
|
||||
|
||||
bool HasTimestamp() const {
|
||||
assert(header_valid_);
|
||||
return header_.HasTimestamp();
|
||||
}
|
||||
|
||||
std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
|
||||
|
||||
void Fsync();
|
||||
|
||||
private:
|
||||
std::shared_ptr<Reader> OpenSequentialReader(
|
||||
Env* env, const DBOptions& db_options,
|
||||
const EnvOptions& env_options) const;
|
||||
|
||||
Status ReadFooter(BlobLogFooter* footer);
|
||||
|
||||
Status WriteFooterAndCloseLocked();
|
||||
|
||||
std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
|
||||
Env* env, const EnvOptions& env_options, bool* fresh_open);
|
||||
|
||||
void CloseRandomAccessLocked();
|
||||
|
||||
// this is used, when you are reading only the footer of a
|
||||
// previously closed file
|
||||
Status SetFromFooterLocked(const BlobLogFooter& footer);
|
||||
|
||||
void set_time_range(const tsrange_t& tr) { time_range_ = tr; }
|
||||
|
||||
void set_ttl_range(const ttlrange_t& ttl) { ttl_range_ = ttl; }
|
||||
|
||||
void SetSNRange(const snrange_t& snr) { sn_range_ = snr; }
|
||||
|
||||
// The following functions are atomic, and don't need locks
|
||||
void SetFileSize(uint64_t fs) { file_size_ = fs; }
|
||||
|
||||
void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
|
||||
|
||||
void SetCanBeDeleted() { can_be_deleted_ = true; }
|
||||
};
|
||||
|
||||
class BlobDBIterator : public Iterator {
|
||||
public:
|
||||
explicit BlobDBIterator(Iterator* iter, ColumnFamilyHandle* column_family,
|
||||
BlobDBImpl* impl, bool own_snapshot,
|
||||
const Snapshot* snapshot)
|
||||
: iter_(iter),
|
||||
cfh_(column_family),
|
||||
db_impl_(impl),
|
||||
own_snapshot_(own_snapshot),
|
||||
snapshot_(snapshot) {
|
||||
assert(iter != nullptr);
|
||||
assert(snapshot != nullptr);
|
||||
}
|
||||
|
||||
~BlobDBIterator() {
|
||||
if (own_snapshot_) {
|
||||
db_impl_->ReleaseSnapshot(snapshot_);
|
||||
}
|
||||
delete iter_;
|
||||
}
|
||||
|
||||
bool Valid() const override { return iter_->Valid(); }
|
||||
|
||||
void SeekToFirst() override { iter_->SeekToFirst(); }
|
||||
|
||||
void SeekToLast() override { iter_->SeekToLast(); }
|
||||
|
||||
void Seek(const Slice& target) override { iter_->Seek(target); }
|
||||
|
||||
void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); }
|
||||
|
||||
void Next() override { iter_->Next(); }
|
||||
|
||||
void Prev() override { iter_->Prev(); }
|
||||
|
||||
Slice key() const override { return iter_->key(); }
|
||||
|
||||
Slice value() const override;
|
||||
|
||||
Status status() const override { return iter_->status(); }
|
||||
|
||||
// Iterator::Refresh() not supported.
|
||||
|
||||
private:
|
||||
Iterator* iter_;
|
||||
ColumnFamilyHandle* cfh_;
|
||||
BlobDBImpl* db_impl_;
|
||||
bool own_snapshot_;
|
||||
const Snapshot* snapshot_;
|
||||
mutable std::string vpart_;
|
||||
std::atomic<bool> oldest_file_evicted_;
|
||||
};
|
||||
|
||||
} // namespace blob_db
|
||||
|
104
utilities/blob_db/blob_db_iterator.h
Normal file
104
utilities/blob_db/blob_db_iterator.h
Normal file
@ -0,0 +1,104 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "utilities/blob_db/blob_db_impl.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
using rocksdb::ManagedSnapshot;
|
||||
|
||||
class BlobDBIterator : public Iterator {
|
||||
public:
|
||||
BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
|
||||
BlobDBImpl* blob_db)
|
||||
: snapshot_(snapshot), iter_(iter), blob_db_(blob_db) {}
|
||||
|
||||
virtual ~BlobDBIterator() = default;
|
||||
|
||||
bool Valid() const override {
|
||||
if (!iter_->Valid()) {
|
||||
return false;
|
||||
}
|
||||
return status_.ok();
|
||||
}
|
||||
|
||||
Status status() const override {
|
||||
if (!iter_->status().ok()) {
|
||||
return iter_->status();
|
||||
}
|
||||
return status_;
|
||||
}
|
||||
|
||||
void SeekToFirst() override {
|
||||
iter_->SeekToFirst();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void SeekToLast() override {
|
||||
iter_->SeekToLast();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void Seek(const Slice& target) override {
|
||||
iter_->Seek(target);
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void SeekForPrev(const Slice& target) override {
|
||||
iter_->SeekForPrev(target);
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void Next() override {
|
||||
assert(Valid());
|
||||
iter_->Next();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void Prev() override {
|
||||
assert(Valid());
|
||||
iter_->Prev();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
Slice key() const override {
|
||||
assert(Valid());
|
||||
return iter_->key();
|
||||
}
|
||||
|
||||
Slice value() const override {
|
||||
assert(Valid());
|
||||
if (!iter_->IsBlob()) {
|
||||
return iter_->value();
|
||||
}
|
||||
return value_;
|
||||
}
|
||||
|
||||
// Iterator::Refresh() not supported.
|
||||
|
||||
private:
|
||||
void UpdateBlobValue() {
|
||||
TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:1");
|
||||
TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:2");
|
||||
value_.Reset();
|
||||
if (iter_->Valid() && iter_->IsBlob()) {
|
||||
status_ = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<ManagedSnapshot> snapshot_;
|
||||
std::unique_ptr<ArenaWrappedDBIter> iter_;
|
||||
BlobDBImpl* blob_db_;
|
||||
Status status_;
|
||||
PinnableSlice value_;
|
||||
};
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
#endif // !ROCKSDB_LITE
|
File diff suppressed because it is too large
Load Diff
@ -18,7 +18,6 @@
|
||||
#include "rocksdb/convenience.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include "util/string_util.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -92,7 +91,7 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
|
||||
|
||||
Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
|
||||
Slice slice;
|
||||
Status s = Read(0, BlobLogHeader::kHeaderSize, &slice);
|
||||
Status s = Read(0, BlobLogHeader::kSize, &slice);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
@ -102,20 +101,19 @@ Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
|
||||
return s;
|
||||
}
|
||||
fprintf(stdout, "Blob log header:\n");
|
||||
fprintf(stdout, " Magic Number : %" PRIu32 "\n", header.magic_number());
|
||||
fprintf(stdout, " Version : %" PRIu32 "\n", header.version());
|
||||
CompressionType compression = header.compression();
|
||||
fprintf(stdout, " Version : %" PRIu32 "\n", header.version);
|
||||
fprintf(stdout, " Column Family ID : %" PRIu32 "\n",
|
||||
header.column_family_id);
|
||||
std::string compression_str;
|
||||
if (!GetStringFromCompressionType(&compression_str, compression).ok()) {
|
||||
if (!GetStringFromCompressionType(&compression_str, header.compression)
|
||||
.ok()) {
|
||||
compression_str = "Unrecongnized compression type (" +
|
||||
ToString((int)header.compression()) + ")";
|
||||
ToString((int)header.compression) + ")";
|
||||
}
|
||||
fprintf(stdout, " Compression : %s\n", compression_str.c_str());
|
||||
fprintf(stdout, " TTL Range : %s\n",
|
||||
GetString(header.ttl_range()).c_str());
|
||||
fprintf(stdout, " Timestamp Range: %s\n",
|
||||
GetString(header.ts_range()).c_str());
|
||||
*offset = BlobLogHeader::kHeaderSize;
|
||||
fprintf(stdout, " Compression : %s\n", compression_str.c_str());
|
||||
fprintf(stdout, " Expiration range : %s\n",
|
||||
GetString(header.expiration_range).c_str());
|
||||
*offset = BlobLogHeader::kSize;
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -126,20 +124,12 @@ Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
|
||||
fprintf(stdout, "No blob log footer.\n");
|
||||
return Status::OK();
|
||||
};
|
||||
if (file_size < BlobLogHeader::kHeaderSize + BlobLogFooter::kFooterSize) {
|
||||
if (file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
|
||||
return no_footer();
|
||||
}
|
||||
Slice slice;
|
||||
Status s = Read(file_size - 4, 4, &slice);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
uint32_t magic_number = DecodeFixed32(slice.data());
|
||||
if (magic_number != kMagicNumber) {
|
||||
return no_footer();
|
||||
}
|
||||
*footer_offset = file_size - BlobLogFooter::kFooterSize;
|
||||
s = Read(*footer_offset, BlobLogFooter::kFooterSize, &slice);
|
||||
*footer_offset = file_size - BlobLogFooter::kSize;
|
||||
Status s = Read(*footer_offset, BlobLogFooter::kSize, &slice);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
@ -149,13 +139,11 @@ Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
|
||||
return s;
|
||||
}
|
||||
fprintf(stdout, "Blob log footer:\n");
|
||||
fprintf(stdout, " Blob count : %" PRIu64 "\n", footer.GetBlobCount());
|
||||
fprintf(stdout, " TTL Range : %s\n",
|
||||
GetString(footer.GetTTLRange()).c_str());
|
||||
fprintf(stdout, " Time Range : %s\n",
|
||||
GetString(footer.GetTimeRange()).c_str());
|
||||
fprintf(stdout, " Sequence Range : %s\n",
|
||||
GetString(footer.GetSNRange()).c_str());
|
||||
fprintf(stdout, " Blob count : %" PRIu64 "\n", footer.blob_count);
|
||||
fprintf(stdout, " Expiration Range : %s\n",
|
||||
GetString(footer.expiration_range).c_str());
|
||||
fprintf(stdout, " Sequence Range : %s\n",
|
||||
GetString(footer.sequence_range).c_str());
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -173,49 +161,25 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
uint32_t key_size = record.GetKeySize();
|
||||
uint64_t blob_size = record.GetBlobSize();
|
||||
fprintf(stdout, " key size : %" PRIu32 "\n", key_size);
|
||||
fprintf(stdout, " blob size : %" PRIu64 "\n", record.GetBlobSize());
|
||||
fprintf(stdout, " TTL : %" PRIu64 "\n", record.GetTTL());
|
||||
fprintf(stdout, " time : %" PRIu64 "\n", record.GetTimeVal());
|
||||
fprintf(stdout, " type : %d, %d\n", record.type(), record.subtype());
|
||||
fprintf(stdout, " header CRC : %" PRIu32 "\n", record.header_checksum());
|
||||
fprintf(stdout, " CRC : %" PRIu32 "\n", record.checksum());
|
||||
uint32_t header_crc =
|
||||
crc32c::Extend(0, slice.data(), slice.size() - 2 * sizeof(uint32_t));
|
||||
uint64_t key_size = record.key_size;
|
||||
uint64_t value_size = record.value_size;
|
||||
fprintf(stdout, " key size : %" PRIu64 "\n", key_size);
|
||||
fprintf(stdout, " value size : %" PRIu64 "\n", value_size);
|
||||
fprintf(stdout, " expiration : %" PRIu64 "\n", record.expiration);
|
||||
*offset += BlobLogRecord::kHeaderSize;
|
||||
s = Read(*offset, key_size + blob_size + BlobLogRecord::kFooterSize, &slice);
|
||||
s = Read(*offset, key_size + value_size, &slice);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
header_crc = crc32c::Extend(header_crc, slice.data(), key_size);
|
||||
header_crc = crc32c::Mask(header_crc);
|
||||
if (header_crc != record.header_checksum()) {
|
||||
return Status::Corruption("Record header checksum mismatch.");
|
||||
}
|
||||
uint32_t blob_crc = crc32c::Extend(0, slice.data() + key_size, blob_size);
|
||||
blob_crc = crc32c::Mask(blob_crc);
|
||||
if (blob_crc != record.checksum()) {
|
||||
return Status::Corruption("Blob checksum mismatch.");
|
||||
}
|
||||
if (show_key != DisplayType::kNone) {
|
||||
fprintf(stdout, " key : ");
|
||||
DumpSlice(Slice(slice.data(), key_size), show_key);
|
||||
if (show_blob != DisplayType::kNone) {
|
||||
fprintf(stdout, " blob : ");
|
||||
DumpSlice(Slice(slice.data() + key_size, blob_size), show_blob);
|
||||
DumpSlice(Slice(slice.data() + key_size, value_size), show_blob);
|
||||
}
|
||||
}
|
||||
Slice footer_slice(slice.data() + record.GetKeySize() + record.GetBlobSize(),
|
||||
BlobLogRecord::kFooterSize);
|
||||
s = record.DecodeFooterFrom(footer_slice);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
fprintf(stdout, " footer CRC : %" PRIu32 "\n", record.footer_checksum());
|
||||
fprintf(stdout, " sequence : %" PRIu64 "\n", record.GetSN());
|
||||
*offset += key_size + blob_size + BlobLogRecord::kFooterSize;
|
||||
*offset += key_size + value_size;
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -3,15 +3,24 @@
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
#ifndef ROCKSDB_LITE
|
||||
#include "utilities/blob_db/blob_file.h"
|
||||
|
||||
#ifndef __STDC_FORMAT_MACROS
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <chrono>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
#include "utilities/blob_db/blob_db_impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
#include "db/column_family.h"
|
||||
#include "db/db_impl.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "util/filename.h"
|
||||
#include "util/logging.h"
|
||||
#include "utilities/blob_db/blob_db_impl.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -20,17 +29,18 @@ namespace blob_db {
|
||||
BlobFile::BlobFile()
|
||||
: parent_(nullptr),
|
||||
file_number_(0),
|
||||
has_ttl_(false),
|
||||
compression_(kNoCompression),
|
||||
blob_count_(0),
|
||||
gc_epoch_(-1),
|
||||
file_size_(0),
|
||||
deleted_count_(0),
|
||||
deleted_size_(0),
|
||||
closed_(false),
|
||||
can_be_deleted_(false),
|
||||
obsolete_(false),
|
||||
gc_once_after_open_(false),
|
||||
ttl_range_(std::make_pair(0, 0)),
|
||||
time_range_(std::make_pair(0, 0)),
|
||||
sn_range_(std::make_pair(0, 0)),
|
||||
expiration_range_({0, 0}),
|
||||
sequence_range_({kMaxSequenceNumber, 0}),
|
||||
last_access_(-1),
|
||||
last_fsync_(0),
|
||||
header_valid_(false) {}
|
||||
@ -39,23 +49,24 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
|
||||
: parent_(p),
|
||||
path_to_dir_(bdir),
|
||||
file_number_(fn),
|
||||
has_ttl_(false),
|
||||
compression_(kNoCompression),
|
||||
blob_count_(0),
|
||||
gc_epoch_(-1),
|
||||
file_size_(0),
|
||||
deleted_count_(0),
|
||||
deleted_size_(0),
|
||||
closed_(false),
|
||||
can_be_deleted_(false),
|
||||
obsolete_(false),
|
||||
gc_once_after_open_(false),
|
||||
ttl_range_(std::make_pair(0, 0)),
|
||||
time_range_(std::make_pair(0, 0)),
|
||||
sn_range_(std::make_pair(0, 0)),
|
||||
expiration_range_({0, 0}),
|
||||
sequence_range_({kMaxSequenceNumber, 0}),
|
||||
last_access_(-1),
|
||||
last_fsync_(0),
|
||||
header_valid_(false) {}
|
||||
|
||||
BlobFile::~BlobFile() {
|
||||
if (can_be_deleted_) {
|
||||
if (obsolete_) {
|
||||
std::string pn(PathName());
|
||||
Status s = Env::Default()->DeleteFile(PathName());
|
||||
if (!s.ok()) {
|
||||
@ -65,6 +76,13 @@ BlobFile::~BlobFile() {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t BlobFile::column_family_id() const {
|
||||
// TODO(yiwu): Should return column family id encoded in blob file after
|
||||
// we add blob db column family support.
|
||||
return reinterpret_cast<ColumnFamilyHandle*>(parent_->DefaultColumnFamily())
|
||||
->GetID();
|
||||
}
|
||||
|
||||
std::string BlobFile::PathName() const {
|
||||
return BlobFileName(path_to_dir_, file_number_);
|
||||
}
|
||||
@ -94,16 +112,21 @@ std::string BlobFile::DumpState() const {
|
||||
"path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " gc_epoch: %" PRIu64
|
||||
" file_size: %" PRIu64 " deleted_count: %" PRIu64
|
||||
" deleted_size: %" PRIu64
|
||||
" closed: %d can_be_deleted: %d ttl_range: (%" PRIu64 ", %" PRIu64
|
||||
") sn_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
|
||||
" closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
|
||||
") sequence_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
|
||||
path_to_dir_.c_str(), file_number_, blob_count_.load(),
|
||||
gc_epoch_.load(), file_size_.load(), deleted_count_, deleted_size_,
|
||||
closed_.load(), can_be_deleted_.load(), ttl_range_.first,
|
||||
ttl_range_.second, sn_range_.first, sn_range_.second,
|
||||
(!!log_writer_), (!!ra_file_reader_));
|
||||
closed_.load(), obsolete_.load(), expiration_range_.first,
|
||||
expiration_range_.second, sequence_range_.first,
|
||||
sequence_range_.second, (!!log_writer_), (!!ra_file_reader_));
|
||||
return str;
|
||||
}
|
||||
|
||||
void BlobFile::MarkObsolete(SequenceNumber sequence) {
|
||||
obsolete_sequence_ = sequence;
|
||||
obsolete_.store(true);
|
||||
}
|
||||
|
||||
bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
|
||||
assert(last_fsync_ <= file_size_);
|
||||
return (hard) ? file_size_ > last_fsync_
|
||||
@ -115,17 +138,18 @@ Status BlobFile::WriteFooterAndCloseLocked() {
|
||||
"File is being closed after footer %s", PathName().c_str());
|
||||
|
||||
BlobLogFooter footer;
|
||||
footer.blob_count_ = blob_count_;
|
||||
if (HasTTL()) footer.set_ttl_range(ttl_range_);
|
||||
footer.blob_count = blob_count_;
|
||||
if (HasTTL()) {
|
||||
footer.expiration_range = expiration_range_;
|
||||
}
|
||||
|
||||
footer.sn_range_ = sn_range_;
|
||||
if (HasTimestamp()) footer.set_time_range(time_range_);
|
||||
footer.sequence_range = sequence_range_;
|
||||
|
||||
// this will close the file and reset the Writable File Pointer.
|
||||
Status s = log_writer_->AppendFooter(footer);
|
||||
if (s.ok()) {
|
||||
closed_ = true;
|
||||
file_size_ += BlobLogFooter::kFooterSize;
|
||||
file_size_ += BlobLogFooter::kSize;
|
||||
} else {
|
||||
ROCKS_LOG_ERROR(parent_->db_options_.info_log,
|
||||
"Failure to read Header for blob-file %s",
|
||||
@ -137,20 +161,20 @@ Status BlobFile::WriteFooterAndCloseLocked() {
|
||||
}
|
||||
|
||||
Status BlobFile::ReadFooter(BlobLogFooter* bf) {
|
||||
if (file_size_ < (BlobLogHeader::kHeaderSize + BlobLogFooter::kFooterSize)) {
|
||||
if (file_size_ < (BlobLogHeader::kSize + BlobLogFooter::kSize)) {
|
||||
return Status::IOError("File does not have footer", PathName());
|
||||
}
|
||||
|
||||
uint64_t footer_offset = file_size_ - BlobLogFooter::kFooterSize;
|
||||
uint64_t footer_offset = file_size_ - BlobLogFooter::kSize;
|
||||
// assume that ra_file_reader_ is valid before we enter this
|
||||
assert(ra_file_reader_);
|
||||
|
||||
Slice result;
|
||||
char scratch[BlobLogFooter::kFooterSize + 10];
|
||||
Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kFooterSize,
|
||||
&result, scratch);
|
||||
char scratch[BlobLogFooter::kSize + 10];
|
||||
Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result,
|
||||
scratch);
|
||||
if (!s.ok()) return s;
|
||||
if (result.size() != BlobLogFooter::kFooterSize) {
|
||||
if (result.size() != BlobLogFooter::kSize) {
|
||||
// should not happen
|
||||
return Status::IOError("EOF reached before footer");
|
||||
}
|
||||
@ -160,21 +184,12 @@ Status BlobFile::ReadFooter(BlobLogFooter* bf) {
|
||||
}
|
||||
|
||||
Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
|
||||
if (footer.HasTTL() != header_.HasTTL()) {
|
||||
return Status::Corruption("has_ttl mismatch");
|
||||
}
|
||||
if (footer.HasTimestamp() != header_.HasTimestamp()) {
|
||||
return Status::Corruption("has_ts mismatch");
|
||||
}
|
||||
|
||||
// assume that file has been fully fsync'd
|
||||
last_fsync_.store(file_size_);
|
||||
blob_count_ = footer.GetBlobCount();
|
||||
ttl_range_ = footer.GetTTLRange();
|
||||
time_range_ = footer.GetTimeRange();
|
||||
sn_range_ = footer.GetSNRange();
|
||||
blob_count_ = footer.blob_count;
|
||||
expiration_range_ = footer.expiration_range;
|
||||
sequence_range_ = footer.sequence_range;
|
||||
closed_ = true;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -193,8 +208,10 @@ void BlobFile::CloseRandomAccessLocked() {
|
||||
std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
|
||||
Env* env, const EnvOptions& env_options, bool* fresh_open) {
|
||||
*fresh_open = false;
|
||||
last_access_ =
|
||||
std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
|
||||
int64_t current_time = 0;
|
||||
env->GetCurrentTime(¤t_time);
|
||||
last_access_.store(current_time);
|
||||
|
||||
{
|
||||
ReadLock lockbfile_r(&mutex_);
|
||||
if (ra_file_reader_) return ra_file_reader_;
|
||||
@ -220,10 +237,6 @@ std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
|
||||
return ra_file_reader_;
|
||||
}
|
||||
|
||||
ColumnFamilyHandle* BlobFile::GetColumnFamily(DB* db) {
|
||||
return db->DefaultColumnFamily();
|
||||
}
|
||||
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
|
216
utilities/blob_db/blob_file.h
Normal file
216
utilities/blob_db/blob_file.h
Normal file
@ -0,0 +1,216 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
#include "utilities/blob_db/blob_log_reader.h"
|
||||
#include "utilities/blob_db/blob_log_writer.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
class BlobDBImpl;
|
||||
|
||||
class BlobFile {
|
||||
friend class BlobDBImpl;
|
||||
friend struct blobf_compare_ttl;
|
||||
|
||||
private:
|
||||
// access to parent
|
||||
const BlobDBImpl* parent_;
|
||||
|
||||
// path to blob directory
|
||||
std::string path_to_dir_;
|
||||
|
||||
// the id of the file.
|
||||
// the above 2 are created during file creation and never changed
|
||||
// after that
|
||||
uint64_t file_number_;
|
||||
|
||||
// If true, the keys in this file all has TTL. Otherwise all keys don't
|
||||
// have TTL.
|
||||
bool has_ttl_;
|
||||
|
||||
// Compression type of blobs in the file
|
||||
CompressionType compression_;
|
||||
|
||||
// number of blobs in the file
|
||||
std::atomic<uint64_t> blob_count_;
|
||||
|
||||
// the file will be selected for GC in this future epoch
|
||||
std::atomic<int64_t> gc_epoch_;
|
||||
|
||||
// size of the file
|
||||
std::atomic<uint64_t> file_size_;
|
||||
|
||||
// number of blobs in this particular file which have been evicted
|
||||
uint64_t deleted_count_;
|
||||
|
||||
// size of deleted blobs (used by heuristic to select file for GC)
|
||||
uint64_t deleted_size_;
|
||||
|
||||
BlobLogHeader header_;
|
||||
|
||||
// closed_ = true implies the file is no more mutable
|
||||
// no more blobs will be appended and the footer has been written out
|
||||
std::atomic<bool> closed_;
|
||||
|
||||
// has a pass of garbage collection successfully finished on this file
|
||||
// obsolete_ still needs to do iterator/snapshot checks
|
||||
std::atomic<bool> obsolete_;
|
||||
|
||||
// The last sequence number by the time the file marked as obsolete.
|
||||
// Data in this file is visible to a snapshot taken before the sequence.
|
||||
SequenceNumber obsolete_sequence_;
|
||||
|
||||
// should this file been gc'd once to reconcile lost deletes/compactions
|
||||
std::atomic<bool> gc_once_after_open_;
|
||||
|
||||
ExpirationRange expiration_range_;
|
||||
|
||||
SequenceRange sequence_range_;
|
||||
|
||||
// Sequential/Append writer for blobs
|
||||
std::shared_ptr<Writer> log_writer_;
|
||||
|
||||
// random access file reader for GET calls
|
||||
std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
|
||||
|
||||
// This Read-Write mutex is per file specific and protects
|
||||
// all the datastructures
|
||||
mutable port::RWMutex mutex_;
|
||||
|
||||
// time when the random access reader was last created.
|
||||
std::atomic<std::int64_t> last_access_;
|
||||
|
||||
// last time file was fsync'd/fdatasyncd
|
||||
std::atomic<uint64_t> last_fsync_;
|
||||
|
||||
bool header_valid_;
|
||||
|
||||
SequenceNumber garbage_collection_finish_sequence_;
|
||||
|
||||
public:
|
||||
BlobFile();
|
||||
|
||||
BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum);
|
||||
|
||||
~BlobFile();
|
||||
|
||||
uint32_t column_family_id() const;
|
||||
|
||||
// Returns log file's pathname relative to the main db dir
|
||||
// Eg. For a live-log-file = blob_dir/000003.blob
|
||||
std::string PathName() const;
|
||||
|
||||
// Primary identifier for blob file.
|
||||
// once the file is created, this never changes
|
||||
uint64_t BlobFileNumber() const { return file_number_; }
|
||||
|
||||
// the following functions are atomic, and don't need
|
||||
// read lock
|
||||
uint64_t BlobCount() const {
|
||||
return blob_count_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
std::string DumpState() const;
|
||||
|
||||
// if the file has gone through GC and blobs have been relocated
|
||||
bool Obsolete() const {
|
||||
assert(Immutable() || !obsolete_.load());
|
||||
return obsolete_.load();
|
||||
}
|
||||
|
||||
// Mark file as obsolete by garbage collection. The file is not visible to
|
||||
// snapshots with sequence greater or equal to the given sequence.
|
||||
void MarkObsolete(SequenceNumber sequence);
|
||||
|
||||
SequenceNumber GetObsoleteSequence() const {
|
||||
assert(Obsolete());
|
||||
return obsolete_sequence_;
|
||||
}
|
||||
|
||||
// if the file is not taking any more appends.
|
||||
bool Immutable() const { return closed_.load(); }
|
||||
|
||||
// we will assume this is atomic
|
||||
bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
|
||||
|
||||
void Fsync();
|
||||
|
||||
uint64_t GetFileSize() const {
|
||||
return file_size_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
// All Get functions which are not atomic, will need ReadLock on the mutex
|
||||
|
||||
ExpirationRange GetExpirationRange() const { return expiration_range_; }
|
||||
|
||||
void ExtendExpirationRange(uint64_t expiration) {
|
||||
expiration_range_.first = std::min(expiration_range_.first, expiration);
|
||||
expiration_range_.second = std::max(expiration_range_.second, expiration);
|
||||
}
|
||||
|
||||
SequenceRange GetSequenceRange() const { return sequence_range_; }
|
||||
|
||||
void SetSequenceRange(SequenceRange sequence_range) {
|
||||
sequence_range_ = sequence_range;
|
||||
}
|
||||
|
||||
void ExtendSequenceRange(SequenceNumber sequence) {
|
||||
sequence_range_.first = std::min(sequence_range_.first, sequence);
|
||||
sequence_range_.second = std::max(sequence_range_.second, sequence);
|
||||
}
|
||||
|
||||
bool HasTTL() const { return has_ttl_; }
|
||||
|
||||
void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
|
||||
|
||||
CompressionType compression() const { return compression_; }
|
||||
|
||||
void SetCompression(CompressionType c) {
|
||||
compression_ = c;
|
||||
}
|
||||
|
||||
std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Reader> OpenSequentialReader(
|
||||
Env* env, const DBOptions& db_options,
|
||||
const EnvOptions& env_options) const;
|
||||
|
||||
Status ReadFooter(BlobLogFooter* footer);
|
||||
|
||||
Status WriteFooterAndCloseLocked();
|
||||
|
||||
std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
|
||||
Env* env, const EnvOptions& env_options, bool* fresh_open);
|
||||
|
||||
void CloseRandomAccessLocked();
|
||||
|
||||
// this is used, when you are reading only the footer of a
|
||||
// previously closed file
|
||||
Status SetFromFooterLocked(const BlobLogFooter& footer);
|
||||
|
||||
void set_expiration_range(const ExpirationRange& expiration_range) {
|
||||
expiration_range_ = expiration_range;
|
||||
}
|
||||
|
||||
// The following functions are atomic, and don't need locks
|
||||
void SetFileSize(uint64_t fs) { file_size_ = fs; }
|
||||
|
||||
void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
|
||||
};
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
161
utilities/blob_db/blob_index.h
Normal file
161
utilities/blob_db/blob_index.h
Normal file
@ -0,0 +1,161 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "rocksdb/options.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/string_util.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
// BlobIndex is a pointer to the blob and metadata of the blob. The index is
|
||||
// stored in base DB as ValueType::kTypeBlobIndex.
|
||||
// There are three types of blob index:
|
||||
//
|
||||
// kInlinedTTL:
|
||||
// +------+------------+---------------+
|
||||
// | type | expiration | value |
|
||||
// +------+------------+---------------+
|
||||
// | char | varint64 | variable size |
|
||||
// +------+------------+---------------+
|
||||
//
|
||||
// kBlob:
|
||||
// +------+-------------+----------+----------+-------------+
|
||||
// | type | file number | offset | size | compression |
|
||||
// +------+-------------+----------+----------+-------------+
|
||||
// | char | varint64 | varint64 | varint64 | char |
|
||||
// +------+-------------+----------+----------+-------------+
|
||||
//
|
||||
// kBlobTTL:
|
||||
// +------+------------+-------------+----------+----------+-------------+
|
||||
// | type | expiration | file number | offset | size | compression |
|
||||
// +------+------------+-------------+----------+----------+-------------+
|
||||
// | char | varint64 | varint64 | varint64 | varint64 | char |
|
||||
// +------+------------+-------------+----------+----------+-------------+
|
||||
//
|
||||
// There isn't a kInlined (without TTL) type since we can store it as a plain
|
||||
// value (i.e. ValueType::kTypeValue).
|
||||
class BlobIndex {
|
||||
public:
|
||||
enum class Type : unsigned char {
|
||||
kInlinedTTL = 0,
|
||||
kBlob = 1,
|
||||
kBlobTTL = 2,
|
||||
kUnknown = 3,
|
||||
};
|
||||
|
||||
BlobIndex() : type_(Type::kUnknown) {}
|
||||
|
||||
bool IsInlined() const { return type_ == Type::kInlinedTTL; }
|
||||
|
||||
bool HasTTL() const {
|
||||
return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
|
||||
}
|
||||
|
||||
uint64_t expiration() const {
|
||||
assert(HasTTL());
|
||||
return expiration_;
|
||||
}
|
||||
|
||||
const Slice& value() const {
|
||||
assert(IsInlined());
|
||||
return value_;
|
||||
}
|
||||
|
||||
uint64_t file_number() const {
|
||||
assert(!IsInlined());
|
||||
return file_number_;
|
||||
}
|
||||
|
||||
uint64_t offset() const {
|
||||
assert(!IsInlined());
|
||||
return offset_;
|
||||
}
|
||||
|
||||
uint64_t size() const {
|
||||
assert(!IsInlined());
|
||||
return size_;
|
||||
}
|
||||
|
||||
Status DecodeFrom(Slice slice) {
|
||||
static const std::string kErrorMessage = "Error while decoding blob index";
|
||||
assert(slice.size() > 0);
|
||||
type_ = static_cast<Type>(*slice.data());
|
||||
if (type_ >= Type::kUnknown) {
|
||||
return Status::Corruption(
|
||||
kErrorMessage,
|
||||
"Unknown blob index type: " + ToString(static_cast<char>(type_)));
|
||||
}
|
||||
slice = Slice(slice.data() + 1, slice.size() - 1);
|
||||
if (HasTTL()) {
|
||||
if (!GetVarint64(&slice, &expiration_)) {
|
||||
return Status::Corruption(kErrorMessage, "Corrupted expiration");
|
||||
}
|
||||
}
|
||||
if (IsInlined()) {
|
||||
value_ = slice;
|
||||
} else {
|
||||
if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
|
||||
GetVarint64(&slice, &size_) && slice.size() == 1) {
|
||||
compression_ = static_cast<CompressionType>(*slice.data());
|
||||
} else {
|
||||
return Status::Corruption(kErrorMessage, "Corrupted blob offset");
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
|
||||
const Slice& value) {
|
||||
assert(dst != nullptr);
|
||||
dst->clear();
|
||||
dst->reserve(1 + kMaxVarint64Length + value.size());
|
||||
dst->push_back(static_cast<char>(Type::kInlinedTTL));
|
||||
PutVarint64(dst, expiration);
|
||||
dst->append(value.data(), value.size());
|
||||
}
|
||||
|
||||
static void EncodeBlob(std::string* dst, uint64_t file_number,
|
||||
uint64_t offset, uint64_t size,
|
||||
CompressionType compression) {
|
||||
assert(dst != nullptr);
|
||||
dst->clear();
|
||||
dst->reserve(kMaxVarint64Length * 3 + 2);
|
||||
dst->push_back(static_cast<char>(Type::kBlob));
|
||||
PutVarint64(dst, file_number);
|
||||
PutVarint64(dst, offset);
|
||||
PutVarint64(dst, size);
|
||||
dst->push_back(static_cast<char>(compression));
|
||||
}
|
||||
|
||||
static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
|
||||
uint64_t file_number, uint64_t offset,
|
||||
uint64_t size, CompressionType compression) {
|
||||
assert(dst != nullptr);
|
||||
dst->clear();
|
||||
dst->reserve(kMaxVarint64Length * 4 + 2);
|
||||
dst->push_back(static_cast<char>(Type::kBlobTTL));
|
||||
PutVarint64(dst, expiration);
|
||||
PutVarint64(dst, file_number);
|
||||
PutVarint64(dst, offset);
|
||||
PutVarint64(dst, size);
|
||||
dst->push_back(static_cast<char>(compression));
|
||||
}
|
||||
|
||||
private:
|
||||
Type type_ = Type::kUnknown;
|
||||
uint64_t expiration_ = 0;
|
||||
Slice value_;
|
||||
uint64_t file_number_ = 0;
|
||||
uint64_t offset_ = 0;
|
||||
uint64_t size_ = 0;
|
||||
CompressionType compression_ = kNoCompression;
|
||||
};
|
||||
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
@ -6,310 +6,145 @@
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
const uint32_t kMagicNumber = 2395959;
|
||||
const uint32_t kVersion1 = 1;
|
||||
const size_t kBlockSize = 32768;
|
||||
|
||||
BlobLogHeader::BlobLogHeader()
|
||||
: magic_number_(kMagicNumber), compression_(kNoCompression) {}
|
||||
|
||||
BlobLogHeader& BlobLogHeader::operator=(BlobLogHeader&& in) noexcept {
|
||||
if (this != &in) {
|
||||
magic_number_ = in.magic_number_;
|
||||
version_ = in.version_;
|
||||
ttl_guess_ = std::move(in.ttl_guess_);
|
||||
ts_guess_ = std::move(in.ts_guess_);
|
||||
compression_ = in.compression_;
|
||||
}
|
||||
return *this;
|
||||
void BlobLogHeader::EncodeTo(std::string* dst) {
|
||||
assert(dst != nullptr);
|
||||
dst->clear();
|
||||
dst->reserve(BlobLogHeader::kSize);
|
||||
PutFixed32(dst, kMagicNumber);
|
||||
PutFixed32(dst, version);
|
||||
PutFixed32(dst, column_family_id);
|
||||
unsigned char flags = (has_ttl ? 1 : 0);
|
||||
dst->push_back(flags);
|
||||
dst->push_back(compression);
|
||||
PutFixed64(dst, expiration_range.first);
|
||||
PutFixed64(dst, expiration_range.second);
|
||||
}
|
||||
|
||||
BlobLogFooter::BlobLogFooter() : magic_number_(kMagicNumber), blob_count_(0) {}
|
||||
|
||||
Status BlobLogFooter::DecodeFrom(const Slice& input) {
|
||||
Slice slice(input);
|
||||
uint32_t val;
|
||||
if (!GetFixed32(&slice, &val)) {
|
||||
return Status::Corruption("Invalid Blob Footer: flags");
|
||||
Status BlobLogHeader::DecodeFrom(Slice src) {
|
||||
static const std::string kErrorMessage =
|
||||
"Error while decoding blob log header";
|
||||
if (src.size() != BlobLogHeader::kSize) {
|
||||
return Status::Corruption(kErrorMessage,
|
||||
"Unexpected blob file header size");
|
||||
}
|
||||
|
||||
bool has_ttl = false;
|
||||
bool has_ts = false;
|
||||
val >>= 8;
|
||||
RecordSubType st = static_cast<RecordSubType>(val);
|
||||
switch (st) {
|
||||
case kRegularType:
|
||||
break;
|
||||
case kTTLType:
|
||||
has_ttl = true;
|
||||
break;
|
||||
case kTimestampType:
|
||||
has_ts = true;
|
||||
break;
|
||||
default:
|
||||
return Status::Corruption("Invalid Blob Footer: flags_val");
|
||||
uint32_t magic_number;
|
||||
unsigned char flags;
|
||||
if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
|
||||
!GetFixed32(&src, &column_family_id)) {
|
||||
return Status::Corruption(
|
||||
kErrorMessage,
|
||||
"Error decoding magic number, version and column family id");
|
||||
}
|
||||
|
||||
if (!GetFixed64(&slice, &blob_count_)) {
|
||||
return Status::Corruption("Invalid Blob Footer: blob_count");
|
||||
if (magic_number != kMagicNumber) {
|
||||
return Status::Corruption(kErrorMessage, "Magic number mismatch");
|
||||
}
|
||||
|
||||
ttlrange_t temp_ttl;
|
||||
if (!GetFixed64(&slice, &temp_ttl.first) ||
|
||||
!GetFixed64(&slice, &temp_ttl.second)) {
|
||||
return Status::Corruption("Invalid Blob Footer: ttl_range");
|
||||
if (version != kVersion1) {
|
||||
return Status::Corruption(kErrorMessage, "Unknown header version");
|
||||
}
|
||||
if (has_ttl) {
|
||||
ttl_range_.reset(new ttlrange_t(temp_ttl));
|
||||
flags = src.data()[0];
|
||||
compression = static_cast<CompressionType>(src.data()[1]);
|
||||
has_ttl = (flags & 1) == 1;
|
||||
src.remove_prefix(2);
|
||||
if (!GetFixed64(&src, &expiration_range.first) ||
|
||||
!GetFixed64(&src, &expiration_range.second)) {
|
||||
return Status::Corruption(kErrorMessage, "Error decoding expiration range");
|
||||
}
|
||||
|
||||
if (!GetFixed64(&slice, &sn_range_.first) ||
|
||||
!GetFixed64(&slice, &sn_range_.second)) {
|
||||
return Status::Corruption("Invalid Blob Footer: sn_range");
|
||||
}
|
||||
|
||||
tsrange_t temp_ts;
|
||||
if (!GetFixed64(&slice, &temp_ts.first) ||
|
||||
!GetFixed64(&slice, &temp_ts.second)) {
|
||||
return Status::Corruption("Invalid Blob Footer: ts_range");
|
||||
}
|
||||
if (has_ts) {
|
||||
ts_range_.reset(new tsrange_t(temp_ts));
|
||||
}
|
||||
|
||||
if (!GetFixed32(&slice, &magic_number_) || magic_number_ != kMagicNumber) {
|
||||
return Status::Corruption("Invalid Blob Footer: magic");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void BlobLogFooter::EncodeTo(std::string* dst) const {
|
||||
dst->reserve(kFooterSize);
|
||||
|
||||
RecordType rt = kFullType;
|
||||
RecordSubType st = kRegularType;
|
||||
if (HasTTL()) {
|
||||
st = kTTLType;
|
||||
} else if (HasTimestamp()) {
|
||||
st = kTimestampType;
|
||||
}
|
||||
uint32_t val = static_cast<uint32_t>(rt) | (static_cast<uint32_t>(st) << 8);
|
||||
PutFixed32(dst, val);
|
||||
|
||||
PutFixed64(dst, blob_count_);
|
||||
bool has_ttl = HasTTL();
|
||||
bool has_ts = HasTimestamp();
|
||||
|
||||
if (has_ttl) {
|
||||
PutFixed64(dst, ttl_range_.get()->first);
|
||||
PutFixed64(dst, ttl_range_.get()->second);
|
||||
} else {
|
||||
PutFixed64(dst, 0);
|
||||
PutFixed64(dst, 0);
|
||||
}
|
||||
PutFixed64(dst, sn_range_.first);
|
||||
PutFixed64(dst, sn_range_.second);
|
||||
|
||||
if (has_ts) {
|
||||
PutFixed64(dst, ts_range_.get()->first);
|
||||
PutFixed64(dst, ts_range_.get()->second);
|
||||
} else {
|
||||
PutFixed64(dst, 0);
|
||||
PutFixed64(dst, 0);
|
||||
}
|
||||
|
||||
PutFixed32(dst, magic_number_);
|
||||
void BlobLogFooter::EncodeTo(std::string* dst) {
|
||||
assert(dst != nullptr);
|
||||
dst->clear();
|
||||
dst->reserve(BlobLogFooter::kSize);
|
||||
PutFixed32(dst, kMagicNumber);
|
||||
PutFixed64(dst, blob_count);
|
||||
PutFixed64(dst, expiration_range.first);
|
||||
PutFixed64(dst, expiration_range.second);
|
||||
PutFixed64(dst, sequence_range.first);
|
||||
PutFixed64(dst, sequence_range.second);
|
||||
crc = crc32c::Value(dst->c_str(), dst->size());
|
||||
crc = crc32c::Mask(crc);
|
||||
PutFixed32(dst, crc);
|
||||
}
|
||||
|
||||
void BlobLogHeader::EncodeTo(std::string* dst) const {
|
||||
dst->reserve(kHeaderSize);
|
||||
|
||||
PutFixed32(dst, magic_number_);
|
||||
|
||||
PutFixed32(dst, version_);
|
||||
|
||||
RecordSubType st = kRegularType;
|
||||
bool has_ttl = HasTTL();
|
||||
bool has_ts = HasTimestamp();
|
||||
|
||||
if (has_ttl) {
|
||||
st = kTTLType;
|
||||
} else if (has_ts) {
|
||||
st = kTimestampType;
|
||||
Status BlobLogFooter::DecodeFrom(Slice src) {
|
||||
static const std::string kErrorMessage =
|
||||
"Error while decoding blob log footer";
|
||||
if (src.size() != BlobLogFooter::kSize) {
|
||||
return Status::Corruption(kErrorMessage,
|
||||
"Unexpected blob file footer size");
|
||||
}
|
||||
uint32_t val =
|
||||
static_cast<uint32_t>(st) | (static_cast<uint32_t>(compression_) << 8);
|
||||
PutFixed32(dst, val);
|
||||
|
||||
if (has_ttl) {
|
||||
PutFixed64(dst, ttl_guess_.get()->first);
|
||||
PutFixed64(dst, ttl_guess_.get()->second);
|
||||
} else {
|
||||
PutFixed64(dst, 0);
|
||||
PutFixed64(dst, 0);
|
||||
uint32_t src_crc = 0;
|
||||
src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - 4);
|
||||
src_crc = crc32c::Mask(src_crc);
|
||||
uint32_t magic_number;
|
||||
if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
|
||||
!GetFixed64(&src, &expiration_range.first) ||
|
||||
!GetFixed64(&src, &expiration_range.second) ||
|
||||
!GetFixed64(&src, &sequence_range.first) ||
|
||||
!GetFixed64(&src, &sequence_range.second) || !GetFixed32(&src, &crc)) {
|
||||
return Status::Corruption(kErrorMessage, "Error decoding content");
|
||||
}
|
||||
|
||||
if (has_ts) {
|
||||
PutFixed64(dst, ts_guess_.get()->first);
|
||||
PutFixed64(dst, ts_guess_.get()->second);
|
||||
} else {
|
||||
PutFixed64(dst, 0);
|
||||
PutFixed64(dst, 0);
|
||||
if (magic_number != kMagicNumber) {
|
||||
return Status::Corruption(kErrorMessage, "Magic number mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
Status BlobLogHeader::DecodeFrom(const Slice& input) {
|
||||
Slice slice(input);
|
||||
if (!GetFixed32(&slice, &magic_number_) || magic_number_ != kMagicNumber) {
|
||||
return Status::Corruption("Invalid Blob Log Header: magic");
|
||||
if (src_crc != crc) {
|
||||
return Status::Corruption(kErrorMessage, "CRC mismatch");
|
||||
}
|
||||
|
||||
// as of today, we only support 1 version
|
||||
if (!GetFixed32(&slice, &version_) || version_ != kVersion1) {
|
||||
return Status::Corruption("Invalid Blob Log Header: version");
|
||||
}
|
||||
|
||||
uint32_t val;
|
||||
if (!GetFixed32(&slice, &val)) {
|
||||
return Status::Corruption("Invalid Blob Log Header: subtype");
|
||||
}
|
||||
|
||||
bool has_ttl = false;
|
||||
bool has_ts = false;
|
||||
RecordSubType st = static_cast<RecordSubType>(val & 0xff);
|
||||
compression_ = static_cast<CompressionType>((val >> 8) & 0xff);
|
||||
switch (st) {
|
||||
case kRegularType:
|
||||
break;
|
||||
case kTTLType:
|
||||
has_ttl = true;
|
||||
break;
|
||||
case kTimestampType:
|
||||
has_ts = true;
|
||||
break;
|
||||
default:
|
||||
return Status::Corruption("Invalid Blob Log Header: subtype_2");
|
||||
}
|
||||
|
||||
ttlrange_t temp_ttl;
|
||||
if (!GetFixed64(&slice, &temp_ttl.first) ||
|
||||
!GetFixed64(&slice, &temp_ttl.second)) {
|
||||
return Status::Corruption("Invalid Blob Log Header: ttl");
|
||||
}
|
||||
if (has_ttl) {
|
||||
set_ttl_guess(temp_ttl);
|
||||
}
|
||||
|
||||
tsrange_t temp_ts;
|
||||
if (!GetFixed64(&slice, &temp_ts.first) ||
|
||||
!GetFixed64(&slice, &temp_ts.second)) {
|
||||
return Status::Corruption("Invalid Blob Log Header: timestamp");
|
||||
}
|
||||
if (has_ts) set_ts_guess(temp_ts);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
BlobLogRecord::BlobLogRecord()
|
||||
: checksum_(0),
|
||||
header_cksum_(0),
|
||||
key_size_(0),
|
||||
blob_size_(0),
|
||||
time_val_(0),
|
||||
ttl_val_(0),
|
||||
sn_(0),
|
||||
type_(0),
|
||||
subtype_(0) {}
|
||||
|
||||
BlobLogRecord::~BlobLogRecord() {}
|
||||
|
||||
void BlobLogRecord::ResizeKeyBuffer(size_t kbs) {
|
||||
if (kbs > key_buffer_.size()) {
|
||||
key_buffer_.resize(kbs);
|
||||
}
|
||||
void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
|
||||
assert(dst != nullptr);
|
||||
dst->clear();
|
||||
dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
|
||||
PutFixed64(dst, key.size());
|
||||
PutFixed64(dst, value.size());
|
||||
PutFixed64(dst, expiration);
|
||||
header_crc = crc32c::Value(dst->c_str(), dst->size());
|
||||
header_crc = crc32c::Mask(header_crc);
|
||||
PutFixed32(dst, header_crc);
|
||||
blob_crc = crc32c::Value(key.data(), key.size());
|
||||
blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
|
||||
blob_crc = crc32c::Mask(blob_crc);
|
||||
PutFixed32(dst, blob_crc);
|
||||
}
|
||||
|
||||
void BlobLogRecord::ResizeBlobBuffer(size_t bbs) {
|
||||
if (bbs > blob_buffer_.size()) {
|
||||
blob_buffer_.resize(bbs);
|
||||
Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
|
||||
static const std::string kErrorMessage = "Error while decoding blob record";
|
||||
if (src.size() != BlobLogRecord::kHeaderSize) {
|
||||
return Status::Corruption(kErrorMessage,
|
||||
"Unexpected blob record header size");
|
||||
}
|
||||
}
|
||||
|
||||
void BlobLogRecord::Clear() {
|
||||
checksum_ = 0;
|
||||
header_cksum_ = 0;
|
||||
key_size_ = 0;
|
||||
blob_size_ = 0;
|
||||
time_val_ = 0;
|
||||
ttl_val_ = 0;
|
||||
sn_ = 0;
|
||||
type_ = subtype_ = 0;
|
||||
key_.clear();
|
||||
blob_.clear();
|
||||
}
|
||||
|
||||
Status BlobLogRecord::DecodeHeaderFrom(const Slice& hdrslice) {
|
||||
Slice input = hdrslice;
|
||||
if (input.size() < kHeaderSize) {
|
||||
return Status::Corruption("Invalid Blob Record Header: size");
|
||||
uint32_t src_crc = 0;
|
||||
src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
|
||||
src_crc = crc32c::Mask(src_crc);
|
||||
if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
|
||||
!GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
|
||||
!GetFixed32(&src, &blob_crc)) {
|
||||
return Status::Corruption(kErrorMessage, "Error decoding content");
|
||||
}
|
||||
|
||||
if (!GetFixed32(&input, &key_size_)) {
|
||||
return Status::Corruption("Invalid Blob Record Header: key_size");
|
||||
if (src_crc != header_crc) {
|
||||
return Status::Corruption(kErrorMessage, "Header CRC mismatch");
|
||||
}
|
||||
if (!GetFixed64(&input, &blob_size_)) {
|
||||
return Status::Corruption("Invalid Blob Record Header: blob_size");
|
||||
}
|
||||
if (!GetFixed64(&input, &ttl_val_)) {
|
||||
return Status::Corruption("Invalid Blob Record Header: ttl_val");
|
||||
}
|
||||
if (!GetFixed64(&input, &time_val_)) {
|
||||
return Status::Corruption("Invalid Blob Record Header: time_val");
|
||||
}
|
||||
|
||||
type_ = *(input.data());
|
||||
input.remove_prefix(1);
|
||||
subtype_ = *(input.data());
|
||||
input.remove_prefix(1);
|
||||
|
||||
if (!GetFixed32(&input, &header_cksum_)) {
|
||||
return Status::Corruption("Invalid Blob Record Header: header_cksum");
|
||||
}
|
||||
if (!GetFixed32(&input, &checksum_)) {
|
||||
return Status::Corruption("Invalid Blob Record Header: checksum");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BlobLogRecord::DecodeFooterFrom(const Slice& footerslice) {
|
||||
Slice input = footerslice;
|
||||
if (input.size() < kFooterSize) {
|
||||
return Status::Corruption("Invalid Blob Record Footer: size");
|
||||
Status BlobLogRecord::CheckBlobCRC() const {
|
||||
uint32_t expected_crc = 0;
|
||||
expected_crc = crc32c::Value(key.data(), key.size());
|
||||
expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
|
||||
expected_crc = crc32c::Mask(expected_crc);
|
||||
if (expected_crc != blob_crc) {
|
||||
return Status::Corruption("Blob CRC mismatch");
|
||||
}
|
||||
|
||||
uint32_t f_crc = crc32c::Extend(0, input.data(), 8);
|
||||
f_crc = crc32c::Mask(f_crc);
|
||||
|
||||
if (!GetFixed64(&input, &sn_)) {
|
||||
return Status::Corruption("Invalid Blob Record Footer: sn");
|
||||
}
|
||||
|
||||
if (!GetFixed32(&input, &footer_cksum_)) {
|
||||
return Status::Corruption("Invalid Blob Record Footer: cksum");
|
||||
}
|
||||
|
||||
if (f_crc != footer_cksum_) {
|
||||
return Status::Corruption("Record Checksum mismatch: footer_cksum");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -9,253 +9,113 @@
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/types.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace blob_db {
|
||||
class BlobFile;
|
||||
class BlobDBImpl;
|
||||
|
||||
constexpr uint32_t kMagicNumber = 2395959; // 0x00248f37
|
||||
constexpr uint32_t kVersion1 = 1;
|
||||
constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
enum RecordType : uint8_t {
|
||||
// Zero is reserved for preallocated files
|
||||
kFullType = 0,
|
||||
using ExpirationRange = std::pair<uint64_t, uint64_t>;
|
||||
using SequenceRange = std::pair<uint64_t, uint64_t>;
|
||||
|
||||
// For fragments
|
||||
kFirstType = 1,
|
||||
kMiddleType = 2,
|
||||
kLastType = 3,
|
||||
kMaxRecordType = kLastType
|
||||
// Format of blob log file header (30 bytes):
|
||||
//
|
||||
// +--------------+---------+---------+-------+-------------+-------------------+
|
||||
// | magic number | version | cf id | flags | compression | expiration range |
|
||||
// +--------------+---------+---------+-------+-------------+-------------------+
|
||||
// | Fixed32 | Fixed32 | Fixed32 | char | char | Fixed64 Fixed64 |
|
||||
// +--------------+---------+---------+-------+-------------+-------------------+
|
||||
//
|
||||
// List of flags:
|
||||
// has_ttl: Whether the file contain TTL data.
|
||||
//
|
||||
// Expiration range in the header is a rough range based on
|
||||
// blob_db_options.ttl_range_secs.
|
||||
struct BlobLogHeader {
|
||||
static constexpr size_t kSize = 30;
|
||||
|
||||
uint32_t version = kVersion1;
|
||||
uint32_t column_family_id = 0;
|
||||
CompressionType compression = kNoCompression;
|
||||
bool has_ttl = false;
|
||||
ExpirationRange expiration_range = std::make_pair(0, 0);
|
||||
|
||||
void EncodeTo(std::string* dst);
|
||||
|
||||
Status DecodeFrom(Slice slice);
|
||||
};
|
||||
|
||||
enum RecordSubType : uint8_t {
|
||||
kRegularType = 0,
|
||||
kTTLType = 1,
|
||||
kTimestampType = 2,
|
||||
// Format of blob log file footer (48 bytes):
|
||||
//
|
||||
// +--------------+------------+-------------------+-------------------+------------+
|
||||
// | magic number | blob count | expiration range | sequence range | footer CRC |
|
||||
// +--------------+------------+-------------------+-------------------+------------+
|
||||
// | Fixed32 | Fixed64 | Fixed64 + Fixed64 | Fixed64 + Fixed64 | Fixed32 |
|
||||
// +--------------+------------+-------------------+-------------------+------------+
|
||||
//
|
||||
// The footer will be presented only when the blob file is properly closed.
|
||||
//
|
||||
// Unlike the same field in file header, expiration range in the footer is the
|
||||
// range of smallest and largest expiration of the data in this file.
|
||||
struct BlobLogFooter {
|
||||
static constexpr size_t kSize = 48;
|
||||
|
||||
uint64_t blob_count = 0;
|
||||
ExpirationRange expiration_range = std::make_pair(0, 0);
|
||||
SequenceRange sequence_range = std::make_pair(0, 0);
|
||||
uint32_t crc = 0;
|
||||
|
||||
void EncodeTo(std::string* dst);
|
||||
|
||||
Status DecodeFrom(Slice slice);
|
||||
};
|
||||
|
||||
extern const uint32_t kMagicNumber;
|
||||
// Blob record format (32 bytes header + key + value):
|
||||
//
|
||||
// +------------+--------------+------------+------------+----------+---------+-----------+
|
||||
// | key length | value length | expiration | header CRC | blob CRC | key | value |
|
||||
// +------------+--------------+------------+------------+----------+---------+-----------+
|
||||
// | Fixed64 | Fixed64 | Fixed64 | Fixed32 | Fixed32 | key len | value len |
|
||||
// +------------+--------------+------------+------------+----------+---------+-----------+
|
||||
//
|
||||
// If file has has_ttl = false, expiration field is always 0, and the blob
|
||||
// doesn't has expiration.
|
||||
//
|
||||
// Also note that if compression is used, value is compressed value and value
|
||||
// length is compressed value length.
|
||||
//
|
||||
// Header CRC is the checksum of (key_len + val_len + expiration), while
|
||||
// blob CRC is the checksum of (key + value).
|
||||
//
|
||||
// We could use variable length encoding (Varint64) to save more space, but it
|
||||
// make reader more complicated.
|
||||
struct BlobLogRecord {
|
||||
// header include fields up to blob CRC
|
||||
static constexpr size_t kHeaderSize = 32;
|
||||
|
||||
class Reader;
|
||||
uint64_t key_size = 0;
|
||||
uint64_t value_size = 0;
|
||||
uint64_t expiration = 0;
|
||||
uint32_t header_crc = 0;
|
||||
uint32_t blob_crc = 0;
|
||||
Slice key;
|
||||
Slice value;
|
||||
std::string key_buf;
|
||||
std::string value_buf;
|
||||
|
||||
using ttlrange_t = std::pair<uint64_t, uint64_t>;
|
||||
using tsrange_t = std::pair<uint64_t, uint64_t>;
|
||||
using snrange_t = std::pair<rocksdb::SequenceNumber, rocksdb::SequenceNumber>;
|
||||
void EncodeHeaderTo(std::string* dst);
|
||||
|
||||
class BlobLogHeader {
|
||||
friend class BlobFile;
|
||||
friend class BlobDBImpl;
|
||||
Status DecodeHeaderFrom(Slice src);
|
||||
|
||||
private:
|
||||
uint32_t magic_number_ = 0;
|
||||
uint32_t version_ = 1;
|
||||
CompressionType compression_;
|
||||
std::unique_ptr<ttlrange_t> ttl_guess_;
|
||||
std::unique_ptr<tsrange_t> ts_guess_;
|
||||
|
||||
private:
|
||||
void set_ttl_guess(const ttlrange_t& ttl) {
|
||||
ttl_guess_.reset(new ttlrange_t(ttl));
|
||||
}
|
||||
|
||||
void set_version(uint32_t v) { version_ = v; }
|
||||
|
||||
void set_ts_guess(const tsrange_t& ts) { ts_guess_.reset(new tsrange_t(ts)); }
|
||||
|
||||
public:
|
||||
// magic number + version + flags + ttl guess + timestamp range = 44
|
||||
static const size_t kHeaderSize = 4 + 4 + 4 + 8 * 2 + 8 * 2;
|
||||
|
||||
void EncodeTo(std::string* dst) const;
|
||||
|
||||
Status DecodeFrom(const Slice& input);
|
||||
|
||||
BlobLogHeader();
|
||||
|
||||
uint32_t magic_number() const { return magic_number_; }
|
||||
|
||||
uint32_t version() const { return version_; }
|
||||
|
||||
CompressionType compression() const { return compression_; }
|
||||
|
||||
ttlrange_t ttl_range() const {
|
||||
if (!ttl_guess_) {
|
||||
return {0, 0};
|
||||
}
|
||||
return *ttl_guess_;
|
||||
}
|
||||
|
||||
tsrange_t ts_range() const {
|
||||
if (!ts_guess_) {
|
||||
return {0, 0};
|
||||
}
|
||||
return *ts_guess_;
|
||||
}
|
||||
|
||||
bool HasTTL() const { return ttl_guess_ != nullptr; }
|
||||
|
||||
bool HasTimestamp() const { return ts_guess_ != nullptr; }
|
||||
|
||||
BlobLogHeader& operator=(BlobLogHeader&& in) noexcept;
|
||||
};
|
||||
|
||||
// Footer encapsulates the fixed information stored at the tail
|
||||
// end of every blob log file.
|
||||
class BlobLogFooter {
|
||||
friend class BlobFile;
|
||||
|
||||
public:
|
||||
// Use this constructor when you plan to write out the footer using
|
||||
// EncodeTo(). Never use this constructor with DecodeFrom().
|
||||
BlobLogFooter();
|
||||
|
||||
uint32_t magic_number() const { return magic_number_; }
|
||||
|
||||
void EncodeTo(std::string* dst) const;
|
||||
|
||||
Status DecodeFrom(const Slice& input);
|
||||
|
||||
// convert this object to a human readable form
|
||||
std::string ToString() const;
|
||||
|
||||
// footer size = 4 byte magic number
|
||||
// 8 bytes count
|
||||
// 8, 8 - ttl range
|
||||
// 8, 8 - sn range
|
||||
// 8, 8 - ts range
|
||||
// = 64
|
||||
static const size_t kFooterSize = 4 + 4 + 8 + (8 * 2) + (8 * 2) + (8 * 2);
|
||||
|
||||
bool HasTTL() const { return !!ttl_range_; }
|
||||
|
||||
bool HasTimestamp() const { return !!ts_range_; }
|
||||
|
||||
uint64_t GetBlobCount() const { return blob_count_; }
|
||||
|
||||
ttlrange_t GetTTLRange() const {
|
||||
if (ttl_range_) {
|
||||
*ttl_range_;
|
||||
}
|
||||
return {0, 0};
|
||||
}
|
||||
|
||||
tsrange_t GetTimeRange() const {
|
||||
if (ts_range_) {
|
||||
return *ts_range_;
|
||||
}
|
||||
return {0, 0};
|
||||
}
|
||||
|
||||
const snrange_t& GetSNRange() const { return sn_range_; }
|
||||
|
||||
private:
|
||||
uint32_t magic_number_ = 0;
|
||||
uint64_t blob_count_ = 0;
|
||||
|
||||
std::unique_ptr<ttlrange_t> ttl_range_;
|
||||
std::unique_ptr<tsrange_t> ts_range_;
|
||||
snrange_t sn_range_;
|
||||
|
||||
private:
|
||||
void set_ttl_range(const ttlrange_t& ttl) {
|
||||
ttl_range_.reset(new ttlrange_t(ttl));
|
||||
}
|
||||
void set_time_range(const tsrange_t& ts) {
|
||||
ts_range_.reset(new tsrange_t(ts));
|
||||
}
|
||||
};
|
||||
|
||||
extern const size_t kBlockSize;
|
||||
|
||||
class BlobLogRecord {
|
||||
friend class Reader;
|
||||
|
||||
private:
|
||||
// this might not be set.
|
||||
uint32_t checksum_;
|
||||
uint32_t header_cksum_;
|
||||
uint32_t key_size_;
|
||||
uint64_t blob_size_;
|
||||
uint64_t time_val_;
|
||||
uint64_t ttl_val_;
|
||||
SequenceNumber sn_;
|
||||
uint32_t footer_cksum_;
|
||||
char type_;
|
||||
char subtype_;
|
||||
Slice key_;
|
||||
Slice blob_;
|
||||
std::string key_buffer_;
|
||||
std::string blob_buffer_;
|
||||
|
||||
private:
|
||||
void Clear();
|
||||
|
||||
char* GetKeyBuffer() { return &(key_buffer_[0]); }
|
||||
|
||||
char* GetBlobBuffer() { return &(blob_buffer_[0]); }
|
||||
|
||||
void ResizeKeyBuffer(size_t kbs);
|
||||
|
||||
void ResizeBlobBuffer(size_t bbs);
|
||||
|
||||
public:
|
||||
// Header is
|
||||
// Key Length ( 4 bytes ),
|
||||
// Blob Length ( 8 bytes),
|
||||
// ttl (8 bytes), timestamp (8 bytes),
|
||||
// type (1 byte), subtype (1 byte)
|
||||
// header checksum (4 bytes), blob checksum (4 bytes),
|
||||
// = 42
|
||||
static const size_t kHeaderSize = 4 + 4 + 8 + 8 + 4 + 8 + 1 + 1;
|
||||
|
||||
static const size_t kFooterSize = 8 + 4;
|
||||
|
||||
public:
|
||||
BlobLogRecord();
|
||||
|
||||
~BlobLogRecord();
|
||||
|
||||
const Slice& Key() const { return key_; }
|
||||
|
||||
const Slice& Blob() const { return blob_; }
|
||||
|
||||
uint32_t GetKeySize() const { return key_size_; }
|
||||
|
||||
uint64_t GetBlobSize() const { return blob_size_; }
|
||||
|
||||
bool HasTTL() const {
|
||||
return ttl_val_ != std::numeric_limits<uint32_t>::max();
|
||||
}
|
||||
|
||||
uint64_t GetTTL() const { return ttl_val_; }
|
||||
|
||||
uint64_t GetTimeVal() const { return time_val_; }
|
||||
|
||||
char type() const { return type_; }
|
||||
|
||||
char subtype() const { return subtype_; }
|
||||
|
||||
SequenceNumber GetSN() const { return sn_; }
|
||||
|
||||
uint32_t header_checksum() const { return header_cksum_; }
|
||||
|
||||
uint32_t checksum() const { return checksum_; }
|
||||
|
||||
uint32_t footer_checksum() const { return footer_cksum_; }
|
||||
|
||||
Status DecodeHeaderFrom(const Slice& hdrslice);
|
||||
|
||||
Status DecodeFooterFrom(const Slice& footerslice);
|
||||
Status CheckBlobCRC() const;
|
||||
};
|
||||
|
||||
} // namespace blob_db
|
||||
|
@ -7,10 +7,8 @@
|
||||
|
||||
#include "utilities/blob_db/blob_log_reader.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include <algorithm>
|
||||
|
||||
#include "util/file_reader_writer.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -18,149 +16,79 @@ namespace blob_db {
|
||||
|
||||
Reader::Reader(std::shared_ptr<Logger> info_log,
|
||||
unique_ptr<SequentialFileReader>&& _file)
|
||||
: info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {
|
||||
backing_store_.resize(kBlockSize);
|
||||
}
|
||||
: info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {}
|
||||
|
||||
Reader::~Reader() {}
|
||||
Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
|
||||
buf->reserve(size);
|
||||
Status s = file_->Read(size, slice, &(*buf)[0]);
|
||||
next_byte_ += size;
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
if (slice->size() != size) {
|
||||
return Status::Corruption("EOF reached while reading record");
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status Reader::ReadHeader(BlobLogHeader* header) {
|
||||
assert(file_.get() != nullptr);
|
||||
assert(next_byte_ == 0);
|
||||
Status status =
|
||||
file_->Read(BlobLogHeader::kHeaderSize, &buffer_, GetReadBuffer());
|
||||
next_byte_ += buffer_.size();
|
||||
if (!status.ok()) return status;
|
||||
|
||||
if (buffer_.size() != BlobLogHeader::kHeaderSize) {
|
||||
return Status::IOError("EOF reached before file header");
|
||||
Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, &backing_store_);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
status = header->DecodeFrom(buffer_);
|
||||
return status;
|
||||
if (buffer_.size() != BlobLogHeader::kSize) {
|
||||
return Status::Corruption("EOF reached before file header");
|
||||
}
|
||||
|
||||
return header->DecodeFrom(buffer_);
|
||||
}
|
||||
|
||||
Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
|
||||
uint64_t* blob_offset) {
|
||||
record->Clear();
|
||||
buffer_.clear();
|
||||
backing_store_[0] = '\0';
|
||||
|
||||
Status status =
|
||||
file_->Read(BlobLogRecord::kHeaderSize, &buffer_, GetReadBuffer());
|
||||
next_byte_ += buffer_.size();
|
||||
if (!status.ok()) return status;
|
||||
Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, &backing_store_);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
if (buffer_.size() != BlobLogRecord::kHeaderSize) {
|
||||
return Status::IOError("EOF reached before record header");
|
||||
return Status::Corruption("EOF reached before record header");
|
||||
}
|
||||
|
||||
status = record->DecodeHeaderFrom(buffer_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
s = record->DecodeHeaderFrom(buffer_);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
uint32_t header_crc = 0;
|
||||
uint32_t blob_crc = 0;
|
||||
size_t crc_data_size = BlobLogRecord::kHeaderSize - 2 * sizeof(uint32_t);
|
||||
header_crc = crc32c::Extend(header_crc, buffer_.data(), crc_data_size);
|
||||
|
||||
uint64_t kb_size = record->GetKeySize() + record->GetBlobSize();
|
||||
uint64_t kb_size = record->key_size + record->value_size;
|
||||
if (blob_offset != nullptr) {
|
||||
*blob_offset = next_byte_ + record->GetKeySize();
|
||||
*blob_offset = next_byte_ + record->key_size;
|
||||
}
|
||||
|
||||
switch (level) {
|
||||
case kReadHdrFooter:
|
||||
file_->Skip(kb_size);
|
||||
case kReadHeader:
|
||||
file_->Skip(record->key_size + record->value_size);
|
||||
next_byte_ += kb_size;
|
||||
status =
|
||||
file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
|
||||
next_byte_ += buffer_.size();
|
||||
if (!status.ok()) return status;
|
||||
if (buffer_.size() != BlobLogRecord::kFooterSize) {
|
||||
return Status::IOError("EOF reached before record footer");
|
||||
break;
|
||||
|
||||
case kReadHeaderKey:
|
||||
s = ReadSlice(record->key_size, &record->key, &record->key_buf);
|
||||
file_->Skip(record->value_size);
|
||||
next_byte_ += record->value_size;
|
||||
break;
|
||||
|
||||
case kReadHeaderKeyBlob:
|
||||
s = ReadSlice(record->key_size, &record->key, &record->key_buf);
|
||||
if (s.ok()) {
|
||||
s = ReadSlice(record->value_size, &record->value, &record->value_buf);
|
||||
}
|
||||
|
||||
status = record->DecodeFooterFrom(buffer_);
|
||||
return status;
|
||||
|
||||
case kReadHdrKeyFooter:
|
||||
record->ResizeKeyBuffer(record->GetKeySize());
|
||||
status = file_->Read(record->GetKeySize(), &record->key_,
|
||||
record->GetKeyBuffer());
|
||||
next_byte_ += record->key_.size();
|
||||
if (!status.ok()) return status;
|
||||
if (record->key_.size() != record->GetKeySize()) {
|
||||
return Status::IOError("EOF reached before key read");
|
||||
if (s.ok()) {
|
||||
s = record->CheckBlobCRC();
|
||||
}
|
||||
|
||||
header_crc =
|
||||
crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
|
||||
header_crc = crc32c::Mask(header_crc);
|
||||
if (header_crc != record->header_cksum_) {
|
||||
return Status::Corruption("Record Checksum mismatch: header_cksum");
|
||||
}
|
||||
|
||||
file_->Skip(record->GetBlobSize());
|
||||
next_byte_ += record->GetBlobSize();
|
||||
|
||||
status =
|
||||
file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
|
||||
next_byte_ += buffer_.size();
|
||||
if (!status.ok()) return status;
|
||||
if (buffer_.size() != BlobLogRecord::kFooterSize) {
|
||||
return Status::IOError("EOF reached during footer read");
|
||||
}
|
||||
|
||||
status = record->DecodeFooterFrom(buffer_);
|
||||
return status;
|
||||
|
||||
case kReadHdrKeyBlobFooter:
|
||||
record->ResizeKeyBuffer(record->GetKeySize());
|
||||
status = file_->Read(record->GetKeySize(), &record->key_,
|
||||
record->GetKeyBuffer());
|
||||
next_byte_ += record->key_.size();
|
||||
if (!status.ok()) return status;
|
||||
if (record->key_.size() != record->GetKeySize()) {
|
||||
return Status::IOError("EOF reached before key read");
|
||||
}
|
||||
|
||||
header_crc =
|
||||
crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
|
||||
header_crc = crc32c::Mask(header_crc);
|
||||
if (header_crc != record->header_cksum_) {
|
||||
return Status::Corruption("Record Checksum mismatch: header_cksum");
|
||||
}
|
||||
|
||||
record->ResizeBlobBuffer(record->GetBlobSize());
|
||||
status = file_->Read(record->GetBlobSize(), &record->blob_,
|
||||
record->GetBlobBuffer());
|
||||
next_byte_ += record->blob_.size();
|
||||
if (!status.ok()) return status;
|
||||
if (record->blob_.size() != record->GetBlobSize()) {
|
||||
return Status::IOError("EOF reached during blob read");
|
||||
}
|
||||
|
||||
blob_crc =
|
||||
crc32c::Extend(blob_crc, record->blob_.data(), record->blob_.size());
|
||||
blob_crc = crc32c::Mask(blob_crc);
|
||||
if (blob_crc != record->checksum_) {
|
||||
return Status::Corruption("Blob Checksum mismatch");
|
||||
}
|
||||
|
||||
status =
|
||||
file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
|
||||
next_byte_ += buffer_.size();
|
||||
if (!status.ok()) return status;
|
||||
if (buffer_.size() != BlobLogRecord::kFooterSize) {
|
||||
return Status::IOError("EOF reached during blob footer read");
|
||||
}
|
||||
|
||||
status = record->DecodeFooterFrom(buffer_);
|
||||
return status;
|
||||
default:
|
||||
assert(0);
|
||||
return status;
|
||||
break;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace blob_db
|
||||
|
@ -7,11 +7,9 @@
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
@ -32,9 +30,9 @@ namespace blob_db {
|
||||
class Reader {
|
||||
public:
|
||||
enum ReadLevel {
|
||||
kReadHdrFooter,
|
||||
kReadHdrKeyFooter,
|
||||
kReadHdrKeyBlobFooter,
|
||||
kReadHeader,
|
||||
kReadHeaderKey,
|
||||
kReadHeaderKeyBlob,
|
||||
};
|
||||
|
||||
// Create a reader that will return log records from "*file".
|
||||
@ -51,7 +49,11 @@ class Reader {
|
||||
Reader(std::shared_ptr<Logger> info_log,
|
||||
std::unique_ptr<SequentialFileReader>&& file);
|
||||
|
||||
~Reader();
|
||||
~Reader() = default;
|
||||
|
||||
// No copying allowed
|
||||
Reader(const Reader&) = delete;
|
||||
Reader& operator=(const Reader&) = delete;
|
||||
|
||||
Status ReadHeader(BlobLogHeader* header);
|
||||
|
||||
@ -61,9 +63,11 @@ class Reader {
|
||||
// will only be valid until the next mutating operation on this
|
||||
// reader or the next mutation to *scratch.
|
||||
// If blob_offset is non-null, return offset of the blob through it.
|
||||
Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHdrFooter,
|
||||
Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
|
||||
uint64_t* blob_offset = nullptr);
|
||||
|
||||
Status ReadSlice(uint64_t size, Slice* slice, std::string* buf);
|
||||
|
||||
SequentialFileReader* file() { return file_.get(); }
|
||||
|
||||
void ResetNextByte() { next_byte_ = 0; }
|
||||
@ -72,9 +76,6 @@ class Reader {
|
||||
|
||||
const SequentialFileReader* file_reader() const { return file_.get(); }
|
||||
|
||||
private:
|
||||
char* GetReadBuffer() { return &(backing_store_[0]); }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Logger> info_log_;
|
||||
const std::unique_ptr<SequentialFileReader> file_;
|
||||
@ -84,10 +85,6 @@ class Reader {
|
||||
|
||||
// which byte to read next. For asserting proper usage
|
||||
uint64_t next_byte_;
|
||||
|
||||
// No copying allowed
|
||||
Reader(const Reader&) = delete;
|
||||
Reader& operator=(const Reader&) = delete;
|
||||
};
|
||||
|
||||
} // namespace blob_db
|
||||
|
@ -2,7 +2,6 @@
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "utilities/blob_db/blob_log_writer.h"
|
||||
@ -11,8 +10,8 @@
|
||||
#include <string>
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
@ -25,18 +24,11 @@ Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
|
||||
bytes_per_sync_(bpsync),
|
||||
next_sync_offset_(0),
|
||||
use_fsync_(use_fs),
|
||||
last_elem_type_(kEtNone) {
|
||||
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||
char t = static_cast<char>(i);
|
||||
type_crc_[i] = crc32c::Value(&t, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Writer::~Writer() {}
|
||||
last_elem_type_(kEtNone) {}
|
||||
|
||||
void Writer::Sync() { dest_->Sync(use_fsync_); }
|
||||
|
||||
Status Writer::WriteHeader(const BlobLogHeader& header) {
|
||||
Status Writer::WriteHeader(BlobLogHeader& header) {
|
||||
assert(block_offset_ == 0);
|
||||
assert(last_elem_type_ == kEtNone);
|
||||
std::string str;
|
||||
@ -51,9 +43,9 @@ Status Writer::WriteHeader(const BlobLogHeader& header) {
|
||||
return s;
|
||||
}
|
||||
|
||||
Status Writer::AppendFooter(const BlobLogFooter& footer) {
|
||||
Status Writer::AppendFooter(BlobLogFooter& footer) {
|
||||
assert(block_offset_ != 0);
|
||||
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
|
||||
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
|
||||
|
||||
std::string str;
|
||||
footer.EncodeTo(&str);
|
||||
@ -70,13 +62,13 @@ Status Writer::AppendFooter(const BlobLogFooter& footer) {
|
||||
}
|
||||
|
||||
Status Writer::AddRecord(const Slice& key, const Slice& val,
|
||||
uint64_t* key_offset, uint64_t* blob_offset,
|
||||
uint64_t ttl) {
|
||||
uint64_t expiration, uint64_t* key_offset,
|
||||
uint64_t* blob_offset) {
|
||||
assert(block_offset_ != 0);
|
||||
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
|
||||
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
|
||||
|
||||
std::string buf;
|
||||
ConstructBlobHeader(&buf, key, val, ttl, -1);
|
||||
ConstructBlobHeader(&buf, key, val, expiration);
|
||||
|
||||
Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
|
||||
return s;
|
||||
@ -85,47 +77,22 @@ Status Writer::AddRecord(const Slice& key, const Slice& val,
|
||||
Status Writer::AddRecord(const Slice& key, const Slice& val,
|
||||
uint64_t* key_offset, uint64_t* blob_offset) {
|
||||
assert(block_offset_ != 0);
|
||||
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
|
||||
assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
|
||||
|
||||
std::string buf;
|
||||
ConstructBlobHeader(&buf, key, val, -1, -1);
|
||||
ConstructBlobHeader(&buf, key, val, 0);
|
||||
|
||||
Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
|
||||
return s;
|
||||
}
|
||||
|
||||
void Writer::ConstructBlobHeader(std::string* headerbuf, const Slice& key,
|
||||
const Slice& val, uint64_t ttl, int64_t ts) {
|
||||
headerbuf->reserve(BlobLogRecord::kHeaderSize);
|
||||
|
||||
uint32_t key_size = static_cast<uint32_t>(key.size());
|
||||
PutFixed32(headerbuf, key_size);
|
||||
PutFixed64(headerbuf, val.size());
|
||||
|
||||
PutFixed64(headerbuf, ttl);
|
||||
PutFixed64(headerbuf, ts);
|
||||
|
||||
RecordType t = kFullType;
|
||||
headerbuf->push_back(static_cast<char>(t));
|
||||
|
||||
RecordSubType st = kRegularType;
|
||||
if (ttl != kNoExpiration) {
|
||||
st = kTTLType;
|
||||
}
|
||||
headerbuf->push_back(static_cast<char>(st));
|
||||
|
||||
uint32_t header_crc = 0;
|
||||
header_crc =
|
||||
crc32c::Extend(header_crc, headerbuf->c_str(), headerbuf->size());
|
||||
header_crc = crc32c::Extend(header_crc, key.data(), key.size());
|
||||
header_crc = crc32c::Mask(header_crc);
|
||||
PutFixed32(headerbuf, header_crc);
|
||||
|
||||
uint32_t crc = 0;
|
||||
// Compute the crc of the record type and the payload.
|
||||
crc = crc32c::Extend(crc, val.data(), val.size());
|
||||
crc = crc32c::Mask(crc); // Adjust for storage
|
||||
PutFixed32(headerbuf, crc);
|
||||
void Writer::ConstructBlobHeader(std::string* buf, const Slice& key,
|
||||
const Slice& val, uint64_t expiration) {
|
||||
BlobLogRecord record;
|
||||
record.key = key;
|
||||
record.value = val;
|
||||
record.expiration = expiration;
|
||||
record.EncodeHeaderTo(buf);
|
||||
}
|
||||
|
||||
Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
|
||||
@ -134,7 +101,12 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
|
||||
Status s = dest_->Append(Slice(headerbuf));
|
||||
if (s.ok()) {
|
||||
s = dest_->Append(key);
|
||||
if (s.ok()) s = dest_->Append(val);
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = dest_->Append(val);
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = dest_->Flush();
|
||||
}
|
||||
|
||||
*key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
|
||||
@ -144,25 +116,6 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
|
||||
return s;
|
||||
}
|
||||
|
||||
Status Writer::AddRecordFooter(const SequenceNumber& seq) {
|
||||
assert(last_elem_type_ == kEtRecord);
|
||||
|
||||
std::string buf;
|
||||
PutFixed64(&buf, seq);
|
||||
|
||||
uint32_t footer_crc = crc32c::Extend(0, buf.c_str(), buf.size());
|
||||
footer_crc = crc32c::Mask(footer_crc);
|
||||
PutFixed32(&buf, footer_crc);
|
||||
|
||||
Status s = dest_->Append(Slice(buf));
|
||||
block_offset_ += BlobLogRecord::kFooterSize;
|
||||
|
||||
if (s.ok()) dest_->Flush();
|
||||
|
||||
last_elem_type_ = kEtFooter;
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
|
@ -2,7 +2,6 @@
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
@ -38,26 +37,29 @@ class Writer {
|
||||
explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
|
||||
uint64_t log_number, uint64_t bpsync, bool use_fsync,
|
||||
uint64_t boffset = 0);
|
||||
~Writer();
|
||||
|
||||
static void ConstructBlobHeader(std::string* headerbuf, const Slice& key,
|
||||
const Slice& val, uint64_t ttl, int64_t ts);
|
||||
~Writer() = default;
|
||||
|
||||
// No copying allowed
|
||||
Writer(const Writer&) = delete;
|
||||
Writer& operator=(const Writer&) = delete;
|
||||
|
||||
static void ConstructBlobHeader(std::string* buf, const Slice& key,
|
||||
const Slice& val, uint64_t expiration);
|
||||
|
||||
Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
|
||||
uint64_t* blob_offset);
|
||||
|
||||
Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
|
||||
uint64_t* blob_offset, uint64_t ttl);
|
||||
Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
|
||||
uint64_t* key_offset, uint64_t* blob_offset);
|
||||
|
||||
Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
|
||||
const Slice& val, uint64_t* key_offset,
|
||||
uint64_t* blob_offset);
|
||||
|
||||
Status AddRecordFooter(const SequenceNumber& sn);
|
||||
Status AppendFooter(BlobLogFooter& footer);
|
||||
|
||||
Status AppendFooter(const BlobLogFooter& footer);
|
||||
|
||||
Status WriteHeader(const BlobLogHeader& header);
|
||||
Status WriteHeader(BlobLogHeader& header);
|
||||
|
||||
WritableFileWriter* file() { return dest_.get(); }
|
||||
|
||||
@ -79,17 +81,8 @@ class Writer {
|
||||
uint64_t next_sync_offset_;
|
||||
bool use_fsync_;
|
||||
|
||||
// crc32c values for all supported record types. These are
|
||||
// pre-computed to reduce the overhead of computing the crc of the
|
||||
// record type stored in the header.
|
||||
uint32_t type_crc_[kMaxRecordType + 1];
|
||||
|
||||
// No copying allowed
|
||||
Writer(const Writer&) = delete;
|
||||
Writer& operator=(const Writer&) = delete;
|
||||
|
||||
public:
|
||||
enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFooter, kEtFileFooter };
|
||||
enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
|
||||
ElemType last_elem_type_;
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user