Merge branch 'master' into wal_filter
This commit is contained in:
commit
4ce117c4d5
@ -14,10 +14,15 @@
|
||||
# 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries.
|
||||
# See thirdparty.inc for more information.
|
||||
# sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 ..
|
||||
# 4. Then build the project in debug mode (you may want to add /m:<N> flag to run msbuild in <N> parallel threads)
|
||||
# msbuild ALL_BUILD.vcxproj
|
||||
# 4. Then build the project in debug mode (you may want to add /m[:<N>] flag to run msbuild in <N> parallel threads
|
||||
# or simply /m ot use all avail cores)
|
||||
# msbuild rocksdb.sln
|
||||
#
|
||||
# rocksdb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything
|
||||
# will be attempted but test only code does not build in Release mode.
|
||||
#
|
||||
# 5. And release mode (/m[:<N>] is also supported)
|
||||
# msbuild ALL_BUILD.vcxproj /p:Configuration=Release
|
||||
# msbuild rocksdb.sln /p:Configuration=Release
|
||||
#
|
||||
|
||||
cmake_minimum_required(VERSION 2.6)
|
||||
@ -83,6 +88,7 @@ set(LIBS ${ROCKSDB_LIBS} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
|
||||
|
||||
add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
|
||||
|
||||
# Main library source code
|
||||
set(SOURCES
|
||||
db/builder.cc
|
||||
db/c.cc
|
||||
@ -100,7 +106,6 @@ set(SOURCES
|
||||
db/db_impl_experimental.cc
|
||||
db/db_impl_readonly.cc
|
||||
db/db_iter.cc
|
||||
db/db_test_util.cc
|
||||
db/event_helpers.cc
|
||||
db/experimental.cc
|
||||
db/filename.cc
|
||||
@ -252,6 +257,12 @@ set(SOURCES
|
||||
utilities/write_batch_with_index/write_batch_with_index_internal.cc
|
||||
)
|
||||
|
||||
# For test util library that is build only in DEBUG mode
|
||||
# and linked to tests. Add test only code that is not #ifdefed for Release here.
|
||||
set(TESTUTIL_SOURCE
|
||||
db/db_test_util.cc
|
||||
)
|
||||
|
||||
add_library(rocksdblib${ARTIFACT_SUFFIX} ${SOURCES})
|
||||
set_target_properties(rocksdblib${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/rocksdblib${ARTIFACT_SUFFIX}.pdb")
|
||||
add_dependencies(rocksdblib${ARTIFACT_SUFFIX} GenerateBuildVersion)
|
||||
@ -367,7 +378,7 @@ set(TESTS
|
||||
utilities/write_batch_with_index/write_batch_with_index_test.cc
|
||||
)
|
||||
|
||||
set(EXES ${APPS} ${TESTS})
|
||||
set(EXES ${APPS})
|
||||
|
||||
foreach(sourcefile ${EXES})
|
||||
string(REPLACE ".cc" "" exename ${sourcefile})
|
||||
@ -376,12 +387,42 @@ foreach(sourcefile ${EXES})
|
||||
target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${LIBS})
|
||||
endforeach(sourcefile ${EXES})
|
||||
|
||||
# C executables must link to a shared object
|
||||
set(C_EXES ${C_TESTS})
|
||||
# test utilities are only build in debug
|
||||
set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX})
|
||||
add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE})
|
||||
set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb")
|
||||
set_target_properties(${TESTUTILLIB}
|
||||
PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
|
||||
EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
|
||||
EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
|
||||
)
|
||||
|
||||
foreach(sourcefile ${C_EXES})
|
||||
# Tests are excluded from Release builds
|
||||
set(TEST_EXES ${TESTS})
|
||||
|
||||
foreach(sourcefile ${TEST_EXES})
|
||||
string(REPLACE ".cc" "" exename ${sourcefile})
|
||||
string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
|
||||
add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
|
||||
set_target_properties(${exename}${ARTIFACT_SUFFIX}
|
||||
PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
|
||||
EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
|
||||
EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
|
||||
)
|
||||
target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${LIBS} testutillib${ARTIFACT_SUFFIX})
|
||||
endforeach(sourcefile ${TEST_EXES})
|
||||
|
||||
# C executables must link to a shared object
|
||||
set(C_TEST_EXES ${C_TESTS})
|
||||
|
||||
foreach(sourcefile ${C_TEST_EXES})
|
||||
string(REPLACE ".c" "" exename ${sourcefile})
|
||||
string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
|
||||
add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
|
||||
target_link_libraries(${exename}${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX})
|
||||
endforeach(sourcefile ${C_TESTS})
|
||||
set_target_properties(${exename}${ARTIFACT_SUFFIX}
|
||||
PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
|
||||
EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
|
||||
EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
|
||||
)
|
||||
target_link_libraries(${exename}${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX})
|
||||
endforeach(sourcefile ${C_TEST_EXES})
|
||||
|
10
Makefile
10
Makefile
@ -41,6 +41,14 @@ ifeq ($(MAKECMDGOALS),dbg)
|
||||
DEBUG_LEVEL=2
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),clean)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),release)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),shared_lib)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
@ -404,7 +412,7 @@ dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS)
|
||||
# creates static library and programs
|
||||
release:
|
||||
$(MAKE) clean
|
||||
OPT="-DNDEBUG -O2" $(MAKE) static_lib tools db_bench
|
||||
DEBUG_LEVEL=0 $(MAKE) static_lib tools db_bench
|
||||
|
||||
coverage:
|
||||
$(MAKE) clean
|
||||
|
@ -5,7 +5,7 @@ before_build:
|
||||
- cmake -G "Visual Studio 12 Win64" ..
|
||||
- cd ..
|
||||
build:
|
||||
project: build\ALL_BUILD.vcxproj
|
||||
project: build\rocksdb.sln
|
||||
parallel: true
|
||||
verbosity: minimal
|
||||
test: off
|
||||
|
@ -5,7 +5,7 @@ before_build:
|
||||
- cmake -G "Visual Studio 12 Win64" -DOPTDBG=1 ..
|
||||
- cd ..
|
||||
build:
|
||||
project: build\ALL_BUILD.vcxproj
|
||||
project: build\rocksdb.sln
|
||||
parallel: true
|
||||
verbosity: minimal
|
||||
test:
|
||||
|
@ -88,6 +88,9 @@ class FbcodeCppLinter extends ArcanistLinter {
|
||||
}
|
||||
|
||||
private function getCppLintOutput($path) {
|
||||
if (!array_key_exists($path, $this->rawLintOutput)) {
|
||||
return array();
|
||||
}
|
||||
list($output) = $this->rawLintOutput[$path];
|
||||
|
||||
$msgs = array();
|
||||
|
@ -45,7 +45,7 @@ fi
|
||||
# we depend on C++11
|
||||
PLATFORM_CXXFLAGS="-std=c++11"
|
||||
# we currently depend on POSIX platform
|
||||
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
|
||||
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
|
||||
|
||||
# Default to fbcode gcc on internal fb machines
|
||||
if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
|
||||
|
@ -116,7 +116,7 @@ else
|
||||
fi
|
||||
|
||||
CFLAGS+=" $DEPS_INCLUDE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
|
||||
CXXFLAGS+=" $CFLAGS"
|
||||
|
||||
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB"
|
||||
|
@ -91,7 +91,7 @@ else
|
||||
fi
|
||||
|
||||
CFLAGS+=" $DEPS_INCLUDE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
|
||||
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DZSTD -DNUMA"
|
||||
CXXFLAGS+=" $CFLAGS"
|
||||
|
||||
|
@ -70,6 +70,13 @@ TSAN="COMPILE_WITH_TSAN=1"
|
||||
DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
|
||||
PARSER="'parser':'egrep \'Failure|^#|Abort\''"
|
||||
|
||||
ARTIFACTS=" 'artifacts': [
|
||||
{
|
||||
'name':'database',
|
||||
'paths':[ '/dev/shm/rocksdb' ],
|
||||
}
|
||||
]"
|
||||
|
||||
#
|
||||
# A mechanism to disable tests temporarily
|
||||
#
|
||||
@ -256,6 +263,26 @@ LITE_BUILD_COMMANDS="[
|
||||
}
|
||||
]"
|
||||
|
||||
#
|
||||
# RocksDB lite tests
|
||||
#
|
||||
LITE_UNIT_TEST_COMMANDS="[
|
||||
{
|
||||
'name':'Rocksdb Lite Unit Test',
|
||||
'oncall':'$ONCALL',
|
||||
'steps': [
|
||||
$CLEANUP_ENV,
|
||||
{
|
||||
'name':'Build RocksDB debug version',
|
||||
'shell':'$SHM $LITE $DEBUG make J=1 check',
|
||||
'user':'root',
|
||||
$PARSER
|
||||
},
|
||||
],
|
||||
$REPORT
|
||||
}
|
||||
]"
|
||||
|
||||
#
|
||||
# RocksDB stress/crash test
|
||||
#
|
||||
@ -280,6 +307,7 @@ STRESS_CRASH_TEST_COMMANDS="[
|
||||
$PARSER
|
||||
}
|
||||
],
|
||||
$ARTIFACTS,
|
||||
$REPORT
|
||||
}
|
||||
]"
|
||||
@ -567,6 +595,9 @@ case $1 in
|
||||
lite)
|
||||
echo $LITE_BUILD_COMMANDS
|
||||
;;
|
||||
lite_test)
|
||||
echo $LITE_UNIT_TEST_COMMANDS
|
||||
;;
|
||||
stress_crash)
|
||||
echo $STRESS_CRASH_TEST_COMMANDS
|
||||
;;
|
||||
|
5
db/c.cc
5
db/c.cc
@ -1687,6 +1687,11 @@ void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
|
||||
opt->rep.keep_log_file_num = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
|
||||
size_t v) {
|
||||
opt->rep.recycle_log_file_num = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) {
|
||||
opt->rep.soft_rate_limit = v;
|
||||
}
|
||||
|
@ -23,8 +23,6 @@
|
||||
#include "util/sync_point.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
@ -1262,13 +1260,8 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) {
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
@ -64,7 +64,7 @@
|
||||
#include "util/xfunc.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
#if !defined(IOS_CROSS_COMPILE) && (!defined(NDEBUG) || !defined(OS_WIN))
|
||||
#if !defined(IOS_CROSS_COMPILE)
|
||||
#ifndef ROCKSDB_LITE
|
||||
namespace rocksdb {
|
||||
|
||||
|
@ -198,7 +198,7 @@ class CompactionJobTest : public testing::Test {
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(file), env_options_));
|
||||
{
|
||||
log::Writer log(std::move(file_writer));
|
||||
log::Writer log(std::move(file_writer), 0, false);
|
||||
std::string record;
|
||||
new_db.EncodeTo(&record);
|
||||
s = log.AddRecord(record);
|
||||
|
@ -625,11 +625,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) {
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
@ -14,7 +14,7 @@
|
||||
namespace rocksdb {
|
||||
|
||||
// SYNC_POINT is not supported in released Windows mode.
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
|
||||
class DBCompactionTest : public DBTestBase {
|
||||
public:
|
||||
@ -1227,6 +1227,7 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
|
||||
TEST_P(DBCompactionTestWithParam, ManualCompaction) {
|
||||
Options options = CurrentOptions();
|
||||
options.max_subcompactions = max_subcompactions_;
|
||||
options.statistics = rocksdb::CreateDBStatistics();
|
||||
CreateAndReopenWithCF({"pikachu"}, options);
|
||||
|
||||
// iter - 0 with 7 levels
|
||||
@ -1258,7 +1259,14 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) {
|
||||
// Compact all
|
||||
MakeTables(1, "a", "z", 1);
|
||||
ASSERT_EQ("1,0,2", FilesPerLevel(1));
|
||||
|
||||
uint64_t prev_block_cache_add =
|
||||
options.statistics->getTickerCount(BLOCK_CACHE_ADD);
|
||||
db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
|
||||
// Verify manual compaction doesn't fill block cache
|
||||
ASSERT_EQ(prev_block_cache_add,
|
||||
options.statistics->getTickerCount(BLOCK_CACHE_ADD));
|
||||
|
||||
ASSERT_EQ("0,0,1", FilesPerLevel(1));
|
||||
|
||||
if (iter == 0) {
|
||||
@ -1266,6 +1274,7 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) {
|
||||
options.max_background_flushes = 0;
|
||||
options.num_levels = 3;
|
||||
options.create_if_missing = true;
|
||||
options.statistics = rocksdb::CreateDBStatistics();
|
||||
DestroyAndReopen(options);
|
||||
CreateAndReopenWithCF({"pikachu"}, options);
|
||||
}
|
||||
@ -1843,11 +1852,11 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
|
||||
::testing::Values(1, 4));
|
||||
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#endif // !defined(ROCKSDB_LITE)
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
@ -10,7 +10,7 @@
|
||||
// Introduction of SyncPoint effectively disabled building and running this test
|
||||
// in Release build.
|
||||
// which is a pity, it is a good test
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "port/stack_trace.h"
|
||||
@ -484,10 +484,10 @@ TEST_F(DBTestDynamicLevel, MigrateToDynamicLevelMaxBytesBase) {
|
||||
}
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#endif // !defined(ROCKSDB_LITE)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
@ -151,6 +151,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
||||
}
|
||||
}
|
||||
|
||||
if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
|
||||
result.recycle_log_file_num = false;
|
||||
}
|
||||
|
||||
if (result.wal_dir.empty()) {
|
||||
// Use dbname as default
|
||||
result.wal_dir = dbname;
|
||||
@ -417,7 +421,7 @@ Status DBImpl::NewDB() {
|
||||
file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(file), env_options));
|
||||
log::Writer log(std::move(file_writer));
|
||||
log::Writer log(std::move(file_writer), 0, false);
|
||||
std::string record;
|
||||
new_db.EncodeTo(&record);
|
||||
s = log.AddRecord(record);
|
||||
@ -602,7 +606,13 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
||||
// find newly obsoleted log files
|
||||
while (alive_log_files_.begin()->number < min_log_number) {
|
||||
auto& earliest = *alive_log_files_.begin();
|
||||
job_context->log_delete_files.push_back(earliest.number);
|
||||
if (db_options_.recycle_log_file_num > log_recycle_files.size()) {
|
||||
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
||||
"adding log %" PRIu64 " to recycle list\n", earliest.number);
|
||||
log_recycle_files.push_back(earliest.number);
|
||||
} else {
|
||||
job_context->log_delete_files.push_back(earliest.number);
|
||||
}
|
||||
total_log_size_ -= earliest.size;
|
||||
alive_log_files_.pop_front();
|
||||
// Current log should always stay alive since it can't have
|
||||
@ -1110,28 +1120,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
// paranoid_checks==false so that corruptions cause entire commits
|
||||
// to be skipped instead of propagating bad information (like overly
|
||||
// large sequence numbers).
|
||||
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
|
||||
0 /*initial_offset*/);
|
||||
log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
|
||||
true /*checksum*/, 0 /*initial_offset*/, log_number);
|
||||
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
||||
"Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number,
|
||||
db_options_.wal_recovery_mode, !continue_replay_log);
|
||||
|
||||
// Determine if we should tolerate incomplete records at the tail end of the
|
||||
// log
|
||||
bool report_eof_inconsistency;
|
||||
if (db_options_.wal_recovery_mode ==
|
||||
WALRecoveryMode::kAbsoluteConsistency) {
|
||||
// in clean shutdown we don't expect any error in the log files
|
||||
report_eof_inconsistency = true;
|
||||
} else {
|
||||
// for other modes ignore only incomplete records in the last log file
|
||||
// which is presumably due to write in progress during restart
|
||||
report_eof_inconsistency = false;
|
||||
|
||||
// TODO krad: Evaluate if we need to move to a more strict mode where we
|
||||
// restrict the inconsistency to only the last log
|
||||
}
|
||||
|
||||
// Read all the records and add to a memtable
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
@ -1146,9 +1141,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
}
|
||||
}
|
||||
|
||||
while (continue_replay_log &&
|
||||
reader.ReadRecord(&record, &scratch, report_eof_inconsistency) &&
|
||||
status.ok()) {
|
||||
while (
|
||||
continue_replay_log &&
|
||||
reader.ReadRecord(&record, &scratch, db_options_.wal_recovery_mode) &&
|
||||
status.ok()) {
|
||||
if (record.size() < 12) {
|
||||
reporter.Corruption(record.size(),
|
||||
Status::Corruption("log record too small"));
|
||||
@ -4142,6 +4138,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
// Do this without holding the dbmutex lock.
|
||||
assert(versions_->prev_log_number() == 0);
|
||||
bool creating_new_log = !log_empty_;
|
||||
uint64_t recycle_log_number = 0;
|
||||
if (creating_new_log && db_options_.recycle_log_file_num &&
|
||||
!log_recycle_files.empty()) {
|
||||
recycle_log_number = log_recycle_files.front();
|
||||
log_recycle_files.pop_front();
|
||||
}
|
||||
uint64_t new_log_number =
|
||||
creating_new_log ? versions_->NewFileNumber() : logfile_number_;
|
||||
SuperVersion* new_superversion = nullptr;
|
||||
@ -4152,17 +4154,28 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
if (creating_new_log) {
|
||||
EnvOptions opt_env_opt =
|
||||
env_->OptimizeForLogWrite(env_options_, db_options_);
|
||||
s = NewWritableFile(env_,
|
||||
LogFileName(db_options_.wal_dir, new_log_number),
|
||||
&lfile, opt_env_opt);
|
||||
if (recycle_log_number) {
|
||||
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
||||
"reusing log %" PRIu64 " from recycle list\n", recycle_log_number);
|
||||
s = env_->ReuseWritableFile(
|
||||
LogFileName(db_options_.wal_dir, new_log_number),
|
||||
LogFileName(db_options_.wal_dir, recycle_log_number), &lfile,
|
||||
opt_env_opt);
|
||||
} else {
|
||||
s = NewWritableFile(env_,
|
||||
LogFileName(db_options_.wal_dir, new_log_number),
|
||||
&lfile, opt_env_opt);
|
||||
}
|
||||
if (s.ok()) {
|
||||
// Our final size should be less than write_buffer_size
|
||||
// (compression, etc) but err on the side of caution.
|
||||
lfile->SetPreallocationBlockSize(
|
||||
1.1 * mutable_cf_options.write_buffer_size);
|
||||
lfile->SetPreallocationBlockSize(1.1 *
|
||||
mutable_cf_options.write_buffer_size);
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(lfile), opt_env_opt));
|
||||
new_log = new log::Writer(std::move(file_writer));
|
||||
new_log = new log::Writer(std::move(file_writer),
|
||||
new_log_number,
|
||||
db_options_.recycle_log_file_num > 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4801,8 +4814,11 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
||||
impl->logfile_number_ = new_log_number;
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(lfile), opt_env_options));
|
||||
impl->logs_.emplace_back(new_log_number,
|
||||
new log::Writer(std::move(file_writer)));
|
||||
impl->logs_.emplace_back(
|
||||
new_log_number,
|
||||
new log::Writer(std::move(file_writer),
|
||||
new_log_number,
|
||||
impl->db_options_.recycle_log_file_num > 0));
|
||||
|
||||
// set column family handles
|
||||
for (auto cf : column_families) {
|
||||
|
@ -556,6 +556,8 @@ class DBImpl : public DB {
|
||||
// * whenever there is an error in background flush or compaction
|
||||
InstrumentedCondVar bg_cv_;
|
||||
uint64_t logfile_number_;
|
||||
std::deque<uint64_t>
|
||||
log_recycle_files; // a list of log files that we can recycle
|
||||
bool log_dir_synced_;
|
||||
bool log_empty_;
|
||||
ColumnFamilyHandleImpl* default_cf_handle_;
|
||||
|
@ -1943,8 +1943,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
|
||||
ASSERT_EQ(db_iter_->value().ToString(), "4");
|
||||
}
|
||||
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
|
||||
TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
|
||||
// Test Prev() when one child iterator is at its end but more rows
|
||||
// are added.
|
||||
@ -2295,7 +2293,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
|
||||
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
#endif // #if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
@ -10,7 +10,7 @@
|
||||
// Introduction of SyncPoint effectively disabled building and running this test
|
||||
// in Release build.
|
||||
// which is a pity, it is a good test
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "port/stack_trace.h"
|
||||
@ -277,10 +277,10 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
|
||||
}
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#endif // !defined(ROCKSDB_LITE)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
@ -207,11 +207,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
// Introduction of SyncPoint effectively disabled building and running this test
|
||||
// in Release build.
|
||||
// which is a pity, it is a good test
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/forward_iterator.h"
|
||||
@ -646,10 +646,10 @@ TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToSame) {
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#endif // !defined(ROCKSDB_LITE)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
@ -10,8 +10,6 @@
|
||||
// Introduction of SyncPoint effectively disabled building and running this test
|
||||
// in Release build.
|
||||
// which is a pity, it is a good test
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
@ -5075,7 +5073,9 @@ class RecoveryTestHelper {
|
||||
ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(file), env_options));
|
||||
current_log_writer.reset(new log::Writer(std::move(file_writer)));
|
||||
current_log_writer.reset(new log::Writer(
|
||||
std::move(file_writer), current_log_number,
|
||||
db_options.recycle_log_file_num > 0));
|
||||
|
||||
for (int i = 0; i < kKeysPerWALFile; i++) {
|
||||
std::string key = "key" + ToString(count++);
|
||||
@ -10445,14 +10445,9 @@ INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
|
||||
#endif // ROCKSDB_LITE
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
@ -58,12 +58,9 @@ DBTestBase::DBTestBase(const std::string path)
|
||||
}
|
||||
|
||||
DBTestBase::~DBTestBase() {
|
||||
// SyncPoint is not supported in Released Windows Mode.
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
rocksdb::SyncPoint::GetInstance()->LoadDependency({});
|
||||
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
#endif // !(defined NDEBUG) || !defined(OS_WIN)
|
||||
Close();
|
||||
Options options;
|
||||
options.db_paths.emplace_back(dbname_, 0);
|
||||
@ -343,6 +340,10 @@ Options DBTestBase::CurrentOptions(
|
||||
options.row_cache = NewLRUCache(1024 * 1024);
|
||||
break;
|
||||
}
|
||||
case kRecycleLogFiles: {
|
||||
options.recycle_log_file_num = 2;
|
||||
break;
|
||||
}
|
||||
case kLevelSubcompactions: {
|
||||
options.max_subcompactions = 4;
|
||||
break;
|
||||
|
@ -437,9 +437,10 @@ class DBTestBase : public testing::Test {
|
||||
kFIFOCompaction = 26,
|
||||
kOptimizeFiltersForHits = 27,
|
||||
kRowCache = 28,
|
||||
kLevelSubcompactions = 29,
|
||||
kUniversalSubcompactions = 30,
|
||||
kEnd = 29
|
||||
kRecycleLogFiles = 29,
|
||||
kLevelSubcompactions = 30,
|
||||
kUniversalSubcompactions = 31,
|
||||
kEnd = 30
|
||||
};
|
||||
int option_config_;
|
||||
|
||||
|
@ -9,7 +9,7 @@
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "port/stack_trace.h"
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -1210,10 +1210,10 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#endif // !defined(ROCKSDB_LITE)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
|
||||
#if !defined(ROCKSDB_LITE)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
@ -9,9 +9,7 @@
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "port/stack_trace.h"
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
#include "util/sync_point.h"
|
||||
#endif
|
||||
|
||||
namespace rocksdb {
|
||||
class DBWALTest : public DBTestBase {
|
||||
@ -70,7 +68,6 @@ TEST_F(DBWALTest, RollLog) {
|
||||
} while (ChangeOptions());
|
||||
}
|
||||
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
TEST_F(DBWALTest, SyncWALNotBlockWrite) {
|
||||
Options options = CurrentOptions();
|
||||
options.max_write_buffer_number = 4;
|
||||
@ -130,15 +127,10 @@ TEST_F(DBWALTest, SyncWALNotWaitWrite) {
|
||||
ASSERT_EQ(Get("foo2"), "bar2");
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
#endif
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
@ -11,8 +11,6 @@
|
||||
// the last "sync". It then checks for data loss errors by purposely dropping
|
||||
// file data (or entire files) not protected by a "sync".
|
||||
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include "db/db_impl.h"
|
||||
@ -902,13 +900,7 @@ INSTANTIATE_TEST_CASE_P(FaultTest, FaultInjectionTest, ::testing::Bool());
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // #if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
@ -61,7 +61,7 @@ class FlushJobTest : public testing::Test {
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(file), EnvOptions()));
|
||||
{
|
||||
log::Writer log(std::move(file_writer));
|
||||
log::Writer log(std::move(file_writer), 0, false);
|
||||
std::string record;
|
||||
new_db.EncodeTo(&record);
|
||||
s = log.AddRecord(record);
|
||||
|
@ -22,14 +22,24 @@ enum RecordType {
|
||||
// For fragments
|
||||
kFirstType = 2,
|
||||
kMiddleType = 3,
|
||||
kLastType = 4
|
||||
kLastType = 4,
|
||||
|
||||
// For recycled log files
|
||||
kRecyclableFullType = 5,
|
||||
kRecyclableFirstType = 6,
|
||||
kRecyclableMiddleType = 7,
|
||||
kRecyclableLastType = 8,
|
||||
};
|
||||
static const int kMaxRecordType = kLastType;
|
||||
static const int kMaxRecordType = kRecyclableLastType;
|
||||
|
||||
static const unsigned int kBlockSize = 32768;
|
||||
|
||||
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
|
||||
static const int kHeaderSize = 4 + 1 + 2;
|
||||
|
||||
// Recyclable header is checksum (4 bytes), type (1 byte), log number
|
||||
// (4 bytes), length (2 bytes).
|
||||
static const int kRecyclableHeaderSize = 4 + 1 + 4 + 2;
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
|
161
db/log_reader.cc
161
db/log_reader.cc
@ -21,9 +21,12 @@ namespace log {
|
||||
Reader::Reporter::~Reporter() {
|
||||
}
|
||||
|
||||
Reader::Reader(unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
|
||||
bool checksum, uint64_t initial_offset)
|
||||
: file_(std::move(_file)),
|
||||
Reader::Reader(std::shared_ptr<Logger> info_log,
|
||||
unique_ptr<SequentialFileReader>&& _file,
|
||||
Reporter* reporter, bool checksum, uint64_t initial_offset,
|
||||
uint64_t log_num)
|
||||
: info_log_(info_log),
|
||||
file_(std::move(_file)),
|
||||
reporter_(reporter),
|
||||
checksum_(checksum),
|
||||
backing_store_(new char[kBlockSize]),
|
||||
@ -33,7 +36,8 @@ Reader::Reader(unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
|
||||
eof_offset_(0),
|
||||
last_record_offset_(0),
|
||||
end_of_buffer_offset_(0),
|
||||
initial_offset_(initial_offset) {}
|
||||
initial_offset_(initial_offset),
|
||||
log_number_(log_num) {}
|
||||
|
||||
Reader::~Reader() {
|
||||
delete[] backing_store_;
|
||||
@ -62,8 +66,15 @@ bool Reader::SkipToInitialBlock() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// For kAbsoluteConsistency, on clean shutdown we don't expect any error
|
||||
// in the log files. For other modes, we can ignore only incomplete records
|
||||
// in the last log file, which are presumably due to a write in progress
|
||||
// during restart (or from log recycling).
|
||||
//
|
||||
// TODO krad: Evaluate if we need to move to a more strict mode where we
|
||||
// restrict the inconsistency to only the last log
|
||||
bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||
const bool report_eof_inconsistency) {
|
||||
WALRecoveryMode wal_recovery_mode) {
|
||||
if (last_record_offset_ < initial_offset_) {
|
||||
if (!SkipToInitialBlock()) {
|
||||
return false;
|
||||
@ -80,10 +91,11 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||
Slice fragment;
|
||||
while (true) {
|
||||
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
|
||||
const unsigned int record_type =
|
||||
ReadPhysicalRecord(&fragment, report_eof_inconsistency);
|
||||
size_t drop_size;
|
||||
const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size);
|
||||
switch (record_type) {
|
||||
case kFullType:
|
||||
case kRecyclableFullType:
|
||||
if (in_fragmented_record && !scratch->empty()) {
|
||||
// Handle bug in earlier versions of log::Writer where
|
||||
// it could emit an empty kFirstType record at the tail end
|
||||
@ -98,6 +110,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||
return true;
|
||||
|
||||
case kFirstType:
|
||||
case kRecyclableFirstType:
|
||||
if (in_fragmented_record && !scratch->empty()) {
|
||||
// Handle bug in earlier versions of log::Writer where
|
||||
// it could emit an empty kFirstType record at the tail end
|
||||
@ -111,6 +124,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||
break;
|
||||
|
||||
case kMiddleType:
|
||||
case kRecyclableMiddleType:
|
||||
if (!in_fragmented_record) {
|
||||
ReportCorruption(fragment.size(),
|
||||
"missing start of fragmented record(1)");
|
||||
@ -120,6 +134,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||
break;
|
||||
|
||||
case kLastType:
|
||||
case kRecyclableLastType:
|
||||
if (!in_fragmented_record) {
|
||||
ReportCorruption(fragment.size(),
|
||||
"missing start of fragmented record(2)");
|
||||
@ -131,9 +146,17 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||
}
|
||||
break;
|
||||
|
||||
case kBadHeader:
|
||||
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
|
||||
// in clean shutdown we don't expect any error in the log files
|
||||
ReportCorruption(drop_size, "truncated header");
|
||||
}
|
||||
// fall-thru
|
||||
|
||||
case kEof:
|
||||
if (in_fragmented_record) {
|
||||
if (report_eof_inconsistency) {
|
||||
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
|
||||
// in clean shutdown we don't expect any error in the log files
|
||||
ReportCorruption(scratch->size(), "error reading trailing data");
|
||||
}
|
||||
// This can be caused by the writer dying immediately after
|
||||
@ -143,6 +166,23 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
|
||||
}
|
||||
return false;
|
||||
|
||||
case kOldRecord:
|
||||
if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
|
||||
// Treat a record from a previous instance of the log as EOF.
|
||||
if (in_fragmented_record) {
|
||||
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
|
||||
// in clean shutdown we don't expect any error in the log files
|
||||
ReportCorruption(scratch->size(), "error reading trailing data");
|
||||
}
|
||||
// This can be caused by the writer dying immediately after
|
||||
// writing a physical record but before completing the next; don't
|
||||
// treat it as a corruption, just ignore the entire logical record.
|
||||
scratch->clear();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// fall-thru
|
||||
|
||||
case kBadRecord:
|
||||
if (in_fragmented_record) {
|
||||
ReportCorruption(scratch->size(), "error in middle of record");
|
||||
@ -244,36 +284,49 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) {
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int Reader::ReadPhysicalRecord(Slice* result,
|
||||
const bool report_eof_inconsistency) {
|
||||
bool Reader::ReadMore(size_t* drop_size, int *error) {
|
||||
if (!eof_ && !read_error_) {
|
||||
// Last read was a full read, so this is a trailer to skip
|
||||
buffer_.clear();
|
||||
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
||||
end_of_buffer_offset_ += buffer_.size();
|
||||
if (!status.ok()) {
|
||||
buffer_.clear();
|
||||
ReportDrop(kBlockSize, status);
|
||||
read_error_ = true;
|
||||
*error = kEof;
|
||||
return false;
|
||||
} else if (buffer_.size() < (size_t)kBlockSize) {
|
||||
eof_ = true;
|
||||
eof_offset_ = buffer_.size();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// Note that if buffer_ is non-empty, we have a truncated header at the
|
||||
// end of the file, which can be caused by the writer crashing in the
|
||||
// middle of writing the header. Unless explicitly requested we don't
|
||||
// considering this an error, just report EOF.
|
||||
if (buffer_.size()) {
|
||||
*drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
*error = kBadHeader;
|
||||
return false;
|
||||
}
|
||||
buffer_.clear();
|
||||
*error = kEof;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
|
||||
while (true) {
|
||||
// We need at least the minimum header size
|
||||
if (buffer_.size() < (size_t)kHeaderSize) {
|
||||
if (!eof_ && !read_error_) {
|
||||
// Last read was a full read, so this is a trailer to skip
|
||||
buffer_.clear();
|
||||
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
||||
end_of_buffer_offset_ += buffer_.size();
|
||||
if (!status.ok()) {
|
||||
buffer_.clear();
|
||||
ReportDrop(kBlockSize, status);
|
||||
read_error_ = true;
|
||||
return kEof;
|
||||
} else if (buffer_.size() < (size_t)kBlockSize) {
|
||||
eof_ = true;
|
||||
eof_offset_ = buffer_.size();
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
// Note that if buffer_ is non-empty, we have a truncated header at the
|
||||
// end of the file, which can be caused by the writer crashing in the
|
||||
// middle of writing the header. Unless explicitly requested we don't
|
||||
// considering this an error, just report EOF.
|
||||
if (buffer_.size() && report_eof_inconsistency) {
|
||||
ReportCorruption(buffer_.size(), "truncated header");
|
||||
}
|
||||
buffer_.clear();
|
||||
return kEof;
|
||||
int r;
|
||||
if (!ReadMore(drop_size, &r)) {
|
||||
return r;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse the header
|
||||
@ -282,18 +335,34 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result,
|
||||
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
|
||||
const unsigned int type = header[6];
|
||||
const uint32_t length = a | (b << 8);
|
||||
if (kHeaderSize + length > buffer_.size()) {
|
||||
size_t drop_size = buffer_.size();
|
||||
int header_size = kHeaderSize;
|
||||
if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
|
||||
header_size = kRecyclableHeaderSize;
|
||||
// We need enough for the larger header
|
||||
if (buffer_.size() < (size_t)kRecyclableHeaderSize) {
|
||||
int r;
|
||||
if (!ReadMore(drop_size, &r)) {
|
||||
return r;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const uint32_t log_num = DecodeFixed32(header + 7);
|
||||
if (log_num != log_number_) {
|
||||
return kOldRecord;
|
||||
}
|
||||
}
|
||||
if (header_size + length > buffer_.size()) {
|
||||
*drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
if (!eof_) {
|
||||
ReportCorruption(drop_size, "bad record length");
|
||||
ReportCorruption(*drop_size, "bad record length");
|
||||
return kBadRecord;
|
||||
}
|
||||
// If the end of the file has been reached without reading |length| bytes
|
||||
// of payload, assume the writer died in the middle of writing the record.
|
||||
// Don't report a corruption unless requested.
|
||||
if (drop_size && report_eof_inconsistency) {
|
||||
ReportCorruption(drop_size, "truncated header");
|
||||
if (*drop_size) {
|
||||
return kBadHeader;
|
||||
}
|
||||
return kEof;
|
||||
}
|
||||
@ -311,29 +380,29 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result,
|
||||
// Check crc
|
||||
if (checksum_) {
|
||||
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
|
||||
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
|
||||
uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
|
||||
if (actual_crc != expected_crc) {
|
||||
// Drop the rest of the buffer since "length" itself may have
|
||||
// been corrupted and if we trust it, we could find some
|
||||
// fragment of a real log record that just happens to look
|
||||
// like a valid log record.
|
||||
size_t drop_size = buffer_.size();
|
||||
*drop_size = buffer_.size();
|
||||
buffer_.clear();
|
||||
ReportCorruption(drop_size, "checksum mismatch");
|
||||
ReportCorruption(*drop_size, "checksum mismatch");
|
||||
return kBadRecord;
|
||||
}
|
||||
}
|
||||
|
||||
buffer_.remove_prefix(kHeaderSize + length);
|
||||
buffer_.remove_prefix(header_size + length);
|
||||
|
||||
// Skip physical record that started before initial_offset_
|
||||
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
|
||||
if (end_of_buffer_offset_ - buffer_.size() - header_size - length <
|
||||
initial_offset_) {
|
||||
result->clear();
|
||||
return kBadRecord;
|
||||
}
|
||||
|
||||
*result = Slice(header + kHeaderSize, length);
|
||||
*result = Slice(header + header_size, length);
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
@ -14,10 +14,12 @@
|
||||
#include "db/log_format.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class SequentialFileReader;
|
||||
class Logger;
|
||||
using std::unique_ptr;
|
||||
|
||||
namespace log {
|
||||
@ -51,8 +53,10 @@ class Reader {
|
||||
//
|
||||
// The Reader will start reading at the first record located at physical
|
||||
// position >= initial_offset within the file.
|
||||
Reader(unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
|
||||
bool checksum, uint64_t initial_offset);
|
||||
Reader(std::shared_ptr<Logger> info_log,
|
||||
unique_ptr<SequentialFileReader>&& file,
|
||||
Reporter* reporter, bool checksum, uint64_t initial_offset,
|
||||
uint64_t log_num);
|
||||
|
||||
~Reader();
|
||||
|
||||
@ -62,7 +66,8 @@ class Reader {
|
||||
// will only be valid until the next mutating operation on this
|
||||
// reader or the next mutation to *scratch.
|
||||
bool ReadRecord(Slice* record, std::string* scratch,
|
||||
bool report_eof_inconsistency = false);
|
||||
WALRecoveryMode wal_recovery_mode =
|
||||
WALRecoveryMode::kTolerateCorruptedTailRecords);
|
||||
|
||||
// Returns the physical offset of the last record returned by ReadRecord.
|
||||
//
|
||||
@ -84,6 +89,7 @@ class Reader {
|
||||
SequentialFileReader* file() { return file_.get(); }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Logger> info_log_;
|
||||
const unique_ptr<SequentialFileReader> file_;
|
||||
Reporter* const reporter_;
|
||||
bool const checksum_;
|
||||
@ -104,6 +110,9 @@ class Reader {
|
||||
// Offset at which to start looking for the first record to return
|
||||
uint64_t const initial_offset_;
|
||||
|
||||
// which log number this is
|
||||
uint64_t const log_number_;
|
||||
|
||||
// Extend record types with the following special values
|
||||
enum {
|
||||
kEof = kMaxRecordType + 1,
|
||||
@ -112,7 +121,11 @@ class Reader {
|
||||
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
|
||||
// * The record is a 0-length record (No drop is reported)
|
||||
// * The record is below constructor's initial_offset (No drop is reported)
|
||||
kBadRecord = kMaxRecordType + 2
|
||||
kBadRecord = kMaxRecordType + 2,
|
||||
// Returned when we fail to read a valid header.
|
||||
kBadHeader = kMaxRecordType + 3,
|
||||
// Returned when we read an old record from a previous user of the log.
|
||||
kOldRecord = kMaxRecordType + 4,
|
||||
};
|
||||
|
||||
// Skips all blocks that are completely before "initial_offset_".
|
||||
@ -121,8 +134,10 @@ class Reader {
|
||||
bool SkipToInitialBlock();
|
||||
|
||||
// Return type, or one of the preceding special values
|
||||
unsigned int ReadPhysicalRecord(Slice* result,
|
||||
bool report_eof_inconsistency = false);
|
||||
unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size);
|
||||
|
||||
// Read some more
|
||||
bool ReadMore(size_t* drop_size, int *error);
|
||||
|
||||
// Reports dropped bytes to the reporter.
|
||||
// buffer_ must be updated to remove the dropped bytes prior to invocation.
|
||||
|
216
db/log_test.cc
216
db/log_test.cc
@ -43,7 +43,7 @@ static std::string RandomSkewedString(int i, Random* rnd) {
|
||||
return BigString(NumberString(i), rnd->Skewed(17));
|
||||
}
|
||||
|
||||
class LogTest : public testing::Test {
|
||||
class LogTest : public ::testing::TestWithParam<int> {
|
||||
private:
|
||||
class StringSource : public SequentialFile {
|
||||
public:
|
||||
@ -153,19 +153,26 @@ class LogTest : public testing::Test {
|
||||
|
||||
// Record metadata for testing initial offset functionality
|
||||
static size_t initial_offset_record_sizes_[];
|
||||
static uint64_t initial_offset_last_record_offsets_[];
|
||||
uint64_t initial_offset_last_record_offsets_[4];
|
||||
|
||||
public:
|
||||
LogTest()
|
||||
: reader_contents_(),
|
||||
dest_holder_(
|
||||
test::GetWritableFileWriter(
|
||||
new test::StringSink(&reader_contents_))),
|
||||
dest_holder_(test::GetWritableFileWriter(
|
||||
new test::StringSink(&reader_contents_))),
|
||||
source_holder_(
|
||||
test::GetSequentialFileReader(new StringSource(reader_contents_))),
|
||||
writer_(std::move(dest_holder_)),
|
||||
reader_(std::move(source_holder_), &report_, true /*checksum*/,
|
||||
0 /*initial_offset*/) {}
|
||||
writer_(std::move(dest_holder_), 123, GetParam()),
|
||||
reader_(NULL, std::move(source_holder_), &report_, true /*checksum*/,
|
||||
0 /*initial_offset*/, 123) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
initial_offset_last_record_offsets_[0] = 0;
|
||||
initial_offset_last_record_offsets_[1] = header_size + 10000;
|
||||
initial_offset_last_record_offsets_[2] = 2 * (header_size + 10000);
|
||||
initial_offset_last_record_offsets_[3] = 2 * (header_size + 10000) +
|
||||
(2 * log::kBlockSize - 1000) +
|
||||
3 * header_size;
|
||||
}
|
||||
|
||||
void Write(const std::string& msg) {
|
||||
writer_.AddRecord(Slice(msg));
|
||||
@ -175,10 +182,11 @@ class LogTest : public testing::Test {
|
||||
return dest_contents().size();
|
||||
}
|
||||
|
||||
std::string Read(const bool report_eof_inconsistency = false) {
|
||||
std::string Read(const WALRecoveryMode wal_recovery_mode =
|
||||
WALRecoveryMode::kTolerateCorruptedTailRecords) {
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
if (reader_.ReadRecord(&record, &scratch, report_eof_inconsistency)) {
|
||||
if (reader_.ReadRecord(&record, &scratch, wal_recovery_mode)) {
|
||||
return record.ToString();
|
||||
} else {
|
||||
return "EOF";
|
||||
@ -200,9 +208,11 @@ class LogTest : public testing::Test {
|
||||
dest->Drop(bytes);
|
||||
}
|
||||
|
||||
void FixChecksum(int header_offset, int len) {
|
||||
void FixChecksum(int header_offset, int len, bool recyclable) {
|
||||
// Compute crc of type/len/data
|
||||
uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
|
||||
int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize;
|
||||
uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6],
|
||||
header_size - 6 + len);
|
||||
crc = crc32c::Mask(crc);
|
||||
EncodeFixed32(&dest_contents()[header_offset], crc);
|
||||
}
|
||||
@ -259,8 +269,8 @@ class LogTest : public testing::Test {
|
||||
unique_ptr<SequentialFileReader> file_reader(
|
||||
test::GetSequentialFileReader(new StringSource(reader_contents_)));
|
||||
unique_ptr<Reader> offset_reader(
|
||||
new Reader(std::move(file_reader), &report_, true /*checksum*/,
|
||||
WrittenBytes() + offset_past_end));
|
||||
new Reader(NULL, std::move(file_reader), &report_,
|
||||
true /*checksum*/, WrittenBytes() + offset_past_end, 123));
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
|
||||
@ -271,8 +281,9 @@ class LogTest : public testing::Test {
|
||||
WriteInitialOffsetLog();
|
||||
unique_ptr<SequentialFileReader> file_reader(
|
||||
test::GetSequentialFileReader(new StringSource(reader_contents_)));
|
||||
unique_ptr<Reader> offset_reader(new Reader(
|
||||
std::move(file_reader), &report_, true /*checksum*/, initial_offset));
|
||||
unique_ptr<Reader> offset_reader(
|
||||
new Reader(NULL, std::move(file_reader), &report_,
|
||||
true /*checksum*/, initial_offset, 123));
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
|
||||
@ -291,16 +302,9 @@ size_t LogTest::initial_offset_record_sizes_[] =
|
||||
2 * log::kBlockSize - 1000, // Span three blocks
|
||||
1};
|
||||
|
||||
uint64_t LogTest::initial_offset_last_record_offsets_[] =
|
||||
{0,
|
||||
kHeaderSize + 10000,
|
||||
2 * (kHeaderSize + 10000),
|
||||
2 * (kHeaderSize + 10000) +
|
||||
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
|
||||
TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
|
||||
|
||||
TEST_F(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
|
||||
|
||||
TEST_F(LogTest, ReadWrite) {
|
||||
TEST_P(LogTest, ReadWrite) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
Write("");
|
||||
@ -313,7 +317,7 @@ TEST_F(LogTest, ReadWrite) {
|
||||
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ManyBlocks) {
|
||||
TEST_P(LogTest, ManyBlocks) {
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
Write(NumberString(i));
|
||||
}
|
||||
@ -323,7 +327,7 @@ TEST_F(LogTest, ManyBlocks) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, Fragmentation) {
|
||||
TEST_P(LogTest, Fragmentation) {
|
||||
Write("small");
|
||||
Write(BigString("medium", 50000));
|
||||
Write(BigString("large", 100000));
|
||||
@ -333,11 +337,12 @@ TEST_F(LogTest, Fragmentation) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, MarginalTrailer) {
|
||||
TEST_P(LogTest, MarginalTrailer) {
|
||||
// Make a trailer that is exactly the same length as an empty record.
|
||||
const int n = kBlockSize - 2*kHeaderSize;
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
const int n = kBlockSize - 2 * header_size;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
|
||||
Write("");
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
@ -346,11 +351,12 @@ TEST_F(LogTest, MarginalTrailer) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, MarginalTrailer2) {
|
||||
TEST_P(LogTest, MarginalTrailer2) {
|
||||
// Make a trailer that is exactly the same length as an empty record.
|
||||
const int n = kBlockSize - 2*kHeaderSize;
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
const int n = kBlockSize - 2 * header_size;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("bar", Read());
|
||||
@ -359,10 +365,11 @@ TEST_F(LogTest, MarginalTrailer2) {
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ShortTrailer) {
|
||||
const int n = kBlockSize - 2*kHeaderSize + 4;
|
||||
TEST_P(LogTest, ShortTrailer) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
const int n = kBlockSize - 2 * header_size + 4;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
|
||||
Write("");
|
||||
Write("bar");
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
@ -371,15 +378,16 @@ TEST_F(LogTest, ShortTrailer) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, AlignedEof) {
|
||||
const int n = kBlockSize - 2*kHeaderSize + 4;
|
||||
TEST_P(LogTest, AlignedEof) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
const int n = kBlockSize - 2 * header_size + 4;
|
||||
Write(BigString("foo", n));
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
|
||||
ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, RandomRead) {
|
||||
TEST_P(LogTest, RandomRead) {
|
||||
const int N = 500;
|
||||
Random write_rnd(301);
|
||||
for (int i = 0; i < N; i++) {
|
||||
@ -394,7 +402,7 @@ TEST_F(LogTest, RandomRead) {
|
||||
|
||||
// Tests of all the error paths in log_reader.cc follow:
|
||||
|
||||
TEST_F(LogTest, ReadError) {
|
||||
TEST_P(LogTest, ReadError) {
|
||||
Write("foo");
|
||||
ForceError();
|
||||
ASSERT_EQ("EOF", Read());
|
||||
@ -402,17 +410,17 @@ TEST_F(LogTest, ReadError) {
|
||||
ASSERT_EQ("OK", MatchError("read error"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, BadRecordType) {
|
||||
TEST_P(LogTest, BadRecordType) {
|
||||
Write("foo");
|
||||
// Type is stored in header[6]
|
||||
IncrementByte(6, 100);
|
||||
FixChecksum(0, 3);
|
||||
FixChecksum(0, 3, false);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("unknown record type"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) {
|
||||
TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
|
||||
Write("foo");
|
||||
ShrinkSize(4); // Drop all payload as well as a header byte
|
||||
ASSERT_EQ("EOF", Read());
|
||||
@ -421,17 +429,18 @@ TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) {
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, TruncatedTrailingRecordIsNotIgnored) {
|
||||
TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
|
||||
Write("foo");
|
||||
ShrinkSize(4); // Drop all payload as well as a header byte
|
||||
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency*/ true));
|
||||
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
|
||||
// Truncated last record is ignored, not treated as an error
|
||||
ASSERT_GT(DroppedBytes(), 0U);
|
||||
ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, BadLength) {
|
||||
const int kPayloadSize = kBlockSize - kHeaderSize;
|
||||
TEST_P(LogTest, BadLength) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
const int kPayloadSize = kBlockSize - header_size;
|
||||
Write(BigString("bar", kPayloadSize));
|
||||
Write("foo");
|
||||
// Least significant size byte is stored in header[4].
|
||||
@ -441,7 +450,7 @@ TEST_F(LogTest, BadLength) {
|
||||
ASSERT_EQ("OK", MatchError("bad record length"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, BadLengthAtEndIsIgnored) {
|
||||
TEST_P(LogTest, BadLengthAtEndIsIgnored) {
|
||||
Write("foo");
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
@ -449,63 +458,63 @@ TEST_F(LogTest, BadLengthAtEndIsIgnored) {
|
||||
ASSERT_EQ("", ReportMessage());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, BadLengthAtEndIsNotIgnored) {
|
||||
TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
|
||||
Write("foo");
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
|
||||
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
|
||||
ASSERT_GT(DroppedBytes(), 0U);
|
||||
ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ChecksumMismatch) {
|
||||
Write("foo");
|
||||
IncrementByte(0, 10);
|
||||
TEST_P(LogTest, ChecksumMismatch) {
|
||||
Write("foooooo");
|
||||
IncrementByte(0, 14);
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(10U, DroppedBytes());
|
||||
ASSERT_EQ(14U + 4 * !!GetParam(), DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("checksum mismatch"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, UnexpectedMiddleType) {
|
||||
TEST_P(LogTest, UnexpectedMiddleType) {
|
||||
Write("foo");
|
||||
SetByte(6, kMiddleType);
|
||||
FixChecksum(0, 3);
|
||||
SetByte(6, GetParam() ? kRecyclableMiddleType : kMiddleType);
|
||||
FixChecksum(0, 3, !!GetParam());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("missing start"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, UnexpectedLastType) {
|
||||
TEST_P(LogTest, UnexpectedLastType) {
|
||||
Write("foo");
|
||||
SetByte(6, kLastType);
|
||||
FixChecksum(0, 3);
|
||||
SetByte(6, GetParam() ? kRecyclableLastType : kLastType);
|
||||
FixChecksum(0, 3, !!GetParam());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("missing start"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, UnexpectedFullType) {
|
||||
TEST_P(LogTest, UnexpectedFullType) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
SetByte(6, kFirstType);
|
||||
FixChecksum(0, 3);
|
||||
SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType);
|
||||
FixChecksum(0, 3, !!GetParam());
|
||||
ASSERT_EQ("bar", Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, UnexpectedFirstType) {
|
||||
TEST_P(LogTest, UnexpectedFirstType) {
|
||||
Write("foo");
|
||||
Write(BigString("bar", 100000));
|
||||
SetByte(6, kFirstType);
|
||||
FixChecksum(0, 3);
|
||||
SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType);
|
||||
FixChecksum(0, 3, !!GetParam());
|
||||
ASSERT_EQ(BigString("bar", 100000), Read());
|
||||
ASSERT_EQ("EOF", Read());
|
||||
ASSERT_EQ(3U, DroppedBytes());
|
||||
ASSERT_EQ("OK", MatchError("partial record without end"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, MissingLastIsIgnored) {
|
||||
TEST_P(LogTest, MissingLastIsIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Remove the LAST block, including header.
|
||||
ShrinkSize(14);
|
||||
@ -514,16 +523,16 @@ TEST_F(LogTest, MissingLastIsIgnored) {
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, MissingLastIsNotIgnored) {
|
||||
TEST_P(LogTest, MissingLastIsNotIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Remove the LAST block, including header.
|
||||
ShrinkSize(14);
|
||||
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
|
||||
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
|
||||
ASSERT_GT(DroppedBytes(), 0U);
|
||||
ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, PartialLastIsIgnored) {
|
||||
TEST_P(LogTest, PartialLastIsIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Cause a bad record length in the LAST block.
|
||||
ShrinkSize(1);
|
||||
@ -532,18 +541,18 @@ TEST_F(LogTest, PartialLastIsIgnored) {
|
||||
ASSERT_EQ(0U, DroppedBytes());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, PartialLastIsNotIgnored) {
|
||||
TEST_P(LogTest, PartialLastIsNotIgnored) {
|
||||
Write(BigString("bar", kBlockSize));
|
||||
// Cause a bad record length in the LAST block.
|
||||
ShrinkSize(1);
|
||||
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
|
||||
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
|
||||
ASSERT_GT(DroppedBytes(), 0U);
|
||||
ASSERT_EQ("OK", MatchError(
|
||||
"Corruption: truncated headerCorruption: "
|
||||
"error reading trailing data"));
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ErrorJoinsRecords) {
|
||||
TEST_P(LogTest, ErrorJoinsRecords) {
|
||||
// Consider two fragmented records:
|
||||
// first(R1) last(R1) first(R2) last(R2)
|
||||
// where the middle two fragments disappear. We do not want
|
||||
@ -566,46 +575,60 @@ TEST_F(LogTest, ErrorJoinsRecords) {
|
||||
ASSERT_GE(dropped, 2 * kBlockSize);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
|
||||
TEST_P(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
|
||||
|
||||
TEST_F(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
|
||||
TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
|
||||
|
||||
TEST_F(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
|
||||
TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
|
||||
|
||||
TEST_F(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); }
|
||||
TEST_P(LogTest, ReadSecondStart) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
CheckInitialOffsetRecord(10000 + header_size, 1);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); }
|
||||
TEST_P(LogTest, ReadThirdOneOff) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
CheckInitialOffsetRecord(10000 + header_size + 1, 2);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); }
|
||||
TEST_P(LogTest, ReadThirdStart) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
CheckInitialOffsetRecord(20000 + 2 * header_size, 2);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); }
|
||||
TEST_P(LogTest, ReadFourthOneOff) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
CheckInitialOffsetRecord(20000 + 2 * header_size + 1, 3);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadFourthFirstBlockTrailer) {
|
||||
TEST_P(LogTest, ReadFourthFirstBlockTrailer) {
|
||||
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadFourthMiddleBlock) {
|
||||
TEST_P(LogTest, ReadFourthMiddleBlock) {
|
||||
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadFourthLastBlock) {
|
||||
TEST_P(LogTest, ReadFourthLastBlock) {
|
||||
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadFourthStart) {
|
||||
TEST_P(LogTest, ReadFourthStart) {
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
CheckInitialOffsetRecord(
|
||||
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
|
||||
2 * (header_size + 1000) + (2 * log::kBlockSize - 1000) + 3 * header_size,
|
||||
3);
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
|
||||
TEST_P(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
|
||||
|
||||
TEST_F(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
|
||||
TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
|
||||
|
||||
TEST_F(LogTest, ClearEofSingleBlock) {
|
||||
TEST_P(LogTest, ClearEofSingleBlock) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
ForceEOF(3 + kHeaderSize + 2);
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
ForceEOF(3 + header_size + 2);
|
||||
ASSERT_EQ("foo", Read());
|
||||
UnmarkEOF();
|
||||
ASSERT_EQ("bar", Read());
|
||||
@ -617,12 +640,13 @@ TEST_F(LogTest, ClearEofSingleBlock) {
|
||||
ASSERT_TRUE(IsEOF());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ClearEofMultiBlock) {
|
||||
TEST_P(LogTest, ClearEofMultiBlock) {
|
||||
size_t num_full_blocks = 5;
|
||||
size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25;
|
||||
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
|
||||
size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
|
||||
Write(BigString("foo", n));
|
||||
Write(BigString("bar", n));
|
||||
ForceEOF(n + num_full_blocks * kHeaderSize + 10);
|
||||
ForceEOF(n + num_full_blocks * header_size + header_size + 3);
|
||||
ASSERT_EQ(BigString("foo", n), Read());
|
||||
ASSERT_TRUE(IsEOF());
|
||||
UnmarkEOF();
|
||||
@ -634,7 +658,7 @@ TEST_F(LogTest, ClearEofMultiBlock) {
|
||||
ASSERT_TRUE(IsEOF());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ClearEofError) {
|
||||
TEST_P(LogTest, ClearEofError) {
|
||||
// If an error occurs during Read() in UnmarkEOF(), the records contained
|
||||
// in the buffer should be returned on subsequent calls of ReadRecord()
|
||||
// until no more full records are left, whereafter ReadRecord() should return
|
||||
@ -652,7 +676,7 @@ TEST_F(LogTest, ClearEofError) {
|
||||
ASSERT_EQ("EOF", Read());
|
||||
}
|
||||
|
||||
TEST_F(LogTest, ClearEofError2) {
|
||||
TEST_P(LogTest, ClearEofError2) {
|
||||
Write("foo");
|
||||
Write("bar");
|
||||
UnmarkEOF();
|
||||
@ -666,6 +690,8 @@ TEST_F(LogTest, ClearEofError2) {
|
||||
ASSERT_EQ("OK", MatchError("read error"));
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(bool, LogTest, ::testing::Values(0, 2));
|
||||
|
||||
} // namespace log
|
||||
} // namespace rocksdb
|
||||
|
||||
|
@ -18,8 +18,12 @@
|
||||
namespace rocksdb {
|
||||
namespace log {
|
||||
|
||||
Writer::Writer(unique_ptr<WritableFileWriter>&& dest)
|
||||
: dest_(std::move(dest)), block_offset_(0) {
|
||||
Writer::Writer(unique_ptr<WritableFileWriter>&& dest,
|
||||
uint64_t log_number, bool recycle_log_files)
|
||||
: dest_(std::move(dest)),
|
||||
block_offset_(0),
|
||||
log_number_(log_number),
|
||||
recycle_log_files_(recycle_log_files) {
|
||||
for (int i = 0; i <= kMaxRecordType; i++) {
|
||||
char t = static_cast<char>(i);
|
||||
type_crc_[i] = crc32c::Value(&t, 1);
|
||||
@ -33,6 +37,10 @@ Status Writer::AddRecord(const Slice& slice) {
|
||||
const char* ptr = slice.data();
|
||||
size_t left = slice.size();
|
||||
|
||||
// Header size varies depending on whether we are recycling or not.
|
||||
const int header_size =
|
||||
recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize;
|
||||
|
||||
// Fragment the record if necessary and emit it. Note that if slice
|
||||
// is empty, we still want to iterate once to emit a single
|
||||
// zero-length record
|
||||
@ -41,32 +49,34 @@ Status Writer::AddRecord(const Slice& slice) {
|
||||
do {
|
||||
const int leftover = kBlockSize - block_offset_;
|
||||
assert(leftover >= 0);
|
||||
if (leftover < kHeaderSize) {
|
||||
if (leftover < header_size) {
|
||||
// Switch to a new block
|
||||
if (leftover > 0) {
|
||||
// Fill the trailer (literal below relies on kHeaderSize being 7)
|
||||
assert(kHeaderSize == 7);
|
||||
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
|
||||
// Fill the trailer (literal below relies on kHeaderSize and
|
||||
// kRecyclableHeaderSize being <= 11)
|
||||
assert(header_size <= 11);
|
||||
dest_->Append(
|
||||
Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", leftover));
|
||||
}
|
||||
block_offset_ = 0;
|
||||
}
|
||||
|
||||
// Invariant: we never leave < kHeaderSize bytes in a block.
|
||||
assert(static_cast<int>(kBlockSize) - block_offset_ >= kHeaderSize);
|
||||
// Invariant: we never leave < header_size bytes in a block.
|
||||
assert(static_cast<int>(kBlockSize) - block_offset_ >= header_size);
|
||||
|
||||
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
|
||||
const size_t avail = kBlockSize - block_offset_ - header_size;
|
||||
const size_t fragment_length = (left < avail) ? left : avail;
|
||||
|
||||
RecordType type;
|
||||
const bool end = (left == fragment_length);
|
||||
if (begin && end) {
|
||||
type = kFullType;
|
||||
type = recycle_log_files_ ? kRecyclableFullType : kFullType;
|
||||
} else if (begin) {
|
||||
type = kFirstType;
|
||||
type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
|
||||
} else if (end) {
|
||||
type = kLastType;
|
||||
type = recycle_log_files_ ? kRecyclableLastType : kLastType;
|
||||
} else {
|
||||
type = kMiddleType;
|
||||
type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
|
||||
}
|
||||
|
||||
s = EmitPhysicalRecord(type, ptr, fragment_length);
|
||||
@ -79,28 +89,48 @@ Status Writer::AddRecord(const Slice& slice) {
|
||||
|
||||
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
|
||||
assert(n <= 0xffff); // Must fit in two bytes
|
||||
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
|
||||
|
||||
size_t header_size;
|
||||
char buf[kRecyclableHeaderSize];
|
||||
|
||||
// Format the header
|
||||
char buf[kHeaderSize];
|
||||
buf[4] = static_cast<char>(n & 0xff);
|
||||
buf[5] = static_cast<char>(n >> 8);
|
||||
buf[6] = static_cast<char>(t);
|
||||
|
||||
uint32_t crc = type_crc_[t];
|
||||
if (t < kRecyclableFullType) {
|
||||
// Legacy record format
|
||||
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
|
||||
header_size = kHeaderSize;
|
||||
} else {
|
||||
// Recyclable record format
|
||||
assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize);
|
||||
header_size = kRecyclableHeaderSize;
|
||||
|
||||
// Only encode low 32-bits of the 64-bit log number. This means
|
||||
// we will fail to detect an old record if we recycled a log from
|
||||
// ~4 billion logs ago, but that is effectively impossible, and
|
||||
// even if it were we'dbe far more likely to see a false positive
|
||||
// on the 32-bit CRC.
|
||||
EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
|
||||
crc = crc32c::Extend(crc, buf + 7, 4);
|
||||
}
|
||||
|
||||
// Compute the crc of the record type and the payload.
|
||||
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
|
||||
crc = crc32c::Mask(crc); // Adjust for storage
|
||||
crc = crc32c::Extend(crc, ptr, n);
|
||||
crc = crc32c::Mask(crc); // Adjust for storage
|
||||
EncodeFixed32(buf, crc);
|
||||
|
||||
// Write the header and the payload
|
||||
Status s = dest_->Append(Slice(buf, kHeaderSize));
|
||||
Status s = dest_->Append(Slice(buf, header_size));
|
||||
if (s.ok()) {
|
||||
s = dest_->Append(Slice(ptr, n));
|
||||
if (s.ok()) {
|
||||
s = dest_->Flush();
|
||||
}
|
||||
}
|
||||
block_offset_ += kHeaderSize + n;
|
||||
block_offset_ += header_size + n;
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -43,7 +43,7 @@ namespace log {
|
||||
* Data is written out in kBlockSize chunks. If next record does not fit
|
||||
* into the space left, the leftover space will be padded with \0.
|
||||
*
|
||||
* Record format:
|
||||
* Legacy record format:
|
||||
*
|
||||
* +---------+-----------+-----------+--- ... ---+
|
||||
* |CRC (4B) | Size (2B) | Type (1B) | Payload |
|
||||
@ -57,13 +57,23 @@ namespace log {
|
||||
* blocks that are larger than kBlockSize
|
||||
* Payload = Byte stream as long as specified by the payload size
|
||||
*
|
||||
* Recyclable record format:
|
||||
*
|
||||
* +---------+-----------+-----------+----------------+--- ... ---+
|
||||
* |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload |
|
||||
* +---------+-----------+-----------+----------------+--- ... ---+
|
||||
*
|
||||
* Same as above, with the addition of
|
||||
* Log number = 32bit log file number, so that we can distinguish between
|
||||
* records written by the most recent log writer vs a previous one.
|
||||
*/
|
||||
class Writer {
|
||||
public:
|
||||
// Create a writer that will append data to "*dest".
|
||||
// "*dest" must be initially empty.
|
||||
// "*dest" must remain live while this Writer is in use.
|
||||
explicit Writer(unique_ptr<WritableFileWriter>&& dest);
|
||||
explicit Writer(unique_ptr<WritableFileWriter>&& dest,
|
||||
uint64_t log_number, bool recycle_log_files);
|
||||
~Writer();
|
||||
|
||||
Status AddRecord(const Slice& slice);
|
||||
@ -74,6 +84,8 @@ class Writer {
|
||||
private:
|
||||
unique_ptr<WritableFileWriter> dest_;
|
||||
int block_offset_; // Current offset in block
|
||||
uint64_t log_number_;
|
||||
bool recycle_log_files_;
|
||||
|
||||
// crc32c values for all supported record types. These are
|
||||
// pre-computed to reduce the overhead of computing the crc of the
|
||||
|
@ -249,8 +249,8 @@ class Repairer {
|
||||
// corruptions cause entire commits to be skipped instead of
|
||||
// propagating bad information (like overly large sequence
|
||||
// numbers).
|
||||
log::Reader reader(std::move(lfile_reader), &reporter,
|
||||
true /*enable checksum*/, 0 /*initial_offset*/);
|
||||
log::Reader reader(options_.info_log, std::move(lfile_reader), &reporter,
|
||||
true /*enable checksum*/, 0 /*initial_offset*/, log);
|
||||
|
||||
// Read all the records and add to a memtable
|
||||
std::string scratch;
|
||||
@ -413,7 +413,7 @@ class Repairer {
|
||||
{
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(file), env_options));
|
||||
log::Writer log(std::move(file_writer));
|
||||
log::Writer log(std::move(file_writer), 0, false);
|
||||
std::string record;
|
||||
edit_->EncodeTo(&record);
|
||||
status = log.AddRecord(record);
|
||||
|
@ -262,8 +262,10 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
|
||||
return s;
|
||||
}
|
||||
assert(file);
|
||||
currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
|
||||
read_options_.verify_checksums_, 0));
|
||||
currentLogReader_.reset(new log::Reader(options_->info_log,
|
||||
std::move(file), &reporter_,
|
||||
read_options_.verify_checksums_, 0,
|
||||
logFile->LogNumber()));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace rocksdb
|
||||
|
@ -82,6 +82,7 @@ class VersionBuilder::Rep {
|
||||
};
|
||||
|
||||
const EnvOptions& env_options_;
|
||||
Logger* info_log_;
|
||||
TableCache* table_cache_;
|
||||
VersionStorageInfo* base_vstorage_;
|
||||
LevelState* levels_;
|
||||
@ -89,9 +90,10 @@ class VersionBuilder::Rep {
|
||||
FileComparator level_nonzero_cmp_;
|
||||
|
||||
public:
|
||||
Rep(const EnvOptions& env_options, TableCache* table_cache,
|
||||
Rep(const EnvOptions& env_options, Logger* info_log, TableCache* table_cache,
|
||||
VersionStorageInfo* base_vstorage)
|
||||
: env_options_(env_options),
|
||||
info_log_(info_log),
|
||||
table_cache_(table_cache),
|
||||
base_vstorage_(base_vstorage) {
|
||||
levels_ = new LevelState[base_vstorage_->num_levels()];
|
||||
@ -335,15 +337,16 @@ class VersionBuilder::Rep {
|
||||
if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
|
||||
// File is deleted: do nothing
|
||||
} else {
|
||||
vstorage->AddFile(level, f);
|
||||
vstorage->AddFile(level, f, info_log_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
VersionBuilder::VersionBuilder(const EnvOptions& env_options,
|
||||
TableCache* table_cache,
|
||||
VersionStorageInfo* base_vstorage)
|
||||
: rep_(new Rep(env_options, table_cache, base_vstorage)) {}
|
||||
VersionStorageInfo* base_vstorage,
|
||||
Logger* info_log)
|
||||
: rep_(new Rep(env_options, info_log, table_cache, base_vstorage)) {}
|
||||
VersionBuilder::~VersionBuilder() { delete rep_; }
|
||||
void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
|
||||
rep_->CheckConsistency(vstorage);
|
||||
|
@ -24,7 +24,7 @@ class InternalStats;
|
||||
class VersionBuilder {
|
||||
public:
|
||||
VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
|
||||
VersionStorageInfo* base_vstorage);
|
||||
VersionStorageInfo* base_vstorage, Logger* info_log = nullptr);
|
||||
~VersionBuilder();
|
||||
void CheckConsistency(VersionStorageInfo* vstorage);
|
||||
void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
|
||||
|
@ -524,7 +524,7 @@ class BaseReferencedVersionBuilder {
|
||||
explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
|
||||
: version_builder_(new VersionBuilder(
|
||||
cfd->current()->version_set()->env_options(), cfd->table_cache(),
|
||||
cfd->current()->storage_info())),
|
||||
cfd->current()->storage_info(), cfd->ioptions()->info_log)),
|
||||
version_(cfd->current()) {
|
||||
version_->Ref();
|
||||
}
|
||||
@ -1262,13 +1262,27 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
|
||||
void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) {
|
||||
auto* level_files = &files_[level];
|
||||
// Must not overlap
|
||||
assert(level <= 0 || level_files->empty() ||
|
||||
internal_comparator_->Compare(
|
||||
(*level_files)[level_files->size() - 1]->largest, f->smallest) <
|
||||
0);
|
||||
#ifndef NDEBUG
|
||||
if (level > 0 && !level_files->empty() &&
|
||||
internal_comparator_->Compare(
|
||||
(*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) {
|
||||
auto* f2 = (*level_files)[level_files->size() - 1];
|
||||
if (info_log != nullptr) {
|
||||
Error(info_log, "Adding new file %" PRIu64
|
||||
" range (%s, %s) to level %d but overlapping "
|
||||
"with existing file %" PRIu64 " %s %s",
|
||||
f->fd.GetNumber(), f->smallest.DebugString(true).c_str(),
|
||||
f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(),
|
||||
f2->smallest.DebugString(true).c_str(),
|
||||
f2->largest.DebugString(true).c_str());
|
||||
LogFlush(info_log);
|
||||
}
|
||||
assert(false);
|
||||
}
|
||||
#endif
|
||||
f->refs++;
|
||||
level_files->push_back(f);
|
||||
}
|
||||
@ -2112,7 +2126,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(descriptor_file), opt_env_opts));
|
||||
descriptor_log_.reset(new log::Writer(std::move(file_writer)));
|
||||
descriptor_log_.reset(new log::Writer(std::move(file_writer), 0, false));
|
||||
s = WriteSnapshot(descriptor_log_.get());
|
||||
}
|
||||
}
|
||||
@ -2376,8 +2390,8 @@ Status VersionSet::Recover(
|
||||
{
|
||||
VersionSet::LogReporter reporter;
|
||||
reporter.status = &s;
|
||||
log::Reader reader(std::move(manifest_file_reader), &reporter,
|
||||
true /*checksum*/, 0 /*initial_offset*/);
|
||||
log::Reader reader(NULL, std::move(manifest_file_reader), &reporter,
|
||||
true /*checksum*/, 0 /*initial_offset*/, 0);
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
||||
@ -2629,8 +2643,8 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
|
||||
column_family_names.insert({0, kDefaultColumnFamilyName});
|
||||
VersionSet::LogReporter reporter;
|
||||
reporter.status = &s;
|
||||
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
|
||||
0 /*initial_offset*/);
|
||||
log::Reader reader(NULL, std::move(file_reader), &reporter, true /*checksum*/,
|
||||
0 /*initial_offset*/, 0);
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
||||
@ -2787,8 +2801,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
|
||||
{
|
||||
VersionSet::LogReporter reporter;
|
||||
reporter.status = &s;
|
||||
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
|
||||
0 /*initial_offset*/);
|
||||
log::Reader reader(NULL, std::move(file_reader), &reporter,
|
||||
true /*checksum*/, 0 /*initial_offset*/, 0);
|
||||
Slice record;
|
||||
std::string scratch;
|
||||
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
||||
@ -3040,7 +3054,8 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num,
|
||||
}
|
||||
file_reader.reset(new SequentialFileReader(std::move(file)));
|
||||
}
|
||||
log::Reader reader(std::move(file_reader), nullptr, true /*checksum*/, 0);
|
||||
log::Reader reader(NULL, std::move(file_reader), nullptr,
|
||||
true /*checksum*/, 0, 0);
|
||||
Slice r;
|
||||
std::string scratch;
|
||||
bool result = false;
|
||||
|
@ -97,7 +97,7 @@ class VersionStorageInfo {
|
||||
|
||||
void Reserve(int level, size_t size) { files_[level].reserve(size); }
|
||||
|
||||
void AddFile(int level, FileMetaData* f);
|
||||
void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
|
||||
|
||||
void SetFinalized();
|
||||
|
||||
|
@ -448,8 +448,8 @@ Status WalManager::ReadFirstLine(const std::string& fname,
|
||||
reporter.fname = fname.c_str();
|
||||
reporter.status = &status;
|
||||
reporter.ignore_error = !db_options_.paranoid_checks;
|
||||
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
|
||||
0 /*initial_offset*/);
|
||||
log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
|
||||
true /*checksum*/, 0 /*initial_offset*/, *sequence);
|
||||
std::string scratch;
|
||||
Slice record;
|
||||
|
||||
|
@ -77,7 +77,7 @@ class WalManagerTest : public testing::Test {
|
||||
ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(file), env_options_));
|
||||
current_log_writer_.reset(new log::Writer(std::move(file_writer)));
|
||||
current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
|
||||
}
|
||||
|
||||
void CreateArchiveLogs(int num_logs, int entries_per_log) {
|
||||
@ -127,7 +127,8 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) {
|
||||
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(file), EnvOptions()));
|
||||
log::Writer writer(std::move(file_writer));
|
||||
log::Writer writer(std::move(file_writer), 1,
|
||||
db_options_.recycle_log_file_num > 0);
|
||||
WriteBatch batch;
|
||||
batch.Put("foo", "bar");
|
||||
WriteBatchInternal::SetSequence(&batch, 10);
|
||||
|
180
include/posix/io_posix.h
Normal file
180
include/posix/io_posix.h
Normal file
@ -0,0 +1,180 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#pragma once
|
||||
#include <unistd.h>
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
// For non linux platform, the following macros are used only as place
|
||||
// holder.
|
||||
#if !(defined OS_LINUX) && !(defined CYGWIN)
|
||||
#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
|
||||
#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
|
||||
#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
|
||||
#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
|
||||
#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
|
||||
#endif
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
static Status IOError(const std::string& context, int err_number) {
|
||||
return Status::IOError(context, strerror(err_number));
|
||||
}
|
||||
|
||||
class PosixSequentialFile : public SequentialFile {
|
||||
private:
|
||||
std::string filename_;
|
||||
FILE* file_;
|
||||
int fd_;
|
||||
bool use_os_buffer_;
|
||||
|
||||
public:
|
||||
PosixSequentialFile(const std::string& fname, FILE* f,
|
||||
const EnvOptions& options);
|
||||
virtual ~PosixSequentialFile();
|
||||
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) override;
|
||||
virtual Status Skip(uint64_t n) override;
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override;
|
||||
};
|
||||
|
||||
class PosixRandomAccessFile : public RandomAccessFile {
|
||||
private:
|
||||
std::string filename_;
|
||||
int fd_;
|
||||
bool use_os_buffer_;
|
||||
|
||||
public:
|
||||
PosixRandomAccessFile(const std::string& fname, int fd,
|
||||
const EnvOptions& options);
|
||||
virtual ~PosixRandomAccessFile();
|
||||
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const override;
|
||||
#ifdef OS_LINUX
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
|
||||
#endif
|
||||
virtual void Hint(AccessPattern pattern) override;
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override;
|
||||
};
|
||||
|
||||
class PosixWritableFile : public WritableFile {
|
||||
private:
|
||||
const std::string filename_;
|
||||
int fd_;
|
||||
uint64_t filesize_;
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
bool allow_fallocate_;
|
||||
bool fallocate_with_keep_size_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
PosixWritableFile(const std::string& fname, int fd,
|
||||
const EnvOptions& options);
|
||||
~PosixWritableFile();
|
||||
|
||||
// Means Close() will properly take care of truncate
|
||||
// and it does not need any additional information
|
||||
virtual Status Truncate(uint64_t size) override { return Status::OK(); }
|
||||
virtual Status Close() override;
|
||||
virtual Status Append(const Slice& data) override;
|
||||
virtual Status Flush() override;
|
||||
virtual Status Sync() override;
|
||||
virtual Status Fsync() override;
|
||||
virtual bool IsSyncThreadSafe() const override;
|
||||
virtual uint64_t GetFileSize() override;
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override;
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
virtual Status Allocate(off_t offset, off_t len) override;
|
||||
virtual Status RangeSync(off_t offset, off_t nbytes) override;
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
|
||||
#endif
|
||||
};
|
||||
|
||||
class PosixMmapReadableFile : public RandomAccessFile {
|
||||
private:
|
||||
int fd_;
|
||||
std::string filename_;
|
||||
void* mmapped_region_;
|
||||
size_t length_;
|
||||
|
||||
public:
|
||||
PosixMmapReadableFile(const int fd, const std::string& fname, void* base,
|
||||
size_t length, const EnvOptions& options);
|
||||
virtual ~PosixMmapReadableFile();
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const override;
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override;
|
||||
};
|
||||
|
||||
class PosixMmapFile : public WritableFile {
|
||||
private:
|
||||
std::string filename_;
|
||||
int fd_;
|
||||
size_t page_size_;
|
||||
size_t map_size_; // How much extra memory to map at a time
|
||||
char* base_; // The mapped region
|
||||
char* limit_; // Limit of the mapped region
|
||||
char* dst_; // Where to write next (in range [base_,limit_])
|
||||
char* last_sync_; // Where have we synced up to
|
||||
uint64_t file_offset_; // Offset of base_ in file
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
bool allow_fallocate_; // If false, fallocate calls are bypassed
|
||||
bool fallocate_with_keep_size_;
|
||||
#endif
|
||||
|
||||
// Roundup x to a multiple of y
|
||||
static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
|
||||
|
||||
size_t TruncateToPageBoundary(size_t s) {
|
||||
s -= (s & (page_size_ - 1));
|
||||
assert((s % page_size_) == 0);
|
||||
return s;
|
||||
}
|
||||
|
||||
Status MapNewRegion();
|
||||
Status UnmapCurrentRegion();
|
||||
Status Msync();
|
||||
|
||||
public:
|
||||
PosixMmapFile(const std::string& fname, int fd, size_t page_size,
|
||||
const EnvOptions& options);
|
||||
~PosixMmapFile();
|
||||
|
||||
// Means Close() will properly take care of truncate
|
||||
// and it does not need any additional information
|
||||
virtual Status Truncate(uint64_t size) override { return Status::OK(); }
|
||||
virtual Status Close() override;
|
||||
virtual Status Append(const Slice& data) override;
|
||||
virtual Status Flush() override;
|
||||
virtual Status Sync() override;
|
||||
virtual Status Fsync() override;
|
||||
virtual uint64_t GetFileSize() override;
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override;
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
virtual Status Allocate(off_t offset, off_t len) override;
|
||||
#endif
|
||||
};
|
||||
|
||||
class PosixDirectory : public Directory {
|
||||
public:
|
||||
explicit PosixDirectory(int fd) : fd_(fd) {}
|
||||
~PosixDirectory() { close(fd_); }
|
||||
|
||||
virtual Status Fsync() override {
|
||||
if (fsync(fd_) == -1) {
|
||||
return IOError("directory", errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
int fd_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
@ -572,6 +572,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num(
|
||||
rocksdb_options_t*, size_t);
|
||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit(
|
||||
rocksdb_options_t*, double);
|
||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit(
|
||||
|
@ -139,6 +139,12 @@ class Env {
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options) = 0;
|
||||
|
||||
// Reuse an existing file by renaming it and opening it as writable.
|
||||
virtual Status ReuseWritableFile(const std::string& fname,
|
||||
const std::string& old_fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options);
|
||||
|
||||
// Create an object that represents a directory. Will fail if directory
|
||||
// doesn't exist. If the directory exists, it will open the directory
|
||||
// and create a new Directory object.
|
||||
|
@ -958,6 +958,16 @@ struct DBOptions {
|
||||
// Default: 1000
|
||||
size_t keep_log_file_num;
|
||||
|
||||
// Recycle log files.
|
||||
// If non-zero, we will reuse previously written log files for new
|
||||
// logs, overwriting the old data. The value indicates how many
|
||||
// such files we will keep around at any point in time for later
|
||||
// use. This is more efficient because the blocks are already
|
||||
// allocated and fdatasync does not need to update the inode after
|
||||
// each write.
|
||||
// Default: 0
|
||||
size_t recycle_log_file_num;
|
||||
|
||||
// manifest file is rolled over on reaching this limit.
|
||||
// The older manifest file be deleted.
|
||||
// The default value is MAX_INT so that roll-over does not take place.
|
||||
|
@ -148,7 +148,8 @@ class StackableDB : public DB {
|
||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||
const Range* r, int n, uint64_t* sizes,
|
||||
bool include_memtable = false) override {
|
||||
return db_->GetApproximateSizes(column_family, r, n, sizes);
|
||||
return db_->GetApproximateSizes(column_family, r, n, sizes,
|
||||
include_memtable);
|
||||
}
|
||||
|
||||
using DB::CompactRange;
|
||||
|
@ -5,7 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#define ROCKSDB_MAJOR 4
|
||||
#define ROCKSDB_MINOR 1
|
||||
#define ROCKSDB_MINOR 2
|
||||
#define ROCKSDB_PATCH 0
|
||||
|
||||
// Do not use these. We made the mistake of declaring macros starting with
|
||||
|
@ -576,6 +576,33 @@ void Java_org_rocksdb_Options_setKeepLogFileNum(
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_Options
|
||||
* Method: recycleLogFiles
|
||||
* Signature: (J)J
|
||||
*/
|
||||
jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv* env, jobject jobj,
|
||||
jlong jhandle) {
|
||||
return reinterpret_cast<rocksdb::Options*>(jhandle)->recycle_log_file_num;
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_Options
|
||||
* Method: setRecycleLogFiles
|
||||
* Signature: (JJ)V
|
||||
*/
|
||||
void Java_org_rocksdb_Options_setRecycleLogFiles(JNIEnv* env, jobject jobj,
|
||||
jlong jhandle,
|
||||
jlong recycle_log_file_num) {
|
||||
rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num);
|
||||
if (s.ok()) {
|
||||
reinterpret_cast<rocksdb::Options*>(jhandle)->recycle_log_file_num =
|
||||
recycle_log_file_num;
|
||||
} else {
|
||||
rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_Options
|
||||
* Method: maxManifestFileSize
|
||||
@ -3533,6 +3560,32 @@ jlong Java_org_rocksdb_DBOptions_keepLogFileNum(
|
||||
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num;
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_DBOptions
|
||||
* Method: setRecycleLogFiles
|
||||
* Signature: (JJ)V
|
||||
*/
|
||||
void Java_org_rocksdb_DBOptions_setRecycleLogFileNum(
|
||||
JNIEnv* env, jobject jobj, jlong jhandle, jlong recycle_log_file_num) {
|
||||
rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num);
|
||||
if (s.ok()) {
|
||||
reinterpret_cast<rocksdb::DBOptions*>(jhandle)->recycle_log_file_num =
|
||||
recycle_log_file_num;
|
||||
} else {
|
||||
rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_DBOptions
|
||||
* Method: recycleLogFiles
|
||||
* Signature: (J)J
|
||||
*/
|
||||
jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv* env, jobject jobj,
|
||||
jlong jhandle) {
|
||||
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->recycle_log_file_num;
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_DBOptions
|
||||
* Method: setMaxManifestFileSize
|
||||
|
1
src.mk
1
src.mk
@ -95,6 +95,7 @@ LIB_SOURCES = \
|
||||
util/env.cc \
|
||||
util/env_hdfs.cc \
|
||||
util/env_posix.cc \
|
||||
util/io_posix.cc \
|
||||
util/file_util.cc \
|
||||
util/file_reader_writer.cc \
|
||||
util/filter_policy.cc \
|
||||
|
@ -57,7 +57,6 @@ def get_dbname(test_name):
|
||||
return dbname
|
||||
|
||||
blackbox_default_params = {
|
||||
'db': lambda: get_dbname('blackbox'),
|
||||
# total time for this script to test db_stress
|
||||
"duration": 6000,
|
||||
# time for one db_stress instance to run
|
||||
@ -69,7 +68,6 @@ blackbox_default_params = {
|
||||
}
|
||||
|
||||
whitebox_default_params = {
|
||||
'db': lambda: get_dbname('whitebox'),
|
||||
"duration": 10000,
|
||||
"log2_keys_per_lock": 10,
|
||||
"nooverwritepercent": 1,
|
||||
@ -110,7 +108,6 @@ simple_default_params = {
|
||||
}
|
||||
|
||||
blackbox_simple_default_params = {
|
||||
'db': lambda: get_dbname('blackbox'),
|
||||
"duration": 6000,
|
||||
"interval": 120,
|
||||
"open_files": -1,
|
||||
@ -120,7 +117,6 @@ blackbox_simple_default_params = {
|
||||
}
|
||||
|
||||
whitebox_simple_default_params = {
|
||||
'db': lambda: get_dbname('whitebox'),
|
||||
"duration": 10000,
|
||||
"log2_keys_per_lock": 10,
|
||||
"nooverwritepercent": 1,
|
||||
@ -166,7 +162,7 @@ def gen_cmd(params):
|
||||
# in case of unsafe crashes in RocksDB.
|
||||
def blackbox_crash_main(args):
|
||||
cmd_params = gen_cmd_params(args)
|
||||
|
||||
dbname = get_dbname('blackbox')
|
||||
exit_time = time.time() + cmd_params['duration']
|
||||
|
||||
print("Running blackbox-crash-test with \n"
|
||||
@ -180,7 +176,7 @@ def blackbox_crash_main(args):
|
||||
run_had_errors = False
|
||||
killtime = time.time() + cmd_params['interval']
|
||||
|
||||
cmd = gen_cmd(cmd_params)
|
||||
cmd = gen_cmd(dict(cmd_params.items() + {'db': dbname}.items()))
|
||||
|
||||
child = subprocess.Popen([cmd],
|
||||
stderr=subprocess.PIPE, shell=True)
|
||||
@ -226,6 +222,7 @@ def blackbox_crash_main(args):
|
||||
# kill_random_test that causes rocksdb to crash at various points in code.
|
||||
def whitebox_crash_main(args):
|
||||
cmd_params = gen_cmd_params(args)
|
||||
dbname = get_dbname('whitebox')
|
||||
|
||||
cur_time = time.time()
|
||||
exit_time = cur_time + cmd_params['duration']
|
||||
@ -285,7 +282,8 @@ def whitebox_crash_main(args):
|
||||
"ops_per_thread": cmd_params['ops_per_thread'],
|
||||
}
|
||||
|
||||
cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()))
|
||||
cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
|
||||
+ {'db': dbname}.items()))
|
||||
|
||||
print "Running:" + cmd + "\n"
|
||||
|
||||
|
@ -1438,7 +1438,21 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
|
||||
}
|
||||
} else {
|
||||
StdErrReporter reporter;
|
||||
log::Reader reader(move(wal_file_reader), &reporter, true, 0);
|
||||
uint64_t log_number;
|
||||
FileType type;
|
||||
|
||||
// we need the log number, but ParseFilename expects dbname/NNN.log.
|
||||
string sanitized = wal_file;
|
||||
size_t lastslash = sanitized.rfind('/');
|
||||
if (lastslash != std::string::npos)
|
||||
sanitized = sanitized.substr(lastslash + 1);
|
||||
if (!ParseFileName(sanitized, &log_number, &type)) {
|
||||
// bogus input, carry on as best we can
|
||||
log_number = 0;
|
||||
}
|
||||
DBOptions db_options;
|
||||
log::Reader reader(db_options.info_log, move(wal_file_reader), &reporter,
|
||||
true, 0, log_number);
|
||||
string scratch;
|
||||
WriteBatch batch;
|
||||
Slice record;
|
||||
|
11
util/env.cc
11
util/env.cc
@ -27,6 +27,17 @@ uint64_t Env::GetThreadID() const {
|
||||
return hasher(std::this_thread::get_id());
|
||||
}
|
||||
|
||||
Status Env::ReuseWritableFile(const std::string& fname,
|
||||
const std::string& old_fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options) {
|
||||
Status s = RenameFile(old_fname, fname);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
return NewWritableFile(fname, result, options);
|
||||
}
|
||||
|
||||
SequentialFile::~SequentialFile() {
|
||||
}
|
||||
|
||||
|
@ -6,13 +6,14 @@
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include <dirent.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#if defined(OS_LINUX)
|
||||
#include <linux/fs.h>
|
||||
#endif
|
||||
#include <pthread.h>
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -26,25 +27,7 @@
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#if defined(OS_LINUX)
|
||||
#include <linux/fs.h>
|
||||
#endif
|
||||
#include <signal.h>
|
||||
#include <algorithm>
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "port/port.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/posix_logger.h"
|
||||
#include "util/random.h"
|
||||
#include "util/iostats_context_imp.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/sync_point.h"
|
||||
#include "util/thread_status_updater.h"
|
||||
#include "util/thread_status_util.h"
|
||||
|
||||
// Get nano time includes
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#elif defined(__MACH__)
|
||||
@ -53,6 +36,19 @@
|
||||
#else
|
||||
#include <chrono>
|
||||
#endif
|
||||
#include <deque>
|
||||
#include <set>
|
||||
#include "port/port.h"
|
||||
#include "posix/io_posix.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/iostats_context_imp.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/posix_logger.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/sync_point.h"
|
||||
#include "util/thread_status_updater.h"
|
||||
#include "util/thread_status_util.h"
|
||||
|
||||
#if !defined(TMPFS_MAGIC)
|
||||
#define TMPFS_MAGIC 0x01021994
|
||||
@ -64,31 +60,10 @@
|
||||
#define EXT4_SUPER_MAGIC 0xEF53
|
||||
#endif
|
||||
|
||||
// For non linux platform, the following macros are used only as place
|
||||
// holder.
|
||||
#if !(defined OS_LINUX) && !(defined CYGWIN)
|
||||
#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
|
||||
#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
|
||||
#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
|
||||
#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
|
||||
#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
|
||||
#endif
|
||||
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
// A wrapper for fadvise, if the platform doesn't support fadvise,
|
||||
// it will simply return Status::NotSupport.
|
||||
int Fadvise(int fd, off_t offset, size_t len, int advice) {
|
||||
#ifdef OS_LINUX
|
||||
return posix_fadvise(fd, offset, len, advice);
|
||||
#else
|
||||
return 0; // simply do nothing.
|
||||
#endif
|
||||
}
|
||||
|
||||
ThreadStatusUpdater* CreateThreadStatusUpdater() {
|
||||
return new ThreadStatusUpdater();
|
||||
}
|
||||
@ -97,677 +72,6 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() {
|
||||
static std::set<std::string> lockedFiles;
|
||||
static port::Mutex mutex_lockedFiles;
|
||||
|
||||
static Status IOError(const std::string& context, int err_number) {
|
||||
return Status::IOError(context, strerror(err_number));
|
||||
}
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
namespace {
|
||||
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
|
||||
if (max_size < kMaxVarint64Length*3) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct stat buf;
|
||||
int result = fstat(fd, &buf);
|
||||
if (result == -1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
long version = 0;
|
||||
result = ioctl(fd, FS_IOC_GETVERSION, &version);
|
||||
if (result == -1) {
|
||||
return 0;
|
||||
}
|
||||
uint64_t uversion = (uint64_t)version;
|
||||
|
||||
char* rid = id;
|
||||
rid = EncodeVarint64(rid, buf.st_dev);
|
||||
rid = EncodeVarint64(rid, buf.st_ino);
|
||||
rid = EncodeVarint64(rid, uversion);
|
||||
assert(rid >= id);
|
||||
return static_cast<size_t>(rid-id);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
class PosixSequentialFile: public SequentialFile {
|
||||
private:
|
||||
std::string filename_;
|
||||
FILE* file_;
|
||||
int fd_;
|
||||
bool use_os_buffer_;
|
||||
|
||||
public:
|
||||
PosixSequentialFile(const std::string& fname, FILE* f,
|
||||
const EnvOptions& options)
|
||||
: filename_(fname), file_(f), fd_(fileno(f)),
|
||||
use_os_buffer_(options.use_os_buffer) {
|
||||
}
|
||||
virtual ~PosixSequentialFile() { fclose(file_); }
|
||||
|
||||
virtual Status Read(size_t n, Slice* result, char* scratch) override {
|
||||
Status s;
|
||||
size_t r = 0;
|
||||
do {
|
||||
r = fread_unlocked(scratch, 1, n, file_);
|
||||
} while (r == 0 && ferror(file_) && errno == EINTR);
|
||||
*result = Slice(scratch, r);
|
||||
if (r < n) {
|
||||
if (feof(file_)) {
|
||||
// We leave status as ok if we hit the end of the file
|
||||
// We also clear the error so that the reads can continue
|
||||
// if a new data is written to the file
|
||||
clearerr(file_);
|
||||
} else {
|
||||
// A partial read with an error: return a non-ok status
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
if (!use_os_buffer_) {
|
||||
// we need to fadvise away the entire range of pages because
|
||||
// we do not want readahead pages to be cached.
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual Status Skip(uint64_t n) override {
|
||||
if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// pread() based random-access
|
||||
class PosixRandomAccessFile: public RandomAccessFile {
|
||||
private:
|
||||
std::string filename_;
|
||||
int fd_;
|
||||
bool use_os_buffer_;
|
||||
|
||||
public:
|
||||
PosixRandomAccessFile(const std::string& fname, int fd,
|
||||
const EnvOptions& options)
|
||||
: filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
|
||||
assert(!options.use_mmap_reads || sizeof(void*) < 8);
|
||||
}
|
||||
virtual ~PosixRandomAccessFile() { close(fd_); }
|
||||
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const override {
|
||||
Status s;
|
||||
ssize_t r = -1;
|
||||
size_t left = n;
|
||||
char* ptr = scratch;
|
||||
while (left > 0) {
|
||||
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
|
||||
|
||||
if (r <= 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
ptr += r;
|
||||
offset += r;
|
||||
left -= r;
|
||||
}
|
||||
|
||||
*result = Slice(scratch, (r < 0) ? 0 : n - left);
|
||||
if (r < 0) {
|
||||
// An error: return a non-ok status
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
if (!use_os_buffer_) {
|
||||
// we need to fadvise away the entire range of pages because
|
||||
// we do not want readahead pages to be cached.
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const override {
|
||||
return GetUniqueIdFromFile(fd_, id, max_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
virtual void Hint(AccessPattern pattern) override {
|
||||
switch(pattern) {
|
||||
case NORMAL:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
|
||||
break;
|
||||
case RANDOM:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
|
||||
break;
|
||||
case SEQUENTIAL:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
|
||||
break;
|
||||
case WILLNEED:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
|
||||
break;
|
||||
case DONTNEED:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// mmap() based random-access
|
||||
class PosixMmapReadableFile: public RandomAccessFile {
|
||||
private:
|
||||
int fd_;
|
||||
std::string filename_;
|
||||
void* mmapped_region_;
|
||||
size_t length_;
|
||||
|
||||
public:
|
||||
// base[0,length-1] contains the mmapped contents of the file.
|
||||
PosixMmapReadableFile(const int fd, const std::string& fname,
|
||||
void* base, size_t length,
|
||||
const EnvOptions& options)
|
||||
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
|
||||
fd_ = fd_ + 0; // suppress the warning for used variables
|
||||
assert(options.use_mmap_reads);
|
||||
assert(options.use_os_buffer);
|
||||
}
|
||||
virtual ~PosixMmapReadableFile() {
|
||||
int ret = munmap(mmapped_region_, length_);
|
||||
if (ret != 0) {
|
||||
fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
|
||||
mmapped_region_, length_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const override {
|
||||
Status s;
|
||||
if (offset > length_) {
|
||||
*result = Slice();
|
||||
return IOError(filename_, EINVAL);
|
||||
} else if (offset + n > length_) {
|
||||
n = static_cast<size_t>(length_ - offset);
|
||||
}
|
||||
*result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
|
||||
return s;
|
||||
}
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// We preallocate up to an extra megabyte and use memcpy to append new
|
||||
// data to the file. This is safe since we either properly close the
|
||||
// file before reading from it, or for log files, the reading code
|
||||
// knows enough to skip zero suffixes.
|
||||
class PosixMmapFile : public WritableFile {
|
||||
private:
|
||||
std::string filename_;
|
||||
int fd_;
|
||||
size_t page_size_;
|
||||
size_t map_size_; // How much extra memory to map at a time
|
||||
char* base_; // The mapped region
|
||||
char* limit_; // Limit of the mapped region
|
||||
char* dst_; // Where to write next (in range [base_,limit_])
|
||||
char* last_sync_; // Where have we synced up to
|
||||
uint64_t file_offset_; // Offset of base_ in file
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
bool allow_fallocate_; // If false, fallocate calls are bypassed
|
||||
bool fallocate_with_keep_size_;
|
||||
#endif
|
||||
|
||||
// Roundup x to a multiple of y
|
||||
static size_t Roundup(size_t x, size_t y) {
|
||||
return ((x + y - 1) / y) * y;
|
||||
}
|
||||
|
||||
size_t TruncateToPageBoundary(size_t s) {
|
||||
s -= (s & (page_size_ - 1));
|
||||
assert((s % page_size_) == 0);
|
||||
return s;
|
||||
}
|
||||
|
||||
Status UnmapCurrentRegion() {
|
||||
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
|
||||
if (base_ != nullptr) {
|
||||
int munmap_status = munmap(base_, limit_ - base_);
|
||||
if (munmap_status != 0) {
|
||||
return IOError(filename_, munmap_status);
|
||||
}
|
||||
file_offset_ += limit_ - base_;
|
||||
base_ = nullptr;
|
||||
limit_ = nullptr;
|
||||
last_sync_ = nullptr;
|
||||
dst_ = nullptr;
|
||||
|
||||
// Increase the amount we map the next time, but capped at 1MB
|
||||
if (map_size_ < (1<<20)) {
|
||||
map_size_ *= 2;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status MapNewRegion() {
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
assert(base_ == nullptr);
|
||||
|
||||
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
|
||||
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
|
||||
if (allow_fallocate_) {
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
|
||||
if (alloc_status != 0) {
|
||||
// fallback to posix_fallocate
|
||||
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
|
||||
}
|
||||
if (alloc_status != 0) {
|
||||
return Status::IOError("Error allocating space to file : " + filename_ +
|
||||
"Error : " + strerror(alloc_status));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
|
||||
void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
|
||||
fd_, file_offset_);
|
||||
if (ptr == MAP_FAILED) {
|
||||
return Status::IOError("MMap failed on " + filename_);
|
||||
}
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
|
||||
|
||||
base_ = reinterpret_cast<char*>(ptr);
|
||||
limit_ = base_ + map_size_;
|
||||
dst_ = base_;
|
||||
last_sync_ = base_;
|
||||
return Status::OK();
|
||||
#else
|
||||
return Status::NotSupported("This platform doesn't support fallocate()");
|
||||
#endif
|
||||
}
|
||||
|
||||
Status Msync() {
|
||||
if (dst_ == last_sync_) {
|
||||
return Status::OK();
|
||||
}
|
||||
// Find the beginnings of the pages that contain the first and last
|
||||
// bytes to be synced.
|
||||
size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
|
||||
size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
|
||||
last_sync_ = dst_;
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
|
||||
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
public:
|
||||
PosixMmapFile(const std::string& fname, int fd, size_t page_size,
|
||||
const EnvOptions& options)
|
||||
: filename_(fname),
|
||||
fd_(fd),
|
||||
page_size_(page_size),
|
||||
map_size_(Roundup(65536, page_size)),
|
||||
base_(nullptr),
|
||||
limit_(nullptr),
|
||||
dst_(nullptr),
|
||||
last_sync_(nullptr),
|
||||
file_offset_(0) {
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
allow_fallocate_ = options.allow_fallocate;
|
||||
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
||||
#endif
|
||||
assert((page_size & (page_size - 1)) == 0);
|
||||
assert(options.use_mmap_writes);
|
||||
}
|
||||
|
||||
|
||||
~PosixMmapFile() {
|
||||
if (fd_ >= 0) {
|
||||
PosixMmapFile::Close();
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status Append(const Slice& data) override {
|
||||
const char* src = data.data();
|
||||
size_t left = data.size();
|
||||
while (left > 0) {
|
||||
assert(base_ <= dst_);
|
||||
assert(dst_ <= limit_);
|
||||
size_t avail = limit_ - dst_;
|
||||
if (avail == 0) {
|
||||
Status s = UnmapCurrentRegion();
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
s = MapNewRegion();
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
|
||||
}
|
||||
|
||||
size_t n = (left <= avail) ? left : avail;
|
||||
memcpy(dst_, src, n);
|
||||
dst_ += n;
|
||||
src += n;
|
||||
left -= n;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Means Close() will properly take care of truncate
|
||||
// and it does not need any additional information
|
||||
virtual Status Truncate(uint64_t size) override {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status Close() override {
|
||||
Status s;
|
||||
size_t unused = limit_ - dst_;
|
||||
|
||||
s = UnmapCurrentRegion();
|
||||
if (!s.ok()) {
|
||||
s = IOError(filename_, errno);
|
||||
} else if (unused > 0) {
|
||||
// Trim the extra space at the end of the file
|
||||
if (ftruncate(fd_, file_offset_ - unused) < 0) {
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
|
||||
if (close(fd_) < 0) {
|
||||
if (s.ok()) {
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
|
||||
fd_ = -1;
|
||||
base_ = nullptr;
|
||||
limit_ = nullptr;
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual Status Flush() override {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status Sync() override {
|
||||
if (fdatasync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
|
||||
return Msync();
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush data as well as metadata to stable storage.
|
||||
*/
|
||||
virtual Status Fsync() override {
|
||||
if (fsync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
|
||||
return Msync();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the size of valid data in the file. This will not match the
|
||||
* size that is returned from the filesystem because we use mmap
|
||||
* to extend file by map_size every time.
|
||||
*/
|
||||
virtual uint64_t GetFileSize() override {
|
||||
size_t used = dst_ - base_;
|
||||
return file_offset_ + used;
|
||||
}
|
||||
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
virtual Status Allocate(off_t offset, off_t len) override {
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
|
||||
int alloc_status = 0;
|
||||
if (allow_fallocate_) {
|
||||
alloc_status =
|
||||
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
||||
offset, len);
|
||||
}
|
||||
if (alloc_status == 0) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Use posix write to write data to a file.
|
||||
class PosixWritableFile : public WritableFile {
|
||||
private:
|
||||
const std::string filename_;
|
||||
int fd_;
|
||||
uint64_t filesize_;
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
bool allow_fallocate_;
|
||||
bool fallocate_with_keep_size_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options)
|
||||
: filename_(fname), fd_(fd), filesize_(0) {
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
allow_fallocate_ = options.allow_fallocate;
|
||||
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
||||
#endif
|
||||
assert(!options.use_mmap_writes);
|
||||
}
|
||||
|
||||
~PosixWritableFile() {
|
||||
if (fd_ >= 0) {
|
||||
PosixWritableFile::Close();
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status Append(const Slice& data) override {
|
||||
const char* src = data.data();
|
||||
size_t left = data.size();
|
||||
while (left != 0) {
|
||||
ssize_t done = write(fd_, src, left);
|
||||
if (done < 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
left -= done;
|
||||
src += done;
|
||||
}
|
||||
filesize_ += data.size();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Means Close() will properly take care of truncate
|
||||
// and it does not need any additional information
|
||||
virtual Status Truncate(uint64_t size) override {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status Close() override {
|
||||
Status s;
|
||||
|
||||
size_t block_size;
|
||||
size_t last_allocated_block;
|
||||
GetPreallocationStatus(&block_size, &last_allocated_block);
|
||||
if (last_allocated_block > 0) {
|
||||
// trim the extra space preallocated at the end of the file
|
||||
// NOTE(ljin): we probably don't want to surface failure as an IOError,
|
||||
// but it will be nice to log these errors.
|
||||
int dummy __attribute__((unused));
|
||||
dummy = ftruncate(fd_, filesize_);
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
// in some file systems, ftruncate only trims trailing space if the
|
||||
// new file size is smaller than the current size. Calling fallocate
|
||||
// with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
|
||||
// blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
|
||||
// filesystems:
|
||||
// XFS (since Linux 2.6.38)
|
||||
// ext4 (since Linux 3.0)
|
||||
// Btrfs (since Linux 3.7)
|
||||
// tmpfs (since Linux 3.5)
|
||||
// We ignore error since failure of this operation does not affect
|
||||
// correctness.
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
if (allow_fallocate_) {
|
||||
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
|
||||
block_size * last_allocated_block - filesize_);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (close(fd_) < 0) {
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
fd_ = -1;
|
||||
return s;
|
||||
}
|
||||
|
||||
// write out the cached data to the OS cache
|
||||
virtual Status Flush() override {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status Sync() override {
|
||||
if (fdatasync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status Fsync() override {
|
||||
if (fsync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual bool IsSyncThreadSafe() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual uint64_t GetFileSize() override { return filesize_; }
|
||||
|
||||
virtual Status InvalidateCache(size_t offset, size_t length) override {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
virtual Status Allocate(off_t offset, off_t len) override {
|
||||
TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
int alloc_status = 0;
|
||||
if (allow_fallocate_) {
|
||||
alloc_status =
|
||||
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
||||
offset, len);
|
||||
}
|
||||
if (alloc_status == 0) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status RangeSync(off_t offset, off_t nbytes) override {
|
||||
if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
virtual size_t GetUniqueId(char* id, size_t max_size) const override {
|
||||
return GetUniqueIdFromFile(fd_, id, max_size);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
class PosixDirectory : public Directory {
|
||||
public:
|
||||
explicit PosixDirectory(int fd) : fd_(fd) {}
|
||||
~PosixDirectory() {
|
||||
close(fd_);
|
||||
}
|
||||
|
||||
virtual Status Fsync() override {
|
||||
if (fsync(fd_) == -1) {
|
||||
return IOError("directory", errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
int fd_;
|
||||
};
|
||||
|
||||
static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
|
||||
mutex_lockedFiles.Lock();
|
||||
if (lock) {
|
||||
@ -806,12 +110,6 @@ static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
|
||||
return value;
|
||||
}
|
||||
|
||||
class PosixFileLock : public FileLock {
|
||||
public:
|
||||
int fd_;
|
||||
std::string filename;
|
||||
};
|
||||
|
||||
void PthreadCall(const char* label, int result) {
|
||||
if (result != 0) {
|
||||
fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
|
||||
@ -819,6 +117,12 @@ void PthreadCall(const char* label, int result) {
|
||||
}
|
||||
}
|
||||
|
||||
class PosixFileLock : public FileLock {
|
||||
public:
|
||||
int fd_;
|
||||
std::string filename;
|
||||
};
|
||||
|
||||
class PosixEnv : public Env {
|
||||
public:
|
||||
PosixEnv();
|
||||
@ -933,6 +237,50 @@ class PosixEnv : public Env {
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual Status ReuseWritableFile(const std::string& fname,
|
||||
const std::string& old_fname,
|
||||
unique_ptr<WritableFile>* result,
|
||||
const EnvOptions& options) override {
|
||||
result->reset();
|
||||
Status s;
|
||||
int fd = -1;
|
||||
do {
|
||||
IOSTATS_TIMER_GUARD(open_nanos);
|
||||
fd = open(old_fname.c_str(), O_RDWR, 0644);
|
||||
} while (fd < 0 && errno == EINTR);
|
||||
if (fd < 0) {
|
||||
s = IOError(fname, errno);
|
||||
} else {
|
||||
SetFD_CLOEXEC(fd, &options);
|
||||
// rename into place
|
||||
if (rename(old_fname.c_str(), fname.c_str()) != 0) {
|
||||
Status r = IOError(old_fname, errno);
|
||||
close(fd);
|
||||
return r;
|
||||
}
|
||||
if (options.use_mmap_writes) {
|
||||
if (!checkedDiskForMmap_) {
|
||||
// this will be executed once in the program's lifetime.
|
||||
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
||||
if (!SupportsFastAllocate(fname)) {
|
||||
forceMmapOff = true;
|
||||
}
|
||||
checkedDiskForMmap_ = true;
|
||||
}
|
||||
}
|
||||
if (options.use_mmap_writes && !forceMmapOff) {
|
||||
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
|
||||
} else {
|
||||
// disable mmap writes
|
||||
EnvOptions no_mmap_writes_options = options;
|
||||
no_mmap_writes_options.use_mmap_writes = false;
|
||||
|
||||
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
virtual Status NewDirectory(const std::string& name,
|
||||
unique_ptr<Directory>* result) override {
|
||||
result->reset();
|
||||
|
637
util/io_posix.cc
Normal file
637
util/io_posix.cc
Normal file
@ -0,0 +1,637 @@
|
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifdef ROCKSDB_LIB_IO_POSIX
|
||||
|
||||
#include "posix/io_posix.h"
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#if defined(OS_LINUX)
|
||||
#include <linux/fs.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/statfs.h>
|
||||
#include <sys/syscall.h>
|
||||
#endif
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/iostats_context_imp.h"
|
||||
#include "util/posix_logger.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// A wrapper for fadvise, if the platform doesn't support fadvise,
|
||||
// it will simply return Status::NotSupport.
|
||||
int Fadvise(int fd, off_t offset, size_t len, int advice) {
|
||||
#ifdef OS_LINUX
|
||||
return posix_fadvise(fd, offset, len, advice);
|
||||
#else
|
||||
return 0; // simply do nothing.
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* PosixSequentialFile
|
||||
*/
|
||||
PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* f,
|
||||
const EnvOptions& options)
|
||||
: filename_(fname),
|
||||
file_(f),
|
||||
fd_(fileno(f)),
|
||||
use_os_buffer_(options.use_os_buffer) {}
|
||||
|
||||
PosixSequentialFile::~PosixSequentialFile() { fclose(file_); }
|
||||
|
||||
Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) {
|
||||
Status s;
|
||||
size_t r = 0;
|
||||
do {
|
||||
r = fread_unlocked(scratch, 1, n, file_);
|
||||
} while (r == 0 && ferror(file_) && errno == EINTR);
|
||||
*result = Slice(scratch, r);
|
||||
if (r < n) {
|
||||
if (feof(file_)) {
|
||||
// We leave status as ok if we hit the end of the file
|
||||
// We also clear the error so that the reads can continue
|
||||
// if a new data is written to the file
|
||||
clearerr(file_);
|
||||
} else {
|
||||
// A partial read with an error: return a non-ok status
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
if (!use_os_buffer_) {
|
||||
// we need to fadvise away the entire range of pages because
|
||||
// we do not want readahead pages to be cached.
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status PosixSequentialFile::Skip(uint64_t n) {
|
||||
if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
namespace {
|
||||
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
|
||||
if (max_size < kMaxVarint64Length * 3) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct stat buf;
|
||||
int result = fstat(fd, &buf);
|
||||
if (result == -1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
long version = 0;
|
||||
result = ioctl(fd, FS_IOC_GETVERSION, &version);
|
||||
if (result == -1) {
|
||||
return 0;
|
||||
}
|
||||
uint64_t uversion = (uint64_t)version;
|
||||
|
||||
char* rid = id;
|
||||
rid = EncodeVarint64(rid, buf.st_dev);
|
||||
rid = EncodeVarint64(rid, buf.st_ino);
|
||||
rid = EncodeVarint64(rid, uversion);
|
||||
assert(rid >= id);
|
||||
return static_cast<size_t>(rid - id);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* PosixRandomAccessFile
|
||||
*
|
||||
* pread() based random-access
|
||||
*/
|
||||
PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd,
|
||||
const EnvOptions& options)
|
||||
: filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
|
||||
assert(!options.use_mmap_reads || sizeof(void*) < 8);
|
||||
}
|
||||
|
||||
PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
|
||||
|
||||
Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const {
|
||||
Status s;
|
||||
ssize_t r = -1;
|
||||
size_t left = n;
|
||||
char* ptr = scratch;
|
||||
while (left > 0) {
|
||||
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
|
||||
|
||||
if (r <= 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
ptr += r;
|
||||
offset += r;
|
||||
left -= r;
|
||||
}
|
||||
|
||||
*result = Slice(scratch, (r < 0) ? 0 : n - left);
|
||||
if (r < 0) {
|
||||
// An error: return a non-ok status
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
if (!use_os_buffer_) {
|
||||
// we need to fadvise away the entire range of pages because
|
||||
// we do not want readahead pages to be cached.
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
|
||||
return GetUniqueIdFromFile(fd_, id, max_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
void PosixRandomAccessFile::Hint(AccessPattern pattern) {
|
||||
switch (pattern) {
|
||||
case NORMAL:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
|
||||
break;
|
||||
case RANDOM:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
|
||||
break;
|
||||
case SEQUENTIAL:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
|
||||
break;
|
||||
case WILLNEED:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
|
||||
break;
|
||||
case DONTNEED:
|
||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* PosixMmapReadableFile
|
||||
*
|
||||
* mmap() based random-access
|
||||
*/
|
||||
// base[0,length-1] contains the mmapped contents of the file.
|
||||
PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
|
||||
const std::string& fname,
|
||||
void* base, size_t length,
|
||||
const EnvOptions& options)
|
||||
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
|
||||
fd_ = fd_ + 0; // suppress the warning for used variables
|
||||
assert(options.use_mmap_reads);
|
||||
assert(options.use_os_buffer);
|
||||
}
|
||||
|
||||
PosixMmapReadableFile::~PosixMmapReadableFile() {
|
||||
int ret = munmap(mmapped_region_, length_);
|
||||
if (ret != 0) {
|
||||
fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
|
||||
mmapped_region_, length_);
|
||||
}
|
||||
}
|
||||
|
||||
Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
|
||||
char* scratch) const {
|
||||
Status s;
|
||||
if (offset > length_) {
|
||||
*result = Slice();
|
||||
return IOError(filename_, EINVAL);
|
||||
} else if (offset + n > length_) {
|
||||
n = static_cast<size_t>(length_ - offset);
|
||||
}
|
||||
*result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
|
||||
return s;
|
||||
}
|
||||
|
||||
Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* PosixMmapFile
|
||||
*
|
||||
* We preallocate up to an extra megabyte and use memcpy to append new
|
||||
* data to the file. This is safe since we either properly close the
|
||||
* file before reading from it, or for log files, the reading code
|
||||
* knows enough to skip zero suffixes.
|
||||
*/
|
||||
Status PosixMmapFile::UnmapCurrentRegion() {
|
||||
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
|
||||
if (base_ != nullptr) {
|
||||
int munmap_status = munmap(base_, limit_ - base_);
|
||||
if (munmap_status != 0) {
|
||||
return IOError(filename_, munmap_status);
|
||||
}
|
||||
file_offset_ += limit_ - base_;
|
||||
base_ = nullptr;
|
||||
limit_ = nullptr;
|
||||
last_sync_ = nullptr;
|
||||
dst_ = nullptr;
|
||||
|
||||
// Increase the amount we map the next time, but capped at 1MB
|
||||
if (map_size_ < (1 << 20)) {
|
||||
map_size_ *= 2;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PosixMmapFile::MapNewRegion() {
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
assert(base_ == nullptr);
|
||||
|
||||
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
|
||||
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
|
||||
if (allow_fallocate_) {
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
|
||||
if (alloc_status != 0) {
|
||||
// fallback to posix_fallocate
|
||||
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
|
||||
}
|
||||
if (alloc_status != 0) {
|
||||
return Status::IOError("Error allocating space to file : " + filename_ +
|
||||
"Error : " + strerror(alloc_status));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
|
||||
void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
|
||||
file_offset_);
|
||||
if (ptr == MAP_FAILED) {
|
||||
return Status::IOError("MMap failed on " + filename_);
|
||||
}
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
|
||||
|
||||
base_ = reinterpret_cast<char*>(ptr);
|
||||
limit_ = base_ + map_size_;
|
||||
dst_ = base_;
|
||||
last_sync_ = base_;
|
||||
return Status::OK();
|
||||
#else
|
||||
return Status::NotSupported("This platform doesn't support fallocate()");
|
||||
#endif
|
||||
}
|
||||
|
||||
Status PosixMmapFile::Msync() {
|
||||
if (dst_ == last_sync_) {
|
||||
return Status::OK();
|
||||
}
|
||||
// Find the beginnings of the pages that contain the first and last
|
||||
// bytes to be synced.
|
||||
size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
|
||||
size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
|
||||
last_sync_ = dst_;
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
|
||||
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
|
||||
const EnvOptions& options)
|
||||
: filename_(fname),
|
||||
fd_(fd),
|
||||
page_size_(page_size),
|
||||
map_size_(Roundup(65536, page_size)),
|
||||
base_(nullptr),
|
||||
limit_(nullptr),
|
||||
dst_(nullptr),
|
||||
last_sync_(nullptr),
|
||||
file_offset_(0) {
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
allow_fallocate_ = options.allow_fallocate;
|
||||
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
||||
#endif
|
||||
assert((page_size & (page_size - 1)) == 0);
|
||||
assert(options.use_mmap_writes);
|
||||
}
|
||||
|
||||
PosixMmapFile::~PosixMmapFile() {
|
||||
if (fd_ >= 0) {
|
||||
PosixMmapFile::Close();
|
||||
}
|
||||
}
|
||||
|
||||
Status PosixMmapFile::Append(const Slice& data) {
|
||||
const char* src = data.data();
|
||||
size_t left = data.size();
|
||||
while (left > 0) {
|
||||
assert(base_ <= dst_);
|
||||
assert(dst_ <= limit_);
|
||||
size_t avail = limit_ - dst_;
|
||||
if (avail == 0) {
|
||||
Status s = UnmapCurrentRegion();
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
s = MapNewRegion();
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
|
||||
}
|
||||
|
||||
size_t n = (left <= avail) ? left : avail;
|
||||
memcpy(dst_, src, n);
|
||||
dst_ += n;
|
||||
src += n;
|
||||
left -= n;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PosixMmapFile::Close() {
|
||||
Status s;
|
||||
size_t unused = limit_ - dst_;
|
||||
|
||||
s = UnmapCurrentRegion();
|
||||
if (!s.ok()) {
|
||||
s = IOError(filename_, errno);
|
||||
} else if (unused > 0) {
|
||||
// Trim the extra space at the end of the file
|
||||
if (ftruncate(fd_, file_offset_ - unused) < 0) {
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
|
||||
if (close(fd_) < 0) {
|
||||
if (s.ok()) {
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
|
||||
fd_ = -1;
|
||||
base_ = nullptr;
|
||||
limit_ = nullptr;
|
||||
return s;
|
||||
}
|
||||
|
||||
Status PosixMmapFile::Flush() { return Status::OK(); }
|
||||
|
||||
Status PosixMmapFile::Sync() {
|
||||
if (fdatasync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
|
||||
return Msync();
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush data as well as metadata to stable storage.
|
||||
*/
|
||||
Status PosixMmapFile::Fsync() {
|
||||
if (fsync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
|
||||
return Msync();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the size of valid data in the file. This will not match the
|
||||
* size that is returned from the filesystem because we use mmap
|
||||
* to extend file by map_size every time.
|
||||
*/
|
||||
uint64_t PosixMmapFile::GetFileSize() {
|
||||
size_t used = dst_ - base_;
|
||||
return file_offset_ + used;
|
||||
}
|
||||
|
||||
Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
Status PosixMmapFile::Allocate(off_t offset, off_t len) {
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
|
||||
int alloc_status = 0;
|
||||
if (allow_fallocate_) {
|
||||
alloc_status = fallocate(
|
||||
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
|
||||
}
|
||||
if (alloc_status == 0) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* PosixWritableFile
|
||||
*
|
||||
* Use posix write to write data to a file.
|
||||
*/
|
||||
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
|
||||
const EnvOptions& options)
|
||||
: filename_(fname), fd_(fd), filesize_(0) {
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
allow_fallocate_ = options.allow_fallocate;
|
||||
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
||||
#endif
|
||||
assert(!options.use_mmap_writes);
|
||||
}
|
||||
|
||||
PosixWritableFile::~PosixWritableFile() {
|
||||
if (fd_ >= 0) {
|
||||
PosixWritableFile::Close();
|
||||
}
|
||||
}
|
||||
|
||||
Status PosixWritableFile::Append(const Slice& data) {
|
||||
const char* src = data.data();
|
||||
size_t left = data.size();
|
||||
while (left != 0) {
|
||||
ssize_t done = write(fd_, src, left);
|
||||
if (done < 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
left -= done;
|
||||
src += done;
|
||||
}
|
||||
filesize_ += data.size();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PosixWritableFile::Close() {
|
||||
Status s;
|
||||
|
||||
size_t block_size;
|
||||
size_t last_allocated_block;
|
||||
GetPreallocationStatus(&block_size, &last_allocated_block);
|
||||
if (last_allocated_block > 0) {
|
||||
// trim the extra space preallocated at the end of the file
|
||||
// NOTE(ljin): we probably don't want to surface failure as an IOError,
|
||||
// but it will be nice to log these errors.
|
||||
int dummy __attribute__((unused));
|
||||
dummy = ftruncate(fd_, filesize_);
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
// in some file systems, ftruncate only trims trailing space if the
|
||||
// new file size is smaller than the current size. Calling fallocate
|
||||
// with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
|
||||
// blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
|
||||
// filesystems:
|
||||
// XFS (since Linux 2.6.38)
|
||||
// ext4 (since Linux 3.0)
|
||||
// Btrfs (since Linux 3.7)
|
||||
// tmpfs (since Linux 3.5)
|
||||
// We ignore error since failure of this operation does not affect
|
||||
// correctness.
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
if (allow_fallocate_) {
|
||||
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
|
||||
block_size * last_allocated_block - filesize_);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (close(fd_) < 0) {
|
||||
s = IOError(filename_, errno);
|
||||
}
|
||||
fd_ = -1;
|
||||
return s;
|
||||
}
|
||||
|
||||
// write out the cached data to the OS cache
|
||||
Status PosixWritableFile::Flush() { return Status::OK(); }
|
||||
|
||||
Status PosixWritableFile::Sync() {
|
||||
if (fdatasync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status PosixWritableFile::Fsync() {
|
||||
if (fsync(fd_) < 0) {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
|
||||
|
||||
uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
|
||||
|
||||
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
|
||||
#ifndef OS_LINUX
|
||||
return Status::OK();
|
||||
#else
|
||||
// free OS pages
|
||||
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
||||
if (ret == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
return IOError(filename_, errno);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
Status PosixWritableFile::Allocate(off_t offset, off_t len) {
|
||||
TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
int alloc_status = 0;
|
||||
if (allow_fallocate_) {
|
||||
alloc_status = fallocate(
|
||||
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
|
||||
}
|
||||
if (alloc_status == 0) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
|
||||
Status PosixWritableFile::RangeSync(off_t offset, off_t nbytes) {
|
||||
if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
return IOError(filename_, errno);
|
||||
}
|
||||
}
|
||||
|
||||
size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
|
||||
return GetUniqueIdFromFile(fd_, id, max_size);
|
||||
}
|
||||
#endif
|
||||
} // namespace rocksdb
|
||||
#endif
|
@ -232,6 +232,7 @@ DBOptions::DBOptions()
|
||||
max_log_file_size(0),
|
||||
log_file_time_to_roll(0),
|
||||
keep_log_file_num(1000),
|
||||
recycle_log_file_num(0),
|
||||
max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
|
||||
table_cache_numshardbits(4),
|
||||
WAL_ttl_seconds(0),
|
||||
@ -290,6 +291,7 @@ DBOptions::DBOptions(const Options& options)
|
||||
max_log_file_size(options.max_log_file_size),
|
||||
log_file_time_to_roll(options.log_file_time_to_roll),
|
||||
keep_log_file_num(options.keep_log_file_num),
|
||||
recycle_log_file_num(options.recycle_log_file_num),
|
||||
max_manifest_file_size(options.max_manifest_file_size),
|
||||
table_cache_numshardbits(options.table_cache_numshardbits),
|
||||
WAL_ttl_seconds(options.WAL_ttl_seconds),
|
||||
@ -348,6 +350,8 @@ void DBOptions::Dump(Logger* log) const {
|
||||
log_file_time_to_roll);
|
||||
Header(log, " Options.keep_log_file_num: %" ROCKSDB_PRIszt,
|
||||
keep_log_file_num);
|
||||
Header(log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
|
||||
recycle_log_file_num);
|
||||
Header(log, " Options.allow_os_buffer: %d", allow_os_buffer);
|
||||
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
|
||||
Header(log, " Options.allow_fallocate: %d", allow_fallocate);
|
||||
|
@ -207,6 +207,9 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
|
||||
{"keep_log_file_num",
|
||||
{offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT,
|
||||
OptionVerificationType::kNormal}},
|
||||
{"recycle_log_file_num",
|
||||
{offsetof(struct DBOptions, recycle_log_file_num), OptionType::kSizeT,
|
||||
OptionVerificationType::kNormal}},
|
||||
{"log_file_time_to_roll",
|
||||
{offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT,
|
||||
OptionVerificationType::kNormal}},
|
||||
|
@ -323,6 +323,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
|
||||
{"max_log_file_size", "37"},
|
||||
{"log_file_time_to_roll", "38"},
|
||||
{"keep_log_file_num", "39"},
|
||||
{"recycle_log_file_num", "5"},
|
||||
{"max_manifest_file_size", "40"},
|
||||
{"table_cache_numshardbits", "41"},
|
||||
{"WAL_ttl_seconds", "43"},
|
||||
@ -339,7 +340,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
|
||||
{"new_table_reader_for_compaction_inputs", "true"},
|
||||
{"compaction_readahead_size", "100"},
|
||||
{"bytes_per_sync", "47"},
|
||||
{"wal_bytes_per_sync", "48"}, };
|
||||
{"wal_bytes_per_sync", "48"},
|
||||
};
|
||||
|
||||
ColumnFamilyOptions base_cf_opt;
|
||||
ColumnFamilyOptions new_cf_opt;
|
||||
@ -431,6 +433,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
|
||||
ASSERT_EQ(new_db_opt.max_log_file_size, 37U);
|
||||
ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
|
||||
ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
|
||||
ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
|
||||
ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
|
||||
ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
|
||||
ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
|
||||
@ -692,6 +695,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
|
||||
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
|
||||
db_opt->use_adaptive_mutex = rnd->Uniform(2);
|
||||
db_opt->use_fsync = rnd->Uniform(2);
|
||||
db_opt->recycle_log_file_num = rnd->Uniform(2);
|
||||
|
||||
// int options
|
||||
db_opt->max_background_compactions = rnd->Uniform(100);
|
||||
|
@ -10,8 +10,6 @@
|
||||
// Syncpoint prevents us building and running tests in release
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#if !defined(NDEBUG) || !defined(OS_WIN)
|
||||
|
||||
#ifndef OS_WIN
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
@ -350,16 +348,10 @@ TEST_F(DBTest, CheckpointCF) {
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if !defined(NDEBUG) || !defined(OS_WIN)
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
@ -588,8 +588,8 @@ JSONDocument::const_item_iterator::~const_item_iterator() {
|
||||
|
||||
JSONDocument::const_item_iterator::value_type
|
||||
JSONDocument::const_item_iterator::operator*() {
|
||||
return {std::string(it_->getKeyStr(), it_->klen()),
|
||||
JSONDocument(it_->value(), false)};
|
||||
return JSONDocument::const_item_iterator::value_type(std::string(it_->getKeyStr(), it_->klen()),
|
||||
JSONDocument(it_->value(), false));
|
||||
}
|
||||
|
||||
JSONDocument::ItemsIteratorGenerator::ItemsIteratorGenerator(
|
||||
|
Loading…
Reference in New Issue
Block a user