Merge branch 'master' into columnfamilies
Conflicts: HISTORY.md db/db_impl.cc db/db_impl.h db/db_iter.cc db/db_test.cc db/dbformat.h db/memtable.cc db/memtable_list.cc db/memtable_list.h db/table_cache.cc db/table_cache.h db/version_edit.h db/version_set.cc db/version_set.h db/write_batch.cc db/write_batch_test.cc include/rocksdb/options.h util/options.cc
This commit is contained in:
commit
0143abdbb0
@ -6,8 +6,12 @@
|
|||||||
executed in high priority thread pool.
|
executed in high priority thread pool.
|
||||||
|
|
||||||
## Unreleased (will be relased in 2.8)
|
## Unreleased (will be relased in 2.8)
|
||||||
* By default, checksums are verified on every read from database
|
## Unreleased
|
||||||
|
|
||||||
|
### Public API changes
|
||||||
|
|
||||||
|
* Removed arena.h from public header files.
|
||||||
|
* By default, checksums are verified on every read from database
|
||||||
|
|
||||||
## 2.7.0 (01/28/2014)
|
## 2.7.0 (01/28/2014)
|
||||||
|
|
||||||
|
21
Makefile
21
Makefile
@ -6,11 +6,7 @@
|
|||||||
INSTALL_PATH ?= $(CURDIR)
|
INSTALL_PATH ?= $(CURDIR)
|
||||||
|
|
||||||
#-----------------------------------------------
|
#-----------------------------------------------
|
||||||
# Uncomment exactly one of the lines labelled (A), (B), and (C) below
|
OPT += -fno-omit-frame-pointer -momit-leaf-frame-pointer
|
||||||
# to switch between compilation modes.
|
|
||||||
|
|
||||||
# OPT ?= -DNDEBUG # (A) Production use (optimized mode)
|
|
||||||
OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
|
|
||||||
#-----------------------------------------------
|
#-----------------------------------------------
|
||||||
|
|
||||||
# detect what platform we're building on
|
# detect what platform we're building on
|
||||||
@ -57,6 +53,7 @@ TESTS = \
|
|||||||
auto_roll_logger_test \
|
auto_roll_logger_test \
|
||||||
block_test \
|
block_test \
|
||||||
bloom_test \
|
bloom_test \
|
||||||
|
dynamic_bloom_test \
|
||||||
c_test \
|
c_test \
|
||||||
cache_test \
|
cache_test \
|
||||||
coding_test \
|
coding_test \
|
||||||
@ -75,6 +72,7 @@ TESTS = \
|
|||||||
merge_test \
|
merge_test \
|
||||||
redis_test \
|
redis_test \
|
||||||
reduce_levels_test \
|
reduce_levels_test \
|
||||||
|
plain_table_db_test \
|
||||||
simple_table_db_test \
|
simple_table_db_test \
|
||||||
skiplist_test \
|
skiplist_test \
|
||||||
stringappend_test \
|
stringappend_test \
|
||||||
@ -93,6 +91,7 @@ TOOLS = \
|
|||||||
db_repl_stress \
|
db_repl_stress \
|
||||||
blob_store_bench
|
blob_store_bench
|
||||||
|
|
||||||
|
|
||||||
PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
|
PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
|
||||||
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
|
BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
|
||||||
|
|
||||||
@ -143,11 +142,11 @@ all: $(LIBRARY) $(PROGRAMS)
|
|||||||
# Will also generate shared libraries.
|
# Will also generate shared libraries.
|
||||||
release:
|
release:
|
||||||
$(MAKE) clean
|
$(MAKE) clean
|
||||||
OPT=-DNDEBUG $(MAKE) all -j32
|
OPT="-DNDEBUG -O2" $(MAKE) all -j32
|
||||||
|
|
||||||
coverage:
|
coverage:
|
||||||
$(MAKE) clean
|
$(MAKE) clean
|
||||||
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
|
COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check -j32
|
||||||
(cd coverage; ./coverage_test.sh)
|
(cd coverage; ./coverage_test.sh)
|
||||||
# Delete intermediate files
|
# Delete intermediate files
|
||||||
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
|
find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
|
||||||
@ -248,6 +247,9 @@ table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJEC
|
|||||||
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||||
|
|
||||||
|
dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
|
$(CXX) util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||||
|
|
||||||
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||||
|
|
||||||
@ -278,11 +280,14 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
|||||||
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||||
|
|
||||||
|
plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
|
$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||||
|
|
||||||
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
||||||
|
|
||||||
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
|
table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
|
$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
|
||||||
|
|
||||||
perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
|
||||||
|
@ -47,7 +47,6 @@ fi
|
|||||||
# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
|
# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
|
||||||
# fi
|
# fi
|
||||||
# fi
|
# fi
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
uncommitted_code=`git diff HEAD`
|
uncommitted_code=`git diff HEAD`
|
||||||
@ -55,7 +54,6 @@ uncommitted_code=`git diff HEAD`
|
|||||||
# If there's no uncommitted changes, we assume user are doing post-commit
|
# If there's no uncommitted changes, we assume user are doing post-commit
|
||||||
# format check, in which case we'll check the modified lines from latest commit.
|
# format check, in which case we'll check the modified lines from latest commit.
|
||||||
# Otherwise, we'll check format of the uncommitted code only.
|
# Otherwise, we'll check format of the uncommitted code only.
|
||||||
format_last_commit=0
|
|
||||||
if [ -z "$uncommitted_code" ]
|
if [ -z "$uncommitted_code" ]
|
||||||
then
|
then
|
||||||
# Check the format of last commit
|
# Check the format of last commit
|
||||||
|
@ -44,6 +44,11 @@ $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
|
|||||||
tee -a $RECENT_REPORT &&
|
tee -a $RECENT_REPORT &&
|
||||||
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
|
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
|
||||||
|
|
||||||
|
# Unless otherwise specified, we'll not generate html report by default
|
||||||
|
if [ -z "$HTML" ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
# Generate the html report. If we cannot find lcov in this machine, we'll simply
|
# Generate the html report. If we cannot find lcov in this machine, we'll simply
|
||||||
# skip this step.
|
# skip this step.
|
||||||
echo "Generating the html coverage report..."
|
echo "Generating the html coverage report..."
|
||||||
|
@ -9,16 +9,16 @@
|
|||||||
|
|
||||||
#include "db/builder.h"
|
#include "db/builder.h"
|
||||||
|
|
||||||
#include "db/filename.h"
|
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
|
#include "db/filename.h"
|
||||||
#include "db/merge_helper.h"
|
#include "db/merge_helper.h"
|
||||||
#include "db/table_cache.h"
|
#include "db/table_cache.h"
|
||||||
#include "db/version_edit.h"
|
#include "db/version_edit.h"
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/table.h"
|
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/table.h"
|
||||||
#include "table/block_based_table_builder.h"
|
#include "table/block_based_table_builder.h"
|
||||||
#include "util/stop_watch.h"
|
#include "util/stop_watch.h"
|
||||||
|
|
||||||
@ -26,20 +26,18 @@ namespace rocksdb {
|
|||||||
|
|
||||||
class TableFactory;
|
class TableFactory;
|
||||||
|
|
||||||
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
TableBuilder* NewTableBuilder(const Options& options,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
|
WritableFile* file,
|
||||||
CompressionType compression_type) {
|
CompressionType compression_type) {
|
||||||
return options.table_factory->GetTableBuilder(options, file,
|
return options.table_factory->NewTableBuilder(options, internal_comparator,
|
||||||
compression_type);
|
file, compression_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BuildTable(const std::string& dbname,
|
Status BuildTable(const std::string& dbname, Env* env, const Options& options,
|
||||||
Env* env,
|
const EnvOptions& soptions, TableCache* table_cache,
|
||||||
const Options& options,
|
Iterator* iter, FileMetaData* meta,
|
||||||
const EnvOptions& soptions,
|
const InternalKeyComparator& internal_comparator,
|
||||||
TableCache* table_cache,
|
|
||||||
Iterator* iter,
|
|
||||||
FileMetaData* meta,
|
|
||||||
const Comparator* user_comparator,
|
|
||||||
const SequenceNumber newest_snapshot,
|
const SequenceNumber newest_snapshot,
|
||||||
const SequenceNumber earliest_seqno_in_memtable,
|
const SequenceNumber earliest_seqno_in_memtable,
|
||||||
const CompressionType compression) {
|
const CompressionType compression) {
|
||||||
@ -64,8 +62,8 @@ Status BuildTable(const std::string& dbname,
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
TableBuilder* builder = GetTableBuilder(options, file.get(),
|
TableBuilder* builder =
|
||||||
compression);
|
NewTableBuilder(options, internal_comparator, file.get(), compression);
|
||||||
|
|
||||||
// the first key is the smallest key
|
// the first key is the smallest key
|
||||||
Slice key = iter->key();
|
Slice key = iter->key();
|
||||||
@ -73,8 +71,8 @@ Status BuildTable(const std::string& dbname,
|
|||||||
meta->smallest_seqno = GetInternalKeySeqno(key);
|
meta->smallest_seqno = GetInternalKeySeqno(key);
|
||||||
meta->largest_seqno = meta->smallest_seqno;
|
meta->largest_seqno = meta->smallest_seqno;
|
||||||
|
|
||||||
MergeHelper merge(user_comparator, options.merge_operator.get(),
|
MergeHelper merge(internal_comparator.user_comparator(),
|
||||||
options.info_log.get(),
|
options.merge_operator.get(), options.info_log.get(),
|
||||||
true /* internal key corruption is not ok */);
|
true /* internal key corruption is not ok */);
|
||||||
|
|
||||||
if (purge) {
|
if (purge) {
|
||||||
@ -103,8 +101,8 @@ Status BuildTable(const std::string& dbname,
|
|||||||
// If the key is the same as the previous key (and it is not the
|
// If the key is the same as the previous key (and it is not the
|
||||||
// first key), then we skip it, since it is an older version.
|
// first key), then we skip it, since it is an older version.
|
||||||
// Otherwise we output the key and mark it as the "new" previous key.
|
// Otherwise we output the key and mark it as the "new" previous key.
|
||||||
if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
|
if (!is_first_key && !internal_comparator.user_comparator()->Compare(
|
||||||
this_ikey.user_key)) {
|
prev_ikey.user_key, this_ikey.user_key)) {
|
||||||
// seqno within the same key are in decreasing order
|
// seqno within the same key are in decreasing order
|
||||||
assert(this_ikey.sequence < prev_ikey.sequence);
|
assert(this_ikey.sequence < prev_ikey.sequence);
|
||||||
} else {
|
} else {
|
||||||
@ -202,10 +200,8 @@ Status BuildTable(const std::string& dbname,
|
|||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
// Verify that the table is usable
|
// Verify that the table is usable
|
||||||
Iterator* it = table_cache->NewIterator(ReadOptions(),
|
Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
|
||||||
soptions,
|
internal_comparator, *meta);
|
||||||
meta->number,
|
|
||||||
meta->file_size);
|
|
||||||
s = it->status();
|
s = it->status();
|
||||||
delete it;
|
delete it;
|
||||||
}
|
}
|
||||||
|
17
db/builder.h
17
db/builder.h
@ -24,23 +24,20 @@ class VersionEdit;
|
|||||||
class TableBuilder;
|
class TableBuilder;
|
||||||
class WritableFile;
|
class WritableFile;
|
||||||
|
|
||||||
|
extern TableBuilder* NewTableBuilder(
|
||||||
extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||||
CompressionType compression_type);
|
WritableFile* file, CompressionType compression_type);
|
||||||
|
|
||||||
// Build a Table file from the contents of *iter. The generated file
|
// Build a Table file from the contents of *iter. The generated file
|
||||||
// will be named according to meta->number. On success, the rest of
|
// will be named according to meta->number. On success, the rest of
|
||||||
// *meta will be filled with metadata about the generated table.
|
// *meta will be filled with metadata about the generated table.
|
||||||
// If no data is present in *iter, meta->file_size will be set to
|
// If no data is present in *iter, meta->file_size will be set to
|
||||||
// zero, and no Table file will be produced.
|
// zero, and no Table file will be produced.
|
||||||
extern Status BuildTable(const std::string& dbname,
|
extern Status BuildTable(const std::string& dbname, Env* env,
|
||||||
Env* env,
|
const Options& options, const EnvOptions& soptions,
|
||||||
const Options& options,
|
TableCache* table_cache, Iterator* iter,
|
||||||
const EnvOptions& soptions,
|
|
||||||
TableCache* table_cache,
|
|
||||||
Iterator* iter,
|
|
||||||
FileMetaData* meta,
|
FileMetaData* meta,
|
||||||
const Comparator* user_comparator,
|
const InternalKeyComparator& internal_comparator,
|
||||||
const SequenceNumber newest_snapshot,
|
const SequenceNumber newest_snapshot,
|
||||||
const SequenceNumber earliest_seqno_in_memtable,
|
const SequenceNumber earliest_seqno_in_memtable,
|
||||||
const CompressionType compression);
|
const CompressionType compression);
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include "db/internal_stats.h"
|
#include "db/internal_stats.h"
|
||||||
#include "db/compaction_picker.h"
|
#include "db/compaction_picker.h"
|
||||||
#include "db/table_properties_collector.h"
|
#include "db/table_properties_collector.h"
|
||||||
|
#include "util/autovector.h"
|
||||||
#include "util/hash_skiplist_rep.h"
|
#include "util/hash_skiplist_rep.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -184,7 +185,7 @@ ColumnFamilyData::~ColumnFamilyData() {
|
|||||||
if (mem_ != nullptr) {
|
if (mem_ != nullptr) {
|
||||||
delete mem_->Unref();
|
delete mem_->Unref();
|
||||||
}
|
}
|
||||||
std::vector<MemTable*> to_delete;
|
autovector<MemTable*> to_delete;
|
||||||
imm_.current()->Unref(&to_delete);
|
imm_.current()->Unref(&to_delete);
|
||||||
for (MemTable* m : to_delete) {
|
for (MemTable* m : to_delete) {
|
||||||
delete m;
|
delete m;
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "db/memtablelist.h"
|
#include "db/memtable_list.h"
|
||||||
#include "db/write_batch_internal.h"
|
#include "db/write_batch_internal.h"
|
||||||
#include "db/table_cache.h"
|
#include "db/table_cache.h"
|
||||||
|
|
||||||
@ -40,7 +40,7 @@ struct SuperVersion {
|
|||||||
// We need to_delete because during Cleanup(), imm->Unref() returns
|
// We need to_delete because during Cleanup(), imm->Unref() returns
|
||||||
// all memtables that we need to free through this vector. We then
|
// all memtables that we need to free through this vector. We then
|
||||||
// delete all those memtables outside of mutex, during destruction
|
// delete all those memtables outside of mutex, during destruction
|
||||||
std::vector<MemTable*> to_delete;
|
autovector<MemTable*> to_delete;
|
||||||
|
|
||||||
// should be called outside the mutex
|
// should be called outside the mutex
|
||||||
SuperVersion();
|
SuperVersion();
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
#include "rocksdb/slice.h"
|
#include "rocksdb/slice.h"
|
||||||
#include "rocksdb/slice_transform.h"
|
#include "rocksdb/slice_transform.h"
|
||||||
#include "rocksdb/statistics.h"
|
#include "rocksdb/statistics.h"
|
||||||
|
#include "rocksdb/perf_context.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "util/bit_set.h"
|
#include "util/bit_set.h"
|
||||||
#include "util/crc32c.h"
|
#include "util/crc32c.h"
|
||||||
@ -389,6 +390,8 @@ DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
|
|||||||
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
|
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
|
||||||
" this is greater than 0.");
|
" this is greater than 0.");
|
||||||
|
|
||||||
|
DEFINE_int32(perf_level, 0, "Level of perf collection");
|
||||||
|
|
||||||
static bool ValidateRateLimit(const char* flagname, double value) {
|
static bool ValidateRateLimit(const char* flagname, double value) {
|
||||||
static constexpr double EPSILON = 1e-10;
|
static constexpr double EPSILON = 1e-10;
|
||||||
if ( value < -EPSILON ) {
|
if ( value < -EPSILON ) {
|
||||||
@ -728,6 +731,7 @@ struct SharedState {
|
|||||||
port::Mutex mu;
|
port::Mutex mu;
|
||||||
port::CondVar cv;
|
port::CondVar cv;
|
||||||
int total;
|
int total;
|
||||||
|
int perf_level;
|
||||||
|
|
||||||
// Each thread goes through the following states:
|
// Each thread goes through the following states:
|
||||||
// (1) initializing
|
// (1) initializing
|
||||||
@ -739,7 +743,7 @@ struct SharedState {
|
|||||||
long num_done;
|
long num_done;
|
||||||
bool start;
|
bool start;
|
||||||
|
|
||||||
SharedState() : cv(&mu) { }
|
SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
|
||||||
};
|
};
|
||||||
|
|
||||||
// Per-thread state for concurrent executions of the same benchmark.
|
// Per-thread state for concurrent executions of the same benchmark.
|
||||||
@ -847,6 +851,7 @@ class Benchmark {
|
|||||||
fprintf(stdout, "Memtablerep: vector\n");
|
fprintf(stdout, "Memtablerep: vector\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
|
||||||
|
|
||||||
PrintWarnings();
|
PrintWarnings();
|
||||||
fprintf(stdout, "------------------------------------------------\n");
|
fprintf(stdout, "------------------------------------------------\n");
|
||||||
@ -1202,6 +1207,7 @@ class Benchmark {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
|
||||||
thread->stats.Start(thread->tid);
|
thread->stats.Start(thread->tid);
|
||||||
(arg->bm->*(arg->method))(thread);
|
(arg->bm->*(arg->method))(thread);
|
||||||
thread->stats.Stop();
|
thread->stats.Stop();
|
||||||
|
335
db/db_impl.cc
335
db/db_impl.cc
@ -22,13 +22,13 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "db/builder.h"
|
#include "db/builder.h"
|
||||||
#include "db/dbformat.h"
|
|
||||||
#include "db/db_iter.h"
|
#include "db/db_iter.h"
|
||||||
|
#include "db/dbformat.h"
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
#include "db/log_reader.h"
|
#include "db/log_reader.h"
|
||||||
#include "db/log_writer.h"
|
#include "db/log_writer.h"
|
||||||
#include "db/memtable.h"
|
#include "db/memtable.h"
|
||||||
#include "db/memtablelist.h"
|
#include "db/memtable_list.h"
|
||||||
#include "db/merge_context.h"
|
#include "db/merge_context.h"
|
||||||
#include "db/merge_helper.h"
|
#include "db/merge_helper.h"
|
||||||
#include "db/prefix_filter_iterator.h"
|
#include "db/prefix_filter_iterator.h"
|
||||||
@ -48,12 +48,13 @@
|
|||||||
#include "rocksdb/statistics.h"
|
#include "rocksdb/statistics.h"
|
||||||
#include "rocksdb/status.h"
|
#include "rocksdb/status.h"
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
#include "port/port.h"
|
|
||||||
#include "table/block.h"
|
#include "table/block.h"
|
||||||
#include "table/block_based_table_factory.h"
|
#include "table/block_based_table_factory.h"
|
||||||
#include "table/merger.h"
|
#include "table/merger.h"
|
||||||
|
#include "table/table_builder.h"
|
||||||
#include "table/two_level_iterator.h"
|
#include "table/two_level_iterator.h"
|
||||||
#include "util/auto_roll_logger.h"
|
#include "util/auto_roll_logger.h"
|
||||||
|
#include "util/autovector.h"
|
||||||
#include "util/build_version.h"
|
#include "util/build_version.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/hash_skiplist_rep.h"
|
#include "util/hash_skiplist_rep.h"
|
||||||
@ -61,13 +62,12 @@
|
|||||||
#include "util/mutexlock.h"
|
#include "util/mutexlock.h"
|
||||||
#include "util/perf_context_imp.h"
|
#include "util/perf_context_imp.h"
|
||||||
#include "util/stop_watch.h"
|
#include "util/stop_watch.h"
|
||||||
#include "util/autovector.h"
|
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
const std::string default_column_family_name("default");
|
const std::string default_column_family_name("default");
|
||||||
|
|
||||||
void dumpLeveldbBuildVersion(Logger * log);
|
void DumpLeveldbBuildVersion(Logger * log);
|
||||||
|
|
||||||
// Information kept for every waiting writer
|
// Information kept for every waiting writer
|
||||||
struct DBImpl::Writer {
|
struct DBImpl::Writer {
|
||||||
@ -141,7 +141,10 @@ Options SanitizeOptions(const std::string& dbname,
|
|||||||
|
|
||||||
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
||||||
DBOptions result = src;
|
DBOptions result = src;
|
||||||
ClipToRange(&result.max_open_files, 20, 1000000);
|
// result.max_open_files means an "infinite" open files.
|
||||||
|
if (result.max_open_files != -1) {
|
||||||
|
ClipToRange(&result.max_open_files, 20, 1000000);
|
||||||
|
}
|
||||||
if (result.max_background_flushes == 0) {
|
if (result.max_background_flushes == 0) {
|
||||||
result.max_background_flushes = 1;
|
result.max_background_flushes = 1;
|
||||||
}
|
}
|
||||||
@ -210,10 +213,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
|||||||
: env_(options.env),
|
: env_(options.env),
|
||||||
dbname_(dbname),
|
dbname_(dbname),
|
||||||
options_(SanitizeOptions(dbname, options)),
|
options_(SanitizeOptions(dbname, options)),
|
||||||
// Reserve ten files or so for other uses and give the rest to TableCache.
|
|
||||||
table_cache_(NewLRUCache(options_.max_open_files - 10,
|
|
||||||
options_.table_cache_numshardbits,
|
|
||||||
options_.table_cache_remove_scan_count_limit)),
|
|
||||||
db_lock_(nullptr),
|
db_lock_(nullptr),
|
||||||
mutex_(options.use_adaptive_mutex),
|
mutex_(options.use_adaptive_mutex),
|
||||||
shutting_down_(nullptr),
|
shutting_down_(nullptr),
|
||||||
@ -239,18 +238,27 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
|||||||
|
|
||||||
env_->GetAbsolutePath(dbname, &db_absolute_path_);
|
env_->GetAbsolutePath(dbname, &db_absolute_path_);
|
||||||
|
|
||||||
|
// Reserve ten files or so for other uses and give the rest to TableCache.
|
||||||
|
// Give a large number for setting of "infinite" open files.
|
||||||
|
const int table_cache_size =
|
||||||
|
(options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10;
|
||||||
|
// Reserve ten files or so for other uses and give the rest to TableCache.
|
||||||
|
table_cache_ =
|
||||||
|
NewLRUCache(table_cache_size, options_.table_cache_numshardbits,
|
||||||
|
options_.table_cache_remove_scan_count_limit);
|
||||||
|
|
||||||
versions_.reset(
|
versions_.reset(
|
||||||
new VersionSet(dbname_, &options_, storage_options_, table_cache_.get()));
|
new VersionSet(dbname_, &options_, storage_options_, table_cache_.get()));
|
||||||
column_family_memtables_.reset(
|
column_family_memtables_.reset(
|
||||||
new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
|
new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
|
||||||
|
|
||||||
dumpLeveldbBuildVersion(options_.info_log.get());
|
DumpLeveldbBuildVersion(options_.info_log.get());
|
||||||
// TODO(icanadi) dump DBOptions and ColumnFamilyOptions separately
|
// TODO(icanadi) dump DBOptions and ColumnFamilyOptions separately
|
||||||
// options_.Dump(options_.info_log.get());
|
// options_.Dump(options_.info_log.get());
|
||||||
|
|
||||||
char name[100];
|
char name[100];
|
||||||
Status st = env_->GetHostName(name, 100L);
|
Status s = env_->GetHostName(name, 100L);
|
||||||
if (st.ok()) {
|
if (s.ok()) {
|
||||||
host_name_ = name;
|
host_name_ = name;
|
||||||
} else {
|
} else {
|
||||||
Log(options_.info_log, "Can't get hostname, use localhost as host name.");
|
Log(options_.info_log, "Can't get hostname, use localhost as host name.");
|
||||||
@ -283,6 +291,10 @@ DBImpl::~DBImpl() {
|
|||||||
env_->UnlockFile(db_lock_);
|
env_->UnlockFile(db_lock_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// versions need to be destroyed before table_cache since it can hold
|
||||||
|
// references to table_cache.
|
||||||
|
versions_.reset();
|
||||||
|
|
||||||
LogFlush(options_.info_log);
|
LogFlush(options_.info_log);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -396,7 +408,7 @@ void DBImpl::MaybeDumpStats() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns the list of live files in 'sst_live' and the list
|
// Returns the list of live files in 'sst_live' and the list
|
||||||
// of all files in the filesystem in 'all_files'.
|
// of all files in the filesystem in 'candidate_files'.
|
||||||
// no_full_scan = true -- never do the full scan using GetChildren()
|
// no_full_scan = true -- never do the full scan using GetChildren()
|
||||||
// force = false -- don't force the full scan, except every
|
// force = false -- don't force the full scan, except every
|
||||||
// options_.delete_obsolete_files_period_micros
|
// options_.delete_obsolete_files_period_micros
|
||||||
@ -448,15 +460,18 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
|
|||||||
versions_->AddLiveFiles(&deletion_state.sst_live);
|
versions_->AddLiveFiles(&deletion_state.sst_live);
|
||||||
|
|
||||||
if (doing_the_full_scan) {
|
if (doing_the_full_scan) {
|
||||||
// set of all files in the directory
|
// set of all files in the directory. We'll exclude files that are still
|
||||||
env_->GetChildren(dbname_, &deletion_state.all_files); // Ignore errors
|
// alive in the subsequent processings.
|
||||||
|
env_->GetChildren(
|
||||||
|
dbname_, &deletion_state.candidate_files
|
||||||
|
); // Ignore errors
|
||||||
|
|
||||||
//Add log files in wal_dir
|
//Add log files in wal_dir
|
||||||
if (options_.wal_dir != dbname_) {
|
if (options_.wal_dir != dbname_) {
|
||||||
std::vector<std::string> log_files;
|
std::vector<std::string> log_files;
|
||||||
env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
|
env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
|
||||||
deletion_state.all_files.insert(
|
deletion_state.candidate_files.insert(
|
||||||
deletion_state.all_files.end(),
|
deletion_state.candidate_files.end(),
|
||||||
log_files.begin(),
|
log_files.begin(),
|
||||||
log_files.end()
|
log_files.end()
|
||||||
);
|
);
|
||||||
@ -469,11 +484,10 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
|
|||||||
// files in sst_delete_files and log_delete_files.
|
// files in sst_delete_files and log_delete_files.
|
||||||
// It is not necessary to hold the mutex when invoking this method.
|
// It is not necessary to hold the mutex when invoking this method.
|
||||||
void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
|
void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
|
||||||
|
|
||||||
// check if there is anything to do
|
// check if there is anything to do
|
||||||
if (!state.all_files.size() &&
|
if (state.candidate_files.empty() &&
|
||||||
!state.sst_delete_files.size() &&
|
state.sst_delete_files.empty() &&
|
||||||
!state.log_delete_files.size()) {
|
state.log_delete_files.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -483,100 +497,114 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
|
|||||||
if (state.manifest_file_number == 0) {
|
if (state.manifest_file_number == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t number;
|
|
||||||
FileType type;
|
|
||||||
std::vector<std::string> old_log_files;
|
std::vector<std::string> old_log_files;
|
||||||
|
|
||||||
// Now, convert live list to an unordered set, WITHOUT mutex held;
|
// Now, convert live list to an unordered set, WITHOUT mutex held;
|
||||||
// set is slow.
|
// set is slow.
|
||||||
std::unordered_set<uint64_t> live_set(state.sst_live.begin(),
|
std::unordered_set<uint64_t> sst_live(
|
||||||
state.sst_live.end());
|
state.sst_live.begin(), state.sst_live.end()
|
||||||
|
);
|
||||||
|
|
||||||
state.all_files.reserve(state.all_files.size() +
|
auto& candidate_files = state.candidate_files;
|
||||||
state.sst_delete_files.size());
|
candidate_files.reserve(
|
||||||
|
candidate_files.size() +
|
||||||
|
state.sst_delete_files.size() +
|
||||||
|
state.log_delete_files.size());
|
||||||
|
// We may ignore the dbname when generating the file names.
|
||||||
|
const char* kDumbDbName = "";
|
||||||
for (auto file : state.sst_delete_files) {
|
for (auto file : state.sst_delete_files) {
|
||||||
state.all_files.push_back(TableFileName("", file->number).substr(1));
|
candidate_files.push_back(
|
||||||
|
TableFileName(kDumbDbName, file->number).substr(1)
|
||||||
|
);
|
||||||
delete file;
|
delete file;
|
||||||
}
|
}
|
||||||
|
|
||||||
state.all_files.reserve(state.all_files.size() +
|
for (auto file_num : state.log_delete_files) {
|
||||||
state.log_delete_files.size());
|
if (file_num > 0) {
|
||||||
for (auto filenum : state.log_delete_files) {
|
candidate_files.push_back(
|
||||||
if (filenum > 0) {
|
LogFileName(kDumbDbName, file_num).substr(1)
|
||||||
state.all_files.push_back(LogFileName("", filenum).substr(1));
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// dedup state.all_files so we don't try to delete the same
|
// dedup state.candidate_files so we don't try to delete the same
|
||||||
// file twice
|
// file twice
|
||||||
sort(state.all_files.begin(), state.all_files.end());
|
sort(candidate_files.begin(), candidate_files.end());
|
||||||
auto unique_end = unique(state.all_files.begin(), state.all_files.end());
|
candidate_files.erase(
|
||||||
|
unique(candidate_files.begin(), candidate_files.end()),
|
||||||
|
candidate_files.end()
|
||||||
|
);
|
||||||
|
|
||||||
for (size_t i = 0; state.all_files.begin() + i < unique_end; i++) {
|
for (const auto& to_delete : candidate_files) {
|
||||||
if (ParseFileName(state.all_files[i], &number, &type)) {
|
uint64_t number;
|
||||||
bool keep = true;
|
FileType type;
|
||||||
switch (type) {
|
// Ignore file if we cannot recognize it.
|
||||||
case kLogFile:
|
if (!ParseFileName(to_delete, &number, &type)) {
|
||||||
keep = ((number >= state.log_number) ||
|
continue;
|
||||||
(number == state.prev_log_number));
|
}
|
||||||
break;
|
|
||||||
case kDescriptorFile:
|
|
||||||
// Keep my manifest file, and any newer incarnations'
|
|
||||||
// (in case there is a race that allows other incarnations)
|
|
||||||
keep = (number >= state.manifest_file_number);
|
|
||||||
break;
|
|
||||||
case kTableFile:
|
|
||||||
keep = (live_set.find(number) != live_set.end());
|
|
||||||
break;
|
|
||||||
case kTempFile:
|
|
||||||
// Any temp files that are currently being written to must
|
|
||||||
// be recorded in pending_outputs_, which is inserted into "live"
|
|
||||||
keep = (live_set.find(number) != live_set.end());
|
|
||||||
break;
|
|
||||||
case kInfoLogFile:
|
|
||||||
keep = true;
|
|
||||||
if (number != 0) {
|
|
||||||
old_log_files.push_back(state.all_files[i]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case kCurrentFile:
|
|
||||||
case kDBLockFile:
|
|
||||||
case kIdentityFile:
|
|
||||||
case kMetaDatabase:
|
|
||||||
keep = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!keep) {
|
bool keep = true;
|
||||||
if (type == kTableFile) {
|
switch (type) {
|
||||||
// evict from cache
|
case kLogFile:
|
||||||
TableCache::Evict(table_cache_.get(), number);
|
keep = ((number >= state.log_number) ||
|
||||||
|
(number == state.prev_log_number));
|
||||||
|
break;
|
||||||
|
case kDescriptorFile:
|
||||||
|
// Keep my manifest file, and any newer incarnations'
|
||||||
|
// (in case there is a race that allows other incarnations)
|
||||||
|
keep = (number >= state.manifest_file_number);
|
||||||
|
break;
|
||||||
|
case kTableFile:
|
||||||
|
keep = (sst_live.find(number) != sst_live.end());
|
||||||
|
break;
|
||||||
|
case kTempFile:
|
||||||
|
// Any temp files that are currently being written to must
|
||||||
|
// be recorded in pending_outputs_, which is inserted into "live"
|
||||||
|
keep = (sst_live.find(number) != sst_live.end());
|
||||||
|
break;
|
||||||
|
case kInfoLogFile:
|
||||||
|
keep = true;
|
||||||
|
if (number != 0) {
|
||||||
|
old_log_files.push_back(to_delete);
|
||||||
}
|
}
|
||||||
std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
|
break;
|
||||||
"/" + state.all_files[i];
|
case kCurrentFile:
|
||||||
|
case kDBLockFile:
|
||||||
|
case kIdentityFile:
|
||||||
|
case kMetaDatabase:
|
||||||
|
keep = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (keep) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type == kTableFile) {
|
||||||
|
// evict from cache
|
||||||
|
TableCache::Evict(table_cache_.get(), number);
|
||||||
|
}
|
||||||
|
std::string fname = ((type == kLogFile) ? options_.wal_dir : dbname_) +
|
||||||
|
"/" + to_delete;
|
||||||
|
Log(options_.info_log,
|
||||||
|
"Delete type=%d #%lu",
|
||||||
|
int(type),
|
||||||
|
(unsigned long)number);
|
||||||
|
|
||||||
|
if (type == kLogFile &&
|
||||||
|
(options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) {
|
||||||
|
Status s = env_->RenameFile(fname,
|
||||||
|
ArchivedLogFileName(options_.wal_dir, number));
|
||||||
|
if (!s.ok()) {
|
||||||
Log(options_.info_log,
|
Log(options_.info_log,
|
||||||
"Delete type=%d #%lu",
|
"RenameFile logfile #%lu FAILED -- %s\n",
|
||||||
int(type),
|
(unsigned long)number, s.ToString().c_str());
|
||||||
(unsigned long)number);
|
}
|
||||||
|
} else {
|
||||||
Status st;
|
Status s = env_->DeleteFile(fname);
|
||||||
if (type == kLogFile && (options_.WAL_ttl_seconds > 0 ||
|
if (!s.ok()) {
|
||||||
options_.WAL_size_limit_MB > 0)) {
|
Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n",
|
||||||
st = env_->RenameFile(fname,
|
int(type), (unsigned long)number, s.ToString().c_str());
|
||||||
ArchivedLogFileName(options_.wal_dir, number));
|
|
||||||
if (!st.ok()) {
|
|
||||||
Log(options_.info_log,
|
|
||||||
"RenameFile logfile #%lu FAILED -- %s\n",
|
|
||||||
(unsigned long)number, st.ToString().c_str());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
st = env_->DeleteFile(fname);
|
|
||||||
if (!st.ok()) {
|
|
||||||
Log(options_.info_log, "Delete type=%d #%lu FAILED -- %s\n",
|
|
||||||
int(type), (unsigned long)number, st.ToString().c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -805,10 +833,11 @@ Status DBImpl::Recover(
|
|||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
uint64_t number;
|
|
||||||
FileType type;
|
|
||||||
std::vector<uint64_t> logs;
|
std::vector<uint64_t> logs;
|
||||||
for (size_t i = 0; i < filenames.size(); i++) {
|
for (size_t i = 0; i < filenames.size(); i++) {
|
||||||
|
uint64_t number;
|
||||||
|
FileType type;
|
||||||
if (ParseFileName(filenames[i], &number, &type)
|
if (ParseFileName(filenames[i], &number, &type)
|
||||||
&& type == kLogFile
|
&& type == kLogFile
|
||||||
&& ((number >= min_log) || (number == prev_log))) {
|
&& ((number >= min_log) || (number == prev_log))) {
|
||||||
@ -824,12 +853,12 @@ Status DBImpl::Recover(
|
|||||||
|
|
||||||
// Recover in the order in which the logs were generated
|
// Recover in the order in which the logs were generated
|
||||||
std::sort(logs.begin(), logs.end());
|
std::sort(logs.begin(), logs.end());
|
||||||
for (size_t i = 0; s.ok() && i < logs.size(); i++) {
|
for (const auto& log : logs) {
|
||||||
// The previous incarnation may not have written any MANIFEST
|
// The previous incarnation may not have written any MANIFEST
|
||||||
// records after allocating this log number. So we manually
|
// records after allocating this log number. So we manually
|
||||||
// update the file number allocation counter in VersionSet.
|
// update the file number allocation counter in VersionSet.
|
||||||
versions_->MarkFileNumberUsed(logs[i]);
|
versions_->MarkFileNumberUsed(log);
|
||||||
s = RecoverLogFile(logs[i], &max_sequence, read_only);
|
s = RecoverLogFile(log, &max_sequence, read_only);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
@ -1011,7 +1040,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
|||||||
{
|
{
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_,
|
s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_,
|
||||||
cfd->table_cache(), iter, &meta, cfd->user_comparator(),
|
cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
|
||||||
newest_snapshot, earliest_seqno_in_memtable,
|
newest_snapshot, earliest_seqno_in_memtable,
|
||||||
GetCompressionFlush(*cfd->full_options()));
|
GetCompressionFlush(*cfd->full_options()));
|
||||||
LogFlush(options_.info_log);
|
LogFlush(options_.info_log);
|
||||||
@ -1045,7 +1074,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
|
Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
|
||||||
std::vector<MemTable*>& mems, VersionEdit* edit,
|
autovector<MemTable*>& mems, VersionEdit* edit,
|
||||||
uint64_t* filenumber) {
|
uint64_t* filenumber) {
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
const uint64_t start_micros = env_->NowMicros();
|
const uint64_t start_micros = env_->NowMicros();
|
||||||
@ -1062,21 +1091,20 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
|
|||||||
Status s;
|
Status s;
|
||||||
{
|
{
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
std::vector<Iterator*> list;
|
std::vector<Iterator*> memtables;
|
||||||
for (MemTable* m : mems) {
|
for (MemTable* m : mems) {
|
||||||
Log(options_.info_log,
|
Log(options_.info_log,
|
||||||
"Flushing memtable with log file: %lu\n",
|
"Flushing memtable with log file: %lu\n",
|
||||||
(unsigned long)m->GetLogNumber());
|
(unsigned long)m->GetLogNumber());
|
||||||
list.push_back(m->NewIterator());
|
memtables.push_back(m->NewIterator());
|
||||||
}
|
}
|
||||||
Iterator* iter =
|
Iterator* iter = NewMergingIterator(env_, &cfd->internal_comparator(),
|
||||||
NewMergingIterator(&cfd->internal_comparator(), &list[0], list.size());
|
&memtables[0], memtables.size());
|
||||||
Log(options_.info_log,
|
Log(options_.info_log, "Level-0 flush table #%lu: started",
|
||||||
"Level-0 flush table #%lu: started",
|
|
||||||
(unsigned long)meta.number);
|
(unsigned long)meta.number);
|
||||||
|
|
||||||
s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_,
|
s = BuildTable(dbname_, env_, *cfd->full_options(), storage_options_,
|
||||||
cfd->table_cache(), iter, &meta, cfd->user_comparator(),
|
cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
|
||||||
newest_snapshot, earliest_seqno_in_memtable,
|
newest_snapshot, earliest_seqno_in_memtable,
|
||||||
GetCompressionFlush(*cfd->full_options()));
|
GetCompressionFlush(*cfd->full_options()));
|
||||||
LogFlush(options_.info_log);
|
LogFlush(options_.info_log);
|
||||||
@ -1092,7 +1120,6 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
|
|||||||
}
|
}
|
||||||
base->Unref();
|
base->Unref();
|
||||||
|
|
||||||
|
|
||||||
// re-acquire the most current version
|
// re-acquire the most current version
|
||||||
base = cfd->current();
|
base = cfd->current();
|
||||||
|
|
||||||
@ -1145,7 +1172,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
|
|||||||
|
|
||||||
// Save the contents of the earliest memtable as a new Table
|
// Save the contents of the earliest memtable as a new Table
|
||||||
uint64_t file_number;
|
uint64_t file_number;
|
||||||
std::vector<MemTable*> mems;
|
autovector<MemTable*> mems;
|
||||||
cfd->imm()->PickMemtablesToFlush(&mems);
|
cfd->imm()->PickMemtablesToFlush(&mems);
|
||||||
if (mems.empty()) {
|
if (mems.empty()) {
|
||||||
Log(options_.info_log, "Nothing in memstore to flush");
|
Log(options_.info_log, "Nothing in memstore to flush");
|
||||||
@ -1763,8 +1790,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
|
|||||||
|
|
||||||
void DBImpl::BackgroundCallFlush() {
|
void DBImpl::BackgroundCallFlush() {
|
||||||
bool madeProgress = false;
|
bool madeProgress = false;
|
||||||
DeletionState deletion_state(default_cfd_->options()->max_write_buffer_number,
|
DeletionState deletion_state(true);
|
||||||
true);
|
|
||||||
assert(bg_flush_scheduled_);
|
assert(bg_flush_scheduled_);
|
||||||
MutexLock l(&mutex_);
|
MutexLock l(&mutex_);
|
||||||
|
|
||||||
@ -1815,8 +1841,7 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() {
|
|||||||
|
|
||||||
void DBImpl::BackgroundCallCompaction() {
|
void DBImpl::BackgroundCallCompaction() {
|
||||||
bool madeProgress = false;
|
bool madeProgress = false;
|
||||||
DeletionState deletion_state(default_cfd_->options()->max_write_buffer_number,
|
DeletionState deletion_state(true);
|
||||||
true);
|
|
||||||
|
|
||||||
MaybeDumpStats();
|
MaybeDumpStats();
|
||||||
|
|
||||||
@ -2077,8 +2102,9 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
|
|||||||
*cfd->full_options(), compact->compaction->output_level(),
|
*cfd->full_options(), compact->compaction->output_level(),
|
||||||
compact->compaction->enable_compression());
|
compact->compaction->enable_compression());
|
||||||
|
|
||||||
compact->builder.reset(GetTableBuilder(
|
compact->builder.reset(
|
||||||
*cfd->full_options(), compact->outfile.get(), compression_type));
|
NewTableBuilder(*cfd->full_options(), cfd->internal_comparator(),
|
||||||
|
compact->outfile.get(), compression_type));
|
||||||
}
|
}
|
||||||
LogFlush(options_.info_log);
|
LogFlush(options_.info_log);
|
||||||
return s;
|
return s;
|
||||||
@ -2126,8 +2152,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
|
|||||||
if (s.ok() && current_entries > 0) {
|
if (s.ok() && current_entries > 0) {
|
||||||
// Verify that the table is usable
|
// Verify that the table is usable
|
||||||
ColumnFamilyData* cfd = compact->compaction->column_family_data();
|
ColumnFamilyData* cfd = compact->compaction->column_family_data();
|
||||||
|
FileMetaData meta(output_number, current_bytes);
|
||||||
Iterator* iter = cfd->table_cache()->NewIterator(
|
Iterator* iter = cfd->table_cache()->NewIterator(
|
||||||
ReadOptions(), storage_options_, output_number, current_bytes);
|
ReadOptions(), storage_options_, cfd->internal_comparator(), meta);
|
||||||
s = iter->status();
|
s = iter->status();
|
||||||
delete iter;
|
delete iter;
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
@ -2641,8 +2668,9 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
|
|||||||
// Collect iterators for files in L0 - Ln
|
// Collect iterators for files in L0 - Ln
|
||||||
super_version->current->AddIterators(options, storage_options_,
|
super_version->current->AddIterators(options, storage_options_,
|
||||||
&iterator_list);
|
&iterator_list);
|
||||||
Iterator* internal_iter = NewMergingIterator(
|
Iterator* internal_iter =
|
||||||
&cfd->internal_comparator(), &iterator_list[0], iterator_list.size());
|
NewMergingIterator(env_, &cfd->internal_comparator(), &iterator_list[0],
|
||||||
|
iterator_list.size());
|
||||||
|
|
||||||
IterState* cleanup = new IterState(this, &mutex_, super_version);
|
IterState* cleanup = new IterState(this, &mutex_, super_version);
|
||||||
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
|
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
|
||||||
@ -2677,8 +2705,8 @@ std::pair<Iterator*, Iterator*> DBImpl::GetTailingIteratorPair(
|
|||||||
std::vector<Iterator*> list;
|
std::vector<Iterator*> list;
|
||||||
super_version->imm->AddIterators(options, &list);
|
super_version->imm->AddIterators(options, &list);
|
||||||
super_version->current->AddIterators(options, storage_options_, &list);
|
super_version->current->AddIterators(options, storage_options_, &list);
|
||||||
Iterator* immutable_iter =
|
Iterator* immutable_iter = NewMergingIterator(
|
||||||
NewMergingIterator(&cfd->internal_comparator(), &list[0], list.size());
|
env_, &cfd->internal_comparator(), &list[0], list.size());
|
||||||
|
|
||||||
// create a DBIter that only uses memtable content; see NewIterator()
|
// create a DBIter that only uses memtable content; see NewIterator()
|
||||||
immutable_iter =
|
immutable_iter =
|
||||||
@ -2739,6 +2767,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
|
|||||||
const Slice& key, std::string* value,
|
const Slice& key, std::string* value,
|
||||||
bool* value_found) {
|
bool* value_found) {
|
||||||
StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
|
StopWatch sw(env_, options_.statistics.get(), DB_GET, false);
|
||||||
|
StopWatchNano snapshot_timer(env_, false);
|
||||||
|
StartPerfTimer(&snapshot_timer);
|
||||||
|
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family.id);
|
auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family.id);
|
||||||
@ -2766,6 +2796,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
|
|||||||
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
||||||
// merge_operands will contain the sequence of merges in the latter case.
|
// merge_operands will contain the sequence of merges in the latter case.
|
||||||
LookupKey lkey(key, snapshot);
|
LookupKey lkey(key, snapshot);
|
||||||
|
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
|
||||||
if (get_version->mem->Get(lkey, value, &s, merge_context,
|
if (get_version->mem->Get(lkey, value, &s, merge_context,
|
||||||
*cfd->full_options())) {
|
*cfd->full_options())) {
|
||||||
// Done
|
// Done
|
||||||
@ -2775,12 +2806,19 @@ Status DBImpl::GetImpl(const ReadOptions& options,
|
|||||||
// Done
|
// Done
|
||||||
RecordTick(options_.statistics.get(), MEMTABLE_HIT);
|
RecordTick(options_.statistics.get(), MEMTABLE_HIT);
|
||||||
} else {
|
} else {
|
||||||
|
StopWatchNano from_files_timer(env_, false);
|
||||||
|
StartPerfTimer(&from_files_timer);
|
||||||
|
|
||||||
get_version->current->Get(options, lkey, value, &s, &merge_context, &stats,
|
get_version->current->Get(options, lkey, value, &s, &merge_context, &stats,
|
||||||
*cfd->full_options(), value_found);
|
*cfd->full_options(), value_found);
|
||||||
have_stat_update = true;
|
have_stat_update = true;
|
||||||
|
BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
|
||||||
RecordTick(options_.statistics.get(), MEMTABLE_MISS);
|
RecordTick(options_.statistics.get(), MEMTABLE_MISS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
StopWatchNano post_process_timer(env_, false);
|
||||||
|
StartPerfTimer(&post_process_timer);
|
||||||
|
|
||||||
bool delete_get_version = false;
|
bool delete_get_version = false;
|
||||||
if (!cfd->options()->disable_seek_compaction && have_stat_update) {
|
if (!cfd->options()->disable_seek_compaction && have_stat_update) {
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
@ -2805,8 +2843,10 @@ Status DBImpl::GetImpl(const ReadOptions& options,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Note, tickers are atomic now - no lock protection needed any more.
|
// Note, tickers are atomic now - no lock protection needed any more.
|
||||||
|
|
||||||
RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
|
RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
|
||||||
RecordTick(options_.statistics.get(), BYTES_READ, value->size());
|
RecordTick(options_.statistics.get(), BYTES_READ, value->size());
|
||||||
|
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2816,6 +2856,9 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
||||||
|
|
||||||
StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
|
StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
|
||||||
|
StopWatchNano snapshot_timer(env_, false);
|
||||||
|
StartPerfTimer(&snapshot_timer);
|
||||||
|
|
||||||
SequenceNumber snapshot;
|
SequenceNumber snapshot;
|
||||||
|
|
||||||
struct MultiGetColumnFamilyData {
|
struct MultiGetColumnFamilyData {
|
||||||
@ -2856,6 +2899,7 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
|
|
||||||
// Keep track of bytes that we read for statistics-recording later
|
// Keep track of bytes that we read for statistics-recording later
|
||||||
uint64_t bytes_read = 0;
|
uint64_t bytes_read = 0;
|
||||||
|
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
|
||||||
|
|
||||||
// For each of the given keys, apply the entire "get" process as follows:
|
// For each of the given keys, apply the entire "get" process as follows:
|
||||||
// First look in the memtable, then in the immutable memtable (if any).
|
// First look in the memtable, then in the immutable memtable (if any).
|
||||||
@ -2889,6 +2933,9 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Post processing (decrement reference counts and record statistics)
|
||||||
|
StopWatchNano post_process_timer(env_, false);
|
||||||
|
StartPerfTimer(&post_process_timer);
|
||||||
autovector<SuperVersion*> superversions_to_delete;
|
autovector<SuperVersion*> superversions_to_delete;
|
||||||
|
|
||||||
bool schedule_flush_or_compaction = false;
|
bool schedule_flush_or_compaction = false;
|
||||||
@ -2921,6 +2968,7 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
|
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
|
||||||
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys);
|
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, num_keys);
|
||||||
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read);
|
RecordTick(options_.statistics.get(), NUMBER_MULTIGET_BYTES_READ, bytes_read);
|
||||||
|
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
|
||||||
|
|
||||||
return stat_list;
|
return stat_list;
|
||||||
}
|
}
|
||||||
@ -3080,6 +3128,8 @@ Status DBImpl::Delete(const WriteOptions& options,
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
|
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
|
||||||
|
StopWatchNano pre_post_process_timer(env_, false);
|
||||||
|
StartPerfTimer(&pre_post_process_timer);
|
||||||
Writer w(&mutex_);
|
Writer w(&mutex_);
|
||||||
w.batch = my_batch;
|
w.batch = my_batch;
|
||||||
w.sync = options.sync;
|
w.sync = options.sync;
|
||||||
@ -3148,6 +3198,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
|
|||||||
if (options.disableWAL) {
|
if (options.disableWAL) {
|
||||||
flush_on_destroy_ = true;
|
flush_on_destroy_ = true;
|
||||||
}
|
}
|
||||||
|
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
|
||||||
|
&pre_post_process_timer);
|
||||||
|
|
||||||
if (!options.disableWAL) {
|
if (!options.disableWAL) {
|
||||||
StopWatchNano timer(env_);
|
StopWatchNano timer(env_);
|
||||||
@ -3156,7 +3208,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
|
|||||||
status = log_->AddRecord(log_entry);
|
status = log_->AddRecord(log_entry);
|
||||||
RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1);
|
RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1);
|
||||||
RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size());
|
RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size());
|
||||||
BumpPerfTime(&perf_context.wal_write_time, &timer);
|
|
||||||
if (status.ok() && options.sync) {
|
if (status.ok() && options.sync) {
|
||||||
if (options_.use_fsync) {
|
if (options_.use_fsync) {
|
||||||
StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
|
StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS);
|
||||||
@ -3166,12 +3217,17 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
|
|||||||
status = log_->file()->Sync();
|
status = log_->file()->Sync();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
BumpPerfTime(&perf_context.write_wal_time, &timer);
|
||||||
}
|
}
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
|
StopWatchNano write_memtable_timer(env_, false);
|
||||||
|
|
||||||
// reading the column family set outside of DB mutex -- should lock
|
// reading the column family set outside of DB mutex -- should lock
|
||||||
versions_->GetColumnFamilySet()->Lock();
|
versions_->GetColumnFamilySet()->Lock();
|
||||||
|
StartPerfTimer(&write_memtable_timer);
|
||||||
status = WriteBatchInternal::InsertInto(
|
status = WriteBatchInternal::InsertInto(
|
||||||
updates, column_family_memtables_.get(), 0, this, false);
|
updates, column_family_memtables_.get(), 0, this, false);
|
||||||
|
BumpPerfTime(&perf_context.write_memtable_time, &write_memtable_timer);
|
||||||
versions_->GetColumnFamilySet()->Unlock();
|
versions_->GetColumnFamilySet()->Unlock();
|
||||||
|
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
@ -3184,6 +3240,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
|
|||||||
SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
|
SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
|
||||||
last_sequence);
|
last_sequence);
|
||||||
}
|
}
|
||||||
|
StartPerfTimer(&pre_post_process_timer);
|
||||||
if (updates == &tmp_batch_) tmp_batch_.Clear();
|
if (updates == &tmp_batch_) tmp_batch_.Clear();
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
@ -3211,6 +3268,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
|
|||||||
writers_.front()->cv.Signal();
|
writers_.front()->cv.Signal();
|
||||||
}
|
}
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
|
BumpPerfTime(&perf_context.write_pre_and_post_process_time,
|
||||||
|
&pre_post_process_timer);
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3420,7 +3479,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
unique_ptr<WritableFile> lfile;
|
unique_ptr<WritableFile> lfile;
|
||||||
MemTable* memtmp = nullptr;
|
MemTable* new_mem = nullptr;
|
||||||
|
|
||||||
// Attempt to switch to a new memtable and trigger compaction of old.
|
// Attempt to switch to a new memtable and trigger compaction of old.
|
||||||
// Do this without holding the dbmutex lock.
|
// Do this without holding the dbmutex lock.
|
||||||
@ -3439,7 +3498,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
|
|||||||
// (compression, etc) but err on the side of caution.
|
// (compression, etc) but err on the side of caution.
|
||||||
lfile->SetPreallocationBlockSize(1.1 *
|
lfile->SetPreallocationBlockSize(1.1 *
|
||||||
cfd->options()->write_buffer_size);
|
cfd->options()->write_buffer_size);
|
||||||
memtmp = new MemTable(cfd->internal_comparator(), *cfd->options());
|
new_mem = new MemTable(cfd->internal_comparator(), *cfd->options());
|
||||||
new_superversion = new SuperVersion();
|
new_superversion = new SuperVersion();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3447,7 +3506,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
|
|||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
// Avoid chewing through file number space in a tight loop.
|
// Avoid chewing through file number space in a tight loop.
|
||||||
versions_->ReuseFileNumber(new_log_number);
|
versions_->ReuseFileNumber(new_log_number);
|
||||||
assert (!memtmp);
|
assert (!new_mem);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
logfile_number_ = new_log_number;
|
logfile_number_ = new_log_number;
|
||||||
@ -3457,12 +3516,12 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
|
|||||||
if (force) {
|
if (force) {
|
||||||
cfd->imm()->FlushRequested();
|
cfd->imm()->FlushRequested();
|
||||||
}
|
}
|
||||||
memtmp->Ref();
|
new_mem->Ref();
|
||||||
memtmp->SetLogNumber(logfile_number_);
|
new_mem->SetLogNumber(logfile_number_);
|
||||||
cfd->SetMemtable(memtmp);
|
cfd->SetMemtable(new_mem);
|
||||||
Log(options_.info_log, "New memtable created with log file: #%lu\n",
|
Log(options_.info_log, "New memtable created with log file: #%lu\n",
|
||||||
(unsigned long)logfile_number_);
|
(unsigned long)logfile_number_);
|
||||||
force = false; // Do not force another compaction if have room
|
force = false; // Do not force another compaction if have room
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
delete cfd->InstallSuperVersion(new_superversion);
|
delete cfd->InstallSuperVersion(new_superversion);
|
||||||
}
|
}
|
||||||
@ -3552,10 +3611,10 @@ Status DBImpl::DeleteFile(std::string name) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int level;
|
int level;
|
||||||
FileMetaData metadata;
|
FileMetaData *metadata;
|
||||||
ColumnFamilyData* cfd;
|
ColumnFamilyData* cfd;
|
||||||
VersionEdit edit;
|
VersionEdit edit;
|
||||||
DeletionState deletion_state(0, true);
|
DeletionState deletion_state(true);
|
||||||
{
|
{
|
||||||
MutexLock l(&mutex_);
|
MutexLock l(&mutex_);
|
||||||
status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
|
status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
|
||||||
@ -3567,7 +3626,7 @@ Status DBImpl::DeleteFile(std::string name) {
|
|||||||
assert((level > 0) && (level < cfd->NumberLevels()));
|
assert((level > 0) && (level < cfd->NumberLevels()));
|
||||||
|
|
||||||
// If the file is being compacted no need to delete.
|
// If the file is being compacted no need to delete.
|
||||||
if (metadata.being_compacted) {
|
if (metadata->being_compacted) {
|
||||||
Log(options_.info_log,
|
Log(options_.info_log,
|
||||||
"DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
|
"DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
@ -3866,7 +3925,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
|
|||||||
|
|
||||||
//
|
//
|
||||||
// A global method that can dump out the build version
|
// A global method that can dump out the build version
|
||||||
void dumpLeveldbBuildVersion(Logger * log) {
|
void DumpLeveldbBuildVersion(Logger * log) {
|
||||||
Log(log, "Git sha %s", rocksdb_build_git_sha);
|
Log(log, "Git sha %s", rocksdb_build_git_sha);
|
||||||
Log(log, "Compile time %s %s",
|
Log(log, "Compile time %s %s",
|
||||||
rocksdb_build_compile_time, rocksdb_build_compile_date);
|
rocksdb_build_compile_time, rocksdb_build_compile_date);
|
||||||
|
22
db/db_impl.h
22
db/db_impl.h
@ -7,24 +7,26 @@
|
|||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <deque>
|
#include <deque>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "db/log_writer.h"
|
#include "db/log_writer.h"
|
||||||
#include "db/snapshot.h"
|
#include "db/snapshot.h"
|
||||||
#include "db/column_family.h"
|
#include "db/column_family.h"
|
||||||
#include "db/version_edit.h"
|
#include "db/version_edit.h"
|
||||||
|
#include "memtable_list.h"
|
||||||
|
#include "port/port.h"
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/memtablerep.h"
|
#include "rocksdb/memtablerep.h"
|
||||||
#include "rocksdb/transaction_log.h"
|
#include "rocksdb/transaction_log.h"
|
||||||
#include "port/port.h"
|
|
||||||
#include "util/stats_logger.h"
|
|
||||||
#include "memtablelist.h"
|
|
||||||
#include "util/autovector.h"
|
#include "util/autovector.h"
|
||||||
|
#include "util/stats_logger.h"
|
||||||
#include "db/internal_stats.h"
|
#include "db/internal_stats.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -178,7 +180,7 @@ class DBImpl : public DB {
|
|||||||
// needed for CleanupIteratorState
|
// needed for CleanupIteratorState
|
||||||
struct DeletionState {
|
struct DeletionState {
|
||||||
inline bool HaveSomethingToDelete() const {
|
inline bool HaveSomethingToDelete() const {
|
||||||
return all_files.size() ||
|
return candidate_files.size() ||
|
||||||
sst_delete_files.size() ||
|
sst_delete_files.size() ||
|
||||||
log_delete_files.size();
|
log_delete_files.size();
|
||||||
}
|
}
|
||||||
@ -186,7 +188,7 @@ class DBImpl : public DB {
|
|||||||
// a list of all files that we'll consider deleting
|
// a list of all files that we'll consider deleting
|
||||||
// (every once in a while this is filled up with all files
|
// (every once in a while this is filled up with all files
|
||||||
// in the DB directory)
|
// in the DB directory)
|
||||||
std::vector<std::string> all_files;
|
std::vector<std::string> candidate_files;
|
||||||
|
|
||||||
// the list of all live sst files that cannot be deleted
|
// the list of all live sst files that cannot be deleted
|
||||||
std::vector<uint64_t> sst_live;
|
std::vector<uint64_t> sst_live;
|
||||||
@ -198,7 +200,7 @@ class DBImpl : public DB {
|
|||||||
std::vector<uint64_t> log_delete_files;
|
std::vector<uint64_t> log_delete_files;
|
||||||
|
|
||||||
// a list of memtables to be free
|
// a list of memtables to be free
|
||||||
std::vector<MemTable *> memtables_to_free;
|
autovector<MemTable*> memtables_to_free;
|
||||||
|
|
||||||
SuperVersion* superversion_to_free; // if nullptr nothing to free
|
SuperVersion* superversion_to_free; // if nullptr nothing to free
|
||||||
|
|
||||||
@ -208,12 +210,10 @@ class DBImpl : public DB {
|
|||||||
// that corresponds to the set of files in 'live'.
|
// that corresponds to the set of files in 'live'.
|
||||||
uint64_t manifest_file_number, log_number, prev_log_number;
|
uint64_t manifest_file_number, log_number, prev_log_number;
|
||||||
|
|
||||||
explicit DeletionState(const int num_memtables = 0,
|
explicit DeletionState(bool create_superversion = false) {
|
||||||
bool create_superversion = false) {
|
|
||||||
manifest_file_number = 0;
|
manifest_file_number = 0;
|
||||||
log_number = 0;
|
log_number = 0;
|
||||||
prev_log_number = 0;
|
prev_log_number = 0;
|
||||||
memtables_to_free.reserve(num_memtables);
|
|
||||||
superversion_to_free = nullptr;
|
superversion_to_free = nullptr;
|
||||||
new_superversion = create_superversion ? new SuperVersion() : nullptr;
|
new_superversion = create_superversion ? new SuperVersion() : nullptr;
|
||||||
}
|
}
|
||||||
@ -232,7 +232,7 @@ class DBImpl : public DB {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Returns the list of live files in 'live' and the list
|
// Returns the list of live files in 'live' and the list
|
||||||
// of all files in the filesystem in 'all_files'.
|
// of all files in the filesystem in 'candidate_files'.
|
||||||
// If force == false and the last call was less than
|
// If force == false and the last call was less than
|
||||||
// options_.delete_obsolete_files_period_micros microseconds ago,
|
// options_.delete_obsolete_files_period_micros microseconds ago,
|
||||||
// it will not fill up the deletion_state
|
// it will not fill up the deletion_state
|
||||||
@ -291,7 +291,7 @@ class DBImpl : public DB {
|
|||||||
// concurrent flush memtables to storage.
|
// concurrent flush memtables to storage.
|
||||||
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
||||||
VersionEdit* edit);
|
VersionEdit* edit);
|
||||||
Status WriteLevel0Table(ColumnFamilyData* cfd, std::vector<MemTable*>& mems,
|
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
|
||||||
VersionEdit* edit, uint64_t* filenumber);
|
VersionEdit* edit, uint64_t* filenumber);
|
||||||
|
|
||||||
uint64_t SlowdownAmount(int n, double bottom, double top);
|
uint64_t SlowdownAmount(int n, double bottom, double top);
|
||||||
|
@ -102,7 +102,8 @@ class DBIter: public Iterator {
|
|||||||
virtual void SeekToLast();
|
virtual void SeekToLast();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void FindNextUserEntry(bool skipping);
|
inline void FindNextUserEntry(bool skipping);
|
||||||
|
void FindNextUserEntryInternal(bool skipping);
|
||||||
void FindPrevUserEntry();
|
void FindPrevUserEntry();
|
||||||
bool ParseKey(ParsedInternalKey* key);
|
bool ParseKey(ParsedInternalKey* key);
|
||||||
void MergeValuesNewToOld();
|
void MergeValuesNewToOld();
|
||||||
@ -191,7 +192,15 @@ void DBIter::Next() {
|
|||||||
//
|
//
|
||||||
// NOTE: In between, saved_key_ can point to a user key that has
|
// NOTE: In between, saved_key_ can point to a user key that has
|
||||||
// a delete marker
|
// a delete marker
|
||||||
void DBIter::FindNextUserEntry(bool skipping) {
|
inline void DBIter::FindNextUserEntry(bool skipping) {
|
||||||
|
StopWatchNano timer(env_, false);
|
||||||
|
StartPerfTimer(&timer);
|
||||||
|
FindNextUserEntryInternal(skipping);
|
||||||
|
BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Actual implementation of DBIter::FindNextUserEntry()
|
||||||
|
void DBIter::FindNextUserEntryInternal(bool skipping) {
|
||||||
// Loop until we hit an acceptable entry to yield
|
// Loop until we hit an acceptable entry to yield
|
||||||
assert(iter_->Valid());
|
assert(iter_->Valid());
|
||||||
assert(direction_ == kForward);
|
assert(direction_ == kForward);
|
||||||
@ -226,10 +235,7 @@ void DBIter::FindNextUserEntry(bool skipping) {
|
|||||||
valid_ = true;
|
valid_ = true;
|
||||||
MergeValuesNewToOld(); // Go to a different state machine
|
MergeValuesNewToOld(); // Go to a different state machine
|
||||||
return;
|
return;
|
||||||
case kTypeColumnFamilyDeletion:
|
default:
|
||||||
case kTypeColumnFamilyValue:
|
|
||||||
case kTypeColumnFamilyMerge:
|
|
||||||
case kTypeLogData:
|
|
||||||
assert(false);
|
assert(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -429,13 +435,16 @@ void DBIter::FindPrevUserEntry() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void DBIter::Seek(const Slice& target) {
|
void DBIter::Seek(const Slice& target) {
|
||||||
direction_ = kForward;
|
|
||||||
ClearSavedValue();
|
|
||||||
saved_key_.clear();
|
saved_key_.clear();
|
||||||
AppendInternalKey(
|
AppendInternalKey(
|
||||||
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
|
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
|
||||||
|
StopWatchNano internal_seek_timer(env_, false);
|
||||||
|
StartPerfTimer(&internal_seek_timer);
|
||||||
iter_->Seek(saved_key_);
|
iter_->Seek(saved_key_);
|
||||||
|
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
|
||||||
if (iter_->Valid()) {
|
if (iter_->Valid()) {
|
||||||
|
direction_ = kForward;
|
||||||
|
ClearSavedValue();
|
||||||
FindNextUserEntry(false /*not skipping */);
|
FindNextUserEntry(false /*not skipping */);
|
||||||
} else {
|
} else {
|
||||||
valid_ = false;
|
valid_ = false;
|
||||||
@ -445,7 +454,10 @@ void DBIter::Seek(const Slice& target) {
|
|||||||
void DBIter::SeekToFirst() {
|
void DBIter::SeekToFirst() {
|
||||||
direction_ = kForward;
|
direction_ = kForward;
|
||||||
ClearSavedValue();
|
ClearSavedValue();
|
||||||
|
StopWatchNano internal_seek_timer(env_, false);
|
||||||
|
StartPerfTimer(&internal_seek_timer);
|
||||||
iter_->SeekToFirst();
|
iter_->SeekToFirst();
|
||||||
|
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
|
||||||
if (iter_->Valid()) {
|
if (iter_->Valid()) {
|
||||||
FindNextUserEntry(false /* not skipping */);
|
FindNextUserEntry(false /* not skipping */);
|
||||||
} else {
|
} else {
|
||||||
@ -464,7 +476,10 @@ void DBIter::SeekToLast() {
|
|||||||
|
|
||||||
direction_ = kReverse;
|
direction_ = kReverse;
|
||||||
ClearSavedValue();
|
ClearSavedValue();
|
||||||
|
StopWatchNano internal_seek_timer(env_, false);
|
||||||
|
StartPerfTimer(&internal_seek_timer);
|
||||||
iter_->SeekToLast();
|
iter_->SeekToLast();
|
||||||
|
BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
|
||||||
FindPrevUserEntry();
|
FindPrevUserEntry();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
405
db/db_test.cc
405
db/db_test.cc
@ -11,25 +11,29 @@
|
|||||||
#include <set>
|
#include <set>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "rocksdb/db.h"
|
#include "db/dbformat.h"
|
||||||
#include "rocksdb/filter_policy.h"
|
|
||||||
#include "db/db_impl.h"
|
#include "db/db_impl.h"
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
#include "db/version_set.h"
|
#include "db/version_set.h"
|
||||||
#include "db/write_batch_internal.h"
|
#include "db/write_batch_internal.h"
|
||||||
#include "table/block_based_table_factory.h"
|
|
||||||
#include "rocksdb/cache.h"
|
#include "rocksdb/cache.h"
|
||||||
#include "rocksdb/compaction_filter.h"
|
#include "rocksdb/compaction_filter.h"
|
||||||
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
|
#include "rocksdb/filter_policy.h"
|
||||||
|
#include "rocksdb/perf_context.h"
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
#include "rocksdb/slice.h"
|
#include "rocksdb/slice.h"
|
||||||
#include "rocksdb/slice_transform.h"
|
#include "rocksdb/slice_transform.h"
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
|
#include "table/block_based_table_factory.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
|
#include "util/hash_linklist_rep.h"
|
||||||
#include "util/logging.h"
|
#include "util/logging.h"
|
||||||
#include "util/mutexlock.h"
|
#include "util/mutexlock.h"
|
||||||
|
#include "util/statistics.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
#include "util/testutil.h"
|
#include "util/testutil.h"
|
||||||
#include "util/statistics.h"
|
|
||||||
#include "utilities/merge_operators.h"
|
#include "utilities/merge_operators.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -241,12 +245,17 @@ class SpecialEnv : public EnvWrapper {
|
|||||||
class DBTest {
|
class DBTest {
|
||||||
private:
|
private:
|
||||||
const FilterPolicy* filter_policy_;
|
const FilterPolicy* filter_policy_;
|
||||||
|
static std::unique_ptr<const SliceTransform> prefix_1_transform;
|
||||||
|
static std::unique_ptr<const SliceTransform> noop_transform;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Sequence of option configurations to try
|
// Sequence of option configurations to try
|
||||||
enum OptionConfig {
|
enum OptionConfig {
|
||||||
kDefault,
|
kDefault,
|
||||||
|
kPlainTableFirstBytePrefix,
|
||||||
|
kPlainTableAllBytesPrefix,
|
||||||
kVectorRep,
|
kVectorRep,
|
||||||
|
kHashLinkList,
|
||||||
kMergePut,
|
kMergePut,
|
||||||
kFilter,
|
kFilter,
|
||||||
kUncompressed,
|
kUncompressed,
|
||||||
@ -260,6 +269,7 @@ class DBTest {
|
|||||||
kHashSkipList,
|
kHashSkipList,
|
||||||
kUniversalCompaction,
|
kUniversalCompaction,
|
||||||
kCompressedBlockCache,
|
kCompressedBlockCache,
|
||||||
|
kInfiniteMaxOpenFiles,
|
||||||
kEnd
|
kEnd
|
||||||
};
|
};
|
||||||
int option_config_;
|
int option_config_;
|
||||||
@ -277,7 +287,8 @@ class DBTest {
|
|||||||
kNoSkip = 0,
|
kNoSkip = 0,
|
||||||
kSkipDeletesFilterFirst = 1,
|
kSkipDeletesFilterFirst = 1,
|
||||||
kSkipUniversalCompaction = 2,
|
kSkipUniversalCompaction = 2,
|
||||||
kSkipMergePut = 4
|
kSkipMergePut = 4,
|
||||||
|
kSkipPlainTable = 8
|
||||||
};
|
};
|
||||||
|
|
||||||
DBTest() : option_config_(kDefault),
|
DBTest() : option_config_(kDefault),
|
||||||
@ -299,20 +310,27 @@ class DBTest {
|
|||||||
// Switch to a fresh database with the next option configuration to
|
// Switch to a fresh database with the next option configuration to
|
||||||
// test. Return false if there are no more configurations to test.
|
// test. Return false if there are no more configurations to test.
|
||||||
bool ChangeOptions(int skip_mask = kNoSkip) {
|
bool ChangeOptions(int skip_mask = kNoSkip) {
|
||||||
option_config_++;
|
|
||||||
|
|
||||||
// skip some options
|
// skip some options
|
||||||
if (skip_mask & kSkipDeletesFilterFirst &&
|
for(option_config_++; option_config_ < kEnd; option_config_++) {
|
||||||
option_config_ == kDeletesFilterFirst) {
|
if ((skip_mask & kSkipDeletesFilterFirst) &&
|
||||||
option_config_++;
|
option_config_ == kDeletesFilterFirst) {
|
||||||
}
|
continue;
|
||||||
if (skip_mask & kSkipUniversalCompaction &&
|
}
|
||||||
option_config_ == kUniversalCompaction) {
|
if ((skip_mask & kSkipUniversalCompaction) &&
|
||||||
option_config_++;
|
option_config_ == kUniversalCompaction) {
|
||||||
}
|
continue;
|
||||||
if (skip_mask & kSkipMergePut && option_config_ == kMergePut) {
|
}
|
||||||
option_config_++;
|
if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ((skip_mask & kSkipPlainTable)
|
||||||
|
&& (option_config_ == kPlainTableAllBytesPrefix
|
||||||
|
|| option_config_ == kPlainTableFirstBytePrefix)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (option_config_ >= kEnd) {
|
if (option_config_ >= kEnd) {
|
||||||
Destroy(&last_options_);
|
Destroy(&last_options_);
|
||||||
return false;
|
return false;
|
||||||
@ -345,6 +363,18 @@ class DBTest {
|
|||||||
options.memtable_factory.reset(
|
options.memtable_factory.reset(
|
||||||
NewHashSkipListRepFactory(NewFixedPrefixTransform(1)));
|
NewHashSkipListRepFactory(NewFixedPrefixTransform(1)));
|
||||||
break;
|
break;
|
||||||
|
case kPlainTableFirstBytePrefix:
|
||||||
|
options.table_factory.reset(new PlainTableFactory());
|
||||||
|
options.prefix_extractor = prefix_1_transform.get();
|
||||||
|
options.allow_mmap_reads = true;
|
||||||
|
options.max_sequential_skip_in_iterations = 999999;
|
||||||
|
break;
|
||||||
|
case kPlainTableAllBytesPrefix:
|
||||||
|
options.table_factory.reset(new PlainTableFactory());
|
||||||
|
options.prefix_extractor = noop_transform.get();
|
||||||
|
options.allow_mmap_reads = true;
|
||||||
|
options.max_sequential_skip_in_iterations = 999999;
|
||||||
|
break;
|
||||||
case kMergePut:
|
case kMergePut:
|
||||||
options.merge_operator = MergeOperators::CreatePutOperator();
|
options.merge_operator = MergeOperators::CreatePutOperator();
|
||||||
break;
|
break;
|
||||||
@ -380,12 +410,19 @@ class DBTest {
|
|||||||
case kVectorRep:
|
case kVectorRep:
|
||||||
options.memtable_factory.reset(new VectorRepFactory(100));
|
options.memtable_factory.reset(new VectorRepFactory(100));
|
||||||
break;
|
break;
|
||||||
|
case kHashLinkList:
|
||||||
|
options.memtable_factory.reset(
|
||||||
|
NewHashLinkListRepFactory(NewFixedPrefixTransform(1), 4));
|
||||||
|
break;
|
||||||
case kUniversalCompaction:
|
case kUniversalCompaction:
|
||||||
options.compaction_style = kCompactionStyleUniversal;
|
options.compaction_style = kCompactionStyleUniversal;
|
||||||
break;
|
break;
|
||||||
case kCompressedBlockCache:
|
case kCompressedBlockCache:
|
||||||
options.block_cache_compressed = NewLRUCache(8*1024*1024);
|
options.block_cache_compressed = NewLRUCache(8*1024*1024);
|
||||||
break;
|
break;
|
||||||
|
case kInfiniteMaxOpenFiles:
|
||||||
|
options.max_open_files = -1;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -526,10 +563,7 @@ class DBTest {
|
|||||||
case kTypeDeletion:
|
case kTypeDeletion:
|
||||||
result += "DEL";
|
result += "DEL";
|
||||||
break;
|
break;
|
||||||
case kTypeColumnFamilyDeletion:
|
default:
|
||||||
case kTypeColumnFamilyValue:
|
|
||||||
case kTypeColumnFamilyMerge:
|
|
||||||
case kTypeLogData:
|
|
||||||
assert(false);
|
assert(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -680,6 +714,72 @@ class DBTest {
|
|||||||
delete iter;
|
delete iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Used to test InplaceUpdate
|
||||||
|
|
||||||
|
// If previous value is nullptr or delta is > than previous value,
|
||||||
|
// sets newValue with delta
|
||||||
|
// If previous value is not empty,
|
||||||
|
// updates previous value with 'b' string of previous value size - 1.
|
||||||
|
static UpdateStatus
|
||||||
|
updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize,
|
||||||
|
Slice delta, std::string* newValue) {
|
||||||
|
if (prevValue == nullptr) {
|
||||||
|
*newValue = std::string(delta.size(), 'c');
|
||||||
|
return UpdateStatus::UPDATED;
|
||||||
|
} else {
|
||||||
|
*prevSize = *prevSize - 1;
|
||||||
|
std::string str_b = std::string(*prevSize, 'b');
|
||||||
|
memcpy(prevValue, str_b.c_str(), str_b.size());
|
||||||
|
return UpdateStatus::UPDATED_INPLACE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static UpdateStatus
|
||||||
|
updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize,
|
||||||
|
Slice delta, std::string* newValue) {
|
||||||
|
if (prevValue == nullptr) {
|
||||||
|
*newValue = std::string(delta.size(), 'c');
|
||||||
|
return UpdateStatus::UPDATED;
|
||||||
|
} else {
|
||||||
|
*prevSize = 1;
|
||||||
|
std::string str_b = std::string(*prevSize, 'b');
|
||||||
|
memcpy(prevValue, str_b.c_str(), str_b.size());
|
||||||
|
return UpdateStatus::UPDATED_INPLACE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static UpdateStatus
|
||||||
|
updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize,
|
||||||
|
Slice delta, std::string* newValue) {
|
||||||
|
*newValue = std::string(delta.size(), 'c');
|
||||||
|
return UpdateStatus::UPDATED;
|
||||||
|
}
|
||||||
|
|
||||||
|
static UpdateStatus
|
||||||
|
updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
|
||||||
|
Slice delta, std::string* newValue) {
|
||||||
|
return UpdateStatus::UPDATE_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Utility method to test InplaceUpdate
|
||||||
|
void validateNumberOfEntries(int numValues) {
|
||||||
|
Iterator* iter = dbfull()->TEST_NewInternalIterator();
|
||||||
|
iter->SeekToFirst();
|
||||||
|
ASSERT_EQ(iter->status().ok(), true);
|
||||||
|
int seq = numValues;
|
||||||
|
while (iter->Valid()) {
|
||||||
|
ParsedInternalKey ikey;
|
||||||
|
ikey.sequence = -1;
|
||||||
|
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
|
||||||
|
|
||||||
|
// checks sequence number for updates
|
||||||
|
ASSERT_EQ(ikey.sequence, (unsigned)seq--);
|
||||||
|
iter->Next();
|
||||||
|
}
|
||||||
|
delete iter;
|
||||||
|
ASSERT_EQ(0, seq);
|
||||||
|
}
|
||||||
|
|
||||||
void CopyFile(const std::string& source, const std::string& destination,
|
void CopyFile(const std::string& source, const std::string& destination,
|
||||||
uint64_t size = 0) {
|
uint64_t size = 0) {
|
||||||
const EnvOptions soptions;
|
const EnvOptions soptions;
|
||||||
@ -705,6 +805,10 @@ class DBTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
std::unique_ptr<const SliceTransform> DBTest::prefix_1_transform(
|
||||||
|
NewFixedPrefixTransform(1));
|
||||||
|
std::unique_ptr<const SliceTransform> DBTest::noop_transform(
|
||||||
|
NewNoopTransform());
|
||||||
|
|
||||||
static std::string Key(int i) {
|
static std::string Key(int i) {
|
||||||
char buf[100];
|
char buf[100];
|
||||||
@ -718,19 +822,19 @@ static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
|
|||||||
|
|
||||||
TEST(DBTest, Empty) {
|
TEST(DBTest, Empty) {
|
||||||
do {
|
do {
|
||||||
ASSERT_TRUE(db_ != nullptr);
|
Options options = CurrentOptions();
|
||||||
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
options.env = env_;
|
||||||
} while (ChangeOptions());
|
options.write_buffer_size = 100000; // Small write buffer
|
||||||
}
|
Reopen(&options);
|
||||||
|
|
||||||
TEST(DBTest, ReadWrite) {
|
|
||||||
do {
|
|
||||||
ASSERT_OK(Put("foo", "v1"));
|
ASSERT_OK(Put("foo", "v1"));
|
||||||
ASSERT_EQ("v1", Get("foo"));
|
ASSERT_EQ("v1", Get("foo"));
|
||||||
ASSERT_OK(Put("bar", "v2"));
|
|
||||||
ASSERT_OK(Put("foo", "v3"));
|
env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls
|
||||||
ASSERT_EQ("v3", Get("foo"));
|
Put("k1", std::string(100000, 'x')); // Fill memtable
|
||||||
ASSERT_EQ("v2", Get("bar"));
|
Put("k2", std::string(100000, 'y')); // Trigger compaction
|
||||||
|
ASSERT_EQ("v1", Get("foo"));
|
||||||
|
env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls
|
||||||
} while (ChangeOptions());
|
} while (ChangeOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -769,7 +873,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
|
|||||||
|
|
||||||
ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
|
ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
|
||||||
// Create a new talbe.
|
// Create a new talbe.
|
||||||
dbfull()->Flush(FlushOptions());
|
ASSERT_OK(dbfull()->Flush(FlushOptions()));
|
||||||
|
|
||||||
// index/filter blocks added to block cache right after table creation.
|
// index/filter blocks added to block cache right after table creation.
|
||||||
ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
|
ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
|
||||||
@ -1051,7 +1155,10 @@ TEST(DBTest, KeyMayExist) {
|
|||||||
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
|
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
|
||||||
|
|
||||||
delete options.filter_policy;
|
delete options.filter_policy;
|
||||||
} while (ChangeOptions());
|
|
||||||
|
// KeyMayExist function only checks data in block caches, which is not used
|
||||||
|
// by plain table format.
|
||||||
|
} while (ChangeOptions(kSkipPlainTable));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, NonBlockingIteration) {
|
TEST(DBTest, NonBlockingIteration) {
|
||||||
@ -1111,7 +1218,9 @@ TEST(DBTest, NonBlockingIteration) {
|
|||||||
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
|
ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
|
||||||
delete iter;
|
delete iter;
|
||||||
|
|
||||||
} while (ChangeOptions());
|
// This test verifies block cache behaviors, which is not used by plain
|
||||||
|
// table format.
|
||||||
|
} while (ChangeOptions(kSkipPlainTable));
|
||||||
}
|
}
|
||||||
|
|
||||||
// A delete is skipped for key if KeyMayExist(key) returns False
|
// A delete is skipped for key if KeyMayExist(key) returns False
|
||||||
@ -1250,7 +1359,13 @@ TEST(DBTest, IterMulti) {
|
|||||||
ASSERT_EQ(IterStatus(iter), "a->va");
|
ASSERT_EQ(IterStatus(iter), "a->va");
|
||||||
iter->Seek("ax");
|
iter->Seek("ax");
|
||||||
ASSERT_EQ(IterStatus(iter), "b->vb");
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
||||||
|
|
||||||
|
SetPerfLevel(kEnableTime);
|
||||||
|
perf_context.Reset();
|
||||||
iter->Seek("b");
|
iter->Seek("b");
|
||||||
|
ASSERT_TRUE((int) perf_context.seek_internal_seek_time > 0);
|
||||||
|
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
|
||||||
|
SetPerfLevel(kDisable);
|
||||||
ASSERT_EQ(IterStatus(iter), "b->vb");
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
||||||
iter->Seek("z");
|
iter->Seek("z");
|
||||||
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
ASSERT_EQ(IterStatus(iter), "(invalid)");
|
||||||
@ -1265,7 +1380,12 @@ TEST(DBTest, IterMulti) {
|
|||||||
// Switch from forward to reverse
|
// Switch from forward to reverse
|
||||||
iter->SeekToFirst();
|
iter->SeekToFirst();
|
||||||
iter->Next();
|
iter->Next();
|
||||||
|
SetPerfLevel(kEnableTime);
|
||||||
|
perf_context.Reset();
|
||||||
iter->Next();
|
iter->Next();
|
||||||
|
ASSERT_EQ(0, (int) perf_context.seek_internal_seek_time);
|
||||||
|
ASSERT_TRUE((int) perf_context.find_next_user_entry_time > 0);
|
||||||
|
SetPerfLevel(kDisable);
|
||||||
iter->Prev();
|
iter->Prev();
|
||||||
ASSERT_EQ(IterStatus(iter), "b->vb");
|
ASSERT_EQ(IterStatus(iter), "b->vb");
|
||||||
|
|
||||||
@ -1696,22 +1816,42 @@ TEST(DBTest, NumImmutableMemTable) {
|
|||||||
|
|
||||||
std::string big_value(1000000, 'x');
|
std::string big_value(1000000, 'x');
|
||||||
std::string num;
|
std::string num;
|
||||||
|
SetPerfLevel(kEnableTime);;
|
||||||
|
|
||||||
ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
|
ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
|
||||||
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
||||||
ASSERT_EQ(num, "0");
|
ASSERT_EQ(num, "0");
|
||||||
|
perf_context.Reset();
|
||||||
|
Get("k1");
|
||||||
|
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
|
||||||
|
|
||||||
ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
|
ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
|
||||||
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
||||||
ASSERT_EQ(num, "1");
|
ASSERT_EQ(num, "1");
|
||||||
|
perf_context.Reset();
|
||||||
|
Get("k1");
|
||||||
|
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
|
||||||
|
perf_context.Reset();
|
||||||
|
Get("k2");
|
||||||
|
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
|
||||||
|
|
||||||
ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
|
ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
|
||||||
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
||||||
ASSERT_EQ(num, "2");
|
ASSERT_EQ(num, "2");
|
||||||
|
perf_context.Reset();
|
||||||
|
Get("k2");
|
||||||
|
ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
|
||||||
|
perf_context.Reset();
|
||||||
|
Get("k3");
|
||||||
|
ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
|
||||||
|
perf_context.Reset();
|
||||||
|
Get("k1");
|
||||||
|
ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
|
||||||
|
|
||||||
dbfull()->Flush(FlushOptions());
|
dbfull()->Flush(FlushOptions());
|
||||||
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
|
||||||
ASSERT_EQ(num, "0");
|
ASSERT_EQ(num, "0");
|
||||||
|
SetPerfLevel(kDisable);
|
||||||
} while (ChangeCompactOptions());
|
} while (ChangeCompactOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1720,11 +1860,16 @@ TEST(DBTest, FLUSH) {
|
|||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
WriteOptions writeOpt = WriteOptions();
|
WriteOptions writeOpt = WriteOptions();
|
||||||
writeOpt.disableWAL = true;
|
writeOpt.disableWAL = true;
|
||||||
|
SetPerfLevel(kEnableTime);;
|
||||||
ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
|
ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
|
||||||
// this will now also flush the last 2 writes
|
// this will now also flush the last 2 writes
|
||||||
dbfull()->Flush(FlushOptions());
|
dbfull()->Flush(FlushOptions());
|
||||||
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
|
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v1"));
|
||||||
|
|
||||||
|
perf_context.Reset();
|
||||||
|
Get("foo");
|
||||||
|
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
|
||||||
|
|
||||||
Reopen();
|
Reopen();
|
||||||
ASSERT_EQ("v1", Get("foo"));
|
ASSERT_EQ("v1", Get("foo"));
|
||||||
ASSERT_EQ("v1", Get("bar"));
|
ASSERT_EQ("v1", Get("bar"));
|
||||||
@ -1736,7 +1881,9 @@ TEST(DBTest, FLUSH) {
|
|||||||
|
|
||||||
Reopen();
|
Reopen();
|
||||||
ASSERT_EQ("v2", Get("bar"));
|
ASSERT_EQ("v2", Get("bar"));
|
||||||
|
perf_context.Reset();
|
||||||
ASSERT_EQ("v2", Get("foo"));
|
ASSERT_EQ("v2", Get("foo"));
|
||||||
|
ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
|
||||||
|
|
||||||
writeOpt.disableWAL = false;
|
writeOpt.disableWAL = false;
|
||||||
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
|
ASSERT_OK(dbfull()->Put(writeOpt, "bar", "v3"));
|
||||||
@ -1748,6 +1895,8 @@ TEST(DBTest, FLUSH) {
|
|||||||
// has WAL enabled.
|
// has WAL enabled.
|
||||||
ASSERT_EQ("v3", Get("foo"));
|
ASSERT_EQ("v3", Get("foo"));
|
||||||
ASSERT_EQ("v3", Get("bar"));
|
ASSERT_EQ("v3", Get("bar"));
|
||||||
|
|
||||||
|
SetPerfLevel(kDisable);
|
||||||
} while (ChangeCompactOptions());
|
} while (ChangeCompactOptions());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2559,9 +2708,9 @@ TEST(DBTest, InPlaceUpdate) {
|
|||||||
options.inplace_update_support = true;
|
options.inplace_update_support = true;
|
||||||
options.env = env_;
|
options.env = env_;
|
||||||
options.write_buffer_size = 100000;
|
options.write_buffer_size = 100000;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
// Update key with values of smaller size
|
// Update key with values of smaller size
|
||||||
Reopen(&options);
|
|
||||||
int numValues = 10;
|
int numValues = 10;
|
||||||
for (int i = numValues; i > 0; i--) {
|
for (int i = numValues; i > 0; i--) {
|
||||||
std::string value = DummyString(i, 'a');
|
std::string value = DummyString(i, 'a');
|
||||||
@ -2569,50 +2718,133 @@ TEST(DBTest, InPlaceUpdate) {
|
|||||||
ASSERT_EQ(value, Get("key"));
|
ASSERT_EQ(value, Get("key"));
|
||||||
}
|
}
|
||||||
|
|
||||||
int count = 0;
|
|
||||||
Iterator* iter = dbfull()->TEST_NewInternalIterator();
|
|
||||||
iter->SeekToFirst();
|
|
||||||
ASSERT_EQ(iter->status().ok(), true);
|
|
||||||
while (iter->Valid()) {
|
|
||||||
ParsedInternalKey ikey(Slice(), 0, kTypeValue);
|
|
||||||
ikey.sequence = -1;
|
|
||||||
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
|
|
||||||
count++;
|
|
||||||
// All updates with the same sequence number.
|
|
||||||
ASSERT_EQ(ikey.sequence, (unsigned)1);
|
|
||||||
iter->Next();
|
|
||||||
}
|
|
||||||
// Only 1 instance for that key.
|
// Only 1 instance for that key.
|
||||||
ASSERT_EQ(count, 1);
|
validateNumberOfEntries(1);
|
||||||
delete iter;
|
|
||||||
|
} while (ChangeCompactOptions());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DBTest, InPlaceUpdateLargeNewValue) {
|
||||||
|
do {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.inplace_update_support = true;
|
||||||
|
options.env = env_;
|
||||||
|
options.write_buffer_size = 100000;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
// Update key with values of larger size
|
// Update key with values of larger size
|
||||||
DestroyAndReopen(&options);
|
int numValues = 10;
|
||||||
numValues = 10;
|
|
||||||
for (int i = 0; i < numValues; i++) {
|
for (int i = 0; i < numValues; i++) {
|
||||||
std::string value = DummyString(i, 'a');
|
std::string value = DummyString(i, 'a');
|
||||||
ASSERT_OK(Put("key", value));
|
ASSERT_OK(Put("key", value));
|
||||||
ASSERT_EQ(value, Get("key"));
|
ASSERT_EQ(value, Get("key"));
|
||||||
}
|
}
|
||||||
|
|
||||||
count = 0;
|
|
||||||
iter = dbfull()->TEST_NewInternalIterator();
|
|
||||||
iter->SeekToFirst();
|
|
||||||
ASSERT_EQ(iter->status().ok(), true);
|
|
||||||
int seq = numValues;
|
|
||||||
while (iter->Valid()) {
|
|
||||||
ParsedInternalKey ikey(Slice(), 0, kTypeValue);
|
|
||||||
ikey.sequence = -1;
|
|
||||||
ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
|
|
||||||
count++;
|
|
||||||
// No inplace updates. All updates are puts with new seq number
|
|
||||||
ASSERT_EQ(ikey.sequence, (unsigned)seq--);
|
|
||||||
iter->Next();
|
|
||||||
}
|
|
||||||
// All 10 updates exist in the internal iterator
|
// All 10 updates exist in the internal iterator
|
||||||
ASSERT_EQ(count, numValues);
|
validateNumberOfEntries(numValues);
|
||||||
delete iter;
|
|
||||||
|
|
||||||
|
} while (ChangeCompactOptions());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
|
||||||
|
do {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.inplace_update_support = true;
|
||||||
|
|
||||||
|
options.env = env_;
|
||||||
|
options.write_buffer_size = 100000;
|
||||||
|
options.inplace_callback =
|
||||||
|
rocksdb::DBTest::updateInPlaceSmallerSize;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
// Update key with values of smaller size
|
||||||
|
int numValues = 10;
|
||||||
|
ASSERT_OK(Put("key", DummyString(numValues, 'a')));
|
||||||
|
ASSERT_EQ(DummyString(numValues, 'c'), Get("key"));
|
||||||
|
|
||||||
|
for (int i = numValues; i > 0; i--) {
|
||||||
|
ASSERT_OK(Put("key", DummyString(i, 'a')));
|
||||||
|
ASSERT_EQ(DummyString(i - 1, 'b'), Get("key"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only 1 instance for that key.
|
||||||
|
validateNumberOfEntries(1);
|
||||||
|
|
||||||
|
} while (ChangeCompactOptions());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
|
||||||
|
do {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.inplace_update_support = true;
|
||||||
|
|
||||||
|
options.env = env_;
|
||||||
|
options.write_buffer_size = 100000;
|
||||||
|
options.inplace_callback =
|
||||||
|
rocksdb::DBTest::updateInPlaceSmallerVarintSize;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
// Update key with values of smaller varint size
|
||||||
|
int numValues = 265;
|
||||||
|
ASSERT_OK(Put("key", DummyString(numValues, 'a')));
|
||||||
|
ASSERT_EQ(DummyString(numValues, 'c'), Get("key"));
|
||||||
|
|
||||||
|
for (int i = numValues; i > 0; i--) {
|
||||||
|
ASSERT_OK(Put("key", DummyString(i, 'a')));
|
||||||
|
ASSERT_EQ(DummyString(1, 'b'), Get("key"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only 1 instance for that key.
|
||||||
|
validateNumberOfEntries(1);
|
||||||
|
|
||||||
|
} while (ChangeCompactOptions());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
|
||||||
|
do {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.inplace_update_support = true;
|
||||||
|
|
||||||
|
options.env = env_;
|
||||||
|
options.write_buffer_size = 100000;
|
||||||
|
options.inplace_callback =
|
||||||
|
rocksdb::DBTest::updateInPlaceLargerSize;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
// Update key with values of larger size
|
||||||
|
int numValues = 10;
|
||||||
|
for (int i = 0; i < numValues; i++) {
|
||||||
|
ASSERT_OK(Put("key", DummyString(i, 'a')));
|
||||||
|
ASSERT_EQ(DummyString(i, 'c'), Get("key"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// No inplace updates. All updates are puts with new seq number
|
||||||
|
// All 10 updates exist in the internal iterator
|
||||||
|
validateNumberOfEntries(numValues);
|
||||||
|
|
||||||
|
} while (ChangeCompactOptions());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DBTest, InPlaceUpdateCallbackNoAction) {
|
||||||
|
do {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.inplace_update_support = true;
|
||||||
|
|
||||||
|
options.env = env_;
|
||||||
|
options.write_buffer_size = 100000;
|
||||||
|
options.inplace_callback =
|
||||||
|
rocksdb::DBTest::updateInPlaceNoAction;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
// Callback function requests no actions from db
|
||||||
|
ASSERT_OK(Put("key", DummyString(1, 'a')));
|
||||||
|
ASSERT_EQ(Get("key"), "NOT_FOUND");
|
||||||
|
|
||||||
} while (ChangeCompactOptions());
|
} while (ChangeCompactOptions());
|
||||||
}
|
}
|
||||||
@ -2653,9 +2885,7 @@ class DeleteFilter : public CompactionFilter {
|
|||||||
|
|
||||||
class ChangeFilter : public CompactionFilter {
|
class ChangeFilter : public CompactionFilter {
|
||||||
public:
|
public:
|
||||||
explicit ChangeFilter(int argv) {
|
explicit ChangeFilter() {}
|
||||||
assert(argv == 100);
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool Filter(int level, const Slice& key,
|
virtual bool Filter(int level, const Slice& key,
|
||||||
const Slice& value, std::string* new_value,
|
const Slice& value, std::string* new_value,
|
||||||
@ -2697,19 +2927,16 @@ class DeleteFilterFactory : public CompactionFilterFactory {
|
|||||||
|
|
||||||
class ChangeFilterFactory : public CompactionFilterFactory {
|
class ChangeFilterFactory : public CompactionFilterFactory {
|
||||||
public:
|
public:
|
||||||
explicit ChangeFilterFactory(int argv) : argv_(argv) {}
|
explicit ChangeFilterFactory() {}
|
||||||
|
|
||||||
virtual std::unique_ptr<CompactionFilter>
|
virtual std::unique_ptr<CompactionFilter>
|
||||||
CreateCompactionFilter(const CompactionFilter::Context& context) override {
|
CreateCompactionFilter(const CompactionFilter::Context& context) override {
|
||||||
return std::unique_ptr<CompactionFilter>(new ChangeFilter(argv_));
|
return std::unique_ptr<CompactionFilter>(new ChangeFilter());
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual const char* Name() const override {
|
virtual const char* Name() const override {
|
||||||
return "ChangeFilterFactory";
|
return "ChangeFilterFactory";
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
const int argv_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST(DBTest, CompactionFilter) {
|
TEST(DBTest, CompactionFilter) {
|
||||||
@ -2856,7 +3083,7 @@ TEST(DBTest, CompactionFilterWithValueChange) {
|
|||||||
options.num_levels = 3;
|
options.num_levels = 3;
|
||||||
options.max_mem_compaction_level = 0;
|
options.max_mem_compaction_level = 0;
|
||||||
options.compaction_filter_factory =
|
options.compaction_filter_factory =
|
||||||
std::make_shared<ChangeFilterFactory>(100);
|
std::make_shared<ChangeFilterFactory>();
|
||||||
Reopen(&options);
|
Reopen(&options);
|
||||||
|
|
||||||
// Write 100K+1 keys, these are written to a few files
|
// Write 100K+1 keys, these are written to a few files
|
||||||
@ -3000,7 +3227,8 @@ TEST(DBTest, ApproximateSizes) {
|
|||||||
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||||
ASSERT_GT(NumTableFilesAtLevel(1), 0);
|
ASSERT_GT(NumTableFilesAtLevel(1), 0);
|
||||||
}
|
}
|
||||||
} while (ChangeOptions(kSkipUniversalCompaction));
|
// ApproximateOffsetOf() is not yet implemented in plain table format.
|
||||||
|
} while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
|
TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
|
||||||
@ -3038,7 +3266,8 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
|
|||||||
|
|
||||||
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
|
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
|
||||||
}
|
}
|
||||||
} while (ChangeOptions());
|
// ApproximateOffsetOf() is not yet implemented in plain table format.
|
||||||
|
} while (ChangeOptions(kSkipPlainTable));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, IteratorPinsRef) {
|
TEST(DBTest, IteratorPinsRef) {
|
||||||
@ -3122,7 +3351,9 @@ TEST(DBTest, HiddenValuesAreRemoved) {
|
|||||||
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
|
ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
|
||||||
|
|
||||||
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
|
ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
|
||||||
} while (ChangeOptions(kSkipUniversalCompaction));
|
// ApproximateOffsetOf() is not yet implemented in plain table format,
|
||||||
|
// which is used by Size().
|
||||||
|
} while (ChangeOptions(kSkipUniversalCompaction | kSkipPlainTable));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DBTest, CompactBetweenSnapshots) {
|
TEST(DBTest, CompactBetweenSnapshots) {
|
||||||
@ -4790,7 +5021,9 @@ TEST(DBTest, Randomized) {
|
|||||||
// TODO(sanjay): Test Get() works
|
// TODO(sanjay): Test Get() works
|
||||||
int p = rnd.Uniform(100);
|
int p = rnd.Uniform(100);
|
||||||
int minimum = 0;
|
int minimum = 0;
|
||||||
if (option_config_ == kHashSkipList) {
|
if (option_config_ == kHashSkipList ||
|
||||||
|
option_config_ == kHashLinkList ||
|
||||||
|
option_config_ == kPlainTableFirstBytePrefix) {
|
||||||
minimum = 1;
|
minimum = 1;
|
||||||
}
|
}
|
||||||
if (p < 45) { // Put
|
if (p < 45) { // Put
|
||||||
@ -4969,20 +5202,22 @@ TEST(DBTest, PrefixScan) {
|
|||||||
snprintf(buf, sizeof(buf), "03______:");
|
snprintf(buf, sizeof(buf), "03______:");
|
||||||
prefix = Slice(buf, 8);
|
prefix = Slice(buf, 8);
|
||||||
key = Slice(buf, 9);
|
key = Slice(buf, 9);
|
||||||
auto prefix_extractor = NewFixedPrefixTransform(8);
|
|
||||||
// db configs
|
// db configs
|
||||||
env_->count_random_reads_ = true;
|
env_->count_random_reads_ = true;
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
options.env = env_;
|
options.env = env_;
|
||||||
options.no_block_cache = true;
|
options.no_block_cache = true;
|
||||||
options.filter_policy = NewBloomFilterPolicy(10);
|
options.filter_policy = NewBloomFilterPolicy(10);
|
||||||
options.prefix_extractor = prefix_extractor;
|
options.prefix_extractor = NewFixedPrefixTransform(8);
|
||||||
options.whole_key_filtering = false;
|
options.whole_key_filtering = false;
|
||||||
options.disable_auto_compactions = true;
|
options.disable_auto_compactions = true;
|
||||||
options.max_background_compactions = 2;
|
options.max_background_compactions = 2;
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
options.disable_seek_compaction = true;
|
options.disable_seek_compaction = true;
|
||||||
options.memtable_factory.reset(NewHashSkipListRepFactory(prefix_extractor));
|
// Tricky: options.prefix_extractor will be released by
|
||||||
|
// NewHashSkipListRepFactory after use.
|
||||||
|
options.memtable_factory.reset(
|
||||||
|
NewHashSkipListRepFactory(options.prefix_extractor));
|
||||||
|
|
||||||
// prefix specified, with blooms: 2 RAND I/Os
|
// prefix specified, with blooms: 2 RAND I/Os
|
||||||
// SeekToFirst
|
// SeekToFirst
|
||||||
|
@ -6,9 +6,9 @@
|
|||||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "db/dbformat.h"
|
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/perf_context_imp.h"
|
#include "util/perf_context_imp.h"
|
||||||
@ -72,6 +72,28 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
|
||||||
|
const ParsedInternalKey& b) const {
|
||||||
|
// Order by:
|
||||||
|
// increasing user key (according to user-supplied comparator)
|
||||||
|
// decreasing sequence number
|
||||||
|
// decreasing type (though sequence# should be enough to disambiguate)
|
||||||
|
int r = user_comparator_->Compare(a.user_key, b.user_key);
|
||||||
|
BumpPerfCount(&perf_context.user_key_comparison_count);
|
||||||
|
if (r == 0) {
|
||||||
|
if (a.sequence > b.sequence) {
|
||||||
|
r = -1;
|
||||||
|
} else if (a.sequence < b.sequence) {
|
||||||
|
r = +1;
|
||||||
|
} else if (a.type > b.type) {
|
||||||
|
r = -1;
|
||||||
|
} else if (a.type < b.type) {
|
||||||
|
r = +1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
void InternalKeyComparator::FindShortestSeparator(
|
void InternalKeyComparator::FindShortestSeparator(
|
||||||
std::string* start,
|
std::string* start,
|
||||||
const Slice& limit) const {
|
const Slice& limit) const {
|
||||||
|
@ -25,7 +25,9 @@ class InternalKey;
|
|||||||
// Value types encoded as the last component of internal keys.
|
// Value types encoded as the last component of internal keys.
|
||||||
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
||||||
// data structures.
|
// data structures.
|
||||||
enum ValueType {
|
// The highest bit of the value type needs to be reserved to SST tables
|
||||||
|
// for them to do more flexible encoding.
|
||||||
|
enum ValueType : unsigned char {
|
||||||
kTypeDeletion = 0x0,
|
kTypeDeletion = 0x0,
|
||||||
kTypeValue = 0x1,
|
kTypeValue = 0x1,
|
||||||
kTypeMerge = 0x2,
|
kTypeMerge = 0x2,
|
||||||
@ -33,7 +35,9 @@ enum ValueType {
|
|||||||
kTypeColumnFamilyDeletion = 0x4,
|
kTypeColumnFamilyDeletion = 0x4,
|
||||||
kTypeColumnFamilyValue = 0x5,
|
kTypeColumnFamilyValue = 0x5,
|
||||||
kTypeColumnFamilyMerge = 0x6,
|
kTypeColumnFamilyMerge = 0x6,
|
||||||
|
kMaxValue = 0x7F
|
||||||
};
|
};
|
||||||
|
|
||||||
// kValueTypeForSeek defines the ValueType that should be passed when
|
// kValueTypeForSeek defines the ValueType that should be passed when
|
||||||
// constructing a ParsedInternalKey object for seeking to a particular
|
// constructing a ParsedInternalKey object for seeking to a particular
|
||||||
// sequence number (since we sort sequence numbers in decreasing order
|
// sequence number (since we sort sequence numbers in decreasing order
|
||||||
@ -99,6 +103,7 @@ class InternalKeyComparator : public Comparator {
|
|||||||
name_("rocksdb.InternalKeyComparator:" +
|
name_("rocksdb.InternalKeyComparator:" +
|
||||||
std::string(user_comparator_->Name())) {
|
std::string(user_comparator_->Name())) {
|
||||||
}
|
}
|
||||||
|
virtual ~InternalKeyComparator() {}
|
||||||
|
|
||||||
virtual const char* Name() const;
|
virtual const char* Name() const;
|
||||||
virtual int Compare(const Slice& a, const Slice& b) const;
|
virtual int Compare(const Slice& a, const Slice& b) const;
|
||||||
@ -110,6 +115,7 @@ class InternalKeyComparator : public Comparator {
|
|||||||
const Comparator* user_comparator() const { return user_comparator_; }
|
const Comparator* user_comparator() const { return user_comparator_; }
|
||||||
|
|
||||||
int Compare(const InternalKey& a, const InternalKey& b) const;
|
int Compare(const InternalKey& a, const InternalKey& b) const;
|
||||||
|
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Filter policy wrapper that converts from internal keys to user keys
|
// Filter policy wrapper that converts from internal keys to user keys
|
||||||
@ -166,6 +172,7 @@ inline bool ParseInternalKey(const Slice& internal_key,
|
|||||||
unsigned char c = num & 0xff;
|
unsigned char c = num & 0xff;
|
||||||
result->sequence = num >> 8;
|
result->sequence = num >> 8;
|
||||||
result->type = static_cast<ValueType>(c);
|
result->type = static_cast<ValueType>(c);
|
||||||
|
assert(result->type <= ValueType::kMaxValue);
|
||||||
result->user_key = Slice(internal_key.data(), n - 8);
|
result->user_key = Slice(internal_key.data(), n - 8);
|
||||||
return (c <= static_cast<unsigned char>(kValueTypeForSeek));
|
return (c <= static_cast<unsigned char>(kValueTypeForSeek));
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,6 @@ namespace log {
|
|||||||
enum RecordType {
|
enum RecordType {
|
||||||
// Zero is reserved for preallocated files
|
// Zero is reserved for preallocated files
|
||||||
kZeroType = 0,
|
kZeroType = 0,
|
||||||
|
|
||||||
kFullType = 1,
|
kFullType = 1,
|
||||||
|
|
||||||
// For fragments
|
// For fragments
|
||||||
|
258
db/memtable.cc
258
db/memtable.cc
@ -17,10 +17,14 @@
|
|||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
#include "rocksdb/merge_operator.h"
|
#include "rocksdb/merge_operator.h"
|
||||||
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "util/arena.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/mutexlock.h"
|
|
||||||
#include "util/murmurhash.h"
|
#include "util/murmurhash.h"
|
||||||
|
#include "util/mutexlock.h"
|
||||||
|
#include "util/perf_context_imp.h"
|
||||||
#include "util/statistics.h"
|
#include "util/statistics.h"
|
||||||
|
#include "util/stop_watch.h"
|
||||||
|
|
||||||
namespace std {
|
namespace std {
|
||||||
template <>
|
template <>
|
||||||
@ -37,9 +41,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
|
|||||||
const ColumnFamilyOptions& options)
|
const ColumnFamilyOptions& options)
|
||||||
: comparator_(cmp),
|
: comparator_(cmp),
|
||||||
refs_(0),
|
refs_(0),
|
||||||
arena_impl_(options.arena_block_size),
|
arena_(options.arena_block_size),
|
||||||
table_(options.memtable_factory->CreateMemTableRep(comparator_,
|
table_(options.memtable_factory->CreateMemTableRep(comparator_, &arena_)),
|
||||||
&arena_impl_)),
|
|
||||||
flush_in_progress_(false),
|
flush_in_progress_(false),
|
||||||
flush_completed_(false),
|
flush_completed_(false),
|
||||||
file_number_(0),
|
file_number_(0),
|
||||||
@ -47,23 +50,36 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
|
|||||||
mem_next_logfile_number_(0),
|
mem_next_logfile_number_(0),
|
||||||
mem_logfile_number_(0),
|
mem_logfile_number_(0),
|
||||||
locks_(options.inplace_update_support ? options.inplace_update_num_locks
|
locks_(options.inplace_update_support ? options.inplace_update_num_locks
|
||||||
: 0) {}
|
: 0),
|
||||||
|
prefix_extractor_(options.prefix_extractor) {
|
||||||
|
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
|
||||||
|
prefix_bloom_.reset(new DynamicBloom(options.memtable_prefix_bloom_bits,
|
||||||
|
options.memtable_prefix_bloom_probes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
MemTable::~MemTable() {
|
MemTable::~MemTable() {
|
||||||
assert(refs_ == 0);
|
assert(refs_ == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t MemTable::ApproximateMemoryUsage() {
|
size_t MemTable::ApproximateMemoryUsage() {
|
||||||
return arena_impl_.ApproximateMemoryUsage() +
|
return arena_.ApproximateMemoryUsage() + table_->ApproximateMemoryUsage();
|
||||||
table_->ApproximateMemoryUsage();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
|
int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
|
||||||
|
const char* prefix_len_key2) const {
|
||||||
|
// Internal keys are encoded as length-prefixed strings.
|
||||||
|
Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
|
||||||
|
Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
|
||||||
|
return comparator.Compare(k1, k2);
|
||||||
|
}
|
||||||
|
|
||||||
|
int MemTable::KeyComparator::operator()(const char* prefix_len_key,
|
||||||
|
const Slice& key)
|
||||||
const {
|
const {
|
||||||
// Internal keys are encoded as length-prefixed strings.
|
// Internal keys are encoded as length-prefixed strings.
|
||||||
Slice a = GetLengthPrefixedSlice(aptr);
|
Slice a = GetLengthPrefixedSlice(prefix_len_key);
|
||||||
Slice b = GetLengthPrefixedSlice(bptr);
|
return comparator.Compare(a, key);
|
||||||
return comparator.Compare(a, b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Slice MemTableRep::UserKey(const char* key) const {
|
Slice MemTableRep::UserKey(const char* key) const {
|
||||||
@ -74,7 +90,7 @@ Slice MemTableRep::UserKey(const char* key) const {
|
|||||||
// Encode a suitable internal key target for "target" and return it.
|
// Encode a suitable internal key target for "target" and return it.
|
||||||
// Uses *scratch as scratch space, and the returned pointer will point
|
// Uses *scratch as scratch space, and the returned pointer will point
|
||||||
// into this scratch space.
|
// into this scratch space.
|
||||||
static const char* EncodeKey(std::string* scratch, const Slice& target) {
|
const char* EncodeKey(std::string* scratch, const Slice& target) {
|
||||||
scratch->clear();
|
scratch->clear();
|
||||||
PutVarint32(scratch, target.size());
|
PutVarint32(scratch, target.size());
|
||||||
scratch->append(target.data(), target.size());
|
scratch->append(target.data(), target.size());
|
||||||
@ -83,27 +99,53 @@ static const char* EncodeKey(std::string* scratch, const Slice& target) {
|
|||||||
|
|
||||||
class MemTableIterator: public Iterator {
|
class MemTableIterator: public Iterator {
|
||||||
public:
|
public:
|
||||||
MemTableIterator(MemTableRep* table, const ReadOptions& options)
|
MemTableIterator(const MemTable& mem, const ReadOptions& options)
|
||||||
: iter_() {
|
: mem_(mem), iter_(), dynamic_prefix_seek_(false), valid_(false) {
|
||||||
if (options.prefix) {
|
if (options.prefix) {
|
||||||
iter_.reset(table->GetPrefixIterator(*options.prefix));
|
iter_.reset(mem_.table_->GetPrefixIterator(*options.prefix));
|
||||||
} else if (options.prefix_seek) {
|
} else if (options.prefix_seek) {
|
||||||
iter_.reset(table->GetDynamicPrefixIterator());
|
dynamic_prefix_seek_ = true;
|
||||||
|
iter_.reset(mem_.table_->GetDynamicPrefixIterator());
|
||||||
} else {
|
} else {
|
||||||
iter_.reset(table->GetIterator());
|
iter_.reset(mem_.table_->GetIterator());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool Valid() const { return iter_->Valid(); }
|
virtual bool Valid() const { return valid_; }
|
||||||
virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
|
virtual void Seek(const Slice& k) {
|
||||||
virtual void SeekToFirst() { iter_->SeekToFirst(); }
|
if (dynamic_prefix_seek_ && mem_.prefix_bloom_ &&
|
||||||
virtual void SeekToLast() { iter_->SeekToLast(); }
|
!mem_.prefix_bloom_->MayContain(
|
||||||
virtual void Next() { iter_->Next(); }
|
mem_.prefix_extractor_->Transform(ExtractUserKey(k)))) {
|
||||||
virtual void Prev() { iter_->Prev(); }
|
valid_ = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
iter_->Seek(k, nullptr);
|
||||||
|
valid_ = iter_->Valid();
|
||||||
|
}
|
||||||
|
virtual void SeekToFirst() {
|
||||||
|
iter_->SeekToFirst();
|
||||||
|
valid_ = iter_->Valid();
|
||||||
|
}
|
||||||
|
virtual void SeekToLast() {
|
||||||
|
iter_->SeekToLast();
|
||||||
|
valid_ = iter_->Valid();
|
||||||
|
}
|
||||||
|
virtual void Next() {
|
||||||
|
assert(Valid());
|
||||||
|
iter_->Next();
|
||||||
|
valid_ = iter_->Valid();
|
||||||
|
}
|
||||||
|
virtual void Prev() {
|
||||||
|
assert(Valid());
|
||||||
|
iter_->Prev();
|
||||||
|
valid_ = iter_->Valid();
|
||||||
|
}
|
||||||
virtual Slice key() const {
|
virtual Slice key() const {
|
||||||
|
assert(Valid());
|
||||||
return GetLengthPrefixedSlice(iter_->key());
|
return GetLengthPrefixedSlice(iter_->key());
|
||||||
}
|
}
|
||||||
virtual Slice value() const {
|
virtual Slice value() const {
|
||||||
|
assert(Valid());
|
||||||
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
|
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
|
||||||
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
|
||||||
}
|
}
|
||||||
@ -111,8 +153,10 @@ class MemTableIterator: public Iterator {
|
|||||||
virtual Status status() const { return Status::OK(); }
|
virtual Status status() const { return Status::OK(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<MemTableRep::Iterator> iter_;
|
const MemTable& mem_;
|
||||||
std::string tmp_; // For passing to EncodeKey
|
std::shared_ptr<MemTableRep::Iterator> iter_;
|
||||||
|
bool dynamic_prefix_seek_;
|
||||||
|
bool valid_;
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
MemTableIterator(const MemTableIterator&);
|
MemTableIterator(const MemTableIterator&);
|
||||||
@ -120,7 +164,7 @@ class MemTableIterator: public Iterator {
|
|||||||
};
|
};
|
||||||
|
|
||||||
Iterator* MemTable::NewIterator(const ReadOptions& options) {
|
Iterator* MemTable::NewIterator(const ReadOptions& options) {
|
||||||
return new MemTableIterator(table_.get(), options);
|
return new MemTableIterator(*this, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
port::RWMutex* MemTable::GetLock(const Slice& key) {
|
port::RWMutex* MemTable::GetLock(const Slice& key) {
|
||||||
@ -128,7 +172,7 @@ port::RWMutex* MemTable::GetLock(const Slice& key) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void MemTable::Add(SequenceNumber s, ValueType type,
|
void MemTable::Add(SequenceNumber s, ValueType type,
|
||||||
const Slice& key,
|
const Slice& key, /* user key */
|
||||||
const Slice& value) {
|
const Slice& value) {
|
||||||
// Format of an entry is concatenation of:
|
// Format of an entry is concatenation of:
|
||||||
// key_size : varint32 of internal_key.size()
|
// key_size : varint32 of internal_key.size()
|
||||||
@ -141,7 +185,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
|||||||
const size_t encoded_len =
|
const size_t encoded_len =
|
||||||
VarintLength(internal_key_size) + internal_key_size +
|
VarintLength(internal_key_size) + internal_key_size +
|
||||||
VarintLength(val_size) + val_size;
|
VarintLength(val_size) + val_size;
|
||||||
char* buf = arena_impl_.Allocate(encoded_len);
|
char* buf = arena_.Allocate(encoded_len);
|
||||||
char* p = EncodeVarint32(buf, internal_key_size);
|
char* p = EncodeVarint32(buf, internal_key_size);
|
||||||
memcpy(p, key.data(), key_size);
|
memcpy(p, key.data(), key_size);
|
||||||
p += key_size;
|
p += key_size;
|
||||||
@ -152,6 +196,11 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
|||||||
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
|
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
|
||||||
table_->Insert(buf);
|
table_->Insert(buf);
|
||||||
|
|
||||||
|
if (prefix_bloom_) {
|
||||||
|
assert(prefix_extractor_);
|
||||||
|
prefix_bloom_->Add(prefix_extractor_->Transform(key));
|
||||||
|
}
|
||||||
|
|
||||||
// The first sequence number inserted into the memtable
|
// The first sequence number inserted into the memtable
|
||||||
assert(first_seqno_ == 0 || s > first_seqno_);
|
assert(first_seqno_ == 0 || s > first_seqno_);
|
||||||
if (first_seqno_ == 0) {
|
if (first_seqno_ == 0) {
|
||||||
@ -161,17 +210,28 @@ void MemTable::Add(SequenceNumber s, ValueType type,
|
|||||||
|
|
||||||
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
||||||
MergeContext& merge_context, const Options& options) {
|
MergeContext& merge_context, const Options& options) {
|
||||||
Slice memkey = key.memtable_key();
|
StopWatchNano memtable_get_timer(options.env, false);
|
||||||
std::unique_ptr<MemTableRep::Iterator> iter(
|
StartPerfTimer(&memtable_get_timer);
|
||||||
table_->GetIterator(key.user_key()));
|
|
||||||
iter->Seek(memkey.data());
|
Slice mem_key = key.memtable_key();
|
||||||
|
Slice user_key = key.user_key();
|
||||||
|
|
||||||
|
std::unique_ptr<MemTableRep::Iterator> iter;
|
||||||
|
if (prefix_bloom_ &&
|
||||||
|
!prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
|
||||||
|
// iter is null if prefix bloom says the key does not exist
|
||||||
|
} else {
|
||||||
|
iter.reset(table_->GetIterator(user_key));
|
||||||
|
iter->Seek(key.internal_key(), mem_key.data());
|
||||||
|
}
|
||||||
|
|
||||||
bool merge_in_progress = s->IsMergeInProgress();
|
bool merge_in_progress = s->IsMergeInProgress();
|
||||||
auto merge_operator = options.merge_operator.get();
|
auto merge_operator = options.merge_operator.get();
|
||||||
auto logger = options.info_log;
|
auto logger = options.info_log;
|
||||||
std::string merge_result;
|
std::string merge_result;
|
||||||
|
|
||||||
for (; iter->Valid(); iter->Next()) {
|
bool found_final_value = false;
|
||||||
|
for (; !found_final_value && iter && iter->Valid(); iter->Next()) {
|
||||||
// entry format is:
|
// entry format is:
|
||||||
// klength varint32
|
// klength varint32
|
||||||
// userkey char[klength-8]
|
// userkey char[klength-8]
|
||||||
@ -182,7 +242,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
|||||||
// sequence number since the Seek() call above should have skipped
|
// sequence number since the Seek() call above should have skipped
|
||||||
// all entries with overly large sequence numbers.
|
// all entries with overly large sequence numbers.
|
||||||
const char* entry = iter->key();
|
const char* entry = iter->key();
|
||||||
uint32_t key_length;
|
uint32_t key_length = 0;
|
||||||
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||||
if (comparator_.comparator.user_comparator()->Compare(
|
if (comparator_.comparator.user_comparator()->Compare(
|
||||||
Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
|
Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
|
||||||
@ -209,7 +269,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
|||||||
if (options.inplace_update_support) {
|
if (options.inplace_update_support) {
|
||||||
GetLock(key.user_key())->Unlock();
|
GetLock(key.user_key())->Unlock();
|
||||||
}
|
}
|
||||||
return true;
|
found_final_value = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
case kTypeDeletion: {
|
case kTypeDeletion: {
|
||||||
if (merge_in_progress) {
|
if (merge_in_progress) {
|
||||||
@ -224,7 +285,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
|||||||
} else {
|
} else {
|
||||||
*s = Status::NotFound();
|
*s = Status::NotFound();
|
||||||
}
|
}
|
||||||
return true;
|
found_final_value = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
case kTypeMerge: {
|
case kTypeMerge: {
|
||||||
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||||
@ -244,10 +306,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kTypeColumnFamilyDeletion:
|
default:
|
||||||
case kTypeColumnFamilyValue:
|
|
||||||
case kTypeColumnFamilyMerge:
|
|
||||||
case kTypeLogData:
|
|
||||||
assert(false);
|
assert(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -259,25 +318,27 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
|
|||||||
|
|
||||||
// No change to value, since we have not yet found a Put/Delete
|
// No change to value, since we have not yet found a Put/Delete
|
||||||
|
|
||||||
if (merge_in_progress) {
|
if (!found_final_value && merge_in_progress) {
|
||||||
*s = Status::MergeInProgress("");
|
*s = Status::MergeInProgress("");
|
||||||
}
|
}
|
||||||
return false;
|
BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
|
||||||
|
BumpPerfCount(&perf_context.get_from_memtable_count);
|
||||||
|
return found_final_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MemTable::Update(SequenceNumber seq, ValueType type,
|
void MemTable::Update(SequenceNumber seq,
|
||||||
const Slice& key,
|
const Slice& key,
|
||||||
const Slice& value) {
|
const Slice& value) {
|
||||||
LookupKey lkey(key, seq);
|
LookupKey lkey(key, seq);
|
||||||
Slice memkey = lkey.memtable_key();
|
Slice mem_key = lkey.memtable_key();
|
||||||
|
|
||||||
std::unique_ptr<MemTableRep::Iterator> iter(
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
||||||
table_->GetIterator(lkey.user_key()));
|
table_->GetIterator(lkey.user_key()));
|
||||||
iter->Seek(memkey.data());
|
iter->Seek(lkey.internal_key(), mem_key.data());
|
||||||
|
|
||||||
if (iter->Valid()) {
|
if (iter->Valid()) {
|
||||||
// entry format is:
|
// entry format is:
|
||||||
// klength varint32
|
// key_length varint32
|
||||||
// userkey char[klength-8]
|
// userkey char[klength-8]
|
||||||
// tag uint64
|
// tag uint64
|
||||||
// vlength varint32
|
// vlength varint32
|
||||||
@ -286,7 +347,7 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
|
|||||||
// sequence number since the Seek() call above should have skipped
|
// sequence number since the Seek() call above should have skipped
|
||||||
// all entries with overly large sequence numbers.
|
// all entries with overly large sequence numbers.
|
||||||
const char* entry = iter->key();
|
const char* entry = iter->key();
|
||||||
uint32_t key_length;
|
uint32_t key_length = 0;
|
||||||
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||||
if (comparator_.comparator.user_comparator()->Compare(
|
if (comparator_.comparator.user_comparator()->Compare(
|
||||||
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
|
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
|
||||||
@ -294,32 +355,105 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
|
|||||||
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||||
switch (static_cast<ValueType>(tag & 0xff)) {
|
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||||
case kTypeValue: {
|
case kTypeValue: {
|
||||||
uint32_t vlength;
|
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||||
GetVarint32Ptr(key_ptr + key_length,
|
uint32_t prev_size = prev_value.size();
|
||||||
key_ptr + key_length+5, &vlength);
|
uint32_t new_size = value.size();
|
||||||
// Update value, if newValue size <= curValue size
|
|
||||||
if (value.size() <= vlength) {
|
// Update value, if new value size <= previous value size
|
||||||
|
if (new_size <= prev_size ) {
|
||||||
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
||||||
value.size());
|
new_size);
|
||||||
WriteLock wl(GetLock(lkey.user_key()));
|
WriteLock wl(GetLock(lkey.user_key()));
|
||||||
memcpy(p, value.data(), value.size());
|
memcpy(p, value.data(), value.size());
|
||||||
assert((unsigned)((p + value.size()) - entry) ==
|
assert((unsigned)((p + value.size()) - entry) ==
|
||||||
(unsigned)(VarintLength(key_length) + key_length +
|
(unsigned)(VarintLength(key_length) + key_length +
|
||||||
VarintLength(value.size()) + value.size()));
|
VarintLength(value.size()) + value.size()));
|
||||||
return true;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
|
// If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
|
||||||
// then we probably don't have enough space to update in-place
|
// we don't have enough space for update inplace
|
||||||
// Maybe do something later
|
Add(seq, kTypeValue, key, value);
|
||||||
// Return false, and do normal Add()
|
return;
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Key doesn't exist
|
// key doesn't exist
|
||||||
|
Add(seq, kTypeValue, key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MemTable::UpdateCallback(SequenceNumber seq,
|
||||||
|
const Slice& key,
|
||||||
|
const Slice& delta,
|
||||||
|
const Options& options) {
|
||||||
|
LookupKey lkey(key, seq);
|
||||||
|
Slice memkey = lkey.memtable_key();
|
||||||
|
|
||||||
|
std::shared_ptr<MemTableRep::Iterator> iter(
|
||||||
|
table_->GetIterator(lkey.user_key()));
|
||||||
|
iter->Seek(lkey.internal_key(), memkey.data());
|
||||||
|
|
||||||
|
if (iter->Valid()) {
|
||||||
|
// entry format is:
|
||||||
|
// key_length varint32
|
||||||
|
// userkey char[klength-8]
|
||||||
|
// tag uint64
|
||||||
|
// vlength varint32
|
||||||
|
// value char[vlength]
|
||||||
|
// Check that it belongs to same user key. We do not check the
|
||||||
|
// sequence number since the Seek() call above should have skipped
|
||||||
|
// all entries with overly large sequence numbers.
|
||||||
|
const char* entry = iter->key();
|
||||||
|
uint32_t key_length = 0;
|
||||||
|
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||||
|
if (comparator_.comparator.user_comparator()->Compare(
|
||||||
|
Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
|
||||||
|
// Correct user key
|
||||||
|
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
|
||||||
|
switch (static_cast<ValueType>(tag & 0xff)) {
|
||||||
|
case kTypeValue: {
|
||||||
|
Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
|
||||||
|
uint32_t prev_size = prev_value.size();
|
||||||
|
|
||||||
|
char* prev_buffer = const_cast<char*>(prev_value.data());
|
||||||
|
uint32_t new_prev_size = prev_size;
|
||||||
|
|
||||||
|
std::string str_value;
|
||||||
|
WriteLock wl(GetLock(lkey.user_key()));
|
||||||
|
auto status = options.inplace_callback(prev_buffer, &new_prev_size,
|
||||||
|
delta, &str_value);
|
||||||
|
if (status == UpdateStatus::UPDATED_INPLACE) {
|
||||||
|
// Value already updated by callback.
|
||||||
|
assert(new_prev_size <= prev_size);
|
||||||
|
if (new_prev_size < prev_size) {
|
||||||
|
// overwrite the new prev_size
|
||||||
|
char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
|
||||||
|
new_prev_size);
|
||||||
|
if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
|
||||||
|
// shift the value buffer as well.
|
||||||
|
memcpy(p, prev_buffer, new_prev_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
|
||||||
|
return true;
|
||||||
|
} else if (status == UpdateStatus::UPDATED) {
|
||||||
|
Add(seq, kTypeValue, key, Slice(str_value));
|
||||||
|
RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||||
|
return true;
|
||||||
|
} else if (status == UpdateStatus::UPDATE_FAILED) {
|
||||||
|
// No action required. Return.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If the latest value is not kTypeValue
|
||||||
|
// or key doesn't exist
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -331,13 +465,13 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
|
|||||||
// The iterator only needs to be ordered within the same user key.
|
// The iterator only needs to be ordered within the same user key.
|
||||||
std::unique_ptr<MemTableRep::Iterator> iter(
|
std::unique_ptr<MemTableRep::Iterator> iter(
|
||||||
table_->GetIterator(key.user_key()));
|
table_->GetIterator(key.user_key()));
|
||||||
iter->Seek(memkey.data());
|
iter->Seek(key.internal_key(), memkey.data());
|
||||||
|
|
||||||
size_t num_successive_merges = 0;
|
size_t num_successive_merges = 0;
|
||||||
|
|
||||||
for (; iter->Valid(); iter->Next()) {
|
for (; iter->Valid(); iter->Next()) {
|
||||||
const char* entry = iter->key();
|
const char* entry = iter->key();
|
||||||
uint32_t key_length;
|
uint32_t key_length = 0;
|
||||||
const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
|
||||||
if (!comparator_.comparator.user_comparator()->Compare(
|
if (!comparator_.comparator.user_comparator()->Compare(
|
||||||
Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
|
Slice(iter_key_ptr, key_length - 8), key.user_key()) == 0) {
|
||||||
|
@ -16,7 +16,8 @@
|
|||||||
#include "db/version_edit.h"
|
#include "db/version_edit.h"
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/memtablerep.h"
|
#include "rocksdb/memtablerep.h"
|
||||||
#include "util/arena_impl.h"
|
#include "util/arena.h"
|
||||||
|
#include "util/dynamic_bloom.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
@ -29,7 +30,10 @@ class MemTable {
|
|||||||
struct KeyComparator : public MemTableRep::KeyComparator {
|
struct KeyComparator : public MemTableRep::KeyComparator {
|
||||||
const InternalKeyComparator comparator;
|
const InternalKeyComparator comparator;
|
||||||
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
|
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
|
||||||
virtual int operator()(const char* a, const char* b) const;
|
virtual int operator()(const char* prefix_len_key1,
|
||||||
|
const char* prefix_len_key2) const;
|
||||||
|
virtual int operator()(const char* prefix_len_key,
|
||||||
|
const Slice& key) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
// MemTables are reference counted. The initial reference count
|
// MemTables are reference counted. The initial reference count
|
||||||
@ -94,16 +98,31 @@ class MemTable {
|
|||||||
bool Get(const LookupKey& key, std::string* value, Status* s,
|
bool Get(const LookupKey& key, std::string* value, Status* s,
|
||||||
MergeContext& merge_context, const Options& options);
|
MergeContext& merge_context, const Options& options);
|
||||||
|
|
||||||
// Update the value and return status ok,
|
// Attempts to update the new_value inplace, else does normal Add
|
||||||
// if key exists in current memtable
|
// Pseudocode
|
||||||
// if new sizeof(new_value) <= sizeof(old_value) &&
|
// if key exists in current memtable && prev_value is of type kTypeValue
|
||||||
// old_value for that key is a put i.e. kTypeValue
|
// if new sizeof(new_value) <= sizeof(prev_value)
|
||||||
// else return false, and status - NotUpdatable()
|
// update inplace
|
||||||
// else return false, and status - NotFound()
|
// else add(key, new_value)
|
||||||
bool Update(SequenceNumber seq, ValueType type,
|
// else add(key, new_value)
|
||||||
|
void Update(SequenceNumber seq,
|
||||||
const Slice& key,
|
const Slice& key,
|
||||||
const Slice& value);
|
const Slice& value);
|
||||||
|
|
||||||
|
// If prev_value for key exits, attempts to update it inplace.
|
||||||
|
// else returns false
|
||||||
|
// Pseudocode
|
||||||
|
// if key exists in current memtable && prev_value is of type kTypeValue
|
||||||
|
// new_value = delta(prev_value)
|
||||||
|
// if sizeof(new_value) <= sizeof(prev_value)
|
||||||
|
// update inplace
|
||||||
|
// else add(key, new_value)
|
||||||
|
// else return false
|
||||||
|
bool UpdateCallback(SequenceNumber seq,
|
||||||
|
const Slice& key,
|
||||||
|
const Slice& delta,
|
||||||
|
const Options& options);
|
||||||
|
|
||||||
// Returns the number of successive merge entries starting from the newest
|
// Returns the number of successive merge entries starting from the newest
|
||||||
// entry for the key up to the last non-merge entry or last entry for the
|
// entry for the key up to the last non-merge entry or last entry for the
|
||||||
// key in the memtable.
|
// key in the memtable.
|
||||||
@ -142,7 +161,7 @@ class MemTable {
|
|||||||
|
|
||||||
KeyComparator comparator_;
|
KeyComparator comparator_;
|
||||||
int refs_;
|
int refs_;
|
||||||
ArenaImpl arena_impl_;
|
Arena arena_;
|
||||||
unique_ptr<MemTableRep> table_;
|
unique_ptr<MemTableRep> table_;
|
||||||
|
|
||||||
// These are used to manage memtable flushes to storage
|
// These are used to manage memtable flushes to storage
|
||||||
@ -150,7 +169,7 @@ class MemTable {
|
|||||||
bool flush_completed_; // finished the flush
|
bool flush_completed_; // finished the flush
|
||||||
uint64_t file_number_; // filled up after flush is complete
|
uint64_t file_number_; // filled up after flush is complete
|
||||||
|
|
||||||
// The udpates to be applied to the transaction log when this
|
// The updates to be applied to the transaction log when this
|
||||||
// memtable is flushed to storage.
|
// memtable is flushed to storage.
|
||||||
VersionEdit edit_;
|
VersionEdit edit_;
|
||||||
|
|
||||||
@ -173,6 +192,11 @@ class MemTable {
|
|||||||
|
|
||||||
// Get the lock associated for the key
|
// Get the lock associated for the key
|
||||||
port::RWMutex* GetLock(const Slice& key);
|
port::RWMutex* GetLock(const Slice& key);
|
||||||
|
|
||||||
|
const SliceTransform* const prefix_extractor_;
|
||||||
|
std::unique_ptr<DynamicBloom> prefix_bloom_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
extern const char* EncodeKey(std::string* scratch, const Slice& target);
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
// LICENSE file in the root directory of this source tree. An additional grant
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
//
|
//
|
||||||
#include "db/memtablelist.h"
|
#include "db/memtable_list.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
@ -31,7 +31,7 @@ MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
|
|||||||
|
|
||||||
void MemTableListVersion::Ref() { ++refs_; }
|
void MemTableListVersion::Ref() { ++refs_; }
|
||||||
|
|
||||||
void MemTableListVersion::Unref(std::vector<MemTable*>* to_delete) {
|
void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
|
||||||
assert(refs_ >= 1);
|
assert(refs_ >= 1);
|
||||||
--refs_;
|
--refs_;
|
||||||
if (refs_ == 0) {
|
if (refs_ == 0) {
|
||||||
@ -103,7 +103,7 @@ bool MemTableList::IsFlushPending() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns the memtables that need to be flushed.
|
// Returns the memtables that need to be flushed.
|
||||||
void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
|
void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
|
||||||
const auto& memlist = current_->memlist_;
|
const auto& memlist = current_->memlist_;
|
||||||
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
|
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
|
||||||
MemTable* m = *it;
|
MemTable* m = *it;
|
||||||
@ -113,18 +113,18 @@ void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
|
|||||||
if (num_flush_not_started_ == 0) {
|
if (num_flush_not_started_ == 0) {
|
||||||
imm_flush_needed.Release_Store(nullptr);
|
imm_flush_needed.Release_Store(nullptr);
|
||||||
}
|
}
|
||||||
m->flush_in_progress_ = true; // flushing will start very soon
|
m->flush_in_progress_ = true; // flushing will start very soon
|
||||||
ret->push_back(m);
|
ret->push_back(m);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
flush_requested_ = false; // start-flush request is complete
|
flush_requested_ = false; // start-flush request is complete
|
||||||
}
|
}
|
||||||
|
|
||||||
// Record a successful flush in the manifest file
|
// Record a successful flush in the manifest file
|
||||||
Status MemTableList::InstallMemtableFlushResults(
|
Status MemTableList::InstallMemtableFlushResults(
|
||||||
ColumnFamilyData* cfd, const std::vector<MemTable*>& mems, VersionSet* vset,
|
ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
|
||||||
Status flushStatus, port::Mutex* mu, Logger* info_log, uint64_t file_number,
|
Status flushStatus, port::Mutex* mu, Logger* info_log, uint64_t file_number,
|
||||||
std::set<uint64_t>& pending_outputs, std::vector<MemTable*>* to_delete,
|
std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
|
||||||
Directory* db_directory) {
|
Directory* db_directory) {
|
||||||
mu->AssertHeld();
|
mu->AssertHeld();
|
||||||
|
|
@ -3,18 +3,25 @@
|
|||||||
// LICENSE file in the root directory of this source tree. An additional grant
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
//
|
//
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <deque>
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
|
#include "db/memtable.h"
|
||||||
#include "db/skiplist.h"
|
#include "db/skiplist.h"
|
||||||
#include "db/memtable.h"
|
#include "db/memtable.h"
|
||||||
|
#include "rocksdb/db.h"
|
||||||
|
#include "rocksdb/iterator.h"
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "util/autovector.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
@ -30,7 +37,7 @@ class MemTableListVersion {
|
|||||||
explicit MemTableListVersion(MemTableListVersion* old = nullptr);
|
explicit MemTableListVersion(MemTableListVersion* old = nullptr);
|
||||||
|
|
||||||
void Ref();
|
void Ref();
|
||||||
void Unref(std::vector<MemTable*>* to_delete = nullptr);
|
void Unref(autovector<MemTable*>* to_delete = nullptr);
|
||||||
|
|
||||||
int size() const;
|
int size() const;
|
||||||
|
|
||||||
@ -89,14 +96,14 @@ class MemTableList {
|
|||||||
|
|
||||||
// Returns the earliest memtables that needs to be flushed. The returned
|
// Returns the earliest memtables that needs to be flushed. The returned
|
||||||
// memtables are guaranteed to be in the ascending order of created time.
|
// memtables are guaranteed to be in the ascending order of created time.
|
||||||
void PickMemtablesToFlush(std::vector<MemTable*>* mems);
|
void PickMemtablesToFlush(autovector<MemTable*>* mems);
|
||||||
|
|
||||||
// Commit a successful flush in the manifest file
|
// Commit a successful flush in the manifest file
|
||||||
Status InstallMemtableFlushResults(
|
Status InstallMemtableFlushResults(
|
||||||
ColumnFamilyData* cfd, const std::vector<MemTable*>& m, VersionSet* vset,
|
ColumnFamilyData* cfd, const autovector<MemTable*>& m, VersionSet* vset,
|
||||||
Status flushStatus, port::Mutex* mu, Logger* info_log,
|
Status flushStatus, port::Mutex* mu, Logger* info_log,
|
||||||
uint64_t file_number, std::set<uint64_t>& pending_outputs,
|
uint64_t file_number, std::set<uint64_t>& pending_outputs,
|
||||||
std::vector<MemTable*>* to_delete, Directory* db_directory);
|
autovector<MemTable*>* to_delete, Directory* db_directory);
|
||||||
|
|
||||||
// New memtables are inserted at the front of the list.
|
// New memtables are inserted at the front of the list.
|
||||||
// Takes ownership of the referenced held on *m by the caller of Add().
|
// Takes ownership of the referenced held on *m by the caller of Add().
|
@ -174,6 +174,13 @@ void ProfileKeyComparison() {
|
|||||||
|
|
||||||
HistogramImpl hist_put;
|
HistogramImpl hist_put;
|
||||||
HistogramImpl hist_get;
|
HistogramImpl hist_get;
|
||||||
|
HistogramImpl hist_get_snapshot;
|
||||||
|
HistogramImpl hist_get_memtable;
|
||||||
|
HistogramImpl hist_get_post_process;
|
||||||
|
HistogramImpl hist_num_memtable_checked;
|
||||||
|
HistogramImpl hist_write_pre_post;
|
||||||
|
HistogramImpl hist_write_wal_time;
|
||||||
|
HistogramImpl hist_write_memtable_time;
|
||||||
|
|
||||||
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
|
std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
|
||||||
|
|
||||||
@ -192,16 +199,37 @@ void ProfileKeyComparison() {
|
|||||||
|
|
||||||
perf_context.Reset();
|
perf_context.Reset();
|
||||||
db->Put(write_options, key, value);
|
db->Put(write_options, key, value);
|
||||||
|
hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
|
||||||
|
hist_write_wal_time.Add(perf_context.write_wal_time);
|
||||||
|
hist_write_memtable_time.Add(perf_context.write_memtable_time);
|
||||||
hist_put.Add(perf_context.user_key_comparison_count);
|
hist_put.Add(perf_context.user_key_comparison_count);
|
||||||
|
|
||||||
perf_context.Reset();
|
perf_context.Reset();
|
||||||
db->Get(read_options, key, &value);
|
db->Get(read_options, key, &value);
|
||||||
|
hist_get_snapshot.Add(perf_context.get_snapshot_time);
|
||||||
|
hist_get_memtable.Add(perf_context.get_from_memtable_time);
|
||||||
|
hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
|
||||||
|
hist_get_post_process.Add(perf_context.get_post_process_time);
|
||||||
hist_get.Add(perf_context.user_key_comparison_count);
|
hist_get.Add(perf_context.user_key_comparison_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
|
std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
|
||||||
<< "Get uesr key comparison: \n" << hist_get.ToString();
|
<< "Get uesr key comparison: \n" << hist_get.ToString();
|
||||||
|
std::cout << "Put(): Pre and Post Process Time: \n"
|
||||||
|
<< hist_write_pre_post.ToString()
|
||||||
|
<< " Writing WAL time: \n"
|
||||||
|
<< hist_write_wal_time.ToString() << "\n"
|
||||||
|
<< " Writing Mem Table time: \n"
|
||||||
|
<< hist_write_memtable_time.ToString() << "\n";
|
||||||
|
|
||||||
|
std::cout << "Get(): Time to get snapshot: \n"
|
||||||
|
<< hist_get_snapshot.ToString()
|
||||||
|
<< " Time to get value from memtables: \n"
|
||||||
|
<< hist_get_memtable.ToString() << "\n"
|
||||||
|
<< " Number of memtables checked: \n"
|
||||||
|
<< hist_num_memtable_checked.ToString() << "\n"
|
||||||
|
<< " Time to post process: \n"
|
||||||
|
<< hist_get_post_process.ToString() << "\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(PerfContextTest, KeyComparisonCount) {
|
TEST(PerfContextTest, KeyComparisonCount) {
|
||||||
@ -259,8 +287,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
|
|||||||
db->Put(write_options, key, value);
|
db->Put(write_options, key, value);
|
||||||
auto put_time = timer.ElapsedNanos();
|
auto put_time = timer.ElapsedNanos();
|
||||||
hist_put_time.Add(put_time);
|
hist_put_time.Add(put_time);
|
||||||
hist_wal_time.Add(perf_context.wal_write_time);
|
hist_wal_time.Add(perf_context.write_wal_time);
|
||||||
hist_time_diff.Add(put_time - perf_context.wal_write_time);
|
hist_time_diff.Add(put_time - perf_context.write_wal_time);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "Put time:\n" << hist_put_time.ToString()
|
std::cout << "Put time:\n" << hist_put_time.ToString()
|
||||||
|
337
db/plain_table_db_test.cc
Normal file
337
db/plain_table_db_test.cc
Normal file
@ -0,0 +1,337 @@
|
|||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
#include <algorithm>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
|
#include "db/db_impl.h"
|
||||||
|
#include "db/filename.h"
|
||||||
|
#include "db/version_set.h"
|
||||||
|
#include "db/write_batch_internal.h"
|
||||||
|
#include "rocksdb/cache.h"
|
||||||
|
#include "rocksdb/compaction_filter.h"
|
||||||
|
#include "rocksdb/db.h"
|
||||||
|
#include "rocksdb/env.h"
|
||||||
|
#include "rocksdb/filter_policy.h"
|
||||||
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "rocksdb/table.h"
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
|
#include "util/hash.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/mutexlock.h"
|
||||||
|
#include "util/testharness.h"
|
||||||
|
#include "util/testutil.h"
|
||||||
|
#include "utilities/merge_operators.h"
|
||||||
|
|
||||||
|
using std::unique_ptr;
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class PlainTableDBTest {
|
||||||
|
protected:
|
||||||
|
private:
|
||||||
|
std::string dbname_;
|
||||||
|
Env* env_;
|
||||||
|
DB* db_;
|
||||||
|
|
||||||
|
Options last_options_;
|
||||||
|
static std::unique_ptr<const SliceTransform> prefix_transform;
|
||||||
|
|
||||||
|
public:
|
||||||
|
PlainTableDBTest() : env_(Env::Default()) {
|
||||||
|
dbname_ = test::TmpDir() + "/plain_table_db_test";
|
||||||
|
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||||
|
db_ = nullptr;
|
||||||
|
Reopen();
|
||||||
|
}
|
||||||
|
|
||||||
|
~PlainTableDBTest() {
|
||||||
|
delete db_;
|
||||||
|
ASSERT_OK(DestroyDB(dbname_, Options()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the current option configuration.
|
||||||
|
Options CurrentOptions() {
|
||||||
|
Options options;
|
||||||
|
options.table_factory.reset(new PlainTableFactory(16, 2, 0.8));
|
||||||
|
options.prefix_extractor = prefix_transform.get();
|
||||||
|
options.allow_mmap_reads = true;
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
|
DBImpl* dbfull() {
|
||||||
|
return reinterpret_cast<DBImpl*>(db_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Reopen(Options* options = nullptr) {
|
||||||
|
ASSERT_OK(TryReopen(options));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Close() {
|
||||||
|
delete db_;
|
||||||
|
db_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void DestroyAndReopen(Options* options = nullptr) {
|
||||||
|
//Destroy using last options
|
||||||
|
Destroy(&last_options_);
|
||||||
|
ASSERT_OK(TryReopen(options));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Destroy(Options* options) {
|
||||||
|
delete db_;
|
||||||
|
db_ = nullptr;
|
||||||
|
ASSERT_OK(DestroyDB(dbname_, *options));
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PureReopen(Options* options, DB** db) {
|
||||||
|
return DB::Open(*options, dbname_, db);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status TryReopen(Options* options = nullptr) {
|
||||||
|
delete db_;
|
||||||
|
db_ = nullptr;
|
||||||
|
Options opts;
|
||||||
|
if (options != nullptr) {
|
||||||
|
opts = *options;
|
||||||
|
} else {
|
||||||
|
opts = CurrentOptions();
|
||||||
|
opts.create_if_missing = true;
|
||||||
|
}
|
||||||
|
last_options_ = opts;
|
||||||
|
|
||||||
|
return DB::Open(opts, dbname_, &db_);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Put(const Slice& k, const Slice& v) {
|
||||||
|
return db_->Put(WriteOptions(), k, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status Delete(const std::string& k) {
|
||||||
|
return db_->Delete(WriteOptions(), k);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
|
||||||
|
ReadOptions options;
|
||||||
|
options.snapshot = snapshot;
|
||||||
|
std::string result;
|
||||||
|
Status s = db_->Get(options, k, &result);
|
||||||
|
if (s.IsNotFound()) {
|
||||||
|
result = "NOT_FOUND";
|
||||||
|
} else if (!s.ok()) {
|
||||||
|
result = s.ToString();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int NumTableFilesAtLevel(int level) {
|
||||||
|
std::string property;
|
||||||
|
ASSERT_TRUE(
|
||||||
|
db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
|
||||||
|
&property));
|
||||||
|
return atoi(property.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return spread of files per level
|
||||||
|
std::string FilesPerLevel() {
|
||||||
|
std::string result;
|
||||||
|
int last_non_zero_offset = 0;
|
||||||
|
for (int level = 0; level < db_->NumberLevels(); level++) {
|
||||||
|
int f = NumTableFilesAtLevel(level);
|
||||||
|
char buf[100];
|
||||||
|
snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
|
||||||
|
result += buf;
|
||||||
|
if (f > 0) {
|
||||||
|
last_non_zero_offset = result.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.resize(last_non_zero_offset);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string IterStatus(Iterator* iter) {
|
||||||
|
std::string result;
|
||||||
|
if (iter->Valid()) {
|
||||||
|
result = iter->key().ToString() + "->" + iter->value().ToString();
|
||||||
|
} else {
|
||||||
|
result = "(invalid)";
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<const SliceTransform> PlainTableDBTest::prefix_transform(
|
||||||
|
NewFixedPrefixTransform(8));
|
||||||
|
|
||||||
|
TEST(PlainTableDBTest, Empty) {
|
||||||
|
ASSERT_TRUE(dbfull() != nullptr);
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(PlainTableDBTest, ReadWrite) {
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||||
|
ASSERT_EQ("v1", Get("1000000000000foo"));
|
||||||
|
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v3"));
|
||||||
|
ASSERT_EQ("v3", Get("1000000000000foo"));
|
||||||
|
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(PlainTableDBTest, Flush) {
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||||
|
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v3"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v3", Get("1000000000000foo"));
|
||||||
|
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(PlainTableDBTest, Iterator) {
|
||||||
|
ASSERT_OK(Put("1000000000foo002", "v_2"));
|
||||||
|
ASSERT_OK(Put("0000000000000bar", "random"));
|
||||||
|
ASSERT_OK(Put("1000000000foo001", "v1"));
|
||||||
|
ASSERT_OK(Put("3000000000000bar", "bar_v"));
|
||||||
|
ASSERT_OK(Put("1000000000foo003", "v__3"));
|
||||||
|
ASSERT_OK(Put("1000000000foo004", "v__4"));
|
||||||
|
ASSERT_OK(Put("1000000000foo005", "v__5"));
|
||||||
|
ASSERT_OK(Put("1000000000foo007", "v__7"));
|
||||||
|
ASSERT_OK(Put("1000000000foo008", "v__8"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v1", Get("1000000000foo001"));
|
||||||
|
ASSERT_EQ("v__3", Get("1000000000foo003"));
|
||||||
|
ReadOptions ro;
|
||||||
|
Iterator* iter = dbfull()->NewIterator(ro);
|
||||||
|
iter->Seek("1000000000foo001");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v1", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Next();
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo002", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v_2", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Next();
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo003", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v__3", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Next();
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo004", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v__4", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Seek("3000000000000bar");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||||
|
ASSERT_EQ("bar_v", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Seek("1000000000foo000");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo001", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v1", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Seek("1000000000foo005");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo005", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v__5", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Seek("1000000000foo006");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo007", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v__7", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Seek("1000000000foo008");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("1000000000foo008", iter->key().ToString());
|
||||||
|
ASSERT_EQ("v__8", iter->value().ToString());
|
||||||
|
|
||||||
|
iter->Seek("1000000000foo009");
|
||||||
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
ASSERT_EQ("3000000000000bar", iter->key().ToString());
|
||||||
|
|
||||||
|
|
||||||
|
delete iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(PlainTableDBTest, Flush2) {
|
||||||
|
ASSERT_OK(Put("0000000000000bar", "b"));
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
|
||||||
|
ASSERT_OK(Put("1000000000000foo", "v2"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v2", Get("1000000000000foo"));
|
||||||
|
|
||||||
|
ASSERT_OK(Put("0000000000000eee", "v3"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v3", Get("0000000000000eee"));
|
||||||
|
|
||||||
|
ASSERT_OK(Delete("0000000000000bar"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
|
||||||
|
|
||||||
|
ASSERT_OK(Put("0000000000000eee", "v5"));
|
||||||
|
dbfull()->TEST_FlushMemTable();
|
||||||
|
ASSERT_EQ("v5", Get("0000000000000eee"));
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string Key(int i) {
|
||||||
|
char buf[100];
|
||||||
|
snprintf(buf, sizeof(buf), "key_______%06d", i);
|
||||||
|
return std::string(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string RandomString(Random* rnd, int len) {
|
||||||
|
std::string r;
|
||||||
|
test::RandomString(rnd, len, &r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(PlainTableDBTest, CompactionTrigger) {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.write_buffer_size = 100 << 10; //100KB
|
||||||
|
options.num_levels = 3;
|
||||||
|
options.max_mem_compaction_level = 0;
|
||||||
|
options.level0_file_num_compaction_trigger = 3;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
|
||||||
|
for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
|
||||||
|
num++) {
|
||||||
|
std::vector<std::string> values;
|
||||||
|
// Write 120KB (12 values, each 10K)
|
||||||
|
for (int i = 0; i < 12; i++) {
|
||||||
|
values.push_back(RandomString(&rnd, 10000));
|
||||||
|
ASSERT_OK(Put(Key(i), values[i]));
|
||||||
|
}
|
||||||
|
dbfull()->TEST_WaitForFlushMemTable();
|
||||||
|
ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
//generate one more file in level-0, and should trigger level-0 compaction
|
||||||
|
std::vector<std::string> values;
|
||||||
|
for (int i = 0; i < 12; i++) {
|
||||||
|
values.push_back(RandomString(&rnd, 10000));
|
||||||
|
ASSERT_OK(Put(Key(i), values[i]));
|
||||||
|
}
|
||||||
|
dbfull()->TEST_WaitForCompact();
|
||||||
|
|
||||||
|
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
|
||||||
|
ASSERT_EQ(NumTableFilesAtLevel(1), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
return rocksdb::test::RunAllTests();
|
||||||
|
}
|
@ -16,11 +16,15 @@ DEFINE_bool(trigger_deadlock, false,
|
|||||||
DEFINE_uint64(bucket_count, 100000, "number of buckets");
|
DEFINE_uint64(bucket_count, 100000, "number of buckets");
|
||||||
DEFINE_uint64(num_locks, 10001, "number of locks");
|
DEFINE_uint64(num_locks, 10001, "number of locks");
|
||||||
DEFINE_bool(random_prefix, false, "randomize prefix");
|
DEFINE_bool(random_prefix, false, "randomize prefix");
|
||||||
DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
|
DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
|
||||||
DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
|
DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
|
||||||
DEFINE_int64(write_buffer_size, 1000000000, "");
|
DEFINE_int64(write_buffer_size, 33554432, "");
|
||||||
DEFINE_int64(max_write_buffer_number, 8, "");
|
DEFINE_int64(max_write_buffer_number, 2, "");
|
||||||
DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
|
DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
|
||||||
|
DEFINE_int32(skiplist_height, 4, "");
|
||||||
|
DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
|
||||||
|
DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
|
||||||
|
DEFINE_int32(value_size, 40, "");
|
||||||
|
|
||||||
// Path to the database on file system
|
// Path to the database on file system
|
||||||
const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
|
const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
|
||||||
@ -104,218 +108,265 @@ class PrefixTest {
|
|||||||
options.min_write_buffer_number_to_merge =
|
options.min_write_buffer_number_to_merge =
|
||||||
FLAGS_min_write_buffer_number_to_merge;
|
FLAGS_min_write_buffer_number_to_merge;
|
||||||
|
|
||||||
options.comparator = new TestKeyComparator();
|
options.memtable_prefix_bloom_bits = FLAGS_memtable_prefix_bloom_bits;
|
||||||
if (FLAGS_use_prefix_hash_memtable) {
|
options.memtable_prefix_bloom_probes = FLAGS_memtable_prefix_bloom_probes;
|
||||||
auto prefix_extractor = NewFixedPrefixTransform(8);
|
|
||||||
options.prefix_extractor = prefix_extractor;
|
|
||||||
options.memtable_factory.reset(NewHashSkipListRepFactory(
|
|
||||||
prefix_extractor, FLAGS_bucket_count));
|
|
||||||
}
|
|
||||||
|
|
||||||
Status s = DB::Open(options, kDbName, &db);
|
Status s = DB::Open(options, kDbName, &db);
|
||||||
ASSERT_OK(s);
|
ASSERT_OK(s);
|
||||||
return std::shared_ptr<DB>(db);
|
return std::shared_ptr<DB>(db);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool NextOptions() {
|
||||||
|
// skip some options
|
||||||
|
option_config_++;
|
||||||
|
if (option_config_ < kEnd) {
|
||||||
|
auto prefix_extractor = NewFixedPrefixTransform(8);
|
||||||
|
options.prefix_extractor = prefix_extractor;
|
||||||
|
switch(option_config_) {
|
||||||
|
case kHashSkipList:
|
||||||
|
options.memtable_factory.reset(
|
||||||
|
NewHashSkipListRepFactory(options.prefix_extractor,
|
||||||
|
FLAGS_bucket_count,
|
||||||
|
FLAGS_skiplist_height));
|
||||||
|
return true;
|
||||||
|
case kHashLinkList:
|
||||||
|
options.memtable_factory.reset(
|
||||||
|
NewHashLinkListRepFactory(options.prefix_extractor,
|
||||||
|
FLAGS_bucket_count));
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
PrefixTest() : option_config_(kBegin) {
|
||||||
|
options.comparator = new TestKeyComparator();
|
||||||
|
}
|
||||||
~PrefixTest() {
|
~PrefixTest() {
|
||||||
delete options.comparator;
|
delete options.comparator;
|
||||||
}
|
}
|
||||||
protected:
|
protected:
|
||||||
|
enum OptionConfig {
|
||||||
|
kBegin,
|
||||||
|
kHashSkipList,
|
||||||
|
kHashLinkList,
|
||||||
|
kEnd
|
||||||
|
};
|
||||||
|
int option_config_;
|
||||||
Options options;
|
Options options;
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST(PrefixTest, DynamicPrefixIterator) {
|
TEST(PrefixTest, DynamicPrefixIterator) {
|
||||||
|
while (NextOptions()) {
|
||||||
|
std::cout << "*** Mem table: " << options.memtable_factory->Name()
|
||||||
|
<< std::endl;
|
||||||
|
DestroyDB(kDbName, Options());
|
||||||
|
auto db = OpenDb();
|
||||||
|
WriteOptions write_options;
|
||||||
|
ReadOptions read_options;
|
||||||
|
|
||||||
DestroyDB(kDbName, Options());
|
std::vector<uint64_t> prefixes;
|
||||||
auto db = OpenDb();
|
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
|
||||||
WriteOptions write_options;
|
prefixes.push_back(i);
|
||||||
ReadOptions read_options;
|
|
||||||
|
|
||||||
std::vector<uint64_t> prefixes;
|
|
||||||
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
|
|
||||||
prefixes.push_back(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (FLAGS_random_prefix) {
|
|
||||||
std::random_shuffle(prefixes.begin(), prefixes.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
// insert x random prefix, each with y continuous element.
|
|
||||||
for (auto prefix : prefixes) {
|
|
||||||
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
|
|
||||||
TestKey test_key(prefix, sorted);
|
|
||||||
|
|
||||||
Slice key = TestKeyToSlice(test_key);
|
|
||||||
std::string value = "v" + std::to_string(sorted);
|
|
||||||
|
|
||||||
ASSERT_OK(db->Put(write_options, key, value));
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// test seek existing keys
|
if (FLAGS_random_prefix) {
|
||||||
HistogramImpl hist_seek_time;
|
std::random_shuffle(prefixes.begin(), prefixes.end());
|
||||||
HistogramImpl hist_seek_comparison;
|
}
|
||||||
|
|
||||||
if (FLAGS_use_prefix_hash_memtable) {
|
HistogramImpl hist_put_time;
|
||||||
read_options.prefix_seek = true;
|
HistogramImpl hist_put_comparison;
|
||||||
}
|
|
||||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
|
||||||
|
|
||||||
for (auto prefix : prefixes) {
|
// insert x random prefix, each with y continuous element.
|
||||||
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
|
for (auto prefix : prefixes) {
|
||||||
Slice key = TestKeyToSlice(test_key);
|
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
|
||||||
std::string value = "v" + std::to_string(0);
|
TestKey test_key(prefix, sorted);
|
||||||
|
|
||||||
perf_context.Reset();
|
Slice key = TestKeyToSlice(test_key);
|
||||||
StopWatchNano timer(Env::Default(), true);
|
std::string value(FLAGS_value_size, 0);
|
||||||
uint64_t total_keys = 0;
|
|
||||||
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
perf_context.Reset();
|
||||||
if (FLAGS_trigger_deadlock) {
|
StopWatchNano timer(Env::Default(), true);
|
||||||
std::cout << "Behold the deadlock!\n";
|
ASSERT_OK(db->Put(write_options, key, value));
|
||||||
db->Delete(write_options, iter->key());
|
hist_put_time.Add(timer.ElapsedNanos());
|
||||||
|
hist_put_comparison.Add(perf_context.user_key_comparison_count);
|
||||||
}
|
}
|
||||||
auto test_key = SliceToTestKey(iter->key());
|
|
||||||
if (test_key->prefix != prefix) break;
|
|
||||||
total_keys++;
|
|
||||||
}
|
}
|
||||||
hist_seek_time.Add(timer.ElapsedNanos());
|
|
||||||
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
|
|
||||||
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "Seek key comparison: \n"
|
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
|
||||||
<< hist_seek_comparison.ToString()
|
<< "Put time: \n" << hist_put_time.ToString();
|
||||||
<< "Seek time: \n"
|
|
||||||
<< hist_seek_time.ToString();
|
|
||||||
|
|
||||||
// test non-existing keys
|
// test seek existing keys
|
||||||
HistogramImpl hist_no_seek_time;
|
HistogramImpl hist_seek_time;
|
||||||
HistogramImpl hist_no_seek_comparison;
|
HistogramImpl hist_seek_comparison;
|
||||||
|
|
||||||
for (auto prefix = FLAGS_total_prefixes;
|
if (FLAGS_use_prefix_hash_memtable) {
|
||||||
prefix < FLAGS_total_prefixes + 100;
|
read_options.prefix_seek = true;
|
||||||
prefix++) {
|
}
|
||||||
TestKey test_key(prefix, 0);
|
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||||
Slice key = TestKeyToSlice(test_key);
|
|
||||||
|
|
||||||
perf_context.Reset();
|
|
||||||
StopWatchNano timer(Env::Default(), true);
|
|
||||||
iter->Seek(key);
|
|
||||||
hist_no_seek_time.Add(timer.ElapsedNanos());
|
|
||||||
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
|
|
||||||
ASSERT_TRUE(!iter->Valid());
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "non-existing Seek key comparison: \n"
|
|
||||||
<< hist_no_seek_comparison.ToString()
|
|
||||||
<< "non-existing Seek time: \n"
|
|
||||||
<< hist_no_seek_time.ToString();
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(PrefixTest, PrefixHash) {
|
|
||||||
|
|
||||||
DestroyDB(kDbName, Options());
|
|
||||||
auto db = OpenDb();
|
|
||||||
WriteOptions write_options;
|
|
||||||
ReadOptions read_options;
|
|
||||||
|
|
||||||
std::vector<uint64_t> prefixes;
|
|
||||||
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
|
|
||||||
prefixes.push_back(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (FLAGS_random_prefix) {
|
|
||||||
std::random_shuffle(prefixes.begin(), prefixes.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
// insert x random prefix, each with y continuous element.
|
|
||||||
HistogramImpl hist_put_time;
|
|
||||||
HistogramImpl hist_put_comparison;
|
|
||||||
|
|
||||||
for (auto prefix : prefixes) {
|
|
||||||
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
|
|
||||||
TestKey test_key(prefix, sorted);
|
|
||||||
|
|
||||||
|
for (auto prefix : prefixes) {
|
||||||
|
TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
|
||||||
Slice key = TestKeyToSlice(test_key);
|
Slice key = TestKeyToSlice(test_key);
|
||||||
std::string value = "v" + std::to_string(sorted);
|
std::string value = "v" + std::to_string(0);
|
||||||
|
|
||||||
perf_context.Reset();
|
perf_context.Reset();
|
||||||
StopWatchNano timer(Env::Default(), true);
|
StopWatchNano timer(Env::Default(), true);
|
||||||
ASSERT_OK(db->Put(write_options, key, value));
|
uint64_t total_keys = 0;
|
||||||
hist_put_time.Add(timer.ElapsedNanos());
|
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
||||||
hist_put_comparison.Add(perf_context.user_key_comparison_count);
|
if (FLAGS_trigger_deadlock) {
|
||||||
}
|
std::cout << "Behold the deadlock!\n";
|
||||||
}
|
db->Delete(write_options, iter->key());
|
||||||
|
}
|
||||||
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
|
auto test_key = SliceToTestKey(iter->key());
|
||||||
<< "Put time: \n" << hist_put_time.ToString();
|
if (test_key->prefix != prefix) break;
|
||||||
|
total_keys++;
|
||||||
|
|
||||||
// test seek existing keys
|
|
||||||
HistogramImpl hist_seek_time;
|
|
||||||
HistogramImpl hist_seek_comparison;
|
|
||||||
|
|
||||||
for (auto prefix : prefixes) {
|
|
||||||
TestKey test_key(prefix, 0);
|
|
||||||
Slice key = TestKeyToSlice(test_key);
|
|
||||||
std::string value = "v" + std::to_string(0);
|
|
||||||
|
|
||||||
Slice key_prefix;
|
|
||||||
if (FLAGS_use_prefix_hash_memtable) {
|
|
||||||
key_prefix = options.prefix_extractor->Transform(key);
|
|
||||||
read_options.prefix = &key_prefix;
|
|
||||||
}
|
|
||||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
|
||||||
|
|
||||||
perf_context.Reset();
|
|
||||||
StopWatchNano timer(Env::Default(), true);
|
|
||||||
uint64_t total_keys = 0;
|
|
||||||
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
|
||||||
if (FLAGS_trigger_deadlock) {
|
|
||||||
std::cout << "Behold the deadlock!\n";
|
|
||||||
db->Delete(write_options, iter->key());
|
|
||||||
}
|
}
|
||||||
auto test_key = SliceToTestKey(iter->key());
|
hist_seek_time.Add(timer.ElapsedNanos());
|
||||||
if (test_key->prefix != prefix) break;
|
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||||
total_keys++;
|
ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
|
||||||
}
|
}
|
||||||
hist_seek_time.Add(timer.ElapsedNanos());
|
|
||||||
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
|
|
||||||
ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "Seek key comparison: \n"
|
std::cout << "Seek key comparison: \n"
|
||||||
<< hist_seek_comparison.ToString()
|
<< hist_seek_comparison.ToString()
|
||||||
<< "Seek time: \n"
|
<< "Seek time: \n"
|
||||||
<< hist_seek_time.ToString();
|
<< hist_seek_time.ToString();
|
||||||
|
|
||||||
// test non-existing keys
|
// test non-existing keys
|
||||||
HistogramImpl hist_no_seek_time;
|
HistogramImpl hist_no_seek_time;
|
||||||
HistogramImpl hist_no_seek_comparison;
|
HistogramImpl hist_no_seek_comparison;
|
||||||
|
|
||||||
for (auto prefix = FLAGS_total_prefixes;
|
for (auto prefix = FLAGS_total_prefixes;
|
||||||
prefix < FLAGS_total_prefixes + 100;
|
prefix < FLAGS_total_prefixes + 10000;
|
||||||
prefix++) {
|
prefix++) {
|
||||||
TestKey test_key(prefix, 0);
|
TestKey test_key(prefix, 0);
|
||||||
Slice key = TestKeyToSlice(test_key);
|
Slice key = TestKeyToSlice(test_key);
|
||||||
|
|
||||||
if (FLAGS_use_prefix_hash_memtable) {
|
perf_context.Reset();
|
||||||
Slice key_prefix = options.prefix_extractor->Transform(key);
|
StopWatchNano timer(Env::Default(), true);
|
||||||
read_options.prefix = &key_prefix;
|
iter->Seek(key);
|
||||||
|
hist_no_seek_time.Add(timer.ElapsedNanos());
|
||||||
|
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||||
|
ASSERT_TRUE(!iter->Valid());
|
||||||
}
|
}
|
||||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
|
||||||
|
|
||||||
perf_context.Reset();
|
std::cout << "non-existing Seek key comparison: \n"
|
||||||
StopWatchNano timer(Env::Default(), true);
|
<< hist_no_seek_comparison.ToString()
|
||||||
iter->Seek(key);
|
<< "non-existing Seek time: \n"
|
||||||
hist_no_seek_time.Add(timer.ElapsedNanos());
|
<< hist_no_seek_time.ToString();
|
||||||
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
|
|
||||||
ASSERT_TRUE(!iter->Valid());
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::cout << "non-existing Seek key comparison: \n"
|
TEST(PrefixTest, PrefixHash) {
|
||||||
<< hist_no_seek_comparison.ToString()
|
while (NextOptions()) {
|
||||||
<< "non-existing Seek time: \n"
|
std::cout << "*** Mem table: " << options.memtable_factory->Name()
|
||||||
<< hist_no_seek_time.ToString();
|
<< std::endl;
|
||||||
|
DestroyDB(kDbName, Options());
|
||||||
|
auto db = OpenDb();
|
||||||
|
WriteOptions write_options;
|
||||||
|
ReadOptions read_options;
|
||||||
|
|
||||||
|
std::vector<uint64_t> prefixes;
|
||||||
|
for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
|
||||||
|
prefixes.push_back(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (FLAGS_random_prefix) {
|
||||||
|
std::random_shuffle(prefixes.begin(), prefixes.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
// insert x random prefix, each with y continuous element.
|
||||||
|
HistogramImpl hist_put_time;
|
||||||
|
HistogramImpl hist_put_comparison;
|
||||||
|
|
||||||
|
for (auto prefix : prefixes) {
|
||||||
|
for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
|
||||||
|
TestKey test_key(prefix, sorted);
|
||||||
|
|
||||||
|
Slice key = TestKeyToSlice(test_key);
|
||||||
|
std::string value = "v" + std::to_string(sorted);
|
||||||
|
|
||||||
|
perf_context.Reset();
|
||||||
|
StopWatchNano timer(Env::Default(), true);
|
||||||
|
ASSERT_OK(db->Put(write_options, key, value));
|
||||||
|
hist_put_time.Add(timer.ElapsedNanos());
|
||||||
|
hist_put_comparison.Add(perf_context.user_key_comparison_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
|
||||||
|
<< "Put time: \n" << hist_put_time.ToString();
|
||||||
|
|
||||||
|
|
||||||
|
// test seek existing keys
|
||||||
|
HistogramImpl hist_seek_time;
|
||||||
|
HistogramImpl hist_seek_comparison;
|
||||||
|
|
||||||
|
for (auto prefix : prefixes) {
|
||||||
|
TestKey test_key(prefix, 0);
|
||||||
|
Slice key = TestKeyToSlice(test_key);
|
||||||
|
std::string value = "v" + std::to_string(0);
|
||||||
|
|
||||||
|
Slice key_prefix;
|
||||||
|
if (FLAGS_use_prefix_hash_memtable) {
|
||||||
|
key_prefix = options.prefix_extractor->Transform(key);
|
||||||
|
read_options.prefix = &key_prefix;
|
||||||
|
}
|
||||||
|
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||||
|
|
||||||
|
perf_context.Reset();
|
||||||
|
StopWatchNano timer(Env::Default(), true);
|
||||||
|
uint64_t total_keys = 0;
|
||||||
|
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
||||||
|
if (FLAGS_trigger_deadlock) {
|
||||||
|
std::cout << "Behold the deadlock!\n";
|
||||||
|
db->Delete(write_options, iter->key());
|
||||||
|
}
|
||||||
|
auto test_key = SliceToTestKey(iter->key());
|
||||||
|
if (test_key->prefix != prefix) break;
|
||||||
|
total_keys++;
|
||||||
|
}
|
||||||
|
hist_seek_time.Add(timer.ElapsedNanos());
|
||||||
|
hist_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||||
|
ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Seek key comparison: \n"
|
||||||
|
<< hist_seek_comparison.ToString()
|
||||||
|
<< "Seek time: \n"
|
||||||
|
<< hist_seek_time.ToString();
|
||||||
|
|
||||||
|
// test non-existing keys
|
||||||
|
HistogramImpl hist_no_seek_time;
|
||||||
|
HistogramImpl hist_no_seek_comparison;
|
||||||
|
|
||||||
|
for (auto prefix = FLAGS_total_prefixes;
|
||||||
|
prefix < FLAGS_total_prefixes + 100;
|
||||||
|
prefix++) {
|
||||||
|
TestKey test_key(prefix, 0);
|
||||||
|
Slice key = TestKeyToSlice(test_key);
|
||||||
|
|
||||||
|
if (FLAGS_use_prefix_hash_memtable) {
|
||||||
|
Slice key_prefix = options.prefix_extractor->Transform(key);
|
||||||
|
read_options.prefix = &key_prefix;
|
||||||
|
}
|
||||||
|
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||||
|
|
||||||
|
perf_context.Reset();
|
||||||
|
StopWatchNano timer(Env::Default(), true);
|
||||||
|
iter->Seek(key);
|
||||||
|
hist_no_seek_time.Add(timer.ElapsedNanos());
|
||||||
|
hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
|
||||||
|
ASSERT_TRUE(!iter->Valid());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "non-existing Seek key comparison: \n"
|
||||||
|
<< hist_no_seek_comparison.ToString()
|
||||||
|
<< "non-existing Seek time: \n"
|
||||||
|
<< hist_no_seek_time.ToString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -231,10 +231,8 @@ class Repairer {
|
|||||||
FileMetaData meta;
|
FileMetaData meta;
|
||||||
meta.number = next_file_number_++;
|
meta.number = next_file_number_++;
|
||||||
Iterator* iter = mem->NewIterator();
|
Iterator* iter = mem->NewIterator();
|
||||||
status = BuildTable(dbname_, env_, options_, storage_options_,
|
status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
|
||||||
table_cache_, iter, &meta,
|
iter, &meta, icmp_, 0, 0, kNoCompression);
|
||||||
icmp_.user_comparator(), 0, 0,
|
|
||||||
kNoCompression);
|
|
||||||
delete iter;
|
delete iter;
|
||||||
delete mem->Unref();
|
delete mem->Unref();
|
||||||
delete cf_mems_default;
|
delete cf_mems_default;
|
||||||
@ -275,8 +273,9 @@ class Repairer {
|
|||||||
int counter = 0;
|
int counter = 0;
|
||||||
Status status = env_->GetFileSize(fname, &t->meta.file_size);
|
Status status = env_->GetFileSize(fname, &t->meta.file_size);
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
|
FileMetaData dummy_meta(t->meta.number, t->meta.file_size);
|
||||||
Iterator* iter = table_cache_->NewIterator(
|
Iterator* iter = table_cache_->NewIterator(
|
||||||
ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
|
ReadOptions(), storage_options_, icmp_, dummy_meta);
|
||||||
bool empty = true;
|
bool empty = true;
|
||||||
ParsedInternalKey parsed;
|
ParsedInternalKey parsed;
|
||||||
t->min_sequence = 0;
|
t->min_sequence = 0;
|
||||||
|
@ -22,6 +22,8 @@
|
|||||||
#include "rocksdb/compaction_filter.h"
|
#include "rocksdb/compaction_filter.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
|
#include "rocksdb/table_properties.h"
|
||||||
|
#include "table/table_builder.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
#include "util/logging.h"
|
#include "util/logging.h"
|
||||||
#include "util/mutexlock.h"
|
#include "util/mutexlock.h"
|
||||||
@ -31,6 +33,7 @@
|
|||||||
|
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
|
|
||||||
|
// IS THIS FILE STILL NEEDED?
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
|
// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||||
@ -84,15 +87,13 @@ public:
|
|||||||
|
|
||||||
Iterator* NewIterator(const ReadOptions&) override;
|
Iterator* NewIterator(const ReadOptions&) override;
|
||||||
|
|
||||||
Status Get(
|
Status Get(const ReadOptions&, const Slice& key, void* arg,
|
||||||
const ReadOptions&, const Slice& key, void* arg,
|
bool (*handle_result)(void* arg, const ParsedInternalKey& k,
|
||||||
bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
|
const Slice& v, bool),
|
||||||
void (*mark_key_may_exist)(void*) = nullptr) override;
|
void (*mark_key_may_exist)(void*) = nullptr) override;
|
||||||
|
|
||||||
uint64_t ApproximateOffsetOf(const Slice& key) override;
|
uint64_t ApproximateOffsetOf(const Slice& key) override;
|
||||||
|
|
||||||
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
|
|
||||||
|
|
||||||
void SetupForCompaction() override;
|
void SetupForCompaction() override;
|
||||||
|
|
||||||
TableProperties& GetTableProperties() override;
|
TableProperties& GetTableProperties() override;
|
||||||
@ -244,7 +245,8 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
|
InternalKeyComparator ikc(rep_->options.comparator);
|
||||||
|
int compare_result = ikc.Compare(tmp_slice, target);
|
||||||
|
|
||||||
if (compare_result < 0) {
|
if (compare_result < 0) {
|
||||||
if (left == right) {
|
if (left == right) {
|
||||||
@ -279,14 +281,20 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status SimpleTableReader::Get(
|
Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
|
||||||
const ReadOptions& options, const Slice& k, void* arg,
|
void* arg,
|
||||||
bool (*saver)(void*, const Slice&, const Slice&, bool),
|
bool (*saver)(void*, const ParsedInternalKey&,
|
||||||
void (*mark_key_may_exist)(void*)) {
|
const Slice&, bool),
|
||||||
|
void (*mark_key_may_exist)(void*)) {
|
||||||
Status s;
|
Status s;
|
||||||
SimpleTableIterator* iter = new SimpleTableIterator(this);
|
SimpleTableIterator* iter = new SimpleTableIterator(this);
|
||||||
for (iter->Seek(k); iter->Valid(); iter->Next()) {
|
for (iter->Seek(k); iter->Valid(); iter->Next()) {
|
||||||
if (!(*saver)(arg, iter->key(), iter->value(), true)) {
|
ParsedInternalKey parsed_key;
|
||||||
|
if (!ParseInternalKey(iter->key(), &parsed_key)) {
|
||||||
|
return Status::Corruption(Slice());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(*saver)(arg, parsed_key, iter->value(), true)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -295,11 +303,6 @@ Status SimpleTableReader::Get(
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
|
|
||||||
const Slice& key) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
|
uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -540,27 +543,30 @@ public:
|
|||||||
const char* Name() const override {
|
const char* Name() const override {
|
||||||
return "SimpleTable";
|
return "SimpleTable";
|
||||||
}
|
}
|
||||||
Status GetTableReader(const Options& options, const EnvOptions& soptions,
|
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||||
unique_ptr<RandomAccessFile> && file,
|
const InternalKeyComparator& internal_key,
|
||||||
uint64_t file_size,
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table_reader) const;
|
unique_ptr<TableReader>* table_reader) const;
|
||||||
|
|
||||||
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
TableBuilder* NewTableBuilder(const Options& options,
|
||||||
|
const InternalKeyComparator& internal_key,
|
||||||
|
WritableFile* file,
|
||||||
CompressionType compression_type) const;
|
CompressionType compression_type) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
Status SimpleTableFactory::GetTableReader(
|
Status SimpleTableFactory::NewTableReader(
|
||||||
const Options& options, const EnvOptions& soptions,
|
const Options& options, const EnvOptions& soptions,
|
||||||
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
|
const InternalKeyComparator& internal_key,
|
||||||
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table_reader) const {
|
unique_ptr<TableReader>* table_reader) const {
|
||||||
|
|
||||||
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
|
return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
|
||||||
table_reader);
|
table_reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
TableBuilder* SimpleTableFactory::GetTableBuilder(
|
TableBuilder* SimpleTableFactory::NewTableBuilder(
|
||||||
const Options& options, WritableFile* file,
|
const Options& options, const InternalKeyComparator& internal_key,
|
||||||
CompressionType compression_type) const {
|
WritableFile* file, CompressionType compression_type) const {
|
||||||
return new SimpleTableBuilder(options, file, compression_type);
|
return new SimpleTableBuilder(options, file, compression_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,8 +34,8 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
|
#include "util/arena.h"
|
||||||
#include "util/random.h"
|
#include "util/random.h"
|
||||||
#include "rocksdb/arena.h"
|
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
@ -48,7 +48,8 @@ class SkipList {
|
|||||||
// Create a new SkipList object that will use "cmp" for comparing keys,
|
// Create a new SkipList object that will use "cmp" for comparing keys,
|
||||||
// and will allocate memory using "*arena". Objects allocated in the arena
|
// and will allocate memory using "*arena". Objects allocated in the arena
|
||||||
// must remain allocated for the lifetime of the skiplist object.
|
// must remain allocated for the lifetime of the skiplist object.
|
||||||
explicit SkipList(Comparator cmp, Arena* arena);
|
explicit SkipList(Comparator cmp, Arena* arena,
|
||||||
|
int32_t max_height = 12, int32_t branching_factor = 4);
|
||||||
|
|
||||||
// Insert key into the list.
|
// Insert key into the list.
|
||||||
// REQUIRES: nothing that compares equal to key is currently in the list.
|
// REQUIRES: nothing that compares equal to key is currently in the list.
|
||||||
@ -102,7 +103,8 @@ class SkipList {
|
|||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum { kMaxHeight = 12 };
|
const int32_t kMaxHeight_;
|
||||||
|
const int32_t kBranching_;
|
||||||
|
|
||||||
// Immutable after construction
|
// Immutable after construction
|
||||||
Comparator const compare_;
|
Comparator const compare_;
|
||||||
@ -115,8 +117,8 @@ class SkipList {
|
|||||||
port::AtomicPointer max_height_; // Height of the entire list
|
port::AtomicPointer max_height_; // Height of the entire list
|
||||||
|
|
||||||
// Used for optimizing sequential insert patterns
|
// Used for optimizing sequential insert patterns
|
||||||
Node* prev_[kMaxHeight];
|
Node** prev_;
|
||||||
int prev_height_;
|
int32_t prev_height_;
|
||||||
|
|
||||||
inline int GetMaxHeight() const {
|
inline int GetMaxHeight() const {
|
||||||
return static_cast<int>(
|
return static_cast<int>(
|
||||||
@ -258,13 +260,12 @@ inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
|
|||||||
template<typename Key, class Comparator>
|
template<typename Key, class Comparator>
|
||||||
int SkipList<Key,Comparator>::RandomHeight() {
|
int SkipList<Key,Comparator>::RandomHeight() {
|
||||||
// Increase height with probability 1 in kBranching
|
// Increase height with probability 1 in kBranching
|
||||||
static const unsigned int kBranching = 4;
|
|
||||||
int height = 1;
|
int height = 1;
|
||||||
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
|
while (height < kMaxHeight_ && ((rnd_.Next() % kBranching_) == 0)) {
|
||||||
height++;
|
height++;
|
||||||
}
|
}
|
||||||
assert(height > 0);
|
assert(height > 0);
|
||||||
assert(height <= kMaxHeight);
|
assert(height <= kMaxHeight_);
|
||||||
return height;
|
return height;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,14 +355,24 @@ typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename Key, class Comparator>
|
template<typename Key, class Comparator>
|
||||||
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
|
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena,
|
||||||
: compare_(cmp),
|
int32_t max_height,
|
||||||
|
int32_t branching_factor)
|
||||||
|
: kMaxHeight_(max_height),
|
||||||
|
kBranching_(branching_factor),
|
||||||
|
compare_(cmp),
|
||||||
arena_(arena),
|
arena_(arena),
|
||||||
head_(NewNode(0 /* any key will do */, kMaxHeight)),
|
head_(NewNode(0 /* any key will do */, max_height)),
|
||||||
max_height_(reinterpret_cast<void*>(1)),
|
max_height_(reinterpret_cast<void*>(1)),
|
||||||
prev_height_(1),
|
prev_height_(1),
|
||||||
rnd_(0xdeadbeef) {
|
rnd_(0xdeadbeef) {
|
||||||
for (int i = 0; i < kMaxHeight; i++) {
|
assert(kMaxHeight_ > 0);
|
||||||
|
assert(kBranching_ > 0);
|
||||||
|
// Allocate the prev_ Node* array, directly from the passed-in arena.
|
||||||
|
// prev_ does not need to be freed, as its life cycle is tied up with
|
||||||
|
// the arena as a whole.
|
||||||
|
prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
|
||||||
|
for (int i = 0; i < kMaxHeight_; i++) {
|
||||||
head_->SetNext(i, nullptr);
|
head_->SetNext(i, nullptr);
|
||||||
prev_[i] = head_;
|
prev_[i] = head_;
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
#include "db/skiplist.h"
|
#include "db/skiplist.h"
|
||||||
#include <set>
|
#include <set>
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "util/arena_impl.h"
|
#include "util/arena.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
#include "util/random.h"
|
#include "util/random.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
@ -34,9 +34,9 @@ struct TestComparator {
|
|||||||
class SkipTest { };
|
class SkipTest { };
|
||||||
|
|
||||||
TEST(SkipTest, Empty) {
|
TEST(SkipTest, Empty) {
|
||||||
ArenaImpl arena_impl;
|
Arena arena;
|
||||||
TestComparator cmp;
|
TestComparator cmp;
|
||||||
SkipList<Key, TestComparator> list(cmp, &arena_impl);
|
SkipList<Key, TestComparator> list(cmp, &arena);
|
||||||
ASSERT_TRUE(!list.Contains(10));
|
ASSERT_TRUE(!list.Contains(10));
|
||||||
|
|
||||||
SkipList<Key, TestComparator>::Iterator iter(&list);
|
SkipList<Key, TestComparator>::Iterator iter(&list);
|
||||||
@ -54,9 +54,9 @@ TEST(SkipTest, InsertAndLookup) {
|
|||||||
const int R = 5000;
|
const int R = 5000;
|
||||||
Random rnd(1000);
|
Random rnd(1000);
|
||||||
std::set<Key> keys;
|
std::set<Key> keys;
|
||||||
ArenaImpl arena_impl;
|
Arena arena;
|
||||||
TestComparator cmp;
|
TestComparator cmp;
|
||||||
SkipList<Key, TestComparator> list(cmp, &arena_impl);
|
SkipList<Key, TestComparator> list(cmp, &arena);
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
Key key = rnd.Next() % R;
|
Key key = rnd.Next() % R;
|
||||||
if (keys.insert(key).second) {
|
if (keys.insert(key).second) {
|
||||||
@ -209,14 +209,14 @@ class ConcurrentTest {
|
|||||||
// Current state of the test
|
// Current state of the test
|
||||||
State current_;
|
State current_;
|
||||||
|
|
||||||
ArenaImpl arena_impl_;
|
Arena arena_;
|
||||||
|
|
||||||
// SkipList is not protected by mu_. We just use a single writer
|
// SkipList is not protected by mu_. We just use a single writer
|
||||||
// thread to modify it.
|
// thread to modify it.
|
||||||
SkipList<Key, TestComparator> list_;
|
SkipList<Key, TestComparator> list_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
|
ConcurrentTest() : list_(TestComparator(), &arena_) {}
|
||||||
|
|
||||||
// REQUIRES: External synchronization
|
// REQUIRES: External synchronization
|
||||||
void WriteStep(Random* rnd) {
|
void WriteStep(Random* rnd) {
|
||||||
|
@ -10,9 +10,10 @@
|
|||||||
#include "db/table_cache.h"
|
#include "db/table_cache.h"
|
||||||
|
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
|
#include "db/version_edit.h"
|
||||||
|
|
||||||
#include "rocksdb/statistics.h"
|
#include "rocksdb/statistics.h"
|
||||||
#include "rocksdb/table.h"
|
#include "table/table_reader.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/stop_watch.h"
|
#include "util/stop_watch.h"
|
||||||
|
|
||||||
@ -34,7 +35,6 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) {
|
|||||||
sizeof(*file_number));
|
sizeof(*file_number));
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(icanadi) Options -> DBOptions
|
|
||||||
TableCache::TableCache(const std::string& dbname, const Options* options,
|
TableCache::TableCache(const std::string& dbname, const Options* options,
|
||||||
const EnvOptions& storage_options, Cache* const cache)
|
const EnvOptions& storage_options, Cache* const cache)
|
||||||
: env_(options->env),
|
: env_(options->env),
|
||||||
@ -46,7 +46,16 @@ TableCache::TableCache(const std::string& dbname, const Options* options,
|
|||||||
TableCache::~TableCache() {
|
TableCache::~TableCache() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
|
||||||
|
return reinterpret_cast<TableReader*>(cache_->Value(handle));
|
||||||
|
}
|
||||||
|
|
||||||
|
void TableCache::ReleaseHandle(Cache::Handle* handle) {
|
||||||
|
cache_->Release(handle);
|
||||||
|
}
|
||||||
|
|
||||||
Status TableCache::FindTable(const EnvOptions& toptions,
|
Status TableCache::FindTable(const EnvOptions& toptions,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
uint64_t file_number, uint64_t file_size,
|
uint64_t file_number, uint64_t file_size,
|
||||||
Cache::Handle** handle, bool* table_io,
|
Cache::Handle** handle, bool* table_io,
|
||||||
const bool no_io) {
|
const bool no_io) {
|
||||||
@ -70,8 +79,9 @@ Status TableCache::FindTable(const EnvOptions& toptions,
|
|||||||
file->Hint(RandomAccessFile::RANDOM);
|
file->Hint(RandomAccessFile::RANDOM);
|
||||||
}
|
}
|
||||||
StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
|
StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
|
||||||
s = options_->table_factory->GetTableReader(
|
s = options_->table_factory->NewTableReader(
|
||||||
*options_, toptions, std::move(file), file_size, &table_reader);
|
*options_, toptions, internal_comparator, std::move(file), file_size,
|
||||||
|
&table_reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
@ -89,25 +99,28 @@ Status TableCache::FindTable(const EnvOptions& toptions,
|
|||||||
|
|
||||||
Iterator* TableCache::NewIterator(const ReadOptions& options,
|
Iterator* TableCache::NewIterator(const ReadOptions& options,
|
||||||
const EnvOptions& toptions,
|
const EnvOptions& toptions,
|
||||||
uint64_t file_number,
|
const InternalKeyComparator& icomparator,
|
||||||
uint64_t file_size,
|
const FileMetaData& file_meta,
|
||||||
TableReader** table_reader_ptr,
|
TableReader** table_reader_ptr,
|
||||||
bool for_compaction) {
|
bool for_compaction) {
|
||||||
if (table_reader_ptr != nullptr) {
|
if (table_reader_ptr != nullptr) {
|
||||||
*table_reader_ptr = nullptr;
|
*table_reader_ptr = nullptr;
|
||||||
}
|
}
|
||||||
|
Cache::Handle* handle = file_meta.table_reader_handle;
|
||||||
Cache::Handle* handle = nullptr;
|
Status s;
|
||||||
Status s = FindTable(toptions, file_number, file_size, &handle,
|
if (!handle) {
|
||||||
nullptr, options.read_tier == kBlockCacheTier);
|
s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
|
||||||
|
&handle, nullptr, options.read_tier == kBlockCacheTier);
|
||||||
|
}
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return NewErrorIterator(s);
|
return NewErrorIterator(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
TableReader* table_reader =
|
TableReader* table_reader = GetTableReaderFromHandle(handle);
|
||||||
reinterpret_cast<TableReader*>(cache_->Value(handle));
|
|
||||||
Iterator* result = table_reader->NewIterator(options);
|
Iterator* result = table_reader->NewIterator(options);
|
||||||
result->RegisterCleanup(&UnrefEntry, cache_, handle);
|
if (!file_meta.table_reader_handle) {
|
||||||
|
result->RegisterCleanup(&UnrefEntry, cache_, handle);
|
||||||
|
}
|
||||||
if (table_reader_ptr != nullptr) {
|
if (table_reader_ptr != nullptr) {
|
||||||
*table_reader_ptr = table_reader;
|
*table_reader_ptr = table_reader;
|
||||||
}
|
}
|
||||||
@ -120,22 +133,24 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status TableCache::Get(const ReadOptions& options,
|
Status TableCache::Get(const ReadOptions& options,
|
||||||
uint64_t file_number,
|
const InternalKeyComparator& internal_comparator,
|
||||||
uint64_t file_size,
|
const FileMetaData& file_meta, const Slice& k, void* arg,
|
||||||
const Slice& k,
|
bool (*saver)(void*, const ParsedInternalKey&,
|
||||||
void* arg,
|
const Slice&, bool),
|
||||||
bool (*saver)(void*, const Slice&, const Slice&, bool),
|
bool* table_io, void (*mark_key_may_exist)(void*)) {
|
||||||
bool* table_io,
|
Cache::Handle* handle = file_meta.table_reader_handle;
|
||||||
void (*mark_key_may_exist)(void*)) {
|
Status s;
|
||||||
Cache::Handle* handle = nullptr;
|
if (!handle) {
|
||||||
Status s = FindTable(storage_options_, file_number, file_size,
|
s = FindTable(storage_options_, internal_comparator, file_meta.number,
|
||||||
&handle, table_io,
|
file_meta.file_size, &handle, table_io,
|
||||||
options.read_tier == kBlockCacheTier);
|
options.read_tier == kBlockCacheTier);
|
||||||
|
}
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
TableReader* t =
|
TableReader* t = GetTableReaderFromHandle(handle);
|
||||||
reinterpret_cast<TableReader*>(cache_->Value(handle));
|
|
||||||
s = t->Get(options, k, arg, saver, mark_key_may_exist);
|
s = t->Get(options, k, arg, saver, mark_key_may_exist);
|
||||||
cache_->Release(handle);
|
if (!file_meta.table_reader_handle) {
|
||||||
|
ReleaseHandle(handle);
|
||||||
|
}
|
||||||
} else if (options.read_tier && s.IsIncomplete()) {
|
} else if (options.read_tier && s.IsIncomplete()) {
|
||||||
// Couldnt find Table in cache but treat as kFound if no_io set
|
// Couldnt find Table in cache but treat as kFound if no_io set
|
||||||
(*mark_key_may_exist)(arg);
|
(*mark_key_may_exist)(arg);
|
||||||
@ -145,19 +160,17 @@ Status TableCache::Get(const ReadOptions& options,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool TableCache::PrefixMayMatch(const ReadOptions& options,
|
bool TableCache::PrefixMayMatch(const ReadOptions& options,
|
||||||
uint64_t file_number,
|
const InternalKeyComparator& icomparator,
|
||||||
uint64_t file_size,
|
uint64_t file_number, uint64_t file_size,
|
||||||
const Slice& internal_prefix,
|
const Slice& internal_prefix, bool* table_io) {
|
||||||
bool* table_io) {
|
|
||||||
Cache::Handle* handle = nullptr;
|
Cache::Handle* handle = nullptr;
|
||||||
Status s = FindTable(storage_options_, file_number,
|
Status s = FindTable(storage_options_, icomparator, file_number, file_size,
|
||||||
file_size, &handle, table_io);
|
&handle, table_io);
|
||||||
bool may_match = true;
|
bool may_match = true;
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
TableReader* t =
|
TableReader* t = GetTableReaderFromHandle(handle);
|
||||||
reinterpret_cast<TableReader*>(cache_->Value(handle));
|
|
||||||
may_match = t->PrefixMayMatch(internal_prefix);
|
may_match = t->PrefixMayMatch(internal_prefix);
|
||||||
cache_->Release(handle);
|
ReleaseHandle(handle);
|
||||||
}
|
}
|
||||||
return may_match;
|
return may_match;
|
||||||
}
|
}
|
||||||
|
@ -12,15 +12,18 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "rocksdb/env.h"
|
|
||||||
#include "rocksdb/cache.h"
|
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
|
#include "rocksdb/cache.h"
|
||||||
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
|
#include "table/table_reader.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
class Env;
|
class Env;
|
||||||
|
struct FileMetaData;
|
||||||
|
|
||||||
class TableCache {
|
class TableCache {
|
||||||
public:
|
public:
|
||||||
@ -35,10 +38,9 @@ class TableCache {
|
|||||||
// the returned iterator. The returned "*tableptr" object is owned by
|
// the returned iterator. The returned "*tableptr" object is owned by
|
||||||
// the cache and should not be deleted, and is valid for as long as the
|
// the cache and should not be deleted, and is valid for as long as the
|
||||||
// returned iterator is live.
|
// returned iterator is live.
|
||||||
Iterator* NewIterator(const ReadOptions& options,
|
Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions,
|
||||||
const EnvOptions& toptions,
|
const InternalKeyComparator& internal_comparator,
|
||||||
uint64_t file_number,
|
const FileMetaData& file_meta,
|
||||||
uint64_t file_size,
|
|
||||||
TableReader** table_reader_ptr = nullptr,
|
TableReader** table_reader_ptr = nullptr,
|
||||||
bool for_compaction = false);
|
bool for_compaction = false);
|
||||||
|
|
||||||
@ -46,33 +48,40 @@ class TableCache {
|
|||||||
// call (*handle_result)(arg, found_key, found_value) repeatedly until
|
// call (*handle_result)(arg, found_key, found_value) repeatedly until
|
||||||
// it returns false.
|
// it returns false.
|
||||||
Status Get(const ReadOptions& options,
|
Status Get(const ReadOptions& options,
|
||||||
uint64_t file_number,
|
const InternalKeyComparator& internal_comparator,
|
||||||
uint64_t file_size,
|
const FileMetaData& file_meta, const Slice& k, void* arg,
|
||||||
const Slice& k,
|
bool (*handle_result)(void*, const ParsedInternalKey&,
|
||||||
void* arg,
|
const Slice&, bool),
|
||||||
bool (*handle_result)(void*, const Slice&, const Slice&, bool),
|
bool* table_io, void (*mark_key_may_exist)(void*) = nullptr);
|
||||||
bool* table_io,
|
|
||||||
void (*mark_key_may_exist)(void*) = nullptr);
|
|
||||||
|
|
||||||
// Determine whether the table may contain the specified prefix. If
|
// Determine whether the table may contain the specified prefix. If
|
||||||
// the table index of blooms are not in memory, this may cause an I/O
|
// the table index or blooms are not in memory, this may cause an I/O
|
||||||
bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
|
bool PrefixMayMatch(const ReadOptions& options,
|
||||||
uint64_t file_size, const Slice& internal_prefix,
|
const InternalKeyComparator& internal_comparator,
|
||||||
bool* table_io);
|
uint64_t file_number, uint64_t file_size,
|
||||||
|
const Slice& internal_prefix, bool* table_io);
|
||||||
|
|
||||||
// Evict any entry for the specified file number
|
// Evict any entry for the specified file number
|
||||||
static void Evict(Cache* cache, uint64_t file_number);
|
static void Evict(Cache* cache, uint64_t file_number);
|
||||||
|
|
||||||
|
// Find table reader
|
||||||
|
Status FindTable(const EnvOptions& toptions,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
|
uint64_t file_number, uint64_t file_size, Cache::Handle**,
|
||||||
|
bool* table_io = nullptr, const bool no_io = false);
|
||||||
|
|
||||||
|
// Get TableReader from a cache handle.
|
||||||
|
TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
|
||||||
|
|
||||||
|
// Release the handle from a cache
|
||||||
|
void ReleaseHandle(Cache::Handle* handle);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Env* const env_;
|
Env* const env_;
|
||||||
const std::string dbname_;
|
const std::string dbname_;
|
||||||
const Options* options_;
|
const Options* options_;
|
||||||
const EnvOptions& storage_options_;
|
const EnvOptions& storage_options_;
|
||||||
Cache* const cache_;
|
Cache* const cache_;
|
||||||
|
|
||||||
Status FindTable(const EnvOptions& toptions, uint64_t file_number,
|
|
||||||
uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
|
|
||||||
const bool no_io = false);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -10,87 +10,6 @@
|
|||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
namespace {
|
|
||||||
void AppendProperty(
|
|
||||||
std::string& props,
|
|
||||||
const std::string& key,
|
|
||||||
const std::string& value,
|
|
||||||
const std::string& prop_delim,
|
|
||||||
const std::string& kv_delim) {
|
|
||||||
props.append(key);
|
|
||||||
props.append(kv_delim);
|
|
||||||
props.append(value);
|
|
||||||
props.append(prop_delim);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class TValue>
|
|
||||||
void AppendProperty(
|
|
||||||
std::string& props,
|
|
||||||
const std::string& key,
|
|
||||||
const TValue& value,
|
|
||||||
const std::string& prop_delim,
|
|
||||||
const std::string& kv_delim) {
|
|
||||||
AppendProperty(
|
|
||||||
props, key, std::to_string(value), prop_delim, kv_delim
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string TableProperties::ToString(
|
|
||||||
const std::string& prop_delim,
|
|
||||||
const std::string& kv_delim) const {
|
|
||||||
std::string result;
|
|
||||||
result.reserve(1024);
|
|
||||||
|
|
||||||
// Basic Info
|
|
||||||
AppendProperty(
|
|
||||||
result, "# data blocks", num_data_blocks, prop_delim, kv_delim
|
|
||||||
);
|
|
||||||
AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
|
|
||||||
|
|
||||||
AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
|
|
||||||
AppendProperty(
|
|
||||||
result,
|
|
||||||
"raw average key size",
|
|
||||||
num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
|
|
||||||
prop_delim,
|
|
||||||
kv_delim
|
|
||||||
);
|
|
||||||
AppendProperty(
|
|
||||||
result, "raw value size", raw_value_size, prop_delim, kv_delim
|
|
||||||
);
|
|
||||||
AppendProperty(
|
|
||||||
result,
|
|
||||||
"raw average value size",
|
|
||||||
num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
|
|
||||||
prop_delim,
|
|
||||||
kv_delim
|
|
||||||
);
|
|
||||||
|
|
||||||
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
|
|
||||||
AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
|
|
||||||
AppendProperty(
|
|
||||||
result, "filter block size", filter_size, prop_delim, kv_delim
|
|
||||||
);
|
|
||||||
AppendProperty(
|
|
||||||
result,
|
|
||||||
"(estimated) table size",
|
|
||||||
data_size + index_size + filter_size,
|
|
||||||
prop_delim,
|
|
||||||
kv_delim
|
|
||||||
);
|
|
||||||
|
|
||||||
AppendProperty(
|
|
||||||
result,
|
|
||||||
"filter policy name",
|
|
||||||
filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
|
|
||||||
prop_delim,
|
|
||||||
kv_delim
|
|
||||||
);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status InternalKeyPropertiesCollector::Add(
|
Status InternalKeyPropertiesCollector::Add(
|
||||||
const Slice& key, const Slice& value) {
|
const Slice& key, const Slice& value) {
|
||||||
ParsedInternalKey ikey;
|
ParsedInternalKey ikey;
|
||||||
@ -106,7 +25,7 @@ Status InternalKeyPropertiesCollector::Add(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status InternalKeyPropertiesCollector::Finish(
|
Status InternalKeyPropertiesCollector::Finish(
|
||||||
TableProperties::UserCollectedProperties* properties) {
|
UserCollectedProperties* properties) {
|
||||||
assert(properties);
|
assert(properties);
|
||||||
assert(properties->find(
|
assert(properties->find(
|
||||||
InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
|
InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
|
||||||
@ -118,7 +37,7 @@ Status InternalKeyPropertiesCollector::Finish(
|
|||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
TableProperties::UserCollectedProperties
|
UserCollectedProperties
|
||||||
InternalKeyPropertiesCollector::GetReadableProperties() const {
|
InternalKeyPropertiesCollector::GetReadableProperties() const {
|
||||||
return {
|
return {
|
||||||
{ "kDeletedKeys", std::to_string(deleted_keys_) }
|
{ "kDeletedKeys", std::to_string(deleted_keys_) }
|
||||||
@ -137,11 +56,11 @@ Status UserKeyTablePropertiesCollector::Add(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status UserKeyTablePropertiesCollector::Finish(
|
Status UserKeyTablePropertiesCollector::Finish(
|
||||||
TableProperties::UserCollectedProperties* properties) {
|
UserCollectedProperties* properties) {
|
||||||
return collector_->Finish(properties);
|
return collector_->Finish(properties);
|
||||||
}
|
}
|
||||||
|
|
||||||
TableProperties::UserCollectedProperties
|
UserCollectedProperties
|
||||||
UserKeyTablePropertiesCollector::GetReadableProperties() const {
|
UserKeyTablePropertiesCollector::GetReadableProperties() const {
|
||||||
return collector_->GetReadableProperties();
|
return collector_->GetReadableProperties();
|
||||||
}
|
}
|
||||||
@ -151,7 +70,7 @@ const std::string InternalKeyTablePropertiesNames::kDeletedKeys
|
|||||||
= "rocksdb.deleted.keys";
|
= "rocksdb.deleted.keys";
|
||||||
|
|
||||||
uint64_t GetDeletedKeys(
|
uint64_t GetDeletedKeys(
|
||||||
const TableProperties::UserCollectedProperties& props) {
|
const UserCollectedProperties& props) {
|
||||||
auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
|
auto pos = props.find(InternalKeyTablePropertiesNames::kDeletedKeys);
|
||||||
if (pos == props.end()) {
|
if (pos == props.end()) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -24,15 +24,13 @@ class InternalKeyPropertiesCollector : public TablePropertiesCollector {
|
|||||||
public:
|
public:
|
||||||
virtual Status Add(const Slice& key, const Slice& value) override;
|
virtual Status Add(const Slice& key, const Slice& value) override;
|
||||||
|
|
||||||
virtual Status Finish(
|
virtual Status Finish(UserCollectedProperties* properties) override;
|
||||||
TableProperties::UserCollectedProperties* properties) override;
|
|
||||||
|
|
||||||
virtual const char* Name() const override {
|
virtual const char* Name() const override {
|
||||||
return "InternalKeyPropertiesCollector";
|
return "InternalKeyPropertiesCollector";
|
||||||
}
|
}
|
||||||
|
|
||||||
TableProperties::UserCollectedProperties
|
UserCollectedProperties GetReadableProperties() const override;
|
||||||
GetReadableProperties() const override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint64_t deleted_keys_ = 0;
|
uint64_t deleted_keys_ = 0;
|
||||||
@ -61,13 +59,11 @@ class UserKeyTablePropertiesCollector : public TablePropertiesCollector {
|
|||||||
|
|
||||||
virtual Status Add(const Slice& key, const Slice& value) override;
|
virtual Status Add(const Slice& key, const Slice& value) override;
|
||||||
|
|
||||||
virtual Status Finish(
|
virtual Status Finish(UserCollectedProperties* properties) override;
|
||||||
TableProperties::UserCollectedProperties* properties) override;
|
|
||||||
|
|
||||||
virtual const char* Name() const override { return collector_->Name(); }
|
virtual const char* Name() const override { return collector_->Name(); }
|
||||||
|
|
||||||
TableProperties::UserCollectedProperties
|
UserCollectedProperties GetReadableProperties() const override;
|
||||||
GetReadableProperties() const override;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::shared_ptr<TablePropertiesCollector> collector_;
|
std::shared_ptr<TablePropertiesCollector> collector_;
|
||||||
|
@ -7,12 +7,14 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
|
||||||
#include "db/db_impl.h"
|
#include "db/db_impl.h"
|
||||||
|
#include "db/dbformat.h"
|
||||||
#include "db/table_properties_collector.h"
|
#include "db/table_properties_collector.h"
|
||||||
#include "rocksdb/table_properties.h"
|
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
#include "table/block_based_table_factory.h"
|
#include "table/block_based_table_factory.h"
|
||||||
|
#include "table/meta_blocks.h"
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
|
#include "table/table_builder.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
#include "util/testutil.h"
|
#include "util/testutil.h"
|
||||||
@ -20,8 +22,6 @@
|
|||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
class TablePropertiesTest {
|
class TablePropertiesTest {
|
||||||
private:
|
|
||||||
unique_ptr<TableReader> table_reader_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO(kailiu) the following classes should be moved to some more general
|
// TODO(kailiu) the following classes should be moved to some more general
|
||||||
@ -83,30 +83,13 @@ class DumbLogger : public Logger {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Utilities test functions
|
// Utilities test functions
|
||||||
void MakeBuilder(
|
void MakeBuilder(const Options& options,
|
||||||
const Options& options,
|
const InternalKeyComparator& internal_comparator,
|
||||||
std::unique_ptr<FakeWritableFile>* writable,
|
std::unique_ptr<FakeWritableFile>* writable,
|
||||||
std::unique_ptr<TableBuilder>* builder) {
|
std::unique_ptr<TableBuilder>* builder) {
|
||||||
writable->reset(new FakeWritableFile);
|
writable->reset(new FakeWritableFile);
|
||||||
builder->reset(
|
builder->reset(options.table_factory->NewTableBuilder(
|
||||||
options.table_factory->GetTableBuilder(options, writable->get(),
|
options, internal_comparator, writable->get(), options.compression));
|
||||||
options.compression));
|
|
||||||
}
|
|
||||||
|
|
||||||
void OpenTable(
|
|
||||||
const Options& options,
|
|
||||||
const std::string& contents,
|
|
||||||
std::unique_ptr<TableReader>* table_reader) {
|
|
||||||
|
|
||||||
std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
|
|
||||||
auto s = options.table_factory->GetTableReader(
|
|
||||||
options,
|
|
||||||
EnvOptions(),
|
|
||||||
std::move(file),
|
|
||||||
contents.size(),
|
|
||||||
table_reader
|
|
||||||
);
|
|
||||||
ASSERT_OK(s);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collects keys that starts with "A" in a table.
|
// Collects keys that starts with "A" in a table.
|
||||||
@ -114,10 +97,10 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
|
|||||||
public:
|
public:
|
||||||
const char* Name() const { return "RegularKeysStartWithA"; }
|
const char* Name() const { return "RegularKeysStartWithA"; }
|
||||||
|
|
||||||
Status Finish(TableProperties::UserCollectedProperties* properties) {
|
Status Finish(UserCollectedProperties* properties) {
|
||||||
std::string encoded;
|
std::string encoded;
|
||||||
PutVarint32(&encoded, count_);
|
PutVarint32(&encoded, count_);
|
||||||
*properties = TableProperties::UserCollectedProperties {
|
*properties = UserCollectedProperties {
|
||||||
{ "TablePropertiesTest", "Rocksdb" },
|
{ "TablePropertiesTest", "Rocksdb" },
|
||||||
{ "Count", encoded }
|
{ "Count", encoded }
|
||||||
};
|
};
|
||||||
@ -132,8 +115,7 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
|
|||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual TableProperties::UserCollectedProperties
|
virtual UserCollectedProperties GetReadableProperties() const {
|
||||||
GetReadableProperties() const {
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,23 +124,65 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
|
|||||||
uint32_t count_ = 0;
|
uint32_t count_ = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
|
extern uint64_t kBlockBasedTableMagicNumber;
|
||||||
Options options;
|
extern uint64_t kPlainTableMagicNumber;
|
||||||
|
void TestCustomizedTablePropertiesCollector(
|
||||||
|
uint64_t magic_number, bool encode_as_internal, const Options& options,
|
||||||
|
const InternalKeyComparator& internal_comparator) {
|
||||||
// make sure the entries will be inserted with order.
|
// make sure the entries will be inserted with order.
|
||||||
std::map<std::string, std::string> kvs = {
|
std::map<std::string, std::string> kvs = {
|
||||||
{"About", "val5"}, // starts with 'A'
|
{"About ", "val5"}, // starts with 'A'
|
||||||
{"Abstract", "val2"}, // starts with 'A'
|
{"Abstract", "val2"}, // starts with 'A'
|
||||||
{"Around", "val7"}, // starts with 'A'
|
{"Around ", "val7"}, // starts with 'A'
|
||||||
{"Beyond", "val3"},
|
{"Beyond ", "val3"},
|
||||||
{"Builder", "val1"},
|
{"Builder ", "val1"},
|
||||||
{"Cancel", "val4"},
|
{"Cancel ", "val4"},
|
||||||
{"Find", "val6"},
|
{"Find ", "val6"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// -- Step 1: build table
|
||||||
|
std::unique_ptr<TableBuilder> builder;
|
||||||
|
std::unique_ptr<FakeWritableFile> writable;
|
||||||
|
MakeBuilder(options, internal_comparator, &writable, &builder);
|
||||||
|
|
||||||
|
for (const auto& kv : kvs) {
|
||||||
|
if (encode_as_internal) {
|
||||||
|
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
|
||||||
|
builder->Add(ikey.Encode(), kv.second);
|
||||||
|
} else {
|
||||||
|
builder->Add(kv.first, kv.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ASSERT_OK(builder->Finish());
|
||||||
|
|
||||||
|
// -- Step 2: Read properties
|
||||||
|
FakeRandomeAccessFile readable(writable->contents());
|
||||||
|
TableProperties props;
|
||||||
|
Status s = ReadTableProperties(
|
||||||
|
&readable,
|
||||||
|
writable->contents().size(),
|
||||||
|
magic_number,
|
||||||
|
Env::Default(),
|
||||||
|
nullptr,
|
||||||
|
&props
|
||||||
|
);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
|
||||||
|
auto user_collected = props.user_collected_properties;
|
||||||
|
|
||||||
|
ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
|
||||||
|
|
||||||
|
uint32_t starts_with_A = 0;
|
||||||
|
Slice key(user_collected.at("Count"));
|
||||||
|
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
||||||
|
ASSERT_EQ(3u, starts_with_A);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
|
||||||
// Test properties collectors with internal keys or regular keys
|
// Test properties collectors with internal keys or regular keys
|
||||||
|
// for block based table
|
||||||
for (bool encode_as_internal : { true, false }) {
|
for (bool encode_as_internal : { true, false }) {
|
||||||
// -- Step 1: build table
|
Options options;
|
||||||
auto collector = new RegularKeysStartWithA();
|
auto collector = new RegularKeysStartWithA();
|
||||||
if (encode_as_internal) {
|
if (encode_as_internal) {
|
||||||
options.table_properties_collectors = {
|
options.table_properties_collectors = {
|
||||||
@ -168,95 +192,109 @@ TEST(TablePropertiesTest, CustomizedTablePropertiesCollector) {
|
|||||||
options.table_properties_collectors.resize(1);
|
options.table_properties_collectors.resize(1);
|
||||||
options.table_properties_collectors[0].reset(collector);
|
options.table_properties_collectors[0].reset(collector);
|
||||||
}
|
}
|
||||||
std::unique_ptr<TableBuilder> builder;
|
test::PlainInternalKeyComparator ikc(options.comparator);
|
||||||
std::unique_ptr<FakeWritableFile> writable;
|
TestCustomizedTablePropertiesCollector(kBlockBasedTableMagicNumber,
|
||||||
MakeBuilder(options, &writable, &builder);
|
encode_as_internal, options, ikc);
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto& kv : kvs) {
|
// test plain table
|
||||||
if (encode_as_internal) {
|
Options options;
|
||||||
InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
|
options.table_properties_collectors.push_back(
|
||||||
builder->Add(ikey.Encode(), kv.second);
|
std::make_shared<RegularKeysStartWithA>()
|
||||||
} else {
|
);
|
||||||
builder->Add(kv.first, kv.second);
|
options.table_factory = std::make_shared<PlainTableFactory>(8, 8, 0);
|
||||||
}
|
test::PlainInternalKeyComparator ikc(options.comparator);
|
||||||
}
|
TestCustomizedTablePropertiesCollector(kPlainTableMagicNumber, true, options,
|
||||||
ASSERT_OK(builder->Finish());
|
ikc);
|
||||||
|
}
|
||||||
|
|
||||||
// -- Step 2: Open table
|
void TestInternalKeyPropertiesCollector(
|
||||||
std::unique_ptr<TableReader> table_reader;
|
uint64_t magic_number,
|
||||||
OpenTable(options, writable->contents(), &table_reader);
|
bool sanitized,
|
||||||
const auto& properties =
|
std::shared_ptr<TableFactory> table_factory) {
|
||||||
table_reader->GetTableProperties().user_collected_properties;
|
InternalKey keys[] = {
|
||||||
|
InternalKey("A ", 0, ValueType::kTypeValue),
|
||||||
|
InternalKey("B ", 0, ValueType::kTypeValue),
|
||||||
|
InternalKey("C ", 0, ValueType::kTypeValue),
|
||||||
|
InternalKey("W ", 0, ValueType::kTypeDeletion),
|
||||||
|
InternalKey("X ", 0, ValueType::kTypeDeletion),
|
||||||
|
InternalKey("Y ", 0, ValueType::kTypeDeletion),
|
||||||
|
InternalKey("Z ", 0, ValueType::kTypeDeletion),
|
||||||
|
};
|
||||||
|
|
||||||
ASSERT_EQ("Rocksdb", properties.at("TablePropertiesTest"));
|
std::unique_ptr<TableBuilder> builder;
|
||||||
|
std::unique_ptr<FakeWritableFile> writable;
|
||||||
|
Options options;
|
||||||
|
test::PlainInternalKeyComparator pikc(options.comparator);
|
||||||
|
|
||||||
|
options.table_factory = table_factory;
|
||||||
|
if (sanitized) {
|
||||||
|
options.table_properties_collectors = {
|
||||||
|
std::make_shared<RegularKeysStartWithA>()
|
||||||
|
};
|
||||||
|
// with sanitization, even regular properties collector will be able to
|
||||||
|
// handle internal keys.
|
||||||
|
auto comparator = options.comparator;
|
||||||
|
// HACK: Set options.info_log to avoid writing log in
|
||||||
|
// SanitizeOptions().
|
||||||
|
options.info_log = std::make_shared<DumbLogger>();
|
||||||
|
options = SanitizeOptions("db", // just a place holder
|
||||||
|
&pikc, nullptr, // don't care filter policy
|
||||||
|
options);
|
||||||
|
options.comparator = comparator;
|
||||||
|
} else {
|
||||||
|
options.table_properties_collectors = {
|
||||||
|
std::make_shared<InternalKeyPropertiesCollector>()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
MakeBuilder(options, pikc, &writable, &builder);
|
||||||
|
for (const auto& k : keys) {
|
||||||
|
builder->Add(k.Encode(), "val");
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_OK(builder->Finish());
|
||||||
|
|
||||||
|
FakeRandomeAccessFile readable(writable->contents());
|
||||||
|
TableProperties props;
|
||||||
|
Status s = ReadTableProperties(
|
||||||
|
&readable,
|
||||||
|
writable->contents().size(),
|
||||||
|
magic_number,
|
||||||
|
Env::Default(),
|
||||||
|
nullptr,
|
||||||
|
&props
|
||||||
|
);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
|
||||||
|
auto user_collected = props.user_collected_properties;
|
||||||
|
uint64_t deleted = GetDeletedKeys(user_collected);
|
||||||
|
ASSERT_EQ(4u, deleted);
|
||||||
|
|
||||||
|
if (sanitized) {
|
||||||
uint32_t starts_with_A = 0;
|
uint32_t starts_with_A = 0;
|
||||||
Slice key(properties.at("Count"));
|
Slice key(user_collected.at("Count"));
|
||||||
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
||||||
ASSERT_EQ(3u, starts_with_A);
|
ASSERT_EQ(1u, starts_with_A);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
|
TEST(TablePropertiesTest, InternalKeyPropertiesCollector) {
|
||||||
InternalKey keys[] = {
|
TestInternalKeyPropertiesCollector(
|
||||||
InternalKey("A", 0, ValueType::kTypeValue),
|
kBlockBasedTableMagicNumber,
|
||||||
InternalKey("B", 0, ValueType::kTypeValue),
|
true /* sanitize */,
|
||||||
InternalKey("C", 0, ValueType::kTypeValue),
|
std::make_shared<BlockBasedTableFactory>()
|
||||||
InternalKey("W", 0, ValueType::kTypeDeletion),
|
);
|
||||||
InternalKey("X", 0, ValueType::kTypeDeletion),
|
TestInternalKeyPropertiesCollector(
|
||||||
InternalKey("Y", 0, ValueType::kTypeDeletion),
|
kBlockBasedTableMagicNumber,
|
||||||
InternalKey("Z", 0, ValueType::kTypeDeletion),
|
true /* not sanitize */,
|
||||||
};
|
std::make_shared<BlockBasedTableFactory>()
|
||||||
|
);
|
||||||
for (bool sanitized : { false, true }) {
|
TestInternalKeyPropertiesCollector(
|
||||||
std::unique_ptr<TableBuilder> builder;
|
kPlainTableMagicNumber,
|
||||||
std::unique_ptr<FakeWritableFile> writable;
|
false /* not sanitize */,
|
||||||
Options options;
|
std::make_shared<PlainTableFactory>(8, 8, 0)
|
||||||
if (sanitized) {
|
);
|
||||||
options.table_properties_collectors = {
|
|
||||||
std::make_shared<RegularKeysStartWithA>()
|
|
||||||
};
|
|
||||||
// with sanitization, even regular properties collector will be able to
|
|
||||||
// handle internal keys.
|
|
||||||
auto comparator = options.comparator;
|
|
||||||
// HACK: Set options.info_log to avoid writing log in
|
|
||||||
// SanitizeOptions().
|
|
||||||
options.info_log = std::make_shared<DumbLogger>();
|
|
||||||
options = SanitizeOptions(
|
|
||||||
"db", // just a place holder
|
|
||||||
nullptr, // with skip internal key comparator
|
|
||||||
nullptr, // don't care filter policy
|
|
||||||
options
|
|
||||||
);
|
|
||||||
options.comparator = comparator;
|
|
||||||
} else {
|
|
||||||
options.table_properties_collectors = {
|
|
||||||
std::make_shared<InternalKeyPropertiesCollector>()
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
MakeBuilder(options, &writable, &builder);
|
|
||||||
for (const auto& k : keys) {
|
|
||||||
builder->Add(k.Encode(), "val");
|
|
||||||
}
|
|
||||||
|
|
||||||
ASSERT_OK(builder->Finish());
|
|
||||||
|
|
||||||
std::unique_ptr<TableReader> table_reader;
|
|
||||||
OpenTable(options, writable->contents(), &table_reader);
|
|
||||||
const auto& properties =
|
|
||||||
table_reader->GetTableProperties().user_collected_properties;
|
|
||||||
|
|
||||||
uint64_t deleted = GetDeletedKeys(properties);
|
|
||||||
ASSERT_EQ(4u, deleted);
|
|
||||||
|
|
||||||
if (sanitized) {
|
|
||||||
uint32_t starts_with_A = 0;
|
|
||||||
Slice key(properties.at("Count"));
|
|
||||||
ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
|
|
||||||
ASSERT_EQ(1u, starts_with_A);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -78,12 +78,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
|
|||||||
PutVarint64(dst, last_sequence_);
|
PutVarint64(dst, last_sequence_);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
|
for (const auto& deleted : deleted_files_) {
|
||||||
iter != deleted_files_.end();
|
|
||||||
++iter) {
|
|
||||||
PutVarint32(dst, kDeletedFile);
|
PutVarint32(dst, kDeletedFile);
|
||||||
PutVarint32(dst, iter->first); // level
|
PutVarint32(dst, deleted.first /* level */);
|
||||||
PutVarint64(dst, iter->second); // file number
|
PutVarint64(dst, deleted.second /* file number */);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < new_files_.size(); i++) {
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include "rocksdb/cache.h"
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -29,8 +30,17 @@ struct FileMetaData {
|
|||||||
SequenceNumber smallest_seqno;// The smallest seqno in this file
|
SequenceNumber smallest_seqno;// The smallest seqno in this file
|
||||||
SequenceNumber largest_seqno; // The largest seqno in this file
|
SequenceNumber largest_seqno; // The largest seqno in this file
|
||||||
|
|
||||||
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
|
// Needs to be disposed when refs becomes 0.
|
||||||
being_compacted(false) {}
|
Cache::Handle* table_reader_handle;
|
||||||
|
|
||||||
|
FileMetaData(uint64_t number, uint64_t file_size)
|
||||||
|
: refs(0),
|
||||||
|
allowed_seeks(1 << 30),
|
||||||
|
number(number),
|
||||||
|
file_size(file_size),
|
||||||
|
being_compacted(false),
|
||||||
|
table_reader_handle(nullptr) {}
|
||||||
|
FileMetaData() : FileMetaData(0, 0) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
class VersionEdit {
|
class VersionEdit {
|
||||||
@ -70,6 +80,7 @@ class VersionEdit {
|
|||||||
const InternalKey& largest,
|
const InternalKey& largest,
|
||||||
const SequenceNumber& smallest_seqno,
|
const SequenceNumber& smallest_seqno,
|
||||||
const SequenceNumber& largest_seqno) {
|
const SequenceNumber& largest_seqno) {
|
||||||
|
assert(smallest_seqno <= largest_seqno);
|
||||||
FileMetaData f;
|
FileMetaData f;
|
||||||
f.number = file;
|
f.number = file;
|
||||||
f.file_size = file_size;
|
f.file_size = file_size;
|
||||||
@ -77,13 +88,12 @@ class VersionEdit {
|
|||||||
f.largest = largest;
|
f.largest = largest;
|
||||||
f.smallest_seqno = smallest_seqno;
|
f.smallest_seqno = smallest_seqno;
|
||||||
f.largest_seqno = largest_seqno;
|
f.largest_seqno = largest_seqno;
|
||||||
assert(smallest_seqno <= largest_seqno);
|
|
||||||
new_files_.push_back(std::make_pair(level, f));
|
new_files_.push_back(std::make_pair(level, f));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Delete the specified "file" from the specified "level".
|
// Delete the specified "file" from the specified "level".
|
||||||
void DeleteFile(int level, uint64_t file) {
|
void DeleteFile(int level, uint64_t file) {
|
||||||
deleted_files_.insert(std::make_pair(level, file));
|
deleted_files_.insert({level, file});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Number of edits
|
// Number of edits
|
||||||
@ -120,7 +130,7 @@ class VersionEdit {
|
|||||||
private:
|
private:
|
||||||
friend class VersionSet;
|
friend class VersionSet;
|
||||||
|
|
||||||
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
|
typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
|
||||||
|
|
||||||
bool GetLevel(Slice* input, int* level, const char** msg);
|
bool GetLevel(Slice* input, int* level, const char** msg);
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#include <set>
|
#include <set>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "db/filename.h"
|
#include "db/filename.h"
|
||||||
#include "db/log_reader.h"
|
#include "db/log_reader.h"
|
||||||
#include "db/log_writer.h"
|
#include "db/log_writer.h"
|
||||||
@ -23,7 +24,7 @@
|
|||||||
#include "db/compaction.h"
|
#include "db/compaction.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/merge_operator.h"
|
#include "rocksdb/merge_operator.h"
|
||||||
#include "rocksdb/table.h"
|
#include "table/table_reader.h"
|
||||||
#include "table/merger.h"
|
#include "table/merger.h"
|
||||||
#include "table/two_level_iterator.h"
|
#include "table/two_level_iterator.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
@ -54,6 +55,10 @@ Version::~Version() {
|
|||||||
assert(f->refs > 0);
|
assert(f->refs > 0);
|
||||||
f->refs--;
|
f->refs--;
|
||||||
if (f->refs <= 0) {
|
if (f->refs <= 0) {
|
||||||
|
if (f->table_reader_handle) {
|
||||||
|
cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
|
||||||
|
f->table_reader_handle = nullptr;
|
||||||
|
}
|
||||||
vset_->obsolete_files_.push_back(f);
|
vset_->obsolete_files_.push_back(f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -188,11 +193,10 @@ class Version::LevelFileNumIterator : public Iterator {
|
|||||||
mutable char value_buf_[16];
|
mutable char value_buf_[16];
|
||||||
};
|
};
|
||||||
|
|
||||||
static Iterator* GetFileIterator(void* arg,
|
static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
|
||||||
const ReadOptions& options,
|
|
||||||
const EnvOptions& soptions,
|
const EnvOptions& soptions,
|
||||||
const Slice& file_value,
|
const InternalKeyComparator& icomparator,
|
||||||
bool for_compaction) {
|
const Slice& file_value, bool for_compaction) {
|
||||||
TableCache* cache = reinterpret_cast<TableCache*>(arg);
|
TableCache* cache = reinterpret_cast<TableCache*>(arg);
|
||||||
if (file_value.size() != 16) {
|
if (file_value.size() != 16) {
|
||||||
return NewErrorIterator(
|
return NewErrorIterator(
|
||||||
@ -205,12 +209,11 @@ static Iterator* GetFileIterator(void* arg,
|
|||||||
options_copy = options;
|
options_copy = options;
|
||||||
options_copy.prefix = nullptr;
|
options_copy.prefix = nullptr;
|
||||||
}
|
}
|
||||||
return cache->NewIterator(options.prefix ? options_copy : options,
|
FileMetaData meta(DecodeFixed64(file_value.data()),
|
||||||
soptions,
|
DecodeFixed64(file_value.data() + 8));
|
||||||
DecodeFixed64(file_value.data()),
|
return cache->NewIterator(
|
||||||
DecodeFixed64(file_value.data() + 8),
|
options.prefix ? options_copy : options, soptions, icomparator, meta,
|
||||||
nullptr /* don't need reference to table*/,
|
nullptr /* don't need reference to table*/, for_compaction);
|
||||||
for_compaction);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -230,7 +233,8 @@ bool Version::PrefixMayMatch(const ReadOptions& options,
|
|||||||
may_match = true;
|
may_match = true;
|
||||||
} else {
|
} else {
|
||||||
may_match = cfd_->table_cache()->PrefixMayMatch(
|
may_match = cfd_->table_cache()->PrefixMayMatch(
|
||||||
options, DecodeFixed64(level_iter->value().data()),
|
options, cfd_->internal_comparator(),
|
||||||
|
DecodeFixed64(level_iter->value().data()),
|
||||||
DecodeFixed64(level_iter->value().data() + 8), internal_prefix,
|
DecodeFixed64(level_iter->value().data() + 8), internal_prefix,
|
||||||
nullptr);
|
nullptr);
|
||||||
}
|
}
|
||||||
@ -252,7 +256,7 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return NewTwoLevelIterator(level_iter, &GetFileIterator, cfd_->table_cache(),
|
return NewTwoLevelIterator(level_iter, &GetFileIterator, cfd_->table_cache(),
|
||||||
options, soptions);
|
options, soptions, cfd_->internal_comparator());
|
||||||
}
|
}
|
||||||
|
|
||||||
void Version::AddIterators(const ReadOptions& options,
|
void Version::AddIterators(const ReadOptions& options,
|
||||||
@ -261,7 +265,7 @@ void Version::AddIterators(const ReadOptions& options,
|
|||||||
// Merge all level zero files together since they may overlap
|
// Merge all level zero files together since they may overlap
|
||||||
for (const FileMetaData* file : files_[0]) {
|
for (const FileMetaData* file : files_[0]) {
|
||||||
iters->push_back(cfd_->table_cache()->NewIterator(
|
iters->push_back(cfd_->table_cache()->NewIterator(
|
||||||
options, soptions, file->number, file->file_size));
|
options, soptions, cfd_->internal_comparator(), *file));
|
||||||
}
|
}
|
||||||
|
|
||||||
// For levels > 0, we can use a concatenating iterator that sequentially
|
// For levels > 0, we can use a concatenating iterator that sequentially
|
||||||
@ -311,83 +315,73 @@ static void MarkKeyMayExist(void* arg) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){
|
static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
|
||||||
|
const Slice& v, bool didIO) {
|
||||||
Saver* s = reinterpret_cast<Saver*>(arg);
|
Saver* s = reinterpret_cast<Saver*>(arg);
|
||||||
MergeContext* merge_contex = s->merge_context;
|
MergeContext* merge_contex = s->merge_context;
|
||||||
std::string merge_result; // temporary area for merge results later
|
std::string merge_result; // temporary area for merge results later
|
||||||
|
|
||||||
assert(s != nullptr && merge_contex != nullptr);
|
assert(s != nullptr && merge_contex != nullptr);
|
||||||
|
|
||||||
ParsedInternalKey parsed_key;
|
|
||||||
// TODO: didIO and Merge?
|
// TODO: didIO and Merge?
|
||||||
s->didIO = didIO;
|
s->didIO = didIO;
|
||||||
if (!ParseInternalKey(ikey, &parsed_key)) {
|
if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
|
||||||
// TODO: what about corrupt during Merge?
|
// Key matches. Process it
|
||||||
s->state = kCorrupt;
|
switch (parsed_key.type) {
|
||||||
} else {
|
case kTypeValue:
|
||||||
if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
|
if (kNotFound == s->state) {
|
||||||
// Key matches. Process it
|
s->state = kFound;
|
||||||
switch (parsed_key.type) {
|
s->value->assign(v.data(), v.size());
|
||||||
case kTypeValue:
|
} else if (kMerge == s->state) {
|
||||||
if (kNotFound == s->state) {
|
assert(s->merge_operator != nullptr);
|
||||||
s->state = kFound;
|
s->state = kFound;
|
||||||
s->value->assign(v.data(), v.size());
|
if (!s->merge_operator->FullMerge(s->user_key, &v,
|
||||||
} else if (kMerge == s->state) {
|
merge_contex->GetOperands(),
|
||||||
assert(s->merge_operator != nullptr);
|
s->value, s->logger)) {
|
||||||
s->state = kFound;
|
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
||||||
if (!s->merge_operator->FullMerge(s->user_key, &v,
|
s->state = kCorrupt;
|
||||||
merge_contex->GetOperands(),
|
|
||||||
s->value, s->logger)) {
|
|
||||||
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
|
||||||
s->state = kCorrupt;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert(false);
|
|
||||||
}
|
}
|
||||||
return false;
|
} else {
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
case kTypeDeletion:
|
case kTypeDeletion:
|
||||||
if (kNotFound == s->state) {
|
if (kNotFound == s->state) {
|
||||||
s->state = kDeleted;
|
s->state = kDeleted;
|
||||||
} else if (kMerge == s->state) {
|
} else if (kMerge == s->state) {
|
||||||
s->state = kFound;
|
s->state = kFound;
|
||||||
if (!s->merge_operator->FullMerge(s->user_key, nullptr,
|
if (!s->merge_operator->FullMerge(s->user_key, nullptr,
|
||||||
merge_contex->GetOperands(),
|
merge_contex->GetOperands(),
|
||||||
s->value, s->logger)) {
|
s->value, s->logger)) {
|
||||||
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
|
||||||
s->state = kCorrupt;
|
s->state = kCorrupt;
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert(false);
|
|
||||||
}
|
}
|
||||||
return false;
|
} else {
|
||||||
|
|
||||||
case kTypeMerge:
|
|
||||||
assert(s->state == kNotFound || s->state == kMerge);
|
|
||||||
s->state = kMerge;
|
|
||||||
merge_contex->PushOperand(v);
|
|
||||||
while (merge_contex->GetNumOperands() >= 2) {
|
|
||||||
// Attempt to merge operands together via user associateive merge
|
|
||||||
if (s->merge_operator->PartialMerge(s->user_key,
|
|
||||||
merge_contex->GetOperand(0),
|
|
||||||
merge_contex->GetOperand(1),
|
|
||||||
&merge_result,
|
|
||||||
s->logger)) {
|
|
||||||
merge_contex->PushPartialMergeResult(merge_result);
|
|
||||||
} else {
|
|
||||||
// Associative merge returns false ==> stack the operands
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
|
|
||||||
case kTypeColumnFamilyDeletion:
|
|
||||||
case kTypeColumnFamilyValue:
|
|
||||||
case kTypeColumnFamilyMerge:
|
|
||||||
case kTypeLogData:
|
|
||||||
assert(false);
|
assert(false);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
case kTypeMerge:
|
||||||
|
assert(s->state == kNotFound || s->state == kMerge);
|
||||||
|
s->state = kMerge;
|
||||||
|
merge_contex->PushOperand(v);
|
||||||
|
while (merge_contex->GetNumOperands() >= 2) {
|
||||||
|
// Attempt to merge operands together via user associateive merge
|
||||||
|
if (s->merge_operator->PartialMerge(
|
||||||
|
s->user_key, merge_contex->GetOperand(0),
|
||||||
|
merge_contex->GetOperand(1), &merge_result, s->logger)) {
|
||||||
|
merge_contex->PushPartialMergeResult(merge_result);
|
||||||
|
} else {
|
||||||
|
// Associative merge returns false ==> stack the operands
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
|
||||||
|
default:
|
||||||
|
assert(false);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -524,8 +518,8 @@ void Version::Get(const ReadOptions& options,
|
|||||||
prev_file = f;
|
prev_file = f;
|
||||||
#endif
|
#endif
|
||||||
bool tableIO = false;
|
bool tableIO = false;
|
||||||
*status = cfd_->table_cache()->Get(options, f->number, f->file_size, ikey,
|
*status = cfd_->table_cache()->Get(options, cfd_->internal_comparator(),
|
||||||
&saver, SaveValue, &tableIO,
|
*f, ikey, &saver, SaveValue, &tableIO,
|
||||||
MarkKeyMayExist);
|
MarkKeyMayExist);
|
||||||
// TODO: examine the behavior for corrupted key
|
// TODO: examine the behavior for corrupted key
|
||||||
if (!status->ok()) {
|
if (!status->ok()) {
|
||||||
@ -707,7 +701,7 @@ bool CompareSeqnoDescending(const Version::Fsize& first,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
void Version::UpdateFilesBySize() {
|
void Version::UpdateFilesBySize() {
|
||||||
// No need to sort the highest level because it is never compacted.
|
// No need to sort the highest level because it is never compacted.
|
||||||
@ -756,12 +750,14 @@ void Version::Ref() {
|
|||||||
++refs_;
|
++refs_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Version::Unref() {
|
bool Version::Unref() {
|
||||||
assert(refs_ >= 1);
|
assert(refs_ >= 1);
|
||||||
--refs_;
|
--refs_;
|
||||||
if (refs_ == 0) {
|
if (refs_ == 0) {
|
||||||
delete this;
|
delete this;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Version::NeedsCompaction() const {
|
bool Version::NeedsCompaction() const {
|
||||||
@ -1200,10 +1196,15 @@ class VersionSet::Builder {
|
|||||||
FileMetaData* f = to_unref[i];
|
FileMetaData* f = to_unref[i];
|
||||||
f->refs--;
|
f->refs--;
|
||||||
if (f->refs <= 0) {
|
if (f->refs <= 0) {
|
||||||
|
if (f->table_reader_handle) {
|
||||||
|
cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
|
||||||
|
f->table_reader_handle = nullptr;
|
||||||
|
}
|
||||||
delete f;
|
delete f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] levels_;
|
delete[] levels_;
|
||||||
base_->Unref();
|
base_->Unref();
|
||||||
}
|
}
|
||||||
@ -1280,19 +1281,17 @@ class VersionSet::Builder {
|
|||||||
|
|
||||||
// Delete files
|
// Delete files
|
||||||
const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
|
const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
|
||||||
for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
|
for (const auto& del_file : del) {
|
||||||
iter != del.end();
|
const auto level = del_file.first;
|
||||||
++iter) {
|
const auto number = del_file.second;
|
||||||
const int level = iter->first;
|
|
||||||
const uint64_t number = iter->second;
|
|
||||||
levels_[level].deleted_files.insert(number);
|
levels_[level].deleted_files.insert(number);
|
||||||
CheckConsistencyForDeletes(edit, number, level);
|
CheckConsistencyForDeletes(edit, number, level);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add new files
|
// Add new files
|
||||||
for (size_t i = 0; i < edit->new_files_.size(); i++) {
|
for (const auto& new_file : edit->new_files_) {
|
||||||
const int level = edit->new_files_[i].first;
|
const int level = new_file.first;
|
||||||
FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
|
FileMetaData* f = new FileMetaData(new_file.second);
|
||||||
f->refs = 1;
|
f->refs = 1;
|
||||||
|
|
||||||
// We arrange to automatically compact this file after
|
// We arrange to automatically compact this file after
|
||||||
@ -1325,23 +1324,21 @@ class VersionSet::Builder {
|
|||||||
for (int level = 0; level < base_->NumberLevels(); level++) {
|
for (int level = 0; level < base_->NumberLevels(); level++) {
|
||||||
// Merge the set of added files with the set of pre-existing files.
|
// Merge the set of added files with the set of pre-existing files.
|
||||||
// Drop any deleted files. Store the result in *v.
|
// Drop any deleted files. Store the result in *v.
|
||||||
const std::vector<FileMetaData*>& base_files = base_->files_[level];
|
const auto& base_files = base_->files_[level];
|
||||||
std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
|
auto base_iter = base_files.begin();
|
||||||
std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
|
auto base_end = base_files.end();
|
||||||
const FileSet* added = levels_[level].added_files;
|
const auto& added_files = *levels_[level].added_files;
|
||||||
v->files_[level].reserve(base_files.size() + added->size());
|
v->files_[level].reserve(base_files.size() + added_files.size());
|
||||||
for (FileSet::const_iterator added_iter = added->begin();
|
|
||||||
added_iter != added->end();
|
for (const auto& added : added_files) {
|
||||||
++added_iter) {
|
|
||||||
// Add all smaller files listed in base_
|
// Add all smaller files listed in base_
|
||||||
for (std::vector<FileMetaData*>::const_iterator bpos
|
for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
|
||||||
= std::upper_bound(base_iter, base_end, *added_iter, cmp);
|
|
||||||
base_iter != bpos;
|
base_iter != bpos;
|
||||||
++base_iter) {
|
++base_iter) {
|
||||||
MaybeAddFile(v, level, *base_iter);
|
MaybeAddFile(v, level, *base_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
MaybeAddFile(v, level, *added_iter);
|
MaybeAddFile(v, level, added);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add remaining base files
|
// Add remaining base files
|
||||||
@ -1353,11 +1350,24 @@ class VersionSet::Builder {
|
|||||||
CheckConsistency(v);
|
CheckConsistency(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LoadTableHandlers() {
|
||||||
|
for (int level = 0; level < cfd_->NumberLevels(); level++) {
|
||||||
|
for (auto& file_meta : *(levels_[level].added_files)) {
|
||||||
|
assert (!file_meta->table_reader_handle);
|
||||||
|
bool table_io;
|
||||||
|
cfd_->table_cache()->FindTable(
|
||||||
|
base_->vset_->storage_options_, cfd_->internal_comparator(),
|
||||||
|
file_meta->number, file_meta->file_size,
|
||||||
|
&file_meta->table_reader_handle, &table_io, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void MaybeAddFile(Version* v, int level, FileMetaData* f) {
|
void MaybeAddFile(Version* v, int level, FileMetaData* f) {
|
||||||
if (levels_[level].deleted_files.count(f->number) > 0) {
|
if (levels_[level].deleted_files.count(f->number) > 0) {
|
||||||
// File is deleted: do nothing
|
// File is deleted: do nothing
|
||||||
} else {
|
} else {
|
||||||
std::vector<FileMetaData*>* files = &v->files_[level];
|
auto* files = &v->files_[level];
|
||||||
if (level > 0 && !files->empty()) {
|
if (level > 0 && !files->empty()) {
|
||||||
// Must not overlap
|
// Must not overlap
|
||||||
assert(cfd_->internal_comparator().Compare(
|
assert(cfd_->internal_comparator().Compare(
|
||||||
@ -1442,13 +1452,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
|||||||
ManifestWriter* last_writer = &w;
|
ManifestWriter* last_writer = &w;
|
||||||
assert(!manifest_writers_.empty());
|
assert(!manifest_writers_.empty());
|
||||||
assert(manifest_writers_.front() == &w);
|
assert(manifest_writers_.front() == &w);
|
||||||
std::deque<ManifestWriter*>::iterator iter = manifest_writers_.begin();
|
for (const auto& writer : manifest_writers_) {
|
||||||
for (; iter != manifest_writers_.end(); ++iter) {
|
if (writer->cfd->GetID() != column_family_data->GetID()) {
|
||||||
if ((*iter)->cfd->GetID() != column_family_data->GetID()) {
|
|
||||||
// group commits across column families are not yet supported
|
// group commits across column families are not yet supported
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
last_writer = *iter;
|
last_writer = writer;
|
||||||
LogAndApplyHelper(column_family_data, &builder, v, last_writer->edit, mu);
|
LogAndApplyHelper(column_family_data, &builder, v, last_writer->edit, mu);
|
||||||
batch_edits.push_back(last_writer->edit);
|
batch_edits.push_back(last_writer->edit);
|
||||||
}
|
}
|
||||||
@ -1456,7 +1465,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
|||||||
|
|
||||||
// Initialize new descriptor log file if necessary by creating
|
// Initialize new descriptor log file if necessary by creating
|
||||||
// a temporary file that contains a snapshot of the current version.
|
// a temporary file that contains a snapshot of the current version.
|
||||||
std::string new_manifest_file;
|
std::string new_manifest_filename;
|
||||||
uint64_t new_manifest_file_size = 0;
|
uint64_t new_manifest_file_size = 0;
|
||||||
Status s;
|
Status s;
|
||||||
// we will need this if we are creating new manifest
|
// we will need this if we are creating new manifest
|
||||||
@ -1470,11 +1479,11 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (new_descriptor_log) {
|
if (new_descriptor_log) {
|
||||||
new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
|
new_manifest_filename = DescriptorFileName(dbname_, manifest_file_number_);
|
||||||
edit->SetNextFile(next_file_number_);
|
edit->SetNextFile(next_file_number_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unlock during expensive MANIFEST log write. New writes cannot get here
|
// Unlock during expensive operations. New writes cannot get here
|
||||||
// because &w is ensuring that all new writes get queued.
|
// because &w is ensuring that all new writes get queued.
|
||||||
{
|
{
|
||||||
// calculate the amount of data being compacted at every level
|
// calculate the amount of data being compacted at every level
|
||||||
@ -1484,11 +1493,18 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
|||||||
|
|
||||||
mu->Unlock();
|
mu->Unlock();
|
||||||
|
|
||||||
|
if (options_->max_open_files == -1) {
|
||||||
|
// unlimited table cache. Pre-load table handle now.
|
||||||
|
// Need to do it out of the mutex.
|
||||||
|
builder.LoadTableHandlers();
|
||||||
|
}
|
||||||
|
|
||||||
// This is fine because everything inside of this block is serialized --
|
// This is fine because everything inside of this block is serialized --
|
||||||
// only one thread can be here at the same time
|
// only one thread can be here at the same time
|
||||||
if (!new_manifest_file.empty()) {
|
if (!new_manifest_filename.empty()) {
|
||||||
unique_ptr<WritableFile> descriptor_file;
|
unique_ptr<WritableFile> descriptor_file;
|
||||||
s = env_->NewWritableFile(new_manifest_file, &descriptor_file,
|
s = env_->NewWritableFile(new_manifest_filename,
|
||||||
|
&descriptor_file,
|
||||||
storage_options_);
|
storage_options_);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
|
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
|
||||||
@ -1536,7 +1552,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
|||||||
|
|
||||||
// If we just created a new descriptor file, install it by writing a
|
// If we just created a new descriptor file, install it by writing a
|
||||||
// new CURRENT file that points to it.
|
// new CURRENT file that points to it.
|
||||||
if (s.ok() && !new_manifest_file.empty()) {
|
if (s.ok() && !new_manifest_filename.empty()) {
|
||||||
s = SetCurrentFile(env_, dbname_, manifest_file_number_);
|
s = SetCurrentFile(env_, dbname_, manifest_file_number_);
|
||||||
if (s.ok() && old_manifest_file_number < manifest_file_number_) {
|
if (s.ok() && old_manifest_file_number < manifest_file_number_) {
|
||||||
// delete old manifest file
|
// delete old manifest file
|
||||||
@ -1573,9 +1589,9 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
|||||||
Log(options_->info_log, "Error in committing version %lu",
|
Log(options_->info_log, "Error in committing version %lu",
|
||||||
(unsigned long)v->GetVersionNumber());
|
(unsigned long)v->GetVersionNumber());
|
||||||
delete v;
|
delete v;
|
||||||
if (!new_manifest_file.empty()) {
|
if (!new_manifest_filename.empty()) {
|
||||||
descriptor_log_.reset();
|
descriptor_log_.reset();
|
||||||
env_->DeleteFile(new_manifest_file);
|
env_->DeleteFile(new_manifest_filename);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1631,27 +1647,33 @@ Status VersionSet::Recover(
|
|||||||
std::set<int> column_families_not_found;
|
std::set<int> column_families_not_found;
|
||||||
|
|
||||||
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
||||||
std::string current;
|
std::string manifest_filename;
|
||||||
Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t);
|
Status s = ReadFileToString(
|
||||||
|
env_, CurrentFileName(dbname_), &manifest_filename
|
||||||
|
);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
if (current.empty() || current[current.size()-1] != '\n') {
|
if (manifest_filename.empty() ||
|
||||||
|
manifest_filename.back() != '\n') {
|
||||||
return Status::Corruption("CURRENT file does not end with newline");
|
return Status::Corruption("CURRENT file does not end with newline");
|
||||||
}
|
}
|
||||||
current.resize(current.size() - 1);
|
// remove the trailing '\n'
|
||||||
|
manifest_filename.resize(manifest_filename.size() - 1);
|
||||||
|
|
||||||
Log(options_->info_log, "Recovering from manifest file:%s\n",
|
Log(options_->info_log, "Recovering from manifest file:%s\n",
|
||||||
current.c_str());
|
manifest_filename.c_str());
|
||||||
|
|
||||||
std::string dscname = dbname_ + "/" + current;
|
manifest_filename = dbname_ + "/" + manifest_filename;
|
||||||
unique_ptr<SequentialFile> file;
|
unique_ptr<SequentialFile> manifest_file;
|
||||||
s = env_->NewSequentialFile(dscname, &file, storage_options_);
|
s = env_->NewSequentialFile(
|
||||||
|
manifest_filename, &manifest_file, storage_options_
|
||||||
|
);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
uint64_t manifest_file_size;
|
uint64_t manifest_file_size;
|
||||||
s = env_->GetFileSize(dscname, &manifest_file_size);
|
s = env_->GetFileSize(manifest_filename, &manifest_file_size);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@ -1682,8 +1704,8 @@ Status VersionSet::Recover(
|
|||||||
{
|
{
|
||||||
VersionSet::LogReporter reporter;
|
VersionSet::LogReporter reporter;
|
||||||
reporter.status = &s;
|
reporter.status = &s;
|
||||||
log::Reader reader(std::move(file), &reporter, true/*checksum*/,
|
log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/,
|
||||||
0/*initial_offset*/);
|
0 /*initial_offset*/);
|
||||||
Slice record;
|
Slice record;
|
||||||
std::string scratch;
|
std::string scratch;
|
||||||
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
|
||||||
@ -1797,7 +1819,6 @@ Status VersionSet::Recover(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
file.reset();
|
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
if (!have_next_file) {
|
if (!have_next_file) {
|
||||||
@ -1846,7 +1867,7 @@ Status VersionSet::Recover(
|
|||||||
"manifest_file_number is %lu, next_file_number is %lu, "
|
"manifest_file_number is %lu, next_file_number is %lu, "
|
||||||
"last_sequence is %lu, log_number is %lu,"
|
"last_sequence is %lu, log_number is %lu,"
|
||||||
"prev_log_number is %lu\n",
|
"prev_log_number is %lu\n",
|
||||||
current.c_str(),
|
manifest_filename.c_str(),
|
||||||
(unsigned long)manifest_file_number_,
|
(unsigned long)manifest_file_number_,
|
||||||
(unsigned long)next_file_number_,
|
(unsigned long)next_file_number_,
|
||||||
(unsigned long)last_sequence_,
|
(unsigned long)last_sequence_,
|
||||||
@ -2229,8 +2250,8 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
|
|||||||
// approximate offset of "ikey" within the table.
|
// approximate offset of "ikey" within the table.
|
||||||
TableReader* table_reader_ptr;
|
TableReader* table_reader_ptr;
|
||||||
Iterator* iter = v->cfd_->table_cache()->NewIterator(
|
Iterator* iter = v->cfd_->table_cache()->NewIterator(
|
||||||
ReadOptions(), storage_options_, files[i]->number,
|
ReadOptions(), storage_options_, v->cfd_->internal_comparator(),
|
||||||
files[i]->file_size, &table_reader_ptr);
|
*(files[i]), &table_reader_ptr);
|
||||||
if (table_reader_ptr != nullptr) {
|
if (table_reader_ptr != nullptr) {
|
||||||
result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
|
result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
|
||||||
}
|
}
|
||||||
@ -2285,8 +2306,9 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
|
|||||||
if (c->level() + which == 0) {
|
if (c->level() + which == 0) {
|
||||||
for (const auto& file : *c->inputs(which)) {
|
for (const auto& file : *c->inputs(which)) {
|
||||||
list[num++] = c->column_family_data()->table_cache()->NewIterator(
|
list[num++] = c->column_family_data()->table_cache()->NewIterator(
|
||||||
options, storage_options_compactions_, file->number,
|
options, storage_options_compactions_,
|
||||||
file->file_size, nullptr, true /* for compaction */);
|
c->column_family_data()->internal_comparator(), *file, nullptr,
|
||||||
|
true /* for compaction */);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Create concatenating iterator for the files from this level
|
// Create concatenating iterator for the files from this level
|
||||||
@ -2295,13 +2317,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
|
|||||||
c->column_family_data()->internal_comparator(),
|
c->column_family_data()->internal_comparator(),
|
||||||
c->inputs(which)),
|
c->inputs(which)),
|
||||||
&GetFileIterator, c->column_family_data()->table_cache(), options,
|
&GetFileIterator, c->column_family_data()->table_cache(), options,
|
||||||
storage_options_, true /* for compaction */);
|
storage_options_, c->column_family_data()->internal_comparator(),
|
||||||
|
true /* for compaction */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert(num <= space);
|
assert(num <= space);
|
||||||
Iterator* result = NewMergingIterator(
|
Iterator* result = NewMergingIterator(
|
||||||
&c->column_family_data()->internal_comparator(), list, num);
|
env_, &c->column_family_data()->internal_comparator(), list, num);
|
||||||
delete[] list;
|
delete[] list;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -2356,14 +2379,14 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
|
Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
|
||||||
FileMetaData* meta,
|
FileMetaData** meta,
|
||||||
ColumnFamilyData** cfd) {
|
ColumnFamilyData** cfd) {
|
||||||
for (auto cfd_iter : *column_family_set_) {
|
for (auto cfd_iter : *column_family_set_) {
|
||||||
Version* version = cfd_iter->current();
|
Version* version = cfd_iter->current();
|
||||||
for (int level = 0; level < version->NumberLevels(); level++) {
|
for (int level = 0; level < version->NumberLevels(); level++) {
|
||||||
for (const auto& file : version->files_[level]) {
|
for (const auto& file : version->files_[level]) {
|
||||||
if (file->number == number) {
|
if (file->number == number) {
|
||||||
*meta = *file;
|
*meta = file;
|
||||||
*filelevel = level;
|
*filelevel = level;
|
||||||
*cfd = cfd_iter;
|
*cfd = cfd_iter;
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
@ -85,8 +85,8 @@ class Version {
|
|||||||
};
|
};
|
||||||
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
void Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
||||||
Status* status, MergeContext* merge_context,
|
Status* status, MergeContext* merge_context,
|
||||||
GetStats* stats, const Options& db_option, bool* value_found =
|
GetStats* stats, const Options& db_option,
|
||||||
nullptr);
|
bool* value_found = nullptr);
|
||||||
|
|
||||||
// Adds "stats" into the current state. Returns true if a new
|
// Adds "stats" into the current state. Returns true if a new
|
||||||
// compaction may need to be triggered, false otherwise.
|
// compaction may need to be triggered, false otherwise.
|
||||||
@ -101,7 +101,9 @@ class Version {
|
|||||||
// Reference count management (so Versions do not disappear out from
|
// Reference count management (so Versions do not disappear out from
|
||||||
// under live iterators)
|
// under live iterators)
|
||||||
void Ref();
|
void Ref();
|
||||||
void Unref();
|
// Decrease reference count. Delete the object if no reference left
|
||||||
|
// and return true. Otherwise, return false.
|
||||||
|
bool Unref();
|
||||||
|
|
||||||
// Returns true iff some level needs a compaction.
|
// Returns true iff some level needs a compaction.
|
||||||
bool NeedsCompaction() const;
|
bool NeedsCompaction() const;
|
||||||
@ -384,7 +386,7 @@ class VersionSet {
|
|||||||
bool VerifyCompactionFileConsistency(Compaction* c);
|
bool VerifyCompactionFileConsistency(Compaction* c);
|
||||||
|
|
||||||
Status GetMetadataForFile(uint64_t number, int* filelevel,
|
Status GetMetadataForFile(uint64_t number, int* filelevel,
|
||||||
FileMetaData* metadata, ColumnFamilyData** cfd);
|
FileMetaData** metadata, ColumnFamilyData** cfd);
|
||||||
|
|
||||||
void GetLiveFilesMetaData(
|
void GetLiveFilesMetaData(
|
||||||
std::vector<LiveFileMetaData> *metadata);
|
std::vector<LiveFileMetaData> *metadata);
|
||||||
|
@ -146,7 +146,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
|
|||||||
return Status::Corruption("unknown WriteBatch tag");
|
return Status::Corruption("unknown WriteBatch tag");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (found != WriteBatchInternal::Count(this)) {
|
if (found != WriteBatchInternal::Count(this)) {
|
||||||
return Status::Corruption("WriteBatch has wrong count");
|
return Status::Corruption("WriteBatch has wrong count");
|
||||||
} else {
|
} else {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
@ -261,14 +261,45 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
}
|
}
|
||||||
MemTable* mem = cf_mems_->GetMemTable();
|
MemTable* mem = cf_mems_->GetMemTable();
|
||||||
const Options* options = cf_mems_->GetFullOptions();
|
const Options* options = cf_mems_->GetFullOptions();
|
||||||
if (options->inplace_update_support &&
|
if (!options->inplace_update_support) {
|
||||||
mem->Update(sequence_, kTypeValue, key, value)) {
|
mem->Add(sequence_, kTypeValue, key, value);
|
||||||
|
} else if (options->inplace_callback == nullptr) {
|
||||||
|
mem->Update(sequence_, key, value);
|
||||||
RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
|
RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
|
||||||
} else {
|
} else {
|
||||||
mem->Add(sequence_, kTypeValue, key, value);
|
if (mem->UpdateCallback(sequence_, key, value, *options)) {
|
||||||
|
} else {
|
||||||
|
// key not found in memtable. Do sst get, update, add
|
||||||
|
SnapshotImpl read_from_snapshot;
|
||||||
|
read_from_snapshot.number_ = sequence_;
|
||||||
|
ReadOptions ropts;
|
||||||
|
ropts.snapshot = &read_from_snapshot;
|
||||||
|
|
||||||
|
std::string prev_value;
|
||||||
|
std::string merged_value;
|
||||||
|
Status s = db_->Get(ropts, key, &prev_value);
|
||||||
|
char* prev_buffer = const_cast<char*>(prev_value.c_str());
|
||||||
|
uint32_t prev_size = prev_value.size();
|
||||||
|
auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
|
||||||
|
s.ok() ? &prev_size : nullptr,
|
||||||
|
value, &merged_value);
|
||||||
|
if (status == UpdateStatus::UPDATED_INPLACE) {
|
||||||
|
// prev_value is updated in-place with final value.
|
||||||
|
mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
|
||||||
|
RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||||
|
} else if (status == UpdateStatus::UPDATED) {
|
||||||
|
// merged_value contains the final value.
|
||||||
|
mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
|
||||||
|
RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// Since all Puts are logged in trasaction logs (if enabled), always bump
|
||||||
|
// sequence number. Even if the update eventually fails and does not result
|
||||||
|
// in memtable add/update.
|
||||||
sequence_++;
|
sequence_++;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void MergeCF(uint32_t column_family_id, const Slice& key,
|
virtual void MergeCF(uint32_t column_family_id, const Slice& key,
|
||||||
const Slice& value) {
|
const Slice& value) {
|
||||||
bool found = cf_mems_->Seek(column_family_id);
|
bool found = cf_mems_->Seek(column_family_id);
|
||||||
@ -333,6 +364,7 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||||||
|
|
||||||
sequence_++;
|
sequence_++;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void DeleteCF(uint32_t column_family_id, const Slice& key) {
|
virtual void DeleteCF(uint32_t column_family_id, const Slice& key) {
|
||||||
bool found = cf_mems_->Seek(column_family_id);
|
bool found = cf_mems_->Seek(column_family_id);
|
||||||
if (!found || IgnoreUpdate()) {
|
if (!found || IgnoreUpdate()) {
|
||||||
|
@ -58,10 +58,7 @@ static std::string PrintContents(WriteBatch* b) {
|
|||||||
state.append(")");
|
state.append(")");
|
||||||
count++;
|
count++;
|
||||||
break;
|
break;
|
||||||
case kTypeColumnFamilyDeletion:
|
default:
|
||||||
case kTypeColumnFamilyValue:
|
|
||||||
case kTypeColumnFamilyMerge:
|
|
||||||
case kTypeLogData:
|
|
||||||
assert(false);
|
assert(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1,45 +0,0 @@
|
|||||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
||||||
// This source code is licensed under the BSD-style license found in the
|
|
||||||
// LICENSE file in the root directory of this source tree. An additional grant
|
|
||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
|
||||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
||||||
//
|
|
||||||
// Arena class defines memory allocation methods. It's used by memtable and
|
|
||||||
// skiplist.
|
|
||||||
|
|
||||||
#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
|
||||||
#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
|
||||||
|
|
||||||
#include <limits>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
namespace rocksdb {
|
|
||||||
|
|
||||||
class Arena {
|
|
||||||
public:
|
|
||||||
Arena() {};
|
|
||||||
virtual ~Arena() {};
|
|
||||||
|
|
||||||
// Return a pointer to a newly allocated memory block of "bytes" bytes.
|
|
||||||
virtual char* Allocate(size_t bytes) = 0;
|
|
||||||
|
|
||||||
// Allocate memory with the normal alignment guarantees provided by malloc.
|
|
||||||
virtual char* AllocateAligned(size_t bytes) = 0;
|
|
||||||
|
|
||||||
// Returns an estimate of the total memory used by arena.
|
|
||||||
virtual const size_t ApproximateMemoryUsage() = 0;
|
|
||||||
|
|
||||||
// Returns the total number of bytes in all blocks allocated so far.
|
|
||||||
virtual const size_t MemoryAllocatedBytes() = 0;
|
|
||||||
|
|
||||||
private:
|
|
||||||
// No copying allowed
|
|
||||||
Arena(const Arena&);
|
|
||||||
void operator=(const Arena&);
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace rocksdb
|
|
||||||
|
|
||||||
#endif // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
|
|
@ -102,7 +102,10 @@ class Cache {
|
|||||||
virtual uint64_t NewId() = 0;
|
virtual uint64_t NewId() = 0;
|
||||||
|
|
||||||
// returns the maximum configured capacity of the cache
|
// returns the maximum configured capacity of the cache
|
||||||
virtual size_t GetCapacity() = 0;
|
virtual size_t GetCapacity() const = 0;
|
||||||
|
|
||||||
|
// returns the memory size for the entries residing in the cache.
|
||||||
|
virtual size_t GetUsage() const = 0;
|
||||||
|
|
||||||
// Call this on shutdown if you want to speed it up. Cache will disown
|
// Call this on shutdown if you want to speed it up. Cache will disown
|
||||||
// any underlying data and will not free it on delete. This call will leak
|
// any underlying data and will not free it on delete. This call will leak
|
||||||
|
@ -438,7 +438,7 @@ class WritableFile {
|
|||||||
// This asks the OS to initiate flushing the cached data to disk,
|
// This asks the OS to initiate flushing the cached data to disk,
|
||||||
// without waiting for completion.
|
// without waiting for completion.
|
||||||
// Default implementation does nothing.
|
// Default implementation does nothing.
|
||||||
virtual Status RangeSync(off64_t offset, off64_t nbytes) {
|
virtual Status RangeSync(off_t offset, off_t nbytes) {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,8 +33,7 @@
|
|||||||
// iteration over the entire collection is rare since doing so requires all the
|
// iteration over the entire collection is rare since doing so requires all the
|
||||||
// keys to be copied into a sorted data structure.
|
// keys to be copied into a sorted data structure.
|
||||||
|
|
||||||
#ifndef STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
|
#pragma once
|
||||||
#define STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
|
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
@ -52,7 +51,11 @@ class MemTableRep {
|
|||||||
public:
|
public:
|
||||||
// Compare a and b. Return a negative value if a is less than b, 0 if they
|
// Compare a and b. Return a negative value if a is less than b, 0 if they
|
||||||
// are equal, and a positive value if a is greater than b
|
// are equal, and a positive value if a is greater than b
|
||||||
virtual int operator()(const char* a, const char* b) const = 0;
|
virtual int operator()(const char* prefix_len_key1,
|
||||||
|
const char* prefix_len_key2) const = 0;
|
||||||
|
|
||||||
|
virtual int operator()(const char* prefix_len_key,
|
||||||
|
const Slice& key) const = 0;
|
||||||
|
|
||||||
virtual ~KeyComparator() { }
|
virtual ~KeyComparator() { }
|
||||||
};
|
};
|
||||||
@ -100,7 +103,7 @@ class MemTableRep {
|
|||||||
virtual void Prev() = 0;
|
virtual void Prev() = 0;
|
||||||
|
|
||||||
// Advance to the first entry with a key >= target
|
// Advance to the first entry with a key >= target
|
||||||
virtual void Seek(const char* target) = 0;
|
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
|
||||||
|
|
||||||
// Position at the first entry in collection.
|
// Position at the first entry in collection.
|
||||||
// Final state of iterator is Valid() iff collection is not empty.
|
// Final state of iterator is Valid() iff collection is not empty.
|
||||||
@ -175,26 +178,22 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// HashSkipListRep is backed by hash map of buckets. Each bucket is a skip
|
// This class contains a fixed array of buckets, each
|
||||||
// list. All the keys with the same prefix will be in the same bucket.
|
// pointing to a skiplist (null if the bucket is empty).
|
||||||
// The prefix is determined using user supplied SliceTransform. It has
|
// bucket_count: number of fixed array buckets
|
||||||
// to match prefix_extractor in options.prefix_extractor.
|
// skiplist_height: the max height of the skiplist
|
||||||
//
|
// skiplist_branching_factor: probabilistic size ratio between adjacent
|
||||||
// Iteration over the entire collection is implemented by dumping all the keys
|
// link lists in the skiplist
|
||||||
// into a separate skip list. Thus, these data structures are best used when
|
|
||||||
// iteration over the entire collection is rare.
|
|
||||||
//
|
|
||||||
// Parameters:
|
|
||||||
// transform: The prefix extractor that returns prefix when supplied a user
|
|
||||||
// key. Has to match options.prefix_extractor
|
|
||||||
// bucket_count: Number of buckets in a hash_map. Each bucket needs
|
|
||||||
// 8 bytes. By default, we set buckets to one million, which
|
|
||||||
// will take 8MB of memory. If you know the number of keys you'll
|
|
||||||
// keep in hash map, set bucket count to be approximately twice
|
|
||||||
// the number of keys
|
|
||||||
extern MemTableRepFactory* NewHashSkipListRepFactory(
|
extern MemTableRepFactory* NewHashSkipListRepFactory(
|
||||||
const SliceTransform* transform, size_t bucket_count = 1000000);
|
const SliceTransform* transform, size_t bucket_count = 1000000,
|
||||||
|
int32_t skiplist_height = 4, int32_t skiplist_branching_factor = 4
|
||||||
|
);
|
||||||
|
|
||||||
|
// The factory is to create memtables with a hashed linked list:
|
||||||
|
// it contains a fixed array of buckets, each pointing to a sorted single
|
||||||
|
// linked list (null if the bucket is empty).
|
||||||
|
// bucket_count: number of fixed array buckets
|
||||||
|
extern MemTableRepFactory* NewHashLinkListRepFactory(
|
||||||
|
const SliceTransform* transform, size_t bucket_count = 50000);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // STORAGE_ROCKSDB_DB_MEMTABLEREP_H_
|
|
||||||
|
@ -34,6 +34,7 @@ class TablePropertiesCollector;
|
|||||||
class Slice;
|
class Slice;
|
||||||
class SliceTransform;
|
class SliceTransform;
|
||||||
class Statistics;
|
class Statistics;
|
||||||
|
class InternalKeyComparator;
|
||||||
|
|
||||||
using std::shared_ptr;
|
using std::shared_ptr;
|
||||||
|
|
||||||
@ -65,6 +66,12 @@ struct CompressionOptions {
|
|||||||
: window_bits(wbits), level(lev), strategy(strategy) {}
|
: window_bits(wbits), level(lev), strategy(strategy) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum UpdateStatus { // Return status For inplace update callback
|
||||||
|
UPDATE_FAILED = 0, // Nothing to update
|
||||||
|
UPDATED_INPLACE = 1, // Value updated inplace
|
||||||
|
UPDATED = 2, // No inplace update. Merged value set
|
||||||
|
};
|
||||||
|
|
||||||
struct Options;
|
struct Options;
|
||||||
|
|
||||||
struct ColumnFamilyOptions {
|
struct ColumnFamilyOptions {
|
||||||
@ -410,13 +417,17 @@ struct ColumnFamilyOptions {
|
|||||||
// the tables.
|
// the tables.
|
||||||
// Default: emtpy vector -- no user-defined statistics collection will be
|
// Default: emtpy vector -- no user-defined statistics collection will be
|
||||||
// performed.
|
// performed.
|
||||||
std::vector<std::shared_ptr<TablePropertiesCollector>>
|
typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
|
||||||
table_properties_collectors;
|
TablePropertiesCollectors;
|
||||||
|
TablePropertiesCollectors table_properties_collectors;
|
||||||
|
|
||||||
// Allows thread-safe inplace updates. Requires Updates iff
|
// Allows thread-safe inplace updates.
|
||||||
// * key exists in current memtable
|
// If inplace_callback function is not set,
|
||||||
// * new sizeof(new_value) <= sizeof(old_value)
|
// Put(key, new_value) will update inplace the existing_value iff
|
||||||
// * old_value for that key is a put i.e. kTypeValue
|
// * key exists in current memtable
|
||||||
|
// * new sizeof(new_value) <= sizeof(existing_value)
|
||||||
|
// * existing_value for that key is a put i.e. kTypeValue
|
||||||
|
// If inplace_callback function is set, check doc for inplace_callback.
|
||||||
// Default: false.
|
// Default: false.
|
||||||
bool inplace_update_support;
|
bool inplace_update_support;
|
||||||
|
|
||||||
@ -424,6 +435,55 @@ struct ColumnFamilyOptions {
|
|||||||
// Default: 10000, if inplace_update_support = true, else 0.
|
// Default: 10000, if inplace_update_support = true, else 0.
|
||||||
size_t inplace_update_num_locks;
|
size_t inplace_update_num_locks;
|
||||||
|
|
||||||
|
// existing_value - pointer to previous value (from both memtable and sst).
|
||||||
|
// nullptr if key doesn't exist
|
||||||
|
// existing_value_size - pointer to size of existing_value).
|
||||||
|
// nullptr if key doesn't exist
|
||||||
|
// delta_value - Delta value to be merged with the existing_value.
|
||||||
|
// Stored in transaction logs.
|
||||||
|
// merged_value - Set when delta is applied on the previous value.
|
||||||
|
|
||||||
|
// Applicable only when inplace_update_support is true,
|
||||||
|
// this callback function is called at the time of updating the memtable
|
||||||
|
// as part of a Put operation, lets say Put(key, delta_value). It allows the
|
||||||
|
// 'delta_value' specified as part of the Put operation to be merged with
|
||||||
|
// an 'existing_value' of the key in the database.
|
||||||
|
|
||||||
|
// If the merged value is smaller in size that the 'existing_value',
|
||||||
|
// then this function can update the 'existing_value' buffer inplace and
|
||||||
|
// the corresponding 'existing_value'_size pointer, if it wishes to.
|
||||||
|
// The callback should return UpdateStatus::UPDATED_INPLACE.
|
||||||
|
// In this case. (In this case, the snapshot-semantics of the rocksdb
|
||||||
|
// Iterator is not atomic anymore).
|
||||||
|
|
||||||
|
// If the merged value is larger in size than the 'existing_value' or the
|
||||||
|
// application does not wish to modify the 'existing_value' buffer inplace,
|
||||||
|
// then the merged value should be returned via *merge_value. It is set by
|
||||||
|
// merging the 'existing_value' and the Put 'delta_value'. The callback should
|
||||||
|
// return UpdateStatus::UPDATED in this case. This merged value will be added
|
||||||
|
// to the memtable.
|
||||||
|
|
||||||
|
// If merging fails or the application does not wish to take any action,
|
||||||
|
// then the callback should return UpdateStatus::UPDATE_FAILED.
|
||||||
|
|
||||||
|
// Please remember that the original call from the application is Put(key,
|
||||||
|
// delta_value). So the transaction log (if enabled) will still contain (key,
|
||||||
|
// delta_value). The 'merged_value' is not stored in the transaction log.
|
||||||
|
// Hence the inplace_callback function should be consistent across db reopens.
|
||||||
|
|
||||||
|
// Default: nullptr
|
||||||
|
UpdateStatus (*inplace_callback)(char* existing_value,
|
||||||
|
uint32_t* existing_value_size,
|
||||||
|
Slice delta_value,
|
||||||
|
std::string* merged_value);
|
||||||
|
|
||||||
|
// if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
|
||||||
|
// for memtable
|
||||||
|
uint32_t memtable_prefix_bloom_bits;
|
||||||
|
|
||||||
|
// number of hash probes per key
|
||||||
|
uint32_t memtable_prefix_bloom_probes;
|
||||||
|
|
||||||
// Maximum number of successive merge operations on a key in the memtable.
|
// Maximum number of successive merge operations on a key in the memtable.
|
||||||
//
|
//
|
||||||
// When a merge operation is added to the memtable and the maximum number of
|
// When a merge operation is added to the memtable and the maximum number of
|
||||||
@ -473,9 +533,10 @@ struct DBOptions {
|
|||||||
shared_ptr<Logger> info_log;
|
shared_ptr<Logger> info_log;
|
||||||
|
|
||||||
// Number of open files that can be used by the DB. You may need to
|
// Number of open files that can be used by the DB. You may need to
|
||||||
// increase this if your database has a large working set (budget
|
// increase this if your database has a large working set. Value -1 means
|
||||||
// one open file per 2MB of working set).
|
// files opened are always kept open. You can estimate number of files based
|
||||||
//
|
// on target_file_size_base and target_file_size_multiplier for level-based
|
||||||
|
// compaction. For universal-style compaction, you can usually set it to -1.
|
||||||
// Default: 1000
|
// Default: 1000
|
||||||
int max_open_files;
|
int max_open_files;
|
||||||
|
|
||||||
|
@ -38,7 +38,27 @@ struct PerfContext {
|
|||||||
uint64_t internal_key_skipped_count;
|
uint64_t internal_key_skipped_count;
|
||||||
// total number of deletes skipped over during iteration
|
// total number of deletes skipped over during iteration
|
||||||
uint64_t internal_delete_skipped_count;
|
uint64_t internal_delete_skipped_count;
|
||||||
uint64_t wal_write_time; // total time spent on writing to WAL
|
|
||||||
|
uint64_t get_snapshot_time; // total time spent on getting snapshot
|
||||||
|
uint64_t get_from_memtable_time; // total time spent on querying memtables
|
||||||
|
uint64_t get_from_memtable_count; // number of mem tables queried
|
||||||
|
// total time spent after Get() finds a key
|
||||||
|
uint64_t get_post_process_time;
|
||||||
|
uint64_t get_from_output_files_time; // total time reading from output files
|
||||||
|
// total time spent on seeking child iters
|
||||||
|
uint64_t seek_child_seek_time;
|
||||||
|
// number of seek issued in child iterators
|
||||||
|
uint64_t seek_child_seek_count;
|
||||||
|
uint64_t seek_min_heap_time; // total time spent on the merge heap
|
||||||
|
// total time spent on seeking the internal entries
|
||||||
|
uint64_t seek_internal_seek_time;
|
||||||
|
// total time spent on iterating internal entries to find the next user entry
|
||||||
|
uint64_t find_next_user_entry_time;
|
||||||
|
// total time spent on pre or post processing when writing a record
|
||||||
|
uint64_t write_pre_and_post_process_time;
|
||||||
|
uint64_t write_wal_time; // total time spent on writing to WAL
|
||||||
|
// total time spent on writing to mem tables
|
||||||
|
uint64_t write_memtable_time;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern __thread PerfContext perf_context;
|
extern __thread PerfContext perf_context;
|
||||||
|
@ -7,7 +7,6 @@
|
|||||||
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <cassert>
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -18,10 +17,8 @@ namespace rocksdb {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Keep adding ticker's here.
|
* Keep adding ticker's here.
|
||||||
* Any ticker should have a value less than TICKER_ENUM_MAX.
|
* 1. Any ticker should be added before TICKER_ENUM_MAX.
|
||||||
* Add a new ticker by assigning it the current value of TICKER_ENUM_MAX
|
* 2. Add a readable string in TickersNameMap below for the newly added ticker.
|
||||||
* Add a string representation in TickersNameMap below.
|
|
||||||
* And incrementing TICKER_ENUM_MAX.
|
|
||||||
*/
|
*/
|
||||||
enum Tickers {
|
enum Tickers {
|
||||||
// total block cache misses
|
// total block cache misses
|
||||||
@ -252,7 +249,7 @@ class Statistics {
|
|||||||
virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
|
virtual void setTickerCount(Tickers tickerType, uint64_t count) = 0;
|
||||||
virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
|
virtual void measureTime(Histograms histogramType, uint64_t time) = 0;
|
||||||
|
|
||||||
virtual void histogramData(Histograms type, HistogramData * const data) = 0;
|
virtual void histogramData(Histograms type, HistogramData* const data) = 0;
|
||||||
// String representation of the statistic object.
|
// String representation of the statistic object.
|
||||||
std::string ToString();
|
std::string ToString();
|
||||||
};
|
};
|
||||||
|
@ -1,127 +1,81 @@
|
|||||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
||||||
// This source code is licensed under the BSD-style license found in the
|
|
||||||
// LICENSE file in the root directory of this source tree. An additional grant
|
|
||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
|
||||||
//
|
|
||||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
//
|
||||||
|
// Currently we support two types of tables: plain table and block-based table.
|
||||||
|
// 1. Block-based table: this is the default table type that we inherited from
|
||||||
|
// LevelDB, which was designed for storing data in hard disk or flash
|
||||||
|
// device.
|
||||||
|
// 2. Plain table: it is one of RocksDB's SST file format optimized
|
||||||
|
// for low query latency on pure-memory or really low-latency media.
|
||||||
|
//
|
||||||
|
// A tutorial of rocksdb table formats is available here:
|
||||||
|
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
|
||||||
|
//
|
||||||
|
// Example code is also available
|
||||||
|
// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <stdint.h>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
#include "rocksdb/table_properties.h"
|
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/status.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
struct Options;
|
// -- Block-based Table
|
||||||
|
class FlushBlockPolicyFactory;
|
||||||
class RandomAccessFile;
|
class RandomAccessFile;
|
||||||
struct ReadOptions;
|
class TableBuilder;
|
||||||
class TableCache;
|
class TableReader;
|
||||||
class WritableFile;
|
class WritableFile;
|
||||||
|
struct EnvOptions;
|
||||||
|
struct Options;
|
||||||
|
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
|
|
||||||
// TableBuilder provides the interface used to build a Table
|
// For advanced user only
|
||||||
// (an immutable and sorted map from keys to values).
|
struct BlockBasedTableOptions {
|
||||||
//
|
// @flush_block_policy_factory creates the instances of flush block policy.
|
||||||
// Multiple threads can invoke const methods on a TableBuilder without
|
// which provides a configurable way to determine when to flush a block in
|
||||||
// external synchronization, but if any of the threads may call a
|
// the block based tables. If not set, table builder will use the default
|
||||||
// non-const method, all threads accessing the same TableBuilder must use
|
// block flush policy, which cut blocks by block size (please refer to
|
||||||
// external synchronization.
|
// `FlushBlockBySizePolicy`).
|
||||||
class TableBuilder {
|
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
|
||||||
public:
|
|
||||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
|
||||||
virtual ~TableBuilder() {}
|
|
||||||
|
|
||||||
// Add key,value to the table being constructed.
|
// TODO(kailiu) Temporarily disable this feature by making the default value
|
||||||
// REQUIRES: key is after any previously added key according to comparator.
|
// to be false.
|
||||||
// REQUIRES: Finish(), Abandon() have not been called
|
//
|
||||||
virtual void Add(const Slice& key, const Slice& value) = 0;
|
// Indicating if we'd put index/filter blocks to the block cache.
|
||||||
|
// If not specified, each "table reader" object will pre-load index/filter
|
||||||
// Return non-ok iff some error has been detected.
|
// block during table initialization.
|
||||||
virtual Status status() const = 0;
|
bool cache_index_and_filter_blocks = false;
|
||||||
|
|
||||||
// Finish building the table.
|
|
||||||
// REQUIRES: Finish(), Abandon() have not been called
|
|
||||||
virtual Status Finish() = 0;
|
|
||||||
|
|
||||||
// Indicate that the contents of this builder should be abandoned.
|
|
||||||
// If the caller is not going to call Finish(), it must call Abandon()
|
|
||||||
// before destroying this builder.
|
|
||||||
// REQUIRES: Finish(), Abandon() have not been called
|
|
||||||
virtual void Abandon() = 0;
|
|
||||||
|
|
||||||
// Number of calls to Add() so far.
|
|
||||||
virtual uint64_t NumEntries() const = 0;
|
|
||||||
|
|
||||||
// Size of the file generated so far. If invoked after a successful
|
|
||||||
// Finish() call, returns the size of the final generated file.
|
|
||||||
virtual uint64_t FileSize() const = 0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// A Table is a sorted map from strings to strings. Tables are
|
// Create default block based table factory.
|
||||||
// immutable and persistent. A Table may be safely accessed from
|
extern TableFactory* NewBlockBasedTableFactory(
|
||||||
// multiple threads without external synchronization.
|
const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
|
||||||
class TableReader {
|
|
||||||
public:
|
|
||||||
virtual ~TableReader() {}
|
|
||||||
|
|
||||||
// Determine whether there is a chance that the current table file
|
// -- Plain Table
|
||||||
// contains the key a key starting with iternal_prefix. The specific
|
// @user_key_len: plain table has optimization for fix-sized keys, which can be
|
||||||
// table implementation can use bloom filter and/or other heuristic
|
// specified via user_key_len. Alternatively, you can pass
|
||||||
// to filter out this table as a whole.
|
// `kPlainTableVariableLength` if your keys have variable
|
||||||
virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
|
// lengths.
|
||||||
|
// @bloom_bits_per_key: the number of bits used for bloom filer per key. You may
|
||||||
|
// disable it by passing a zero.
|
||||||
|
// @hash_table_ratio: the desired utilization of the hash table used for prefix
|
||||||
|
// hashing. hash_table_ratio = number of prefixes / #buckets
|
||||||
|
// in the hash table
|
||||||
|
const uint32_t kPlainTableVariableLength = 0;
|
||||||
|
extern TableFactory* NewPlainTableFactory(
|
||||||
|
uint32_t user_key_len = kPlainTableVariableLength,
|
||||||
|
int bloom_bits_per_key = 10, double hash_table_ratio = 0.75);
|
||||||
|
|
||||||
// Returns a new iterator over the table contents.
|
// A base class for table factories.
|
||||||
// The result of NewIterator() is initially invalid (caller must
|
|
||||||
// call one of the Seek methods on the iterator before using it).
|
|
||||||
virtual Iterator* NewIterator(const ReadOptions&) = 0;
|
|
||||||
|
|
||||||
// Given a key, return an approximate byte offset in the file where
|
|
||||||
// the data for that key begins (or would begin if the key were
|
|
||||||
// present in the file). The returned value is in terms of file
|
|
||||||
// bytes, and so includes effects like compression of the underlying data.
|
|
||||||
// E.g., the approximate offset of the last key in the table will
|
|
||||||
// be close to the file length.
|
|
||||||
virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
|
|
||||||
|
|
||||||
// Returns true if the block for the specified key is in cache.
|
|
||||||
// REQUIRES: key is in this table.
|
|
||||||
virtual bool TEST_KeyInCache(const ReadOptions& options,
|
|
||||||
const Slice& key) = 0;
|
|
||||||
|
|
||||||
// Set up the table for Compaction. Might change some parameters with
|
|
||||||
// posix_fadvise
|
|
||||||
virtual void SetupForCompaction() = 0;
|
|
||||||
|
|
||||||
virtual TableProperties& GetTableProperties() = 0;
|
|
||||||
|
|
||||||
// Calls (*result_handler)(handle_context, ...) repeatedly, starting with
|
|
||||||
// the entry found after a call to Seek(key), until result_handler returns
|
|
||||||
// false, where k is the actual internal key for a row found and v as the
|
|
||||||
// value of the key. didIO is true if I/O is involved in the operation. May
|
|
||||||
// not make such a call if filter policy says that key is not present.
|
|
||||||
//
|
|
||||||
// mark_key_may_exist_handler needs to be called when it is configured to be
|
|
||||||
// memory only and the key is not found in the block cache, with
|
|
||||||
// the parameter to be handle_context.
|
|
||||||
//
|
|
||||||
// readOptions is the options for the read
|
|
||||||
// key is the key to search for
|
|
||||||
virtual Status Get(
|
|
||||||
const ReadOptions& readOptions,
|
|
||||||
const Slice& key,
|
|
||||||
void* handle_context,
|
|
||||||
bool (*result_handler)(void* handle_context, const Slice& k,
|
|
||||||
const Slice& v, bool didIO),
|
|
||||||
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
// A base class for table factories
|
|
||||||
class TableFactory {
|
class TableFactory {
|
||||||
public:
|
public:
|
||||||
virtual ~TableFactory() {}
|
virtual ~TableFactory() {}
|
||||||
@ -139,7 +93,7 @@ class TableFactory {
|
|||||||
// in parameter file. It's the caller's responsibility to make sure
|
// in parameter file. It's the caller's responsibility to make sure
|
||||||
// file is in the correct format.
|
// file is in the correct format.
|
||||||
//
|
//
|
||||||
// GetTableReader() is called in two places:
|
// NewTableReader() is called in two places:
|
||||||
// (1) TableCache::FindTable() calls the function when table cache miss
|
// (1) TableCache::FindTable() calls the function when table cache miss
|
||||||
// and cache the table object returned.
|
// and cache the table object returned.
|
||||||
// (1) SstFileReader (for SST Dump) opens the table and dump the table
|
// (1) SstFileReader (for SST Dump) opens the table and dump the table
|
||||||
@ -150,9 +104,10 @@ class TableFactory {
|
|||||||
// file is a file handler to handle the file for the table
|
// file is a file handler to handle the file for the table
|
||||||
// file_size is the physical file size of the file
|
// file_size is the physical file size of the file
|
||||||
// table_reader is the output table reader
|
// table_reader is the output table reader
|
||||||
virtual Status GetTableReader(
|
virtual Status NewTableReader(
|
||||||
const Options& options, const EnvOptions& soptions,
|
const Options& options, const EnvOptions& soptions,
|
||||||
unique_ptr<RandomAccessFile> && file, uint64_t file_size,
|
const InternalKeyComparator& internal_comparator,
|
||||||
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table_reader) const = 0;
|
unique_ptr<TableReader>* table_reader) const = 0;
|
||||||
|
|
||||||
// Return a table builder to write to a file for this table type.
|
// Return a table builder to write to a file for this table type.
|
||||||
@ -173,8 +128,9 @@ class TableFactory {
|
|||||||
// file is a handle of a writable file. It is the caller's responsibility to
|
// file is a handle of a writable file. It is the caller's responsibility to
|
||||||
// keep the file open and close the file after closing the table builder.
|
// keep the file open and close the file after closing the table builder.
|
||||||
// compression_type is the compression type to use in this table.
|
// compression_type is the compression type to use in this table.
|
||||||
virtual TableBuilder* GetTableBuilder(
|
virtual TableBuilder* NewTableBuilder(
|
||||||
const Options& options, WritableFile* file,
|
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||||
CompressionType compression_type) const = 0;
|
WritableFile* file, CompressionType compression_type) const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -1,28 +1,25 @@
|
|||||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
// This source code is licensed under the BSD-style license found in the
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// LICENSE file in the root directory of this source tree. An additional grant
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "rocksdb/status.h"
|
#include "rocksdb/status.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
|
// -- Table Properties
|
||||||
|
// Other than basic table properties, each table may also have the user
|
||||||
|
// collected properties.
|
||||||
|
// The value of the user-collected properties are encoded as raw bytes --
|
||||||
|
// users have to interprete these values by themselves.
|
||||||
|
typedef std::unordered_map<std::string, std::string> UserCollectedProperties;
|
||||||
|
|
||||||
// TableProperties contains a bunch of read-only properties of its associated
|
// TableProperties contains a bunch of read-only properties of its associated
|
||||||
// table.
|
// table.
|
||||||
struct TableProperties {
|
struct TableProperties {
|
||||||
public:
|
public:
|
||||||
// Other than basic table properties, each table may also have the user
|
|
||||||
// collected properties.
|
|
||||||
// The value of the user-collected properties are encoded as raw bytes --
|
|
||||||
// users have to interprete these values by themselves.
|
|
||||||
typedef
|
|
||||||
std::unordered_map<std::string, std::string>
|
|
||||||
UserCollectedProperties;
|
|
||||||
|
|
||||||
// the total size of all data blocks.
|
// the total size of all data blocks.
|
||||||
uint64_t data_size = 0;
|
uint64_t data_size = 0;
|
||||||
// the size of index block.
|
// the size of index block.
|
||||||
@ -37,6 +34,10 @@ struct TableProperties {
|
|||||||
uint64_t num_data_blocks = 0;
|
uint64_t num_data_blocks = 0;
|
||||||
// the number of entries in this table
|
// the number of entries in this table
|
||||||
uint64_t num_entries = 0;
|
uint64_t num_entries = 0;
|
||||||
|
// format version, reserved for backward compatibility
|
||||||
|
uint64_t format_version = 0;
|
||||||
|
// If 0, key is variable length. Otherwise number of bytes for each key.
|
||||||
|
uint64_t fixed_key_len = 0;
|
||||||
|
|
||||||
// The name of the filter policy used in this table.
|
// The name of the filter policy used in this table.
|
||||||
// If no filter policy is used, `filter_policy_name` will be an empty string.
|
// If no filter policy is used, `filter_policy_name` will be an empty string.
|
||||||
@ -47,17 +48,32 @@ struct TableProperties {
|
|||||||
|
|
||||||
// convert this object to a human readable form
|
// convert this object to a human readable form
|
||||||
// @prop_delim: delimiter for each property.
|
// @prop_delim: delimiter for each property.
|
||||||
std::string ToString(
|
std::string ToString(const std::string& prop_delim = "; ",
|
||||||
const std::string& prop_delim = "; ",
|
const std::string& kv_delim = "=") const;
|
||||||
const std::string& kv_delim = "=") const;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// table properties' human-readable names in the property block.
|
||||||
|
struct TablePropertiesNames {
|
||||||
|
static const std::string kDataSize;
|
||||||
|
static const std::string kIndexSize;
|
||||||
|
static const std::string kFilterSize;
|
||||||
|
static const std::string kRawKeySize;
|
||||||
|
static const std::string kRawValueSize;
|
||||||
|
static const std::string kNumDataBlocks;
|
||||||
|
static const std::string kNumEntries;
|
||||||
|
static const std::string kFormatVersion;
|
||||||
|
static const std::string kFixedKeyLen;
|
||||||
|
static const std::string kFilterPolicy;
|
||||||
|
};
|
||||||
|
|
||||||
|
extern const std::string kPropertiesBlock;
|
||||||
|
|
||||||
// `TablePropertiesCollector` provides the mechanism for users to collect
|
// `TablePropertiesCollector` provides the mechanism for users to collect
|
||||||
// their own interested properties. This class is essentially a collection
|
// their own interested properties. This class is essentially a collection
|
||||||
// of callback functions that will be invoked during table building.
|
// of callback functions that will be invoked during table building.
|
||||||
class TablePropertiesCollector {
|
class TablePropertiesCollector {
|
||||||
public:
|
public:
|
||||||
virtual ~TablePropertiesCollector() { }
|
virtual ~TablePropertiesCollector() {}
|
||||||
|
|
||||||
// Add() will be called when a new key/value pair is inserted into the table.
|
// Add() will be called when a new key/value pair is inserted into the table.
|
||||||
// @params key the original key that is inserted into the table.
|
// @params key the original key that is inserted into the table.
|
||||||
@ -68,23 +84,20 @@ class TablePropertiesCollector {
|
|||||||
// for writing the properties block.
|
// for writing the properties block.
|
||||||
// @params properties User will add their collected statistics to
|
// @params properties User will add their collected statistics to
|
||||||
// `properties`.
|
// `properties`.
|
||||||
virtual Status Finish(
|
virtual Status Finish(UserCollectedProperties* properties) = 0;
|
||||||
TableProperties::UserCollectedProperties* properties) = 0;
|
|
||||||
|
|
||||||
// The name of the properties collector can be used for debugging purpose.
|
// The name of the properties collector can be used for debugging purpose.
|
||||||
virtual const char* Name() const = 0;
|
virtual const char* Name() const = 0;
|
||||||
|
|
||||||
// Return the human-readable properties, where the key is property name and
|
// Return the human-readable properties, where the key is property name and
|
||||||
// the value is the human-readable form of value.
|
// the value is the human-readable form of value.
|
||||||
virtual TableProperties::UserCollectedProperties
|
virtual UserCollectedProperties GetReadableProperties() const = 0;
|
||||||
GetReadableProperties() const = 0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Extra properties
|
// Extra properties
|
||||||
// Below is a list of non-basic properties that are collected by database
|
// Below is a list of non-basic properties that are collected by database
|
||||||
// itself. Especially some properties regarding to the internal keys (which
|
// itself. Especially some properties regarding to the internal keys (which
|
||||||
// is unknown to `table`).
|
// is unknown to `table`).
|
||||||
extern uint64_t GetDeletedKeys(
|
extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
|
||||||
const TableProperties::UserCollectedProperties& props);
|
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -396,7 +396,6 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
|
|||||||
_stream.next_out = (char *)(output + old_sz);
|
_stream.next_out = (char *)(output + old_sz);
|
||||||
_stream.avail_out = output_len - old_sz;
|
_stream.avail_out = output_len - old_sz;
|
||||||
break;
|
break;
|
||||||
case Z_BUF_ERROR:
|
|
||||||
default:
|
default:
|
||||||
delete[] output;
|
delete[] output;
|
||||||
BZ2_bzDecompressEnd(&_stream);
|
BZ2_bzDecompressEnd(&_stream);
|
||||||
|
@ -17,15 +17,17 @@
|
|||||||
#include "rocksdb/flush_block_policy.h"
|
#include "rocksdb/flush_block_policy.h"
|
||||||
#include "rocksdb/cache.h"
|
#include "rocksdb/cache.h"
|
||||||
#include "rocksdb/comparator.h"
|
#include "rocksdb/comparator.h"
|
||||||
#include "rocksdb/table.h"
|
#include "table/table_builder.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/filter_policy.h"
|
#include "rocksdb/filter_policy.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
|
#include "db/dbformat.h"
|
||||||
#include "table/block_based_table_reader.h"
|
#include "table/block_based_table_reader.h"
|
||||||
#include "table/block.h"
|
#include "table/block.h"
|
||||||
#include "table/block_builder.h"
|
#include "table/block_builder.h"
|
||||||
#include "table/filter_block.h"
|
#include "table/filter_block.h"
|
||||||
#include "table/format.h"
|
#include "table/format.h"
|
||||||
|
#include "table/meta_blocks.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/crc32c.h"
|
#include "util/crc32c.h"
|
||||||
#include "util/stop_watch.h"
|
#include "util/stop_watch.h"
|
||||||
@ -34,51 +36,24 @@ namespace rocksdb {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
struct BytewiseLessThan {
|
|
||||||
bool operator()(const std::string& key1, const std::string& key2) const {
|
|
||||||
// smaller entries will be placed in front.
|
|
||||||
return comparator->Compare(key1, key2) <= 0;
|
|
||||||
}
|
|
||||||
const Comparator* comparator = BytewiseComparator();
|
|
||||||
};
|
|
||||||
|
|
||||||
// When writing to a block that requires entries to be sorted by
|
|
||||||
// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
|
|
||||||
// before writng to store.
|
|
||||||
typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
|
|
||||||
|
|
||||||
void AddProperties(BytewiseSortedMap& props, std::string name, uint64_t val) {
|
|
||||||
assert(props.find(name) == props.end());
|
|
||||||
|
|
||||||
std::string dst;
|
|
||||||
PutVarint64(&dst, val);
|
|
||||||
|
|
||||||
props.insert(
|
|
||||||
std::make_pair(name, dst)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
|
static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
|
||||||
// Check to see if compressed less than 12.5%
|
// Check to see if compressed less than 12.5%
|
||||||
return compressed_size < raw_size - (raw_size / 8u);
|
return compressed_size < raw_size - (raw_size / 8u);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Were we encounter any error occurs during user-defined statistics collection,
|
|
||||||
// we'll write the warning message to info log.
|
|
||||||
void LogPropertiesCollectionError(
|
|
||||||
Logger* info_log, const std::string& method, const std::string& name) {
|
|
||||||
assert(method == "Add" || method == "Finish");
|
|
||||||
|
|
||||||
std::string msg =
|
|
||||||
"[Warning] encountered error when calling TablePropertiesCollector::" +
|
|
||||||
method + "() with collector name: " + name;
|
|
||||||
Log(info_log, "%s", msg.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
|
// kBlockBasedTableMagicNumber was picked by running
|
||||||
|
// echo http://code.google.com/p/leveldb/ | sha1sum
|
||||||
|
// and taking the leading 64 bits.
|
||||||
|
// Please note that kBlockBasedTableMagicNumber may also be accessed by
|
||||||
|
// other .cc files so it have to be explicitly declared with "extern".
|
||||||
|
extern const uint64_t kBlockBasedTableMagicNumber
|
||||||
|
= 0xdb4775248b80fb57ull;
|
||||||
|
|
||||||
struct BlockBasedTableBuilder::Rep {
|
struct BlockBasedTableBuilder::Rep {
|
||||||
Options options;
|
Options options;
|
||||||
|
const InternalKeyComparator& internal_comparator;
|
||||||
WritableFile* file;
|
WritableFile* file;
|
||||||
uint64_t offset = 0;
|
uint64_t offset = 0;
|
||||||
Status status;
|
Status status;
|
||||||
@ -98,31 +73,30 @@ struct BlockBasedTableBuilder::Rep {
|
|||||||
std::string compressed_output;
|
std::string compressed_output;
|
||||||
std::unique_ptr<FlushBlockPolicy> flush_block_policy;
|
std::unique_ptr<FlushBlockPolicy> flush_block_policy;
|
||||||
|
|
||||||
Rep(const Options& opt,
|
Rep(const Options& opt, const InternalKeyComparator& icomparator,
|
||||||
WritableFile* f,
|
WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||||
FlushBlockPolicyFactory* flush_block_policy_factory,
|
|
||||||
CompressionType compression_type)
|
CompressionType compression_type)
|
||||||
: options(opt),
|
: options(opt),
|
||||||
|
internal_comparator(icomparator),
|
||||||
file(f),
|
file(f),
|
||||||
data_block(options),
|
data_block(options, &internal_comparator),
|
||||||
// To avoid linear scan, we make the block_restart_interval to be `1`
|
// To avoid linear scan, we make the block_restart_interval to be `1`
|
||||||
// in index block builder
|
// in index block builder
|
||||||
index_block(1 /* block_restart_interval */, options.comparator),
|
index_block(1 /* block_restart_interval */, &internal_comparator),
|
||||||
compression_type(compression_type),
|
compression_type(compression_type),
|
||||||
filter_block(opt.filter_policy == nullptr ? nullptr
|
filter_block(opt.filter_policy == nullptr
|
||||||
: new FilterBlockBuilder(opt)),
|
? nullptr
|
||||||
|
: new FilterBlockBuilder(opt, &internal_comparator)),
|
||||||
flush_block_policy(
|
flush_block_policy(
|
||||||
flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {
|
flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {}
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
BlockBasedTableBuilder::BlockBasedTableBuilder(
|
BlockBasedTableBuilder::BlockBasedTableBuilder(
|
||||||
const Options& options,
|
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||||
WritableFile* file,
|
WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||||
FlushBlockPolicyFactory* flush_block_policy_factory,
|
|
||||||
CompressionType compression_type)
|
CompressionType compression_type)
|
||||||
: rep_(new Rep(options,
|
: rep_(new Rep(options, internal_comparator, file,
|
||||||
file, flush_block_policy_factory, compression_type)) {
|
flush_block_policy_factory, compression_type)) {
|
||||||
if (rep_->filter_block != nullptr) {
|
if (rep_->filter_block != nullptr) {
|
||||||
rep_->filter_block->StartBlock(0);
|
rep_->filter_block->StartBlock(0);
|
||||||
}
|
}
|
||||||
@ -145,7 +119,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||||||
assert(!r->closed);
|
assert(!r->closed);
|
||||||
if (!ok()) return;
|
if (!ok()) return;
|
||||||
if (r->props.num_entries > 0) {
|
if (r->props.num_entries > 0) {
|
||||||
assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
|
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto should_flush = r->flush_block_policy->Update(key, value);
|
auto should_flush = r->flush_block_policy->Update(key, value);
|
||||||
@ -162,7 +136,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||||||
// entries in the first block and < all entries in subsequent
|
// entries in the first block and < all entries in subsequent
|
||||||
// blocks.
|
// blocks.
|
||||||
if (ok()) {
|
if (ok()) {
|
||||||
r->options.comparator->FindShortestSeparator(&r->last_key, key);
|
r->internal_comparator.FindShortestSeparator(&r->last_key, key);
|
||||||
std::string handle_encoding;
|
std::string handle_encoding;
|
||||||
r->pending_handle.EncodeTo(&handle_encoding);
|
r->pending_handle.EncodeTo(&handle_encoding);
|
||||||
r->index_block.Add(r->last_key, Slice(handle_encoding));
|
r->index_block.Add(r->last_key, Slice(handle_encoding));
|
||||||
@ -179,16 +153,12 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||||||
r->props.raw_key_size += key.size();
|
r->props.raw_key_size += key.size();
|
||||||
r->props.raw_value_size += value.size();
|
r->props.raw_value_size += value.size();
|
||||||
|
|
||||||
for (auto collector : r->options.table_properties_collectors) {
|
NotifyCollectTableCollectorsOnAdd(
|
||||||
Status s = collector->Add(key, value);
|
key,
|
||||||
if (!s.ok()) {
|
value,
|
||||||
LogPropertiesCollectionError(
|
r->options.table_properties_collectors,
|
||||||
r->options.info_log.get(),
|
r->options.info_log.get()
|
||||||
"Add", /* method */
|
);
|
||||||
collector->Name()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void BlockBasedTableBuilder::Flush() {
|
void BlockBasedTableBuilder::Flush() {
|
||||||
@ -370,7 +340,7 @@ Status BlockBasedTableBuilder::Finish() {
|
|||||||
// block, we will finish writing all index entries here and flush them
|
// block, we will finish writing all index entries here and flush them
|
||||||
// to storage after metaindex block is written.
|
// to storage after metaindex block is written.
|
||||||
if (ok() && !empty_data_block) {
|
if (ok() && !empty_data_block) {
|
||||||
r->options.comparator->FindShortSuccessor(&r->last_key);
|
r->internal_comparator.FindShortSuccessor(&r->last_key);
|
||||||
|
|
||||||
std::string handle_encoding;
|
std::string handle_encoding;
|
||||||
r->pending_handle.EncodeTo(&handle_encoding);
|
r->pending_handle.EncodeTo(&handle_encoding);
|
||||||
@ -382,14 +352,7 @@ Status BlockBasedTableBuilder::Finish() {
|
|||||||
// 2. [meta block: properties]
|
// 2. [meta block: properties]
|
||||||
// 3. [metaindex block]
|
// 3. [metaindex block]
|
||||||
if (ok()) {
|
if (ok()) {
|
||||||
// We use `BytewiseComparator` as the comparator for meta block.
|
MetaIndexBuilder meta_index_builer;
|
||||||
BlockBuilder meta_index_block(
|
|
||||||
r->options.block_restart_interval,
|
|
||||||
BytewiseComparator()
|
|
||||||
);
|
|
||||||
// Key: meta block name
|
|
||||||
// Value: block handle to that meta block
|
|
||||||
BytewiseSortedMap meta_block_handles;
|
|
||||||
|
|
||||||
// Write filter block.
|
// Write filter block.
|
||||||
if (r->filter_block != nullptr) {
|
if (r->filter_block != nullptr) {
|
||||||
@ -397,104 +360,43 @@ Status BlockBasedTableBuilder::Finish() {
|
|||||||
// of filter data.
|
// of filter data.
|
||||||
std::string key = BlockBasedTable::kFilterBlockPrefix;
|
std::string key = BlockBasedTable::kFilterBlockPrefix;
|
||||||
key.append(r->options.filter_policy->Name());
|
key.append(r->options.filter_policy->Name());
|
||||||
std::string handle_encoding;
|
meta_index_builer.Add(key, filter_block_handle);
|
||||||
filter_block_handle.EncodeTo(&handle_encoding);
|
|
||||||
meta_block_handles.insert(
|
|
||||||
std::make_pair(key, handle_encoding)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write properties block.
|
// Write properties block.
|
||||||
{
|
{
|
||||||
BlockBuilder properties_block(
|
PropertyBlockBuilder property_block_builder;
|
||||||
r->options.block_restart_interval,
|
std::vector<std::string> failed_user_prop_collectors;
|
||||||
BytewiseComparator()
|
r->props.filter_policy_name = r->options.filter_policy != nullptr ?
|
||||||
);
|
r->options.filter_policy->Name() : "";
|
||||||
|
|
||||||
BytewiseSortedMap properties;
|
|
||||||
|
|
||||||
// Add basic properties
|
|
||||||
AddProperties(
|
|
||||||
properties,
|
|
||||||
BlockBasedTablePropertiesNames::kRawKeySize,
|
|
||||||
r->props.raw_key_size
|
|
||||||
);
|
|
||||||
AddProperties(
|
|
||||||
properties,
|
|
||||||
BlockBasedTablePropertiesNames::kRawValueSize,
|
|
||||||
r->props.raw_value_size
|
|
||||||
);
|
|
||||||
AddProperties(
|
|
||||||
properties,
|
|
||||||
BlockBasedTablePropertiesNames::kDataSize,
|
|
||||||
r->props.data_size
|
|
||||||
);
|
|
||||||
r->props.index_size =
|
r->props.index_size =
|
||||||
r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
|
r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
|
||||||
AddProperties(
|
|
||||||
properties,
|
|
||||||
BlockBasedTablePropertiesNames::kIndexSize,
|
|
||||||
r->props.index_size
|
|
||||||
);
|
|
||||||
AddProperties(
|
|
||||||
properties,
|
|
||||||
BlockBasedTablePropertiesNames::kNumEntries,
|
|
||||||
r->props.num_entries
|
|
||||||
);
|
|
||||||
AddProperties(
|
|
||||||
properties,
|
|
||||||
BlockBasedTablePropertiesNames::kNumDataBlocks,
|
|
||||||
r->props.num_data_blocks);
|
|
||||||
if (r->filter_block != nullptr) {
|
|
||||||
properties.insert({
|
|
||||||
BlockBasedTablePropertiesNames::kFilterPolicy,
|
|
||||||
r->options.filter_policy->Name()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
AddProperties(
|
|
||||||
properties,
|
|
||||||
BlockBasedTablePropertiesNames::kFilterSize,
|
|
||||||
r->props.filter_size
|
|
||||||
);
|
|
||||||
|
|
||||||
for (auto collector : r->options.table_properties_collectors) {
|
// Add basic properties
|
||||||
TableProperties::UserCollectedProperties user_collected_properties;
|
property_block_builder.AddTableProperty(r->props);
|
||||||
Status s =
|
|
||||||
collector->Finish(&user_collected_properties);
|
|
||||||
|
|
||||||
if (!s.ok()) {
|
NotifyCollectTableCollectorsOnFinish(
|
||||||
LogPropertiesCollectionError(
|
r->options.table_properties_collectors,
|
||||||
r->options.info_log.get(),
|
r->options.info_log.get(),
|
||||||
"Finish", /* method */
|
&property_block_builder
|
||||||
collector->Name()
|
);
|
||||||
);
|
|
||||||
} else {
|
|
||||||
properties.insert(
|
|
||||||
user_collected_properties.begin(),
|
|
||||||
user_collected_properties.end()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto& stat : properties) {
|
|
||||||
properties_block.Add(stat.first, stat.second);
|
|
||||||
}
|
|
||||||
|
|
||||||
BlockHandle properties_block_handle;
|
BlockHandle properties_block_handle;
|
||||||
WriteBlock(&properties_block, &properties_block_handle);
|
WriteRawBlock(
|
||||||
|
property_block_builder.Finish(),
|
||||||
std::string handle_encoding;
|
kNoCompression,
|
||||||
properties_block_handle.EncodeTo(&handle_encoding);
|
&properties_block_handle
|
||||||
meta_block_handles.insert(
|
|
||||||
{ BlockBasedTable::kPropertiesBlock, handle_encoding }
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
meta_index_builer.Add(kPropertiesBlock,
|
||||||
|
properties_block_handle);
|
||||||
} // end of properties block writing
|
} // end of properties block writing
|
||||||
|
|
||||||
for (const auto& metablock : meta_block_handles) {
|
WriteRawBlock(
|
||||||
meta_index_block.Add(metablock.first, metablock.second);
|
meta_index_builer.Finish(),
|
||||||
}
|
kNoCompression,
|
||||||
|
&metaindex_block_handle
|
||||||
WriteBlock(&meta_index_block, &metaindex_block_handle);
|
);
|
||||||
} // meta blocks and metaindex block.
|
} // meta blocks and metaindex block.
|
||||||
|
|
||||||
// Write index block
|
// Write index block
|
||||||
@ -504,7 +406,7 @@ Status BlockBasedTableBuilder::Finish() {
|
|||||||
|
|
||||||
// Write footer
|
// Write footer
|
||||||
if (ok()) {
|
if (ok()) {
|
||||||
Footer footer;
|
Footer footer(kBlockBasedTableMagicNumber);
|
||||||
footer.set_metaindex_handle(metaindex_block_handle);
|
footer.set_metaindex_handle(metaindex_block_handle);
|
||||||
footer.set_index_handle(index_block_handle);
|
footer.set_index_handle(index_block_handle);
|
||||||
std::string footer_encoding;
|
std::string footer_encoding;
|
||||||
@ -556,4 +458,7 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
|
|||||||
return rep_->offset;
|
return rep_->offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::string BlockBasedTable::kFilterBlockPrefix =
|
||||||
|
"filter.";
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#include "rocksdb/flush_block_policy.h"
|
#include "rocksdb/flush_block_policy.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
#include "rocksdb/status.h"
|
#include "rocksdb/status.h"
|
||||||
#include "rocksdb/table.h"
|
#include "table/table_builder.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
@ -20,13 +20,13 @@ class BlockBuilder;
|
|||||||
class BlockHandle;
|
class BlockHandle;
|
||||||
class WritableFile;
|
class WritableFile;
|
||||||
|
|
||||||
|
|
||||||
class BlockBasedTableBuilder : public TableBuilder {
|
class BlockBasedTableBuilder : public TableBuilder {
|
||||||
public:
|
public:
|
||||||
// Create a builder that will store the contents of the table it is
|
// Create a builder that will store the contents of the table it is
|
||||||
// building in *file. Does not close the file. It is up to the
|
// building in *file. Does not close the file. It is up to the
|
||||||
// caller to close the file after calling Finish().
|
// caller to close the file after calling Finish().
|
||||||
BlockBasedTableBuilder(const Options& options,
|
BlockBasedTableBuilder(const Options& options,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
WritableFile* file,
|
WritableFile* file,
|
||||||
FlushBlockPolicyFactory* flush_block_policy_factory,
|
FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||||
CompressionType compression_type);
|
CompressionType compression_type);
|
||||||
|
@ -18,17 +18,19 @@
|
|||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
Status BlockBasedTableFactory::GetTableReader(
|
Status BlockBasedTableFactory::NewTableReader(
|
||||||
const Options& options, const EnvOptions& soptions,
|
const Options& options, const EnvOptions& soptions,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table_reader) const {
|
unique_ptr<TableReader>* table_reader) const {
|
||||||
return BlockBasedTable::Open(options, soptions, table_options_,
|
return BlockBasedTable::Open(options, soptions, table_options_,
|
||||||
std::move(file), file_size, table_reader);
|
internal_comparator, std::move(file), file_size,
|
||||||
|
table_reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
TableBuilder* BlockBasedTableFactory::GetTableBuilder(
|
TableBuilder* BlockBasedTableFactory::NewTableBuilder(
|
||||||
const Options& options, WritableFile* file,
|
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||||
CompressionType compression_type) const {
|
WritableFile* file, CompressionType compression_type) const {
|
||||||
auto flush_block_policy_factory =
|
auto flush_block_policy_factory =
|
||||||
table_options_.flush_block_policy_factory.get();
|
table_options_.flush_block_policy_factory.get();
|
||||||
|
|
||||||
@ -45,11 +47,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
|
|||||||
options.block_size_deviation);
|
options.block_size_deviation);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto table_builder = new BlockBasedTableBuilder(
|
auto table_builder =
|
||||||
options,
|
new BlockBasedTableBuilder(options, internal_comparator, file,
|
||||||
file,
|
flush_block_policy_factory, compression_type);
|
||||||
flush_block_policy_factory,
|
|
||||||
compression_type);
|
|
||||||
|
|
||||||
// Delete flush_block_policy_factory only when it's just created from the
|
// Delete flush_block_policy_factory only when it's just created from the
|
||||||
// options.
|
// options.
|
||||||
@ -63,4 +63,9 @@ TableBuilder* BlockBasedTableFactory::GetTableBuilder(
|
|||||||
return table_builder;
|
return table_builder;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TableFactory* NewBlockBasedTableFactory(
|
||||||
|
const BlockBasedTableOptions& table_options) {
|
||||||
|
return new BlockBasedTableFactory(table_options);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -14,7 +14,6 @@
|
|||||||
#include "rocksdb/flush_block_policy.h"
|
#include "rocksdb/flush_block_policy.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
#include "table/block_based_table_options.h"
|
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
@ -22,31 +21,26 @@ struct Options;
|
|||||||
struct EnvOptions;
|
struct EnvOptions;
|
||||||
|
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
class Status;
|
|
||||||
class RandomAccessFile;
|
|
||||||
class WritableFile;
|
|
||||||
class Table;
|
|
||||||
class TableBuilder;
|
|
||||||
class BlockBasedTable;
|
|
||||||
class BlockBasedTableBuilder;
|
class BlockBasedTableBuilder;
|
||||||
|
|
||||||
class BlockBasedTableFactory: public TableFactory {
|
class BlockBasedTableFactory : public TableFactory {
|
||||||
public:
|
public:
|
||||||
BlockBasedTableFactory() : BlockBasedTableFactory(BlockBasedTableOptions()) {}
|
explicit BlockBasedTableFactory(
|
||||||
explicit BlockBasedTableFactory(const BlockBasedTableOptions& table_options)
|
const BlockBasedTableOptions& table_options = BlockBasedTableOptions())
|
||||||
: table_options_(table_options) {}
|
: table_options_(table_options) {}
|
||||||
|
|
||||||
~BlockBasedTableFactory() {}
|
~BlockBasedTableFactory() {}
|
||||||
|
|
||||||
const char* Name() const override { return "BlockBasedTable"; }
|
const char* Name() const override { return "BlockBasedTable"; }
|
||||||
|
|
||||||
Status GetTableReader(const Options& options, const EnvOptions& soptions,
|
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table_reader) const override;
|
unique_ptr<TableReader>* table_reader) const override;
|
||||||
|
|
||||||
TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
|
TableBuilder* NewTableBuilder(
|
||||||
CompressionType compression_type)
|
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||||
const override;
|
WritableFile* file, CompressionType compression_type) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
BlockBasedTableOptions table_options_;
|
BlockBasedTableOptions table_options_;
|
||||||
|
@ -1,31 +0,0 @@
|
|||||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
||||||
// This source code is licensed under the BSD-style license found in the
|
|
||||||
// LICENSE file in the root directory of this source tree. An additional grant
|
|
||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
namespace rocksdb {
|
|
||||||
|
|
||||||
class FlushBlockPolicyFactory;
|
|
||||||
|
|
||||||
struct BlockBasedTableOptions {
|
|
||||||
// @flush_block_policy_factory creates the instances of flush block policy.
|
|
||||||
// which provides a configurable way to determine when to flush a block in
|
|
||||||
// the block based tables. If not set, table builder will use the default
|
|
||||||
// block flush policy, which cut blocks by block size (please refer to
|
|
||||||
// `FlushBlockBySizePolicy`).
|
|
||||||
std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
|
|
||||||
|
|
||||||
// TODO(kailiu) Temporarily disable this feature by making the default value
|
|
||||||
// to be false. Also in master branch, this file is non-public so no user
|
|
||||||
// will be able to change the value of `cache_index_and_filter_blocks`.
|
|
||||||
//
|
|
||||||
// Indicating if we'd put index/filter blocks to the block cache.
|
|
||||||
// If not specified, each "table reader" object will pre-load index/filter
|
|
||||||
// block during table initialization.
|
|
||||||
bool cache_index_and_filter_blocks = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace rocksdb
|
|
@ -21,15 +21,17 @@
|
|||||||
#include "table/block.h"
|
#include "table/block.h"
|
||||||
#include "table/filter_block.h"
|
#include "table/filter_block.h"
|
||||||
#include "table/format.h"
|
#include "table/format.h"
|
||||||
|
#include "table/meta_blocks.h"
|
||||||
#include "table/two_level_iterator.h"
|
#include "table/two_level_iterator.h"
|
||||||
|
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/perf_context_imp.h"
|
#include "util/perf_context_imp.h"
|
||||||
#include "util/stop_watch.h"
|
#include "util/stop_watch.h"
|
||||||
#include "table/block_based_table_options.h"
|
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
|
extern uint64_t kBlockBasedTableMagicNumber;
|
||||||
|
|
||||||
// The longest the prefix of the cache key used to identify blocks can be.
|
// The longest the prefix of the cache key used to identify blocks can be.
|
||||||
// We are using the fact that we know for Posix files the unique ID is three
|
// We are using the fact that we know for Posix files the unique ID is three
|
||||||
// varints.
|
// varints.
|
||||||
@ -37,12 +39,13 @@ const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
|
|||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
|
|
||||||
struct BlockBasedTable::Rep {
|
struct BlockBasedTable::Rep {
|
||||||
Rep(const EnvOptions& storage_options) :
|
Rep(const EnvOptions& storage_options,
|
||||||
soptions(storage_options) {
|
const InternalKeyComparator& internal_comparator)
|
||||||
}
|
: soptions(storage_options), internal_comparator_(internal_comparator) {}
|
||||||
|
|
||||||
Options options;
|
Options options;
|
||||||
const EnvOptions& soptions;
|
const EnvOptions& soptions;
|
||||||
|
const InternalKeyComparator& internal_comparator_;
|
||||||
Status status;
|
Status status;
|
||||||
unique_ptr<RandomAccessFile> file;
|
unique_ptr<RandomAccessFile> file;
|
||||||
char cache_key_prefix[kMaxCacheKeyPrefixSize];
|
char cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||||
@ -223,34 +226,19 @@ Cache::Handle* GetFromBlockCache(
|
|||||||
|
|
||||||
Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
|
Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
|
||||||
const BlockBasedTableOptions& table_options,
|
const BlockBasedTableOptions& table_options,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
unique_ptr<RandomAccessFile>&& file,
|
unique_ptr<RandomAccessFile>&& file,
|
||||||
uint64_t file_size,
|
uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table_reader) {
|
unique_ptr<TableReader>* table_reader) {
|
||||||
table_reader->reset();
|
table_reader->reset();
|
||||||
|
|
||||||
if (file_size < Footer::kEncodedLength) {
|
Footer footer(kBlockBasedTableMagicNumber);
|
||||||
return Status::InvalidArgument("file is too short to be an sstable");
|
auto s = ReadFooterFromFile(file.get(), file_size, &footer);
|
||||||
}
|
|
||||||
|
|
||||||
char footer_space[Footer::kEncodedLength];
|
|
||||||
Slice footer_input;
|
|
||||||
Status s = file->Read(file_size - Footer::kEncodedLength,
|
|
||||||
Footer::kEncodedLength, &footer_input, footer_space);
|
|
||||||
if (!s.ok()) return s;
|
|
||||||
|
|
||||||
// Check that we actually read the whole footer from the file. It may be
|
|
||||||
// that size isn't correct.
|
|
||||||
if (footer_input.size() != Footer::kEncodedLength) {
|
|
||||||
return Status::InvalidArgument("file is too short to be an sstable");
|
|
||||||
}
|
|
||||||
|
|
||||||
Footer footer;
|
|
||||||
s = footer.DecodeFrom(&footer_input);
|
|
||||||
if (!s.ok()) return s;
|
if (!s.ok()) return s;
|
||||||
|
|
||||||
// We've successfully read the footer and the index block: we're
|
// We've successfully read the footer and the index block: we're
|
||||||
// ready to serve requests.
|
// ready to serve requests.
|
||||||
Rep* rep = new BlockBasedTable::Rep(soptions);
|
Rep* rep = new BlockBasedTable::Rep(soptions, internal_comparator);
|
||||||
rep->options = options;
|
rep->options = options;
|
||||||
rep->file = std::move(file);
|
rep->file = std::move(file);
|
||||||
rep->metaindex_handle = footer.metaindex_handle();
|
rep->metaindex_handle = footer.metaindex_handle();
|
||||||
@ -265,10 +253,11 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
|
|||||||
|
|
||||||
// Read the properties
|
// Read the properties
|
||||||
meta_iter->Seek(kPropertiesBlock);
|
meta_iter->Seek(kPropertiesBlock);
|
||||||
if (meta_iter->Valid() && meta_iter->key() == Slice(kPropertiesBlock)) {
|
if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock) {
|
||||||
s = meta_iter->status();
|
s = meta_iter->status();
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = ReadProperties(meta_iter->value(), rep, &rep->table_properties);
|
s = ReadProperties(meta_iter->value(), rep->file.get(), rep->options.env,
|
||||||
|
rep->options.info_log.get(), &rep->table_properties);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
@ -350,7 +339,7 @@ void BlockBasedTable::SetupForCompaction() {
|
|||||||
compaction_optimized_ = true;
|
compaction_optimized_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
TableProperties& BlockBasedTable::GetTableProperties() {
|
const TableProperties& BlockBasedTable::GetTableProperties() {
|
||||||
return rep_->table_properties;
|
return rep_->table_properties;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -415,96 +404,6 @@ FilterBlockReader* BlockBasedTable::ReadFilter (
|
|||||||
rep->options, block.data, block.heap_allocated);
|
rep->options, block.data, block.heap_allocated);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BlockBasedTable::ReadProperties(
|
|
||||||
const Slice& handle_value, Rep* rep, TableProperties* table_properties) {
|
|
||||||
assert(table_properties);
|
|
||||||
|
|
||||||
Slice v = handle_value;
|
|
||||||
BlockHandle handle;
|
|
||||||
if (!handle.DecodeFrom(&v).ok()) {
|
|
||||||
return Status::InvalidArgument("Failed to decode properties block handle");
|
|
||||||
}
|
|
||||||
|
|
||||||
BlockContents block_contents;
|
|
||||||
Status s = ReadBlockContents(
|
|
||||||
rep->file.get(),
|
|
||||||
ReadOptions(),
|
|
||||||
handle,
|
|
||||||
&block_contents,
|
|
||||||
rep->options.env,
|
|
||||||
false
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Block properties_block(block_contents);
|
|
||||||
std::unique_ptr<Iterator> iter(
|
|
||||||
properties_block.NewIterator(BytewiseComparator())
|
|
||||||
);
|
|
||||||
|
|
||||||
// All pre-defined properties of type uint64_t
|
|
||||||
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
|
||||||
{ BlockBasedTablePropertiesNames::kDataSize,
|
|
||||||
&table_properties->data_size },
|
|
||||||
{ BlockBasedTablePropertiesNames::kIndexSize,
|
|
||||||
&table_properties->index_size },
|
|
||||||
{ BlockBasedTablePropertiesNames::kFilterSize,
|
|
||||||
&table_properties->filter_size },
|
|
||||||
{ BlockBasedTablePropertiesNames::kRawKeySize,
|
|
||||||
&table_properties->raw_key_size },
|
|
||||||
{ BlockBasedTablePropertiesNames::kRawValueSize,
|
|
||||||
&table_properties->raw_value_size },
|
|
||||||
{ BlockBasedTablePropertiesNames::kNumDataBlocks,
|
|
||||||
&table_properties->num_data_blocks },
|
|
||||||
{ BlockBasedTablePropertiesNames::kNumEntries,
|
|
||||||
&table_properties->num_entries },
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string last_key;
|
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
||||||
s = iter->status();
|
|
||||||
if (!s.ok()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto key = iter->key().ToString();
|
|
||||||
// properties block is strictly sorted with no duplicate key.
|
|
||||||
assert(
|
|
||||||
last_key.empty() ||
|
|
||||||
BytewiseComparator()->Compare(key, last_key) > 0
|
|
||||||
);
|
|
||||||
last_key = key;
|
|
||||||
|
|
||||||
auto raw_val = iter->value();
|
|
||||||
auto pos = predefined_uint64_properties.find(key);
|
|
||||||
|
|
||||||
if (pos != predefined_uint64_properties.end()) {
|
|
||||||
// handle predefined rocksdb properties
|
|
||||||
uint64_t val;
|
|
||||||
if (!GetVarint64(&raw_val, &val)) {
|
|
||||||
// skip malformed value
|
|
||||||
auto error_msg =
|
|
||||||
"[Warning] detect malformed value in properties meta-block:"
|
|
||||||
"\tkey: " + key + "\tval: " + raw_val.ToString();
|
|
||||||
Log(rep->options.info_log, "%s", error_msg.c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
*(pos->second) = val;
|
|
||||||
} else if (key == BlockBasedTablePropertiesNames::kFilterPolicy) {
|
|
||||||
table_properties->filter_policy_name = raw_val.ToString();
|
|
||||||
} else {
|
|
||||||
// handle user-collected
|
|
||||||
table_properties->user_collected_properties.insert(
|
|
||||||
std::make_pair(key, raw_val.ToString())
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status BlockBasedTable::GetBlock(
|
Status BlockBasedTable::GetBlock(
|
||||||
const BlockBasedTable* table,
|
const BlockBasedTable* table,
|
||||||
const BlockHandle& handle,
|
const BlockHandle& handle,
|
||||||
@ -764,7 +663,7 @@ Iterator* BlockBasedTable::BlockReader(void* arg,
|
|||||||
|
|
||||||
Iterator* iter;
|
Iterator* iter;
|
||||||
if (block != nullptr) {
|
if (block != nullptr) {
|
||||||
iter = block->NewIterator(table->rep_->options.comparator);
|
iter = block->NewIterator(&(table->rep_->internal_comparator_));
|
||||||
if (cache_handle != nullptr) {
|
if (cache_handle != nullptr) {
|
||||||
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
|
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
|
||||||
} else {
|
} else {
|
||||||
@ -837,7 +736,7 @@ BlockBasedTable::GetFilter(bool no_io) const {
|
|||||||
// Get the iterator from the index block.
|
// Get the iterator from the index block.
|
||||||
Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
|
Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
|
||||||
if (rep_->index_block) {
|
if (rep_->index_block) {
|
||||||
return rep_->index_block->NewIterator(rep_->options.comparator);
|
return rep_->index_block->NewIterator(&(rep_->internal_comparator_));
|
||||||
}
|
}
|
||||||
|
|
||||||
// get index block from cache
|
// get index block from cache
|
||||||
@ -858,7 +757,7 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
|
|||||||
|
|
||||||
Iterator* iter;
|
Iterator* iter;
|
||||||
if (entry.value != nullptr) {
|
if (entry.value != nullptr) {
|
||||||
iter = entry.value->NewIterator(rep_->options.comparator);
|
iter = entry.value->NewIterator(&(rep_->internal_comparator_));
|
||||||
if (entry.cache_handle) {
|
if (entry.cache_handle) {
|
||||||
iter->RegisterCleanup(
|
iter->RegisterCleanup(
|
||||||
&ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle
|
&ReleaseBlock, rep_->options.block_cache.get(), entry.cache_handle
|
||||||
@ -872,9 +771,9 @@ Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
|
|||||||
return iter;
|
return iter;
|
||||||
}
|
}
|
||||||
|
|
||||||
Iterator* BlockBasedTable::BlockReader(void* arg,
|
Iterator* BlockBasedTable::BlockReader(void* arg, const ReadOptions& options,
|
||||||
const ReadOptions& options,
|
|
||||||
const EnvOptions& soptions,
|
const EnvOptions& soptions,
|
||||||
|
const InternalKeyComparator& icomparator,
|
||||||
const Slice& index_value,
|
const Slice& index_value,
|
||||||
bool for_compaction) {
|
bool for_compaction) {
|
||||||
return BlockReader(arg, options, index_value, nullptr, for_compaction);
|
return BlockReader(arg, options, index_value, nullptr, for_compaction);
|
||||||
@ -965,20 +864,15 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& options) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return NewTwoLevelIterator(
|
return NewTwoLevelIterator(IndexBlockReader(options),
|
||||||
IndexBlockReader(options),
|
&BlockBasedTable::BlockReader,
|
||||||
&BlockBasedTable::BlockReader,
|
const_cast<BlockBasedTable*>(this), options,
|
||||||
const_cast<BlockBasedTable*>(this),
|
rep_->soptions, rep_->internal_comparator_);
|
||||||
options,
|
|
||||||
rep_->soptions
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BlockBasedTable::Get(
|
Status BlockBasedTable::Get(
|
||||||
const ReadOptions& readOptions,
|
const ReadOptions& readOptions, const Slice& key, void* handle_context,
|
||||||
const Slice& key,
|
bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
|
||||||
void* handle_context,
|
|
||||||
bool (*result_handler)(void* handle_context, const Slice& k,
|
|
||||||
const Slice& v, bool didIO),
|
const Slice& v, bool didIO),
|
||||||
void (*mark_key_may_exist_handler)(void* handle_context)) {
|
void (*mark_key_may_exist_handler)(void* handle_context)) {
|
||||||
Status s;
|
Status s;
|
||||||
@ -1016,8 +910,13 @@ Status BlockBasedTable::Get(
|
|||||||
|
|
||||||
// Call the *saver function on each entry/block until it returns false
|
// Call the *saver function on each entry/block until it returns false
|
||||||
for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
|
for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) {
|
||||||
if (!(*result_handler)(handle_context, block_iter->key(),
|
ParsedInternalKey parsed_key;
|
||||||
block_iter->value(), didIO)) {
|
if (!ParseInternalKey(block_iter->key(), &parsed_key)) {
|
||||||
|
s = Status::Corruption(Slice());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(*result_handler)(handle_context, parsed_key, block_iter->value(),
|
||||||
|
didIO)) {
|
||||||
done = true;
|
done = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1034,7 +933,8 @@ Status BlockBasedTable::Get(
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SaveDidIO(void* arg, const Slice& key, const Slice& value, bool didIO) {
|
bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
|
||||||
|
bool didIO) {
|
||||||
*reinterpret_cast<bool*>(arg) = didIO;
|
*reinterpret_cast<bool*>(arg) = didIO;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1075,25 +975,4 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string BlockBasedTable::kFilterBlockPrefix =
|
|
||||||
"filter.";
|
|
||||||
const std::string BlockBasedTable::kPropertiesBlock =
|
|
||||||
"rocksdb.properties";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kDataSize =
|
|
||||||
"rocksdb.data.size";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kIndexSize =
|
|
||||||
"rocksdb.index.size";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kFilterSize =
|
|
||||||
"rocksdb.filter.size";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kRawKeySize =
|
|
||||||
"rocksdb.raw.key.size";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kRawValueSize =
|
|
||||||
"rocksdb.raw.value.size";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kNumDataBlocks =
|
|
||||||
"rocksdb.num.data.blocks";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kNumEntries =
|
|
||||||
"rocksdb.num.entries";
|
|
||||||
const std::string BlockBasedTablePropertiesNames::kFilterPolicy =
|
|
||||||
"rocksdb.filter.policy";
|
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -14,8 +14,7 @@
|
|||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
#include "rocksdb/statistics.h"
|
#include "rocksdb/statistics.h"
|
||||||
#include "rocksdb/table_properties.h"
|
#include "table/table_reader.h"
|
||||||
#include "rocksdb/table.h"
|
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -39,7 +38,6 @@ using std::unique_ptr;
|
|||||||
class BlockBasedTable : public TableReader {
|
class BlockBasedTable : public TableReader {
|
||||||
public:
|
public:
|
||||||
static const std::string kFilterBlockPrefix;
|
static const std::string kFilterBlockPrefix;
|
||||||
static const std::string kPropertiesBlock;
|
|
||||||
|
|
||||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||||
// of "file", and read the metadata entries necessary to allow
|
// of "file", and read the metadata entries necessary to allow
|
||||||
@ -53,6 +51,7 @@ class BlockBasedTable : public TableReader {
|
|||||||
// *file must remain live while this Table is in use.
|
// *file must remain live while this Table is in use.
|
||||||
static Status Open(const Options& db_options, const EnvOptions& env_options,
|
static Status Open(const Options& db_options, const EnvOptions& env_options,
|
||||||
const BlockBasedTableOptions& table_options,
|
const BlockBasedTableOptions& table_options,
|
||||||
|
const InternalKeyComparator& internal_key_comparator,
|
||||||
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
unique_ptr<TableReader>* table_reader);
|
unique_ptr<TableReader>* table_reader);
|
||||||
|
|
||||||
@ -63,14 +62,13 @@ class BlockBasedTable : public TableReader {
|
|||||||
// call one of the Seek methods on the iterator before using it).
|
// call one of the Seek methods on the iterator before using it).
|
||||||
Iterator* NewIterator(const ReadOptions&) override;
|
Iterator* NewIterator(const ReadOptions&) override;
|
||||||
|
|
||||||
Status Get(
|
Status Get(const ReadOptions& readOptions, const Slice& key,
|
||||||
const ReadOptions& readOptions,
|
void* handle_context,
|
||||||
const Slice& key,
|
bool (*result_handler)(void* handle_context,
|
||||||
void* handle_context,
|
const ParsedInternalKey& k, const Slice& v,
|
||||||
bool (*result_handler)(void* handle_context, const Slice& k,
|
bool didIO),
|
||||||
const Slice& v, bool didIO),
|
void (*mark_key_may_exist_handler)(void* handle_context) =
|
||||||
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
|
nullptr) override;
|
||||||
override;
|
|
||||||
|
|
||||||
// Given a key, return an approximate byte offset in the file where
|
// Given a key, return an approximate byte offset in the file where
|
||||||
// the data for that key begins (or would begin if the key were
|
// the data for that key begins (or would begin if the key were
|
||||||
@ -82,13 +80,13 @@ class BlockBasedTable : public TableReader {
|
|||||||
|
|
||||||
// Returns true if the block for the specified key is in cache.
|
// Returns true if the block for the specified key is in cache.
|
||||||
// REQUIRES: key is in this table.
|
// REQUIRES: key is in this table.
|
||||||
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
|
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
|
||||||
|
|
||||||
// Set up the table for Compaction. Might change some parameters with
|
// Set up the table for Compaction. Might change some parameters with
|
||||||
// posix_fadvise
|
// posix_fadvise
|
||||||
void SetupForCompaction() override;
|
void SetupForCompaction() override;
|
||||||
|
|
||||||
TableProperties& GetTableProperties() override;
|
const TableProperties& GetTableProperties() override;
|
||||||
|
|
||||||
~BlockBasedTable();
|
~BlockBasedTable();
|
||||||
|
|
||||||
@ -101,8 +99,9 @@ class BlockBasedTable : public TableReader {
|
|||||||
bool compaction_optimized_;
|
bool compaction_optimized_;
|
||||||
|
|
||||||
static Iterator* BlockReader(void*, const ReadOptions&,
|
static Iterator* BlockReader(void*, const ReadOptions&,
|
||||||
const EnvOptions& soptions, const Slice&,
|
const EnvOptions& soptions,
|
||||||
bool for_compaction);
|
const InternalKeyComparator& icomparator,
|
||||||
|
const Slice&, bool for_compaction);
|
||||||
|
|
||||||
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
|
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
|
||||||
bool* didIO, bool for_compaction = false);
|
bool* didIO, bool for_compaction = false);
|
||||||
@ -142,7 +141,6 @@ class BlockBasedTable : public TableReader {
|
|||||||
|
|
||||||
void ReadMeta(const Footer& footer);
|
void ReadMeta(const Footer& footer);
|
||||||
void ReadFilter(const Slice& filter_handle_value);
|
void ReadFilter(const Slice& filter_handle_value);
|
||||||
static Status ReadProperties(const Slice& handle_value, Rep* rep);
|
|
||||||
|
|
||||||
// Read the meta block from sst.
|
// Read the meta block from sst.
|
||||||
static Status ReadMetaBlock(
|
static Status ReadMetaBlock(
|
||||||
@ -156,10 +154,6 @@ class BlockBasedTable : public TableReader {
|
|||||||
Rep* rep,
|
Rep* rep,
|
||||||
size_t* filter_size = nullptr);
|
size_t* filter_size = nullptr);
|
||||||
|
|
||||||
// Read the table properties from properties block.
|
|
||||||
static Status ReadProperties(
|
|
||||||
const Slice& handle_value, Rep* rep, TableProperties* properties);
|
|
||||||
|
|
||||||
static void SetupCacheKeyPrefix(Rep* rep);
|
static void SetupCacheKeyPrefix(Rep* rep);
|
||||||
|
|
||||||
explicit BlockBasedTable(Rep* rep) :
|
explicit BlockBasedTable(Rep* rep) :
|
||||||
@ -181,15 +175,4 @@ class BlockBasedTable : public TableReader {
|
|||||||
void operator=(const TableReader&) = delete;
|
void operator=(const TableReader&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct BlockBasedTablePropertiesNames {
|
|
||||||
static const std::string kDataSize;
|
|
||||||
static const std::string kIndexSize;
|
|
||||||
static const std::string kFilterSize;
|
|
||||||
static const std::string kRawKeySize;
|
|
||||||
static const std::string kRawValueSize;
|
|
||||||
static const std::string kNumDataBlocks;
|
|
||||||
static const std::string kNumEntries;
|
|
||||||
static const std::string kFilterPolicy;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -36,6 +36,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include "rocksdb/comparator.h"
|
#include "rocksdb/comparator.h"
|
||||||
|
#include "db/dbformat.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -51,9 +52,8 @@ BlockBuilder::BlockBuilder(int block_restart_interval,
|
|||||||
restarts_.push_back(0); // First restart point is at offset 0
|
restarts_.push_back(0); // First restart point is at offset 0
|
||||||
}
|
}
|
||||||
|
|
||||||
BlockBuilder::BlockBuilder(const Options& options)
|
BlockBuilder::BlockBuilder(const Options& options, const Comparator* comparator)
|
||||||
: BlockBuilder(options.block_restart_interval, options.comparator) {
|
: BlockBuilder(options.block_restart_interval, comparator) {}
|
||||||
}
|
|
||||||
|
|
||||||
void BlockBuilder::Reset() {
|
void BlockBuilder::Reset() {
|
||||||
buffer_.clear();
|
buffer_.clear();
|
||||||
|
@ -21,7 +21,7 @@ class Comparator;
|
|||||||
class BlockBuilder {
|
class BlockBuilder {
|
||||||
public:
|
public:
|
||||||
BlockBuilder(int block_builder, const Comparator* comparator);
|
BlockBuilder(int block_builder, const Comparator* comparator);
|
||||||
explicit BlockBuilder(const Options& options);
|
explicit BlockBuilder(const Options& options, const Comparator* comparator);
|
||||||
|
|
||||||
// Reset the contents as if the BlockBuilder was just constructed.
|
// Reset the contents as if the BlockBuilder was just constructed.
|
||||||
void Reset();
|
void Reset();
|
||||||
|
@ -32,9 +32,12 @@ class BlockTest {};
|
|||||||
TEST(BlockTest, SimpleTest) {
|
TEST(BlockTest, SimpleTest) {
|
||||||
Random rnd(301);
|
Random rnd(301);
|
||||||
Options options = Options();
|
Options options = Options();
|
||||||
|
std::unique_ptr<InternalKeyComparator> ic;
|
||||||
|
ic.reset(new test::PlainInternalKeyComparator(options.comparator));
|
||||||
|
|
||||||
std::vector<std::string> keys;
|
std::vector<std::string> keys;
|
||||||
std::vector<std::string> values;
|
std::vector<std::string> values;
|
||||||
BlockBuilder builder(options);
|
BlockBuilder builder(options, ic.get());
|
||||||
int num_records = 100000;
|
int num_records = 100000;
|
||||||
char buf[10];
|
char buf[10];
|
||||||
char* p = &buf[0];
|
char* p = &buf[0];
|
||||||
|
@ -21,11 +21,12 @@ namespace rocksdb {
|
|||||||
static const size_t kFilterBaseLg = 11;
|
static const size_t kFilterBaseLg = 11;
|
||||||
static const size_t kFilterBase = 1 << kFilterBaseLg;
|
static const size_t kFilterBase = 1 << kFilterBaseLg;
|
||||||
|
|
||||||
FilterBlockBuilder::FilterBlockBuilder(const Options& opt)
|
FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
|
||||||
: policy_(opt.filter_policy),
|
const Comparator* internal_comparator)
|
||||||
prefix_extractor_(opt.prefix_extractor),
|
: policy_(opt.filter_policy),
|
||||||
whole_key_filtering_(opt.whole_key_filtering),
|
prefix_extractor_(opt.prefix_extractor),
|
||||||
comparator_(opt.comparator){}
|
whole_key_filtering_(opt.whole_key_filtering),
|
||||||
|
comparator_(internal_comparator) {}
|
||||||
|
|
||||||
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
|
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
|
||||||
uint64_t filter_index = (block_offset / kFilterBase);
|
uint64_t filter_index = (block_offset / kFilterBase);
|
||||||
|
@ -35,7 +35,8 @@ class FilterPolicy;
|
|||||||
// (StartBlock AddKey*)* Finish
|
// (StartBlock AddKey*)* Finish
|
||||||
class FilterBlockBuilder {
|
class FilterBlockBuilder {
|
||||||
public:
|
public:
|
||||||
explicit FilterBlockBuilder(const Options& opt);
|
explicit FilterBlockBuilder(const Options& opt,
|
||||||
|
const Comparator* internal_comparator);
|
||||||
|
|
||||||
void StartBlock(uint64_t block_offset);
|
void StartBlock(uint64_t block_offset);
|
||||||
void AddKey(const Slice& key);
|
void AddKey(const Slice& key);
|
||||||
|
@ -55,7 +55,7 @@ class FilterBlockTest {
|
|||||||
};
|
};
|
||||||
|
|
||||||
TEST(FilterBlockTest, EmptyBuilder) {
|
TEST(FilterBlockTest, EmptyBuilder) {
|
||||||
FilterBlockBuilder builder(options_);
|
FilterBlockBuilder builder(options_, options_.comparator);
|
||||||
Slice block = builder.Finish();
|
Slice block = builder.Finish();
|
||||||
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
|
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
|
||||||
FilterBlockReader reader(options_, block);
|
FilterBlockReader reader(options_, block);
|
||||||
@ -64,7 +64,7 @@ TEST(FilterBlockTest, EmptyBuilder) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(FilterBlockTest, SingleChunk) {
|
TEST(FilterBlockTest, SingleChunk) {
|
||||||
FilterBlockBuilder builder(options_);
|
FilterBlockBuilder builder(options_, options_.comparator);
|
||||||
builder.StartBlock(100);
|
builder.StartBlock(100);
|
||||||
builder.AddKey("foo");
|
builder.AddKey("foo");
|
||||||
builder.AddKey("bar");
|
builder.AddKey("bar");
|
||||||
@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(FilterBlockTest, MultiChunk) {
|
TEST(FilterBlockTest, MultiChunk) {
|
||||||
FilterBlockBuilder builder(options_);
|
FilterBlockBuilder builder(options_, options_.comparator);
|
||||||
|
|
||||||
// First filter
|
// First filter
|
||||||
builder.StartBlock(0);
|
builder.StartBlock(0);
|
||||||
|
@ -34,6 +34,7 @@ Status BlockHandle::DecodeFrom(Slice* input) {
|
|||||||
return Status::Corruption("bad block handle");
|
return Status::Corruption("bad block handle");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
|
||||||
|
|
||||||
void Footer::EncodeTo(std::string* dst) const {
|
void Footer::EncodeTo(std::string* dst) const {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
@ -72,6 +73,30 @@ Status Footer::DecodeFrom(Slice* input) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status ReadFooterFromFile(RandomAccessFile* file,
|
||||||
|
uint64_t file_size,
|
||||||
|
Footer* footer) {
|
||||||
|
if (file_size < Footer::kEncodedLength) {
|
||||||
|
return Status::InvalidArgument("file is too short to be an sstable");
|
||||||
|
}
|
||||||
|
|
||||||
|
char footer_space[Footer::kEncodedLength];
|
||||||
|
Slice footer_input;
|
||||||
|
Status s = file->Read(file_size - Footer::kEncodedLength,
|
||||||
|
Footer::kEncodedLength,
|
||||||
|
&footer_input,
|
||||||
|
footer_space);
|
||||||
|
if (!s.ok()) return s;
|
||||||
|
|
||||||
|
// Check that we actually read the whole footer from the file. It may be
|
||||||
|
// that size isn't correct.
|
||||||
|
if (footer_input.size() != Footer::kEncodedLength) {
|
||||||
|
return Status::InvalidArgument("file is too short to be an sstable");
|
||||||
|
}
|
||||||
|
|
||||||
|
return footer->DecodeFrom(&footer_input);
|
||||||
|
}
|
||||||
|
|
||||||
Status ReadBlockContents(RandomAccessFile* file,
|
Status ReadBlockContents(RandomAccessFile* file,
|
||||||
const ReadOptions& options,
|
const ReadOptions& options,
|
||||||
const BlockHandle& handle,
|
const BlockHandle& handle,
|
||||||
|
@ -26,6 +26,7 @@ struct ReadOptions;
|
|||||||
class BlockHandle {
|
class BlockHandle {
|
||||||
public:
|
public:
|
||||||
BlockHandle();
|
BlockHandle();
|
||||||
|
BlockHandle(uint64_t offset, uint64_t size);
|
||||||
|
|
||||||
// The offset of the block in the file.
|
// The offset of the block in the file.
|
||||||
uint64_t offset() const { return offset_; }
|
uint64_t offset() const { return offset_; }
|
||||||
@ -38,19 +39,36 @@ class BlockHandle {
|
|||||||
void EncodeTo(std::string* dst) const;
|
void EncodeTo(std::string* dst) const;
|
||||||
Status DecodeFrom(Slice* input);
|
Status DecodeFrom(Slice* input);
|
||||||
|
|
||||||
|
// if the block handle's offset and size are both "0", we will view it
|
||||||
|
// as a null block handle that points to no where.
|
||||||
|
bool IsNull() const {
|
||||||
|
return offset_ == 0 && size_ == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const BlockHandle& NullBlockHandle() {
|
||||||
|
return kNullBlockHandle;
|
||||||
|
}
|
||||||
|
|
||||||
// Maximum encoding length of a BlockHandle
|
// Maximum encoding length of a BlockHandle
|
||||||
enum { kMaxEncodedLength = 10 + 10 };
|
enum { kMaxEncodedLength = 10 + 10 };
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint64_t offset_;
|
uint64_t offset_ = 0;
|
||||||
uint64_t size_;
|
uint64_t size_ = 0;
|
||||||
|
|
||||||
|
static const BlockHandle kNullBlockHandle;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Footer encapsulates the fixed information stored at the tail
|
// Footer encapsulates the fixed information stored at the tail
|
||||||
// end of every table file.
|
// end of every table file.
|
||||||
class Footer {
|
class Footer {
|
||||||
public:
|
public:
|
||||||
Footer() { }
|
// @table_magic_number serves two purposes:
|
||||||
|
// 1. Identify different types of the tables.
|
||||||
|
// 2. Help us to identify if a given file is a valid sst.
|
||||||
|
Footer(uint64_t table_magic_number) :
|
||||||
|
kTableMagicNumber(table_magic_number) {
|
||||||
|
}
|
||||||
|
|
||||||
// The block handle for the metaindex block of the table
|
// The block handle for the metaindex block of the table
|
||||||
const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
|
const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
|
||||||
@ -77,12 +95,13 @@ class Footer {
|
|||||||
private:
|
private:
|
||||||
BlockHandle metaindex_handle_;
|
BlockHandle metaindex_handle_;
|
||||||
BlockHandle index_handle_;
|
BlockHandle index_handle_;
|
||||||
|
const uint64_t kTableMagicNumber;
|
||||||
};
|
};
|
||||||
|
|
||||||
// kTableMagicNumber was picked by running
|
// Read the footer from file
|
||||||
// echo http://code.google.com/p/leveldb/ | sha1sum
|
Status ReadFooterFromFile(RandomAccessFile* file,
|
||||||
// and taking the leading 64 bits.
|
uint64_t file_size,
|
||||||
static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
|
Footer* footer);
|
||||||
|
|
||||||
// 1-byte type + 32-bit crc
|
// 1-byte type + 32-bit crc
|
||||||
static const size_t kBlockTrailerSize = 5;
|
static const size_t kBlockTrailerSize = 5;
|
||||||
@ -115,8 +134,13 @@ extern Status UncompressBlockContents(const char* data,
|
|||||||
// Implementation details follow. Clients should ignore,
|
// Implementation details follow. Clients should ignore,
|
||||||
|
|
||||||
inline BlockHandle::BlockHandle()
|
inline BlockHandle::BlockHandle()
|
||||||
: offset_(~static_cast<uint64_t>(0)),
|
: BlockHandle(~static_cast<uint64_t>(0),
|
||||||
size_(~static_cast<uint64_t>(0)) {
|
~static_cast<uint64_t>(0)) {
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
|
||||||
|
: offset_(offset),
|
||||||
|
size_(size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -11,8 +11,11 @@
|
|||||||
|
|
||||||
#include "rocksdb/comparator.h"
|
#include "rocksdb/comparator.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
|
#include "rocksdb/options.h"
|
||||||
#include "table/iter_heap.h"
|
#include "table/iter_heap.h"
|
||||||
#include "table/iterator_wrapper.h"
|
#include "table/iterator_wrapper.h"
|
||||||
|
#include "util/stop_watch.h"
|
||||||
|
#include "util/perf_context_imp.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -22,10 +25,13 @@ namespace {
|
|||||||
|
|
||||||
class MergingIterator : public Iterator {
|
class MergingIterator : public Iterator {
|
||||||
public:
|
public:
|
||||||
MergingIterator(const Comparator* comparator, Iterator** children, int n)
|
MergingIterator(Env* const env, const Comparator* comparator,
|
||||||
|
Iterator** children, int n)
|
||||||
: comparator_(comparator),
|
: comparator_(comparator),
|
||||||
children_(n),
|
children_(n),
|
||||||
current_(nullptr),
|
current_(nullptr),
|
||||||
|
use_heap_(true),
|
||||||
|
env_(env),
|
||||||
direction_(kForward),
|
direction_(kForward),
|
||||||
maxHeap_(NewMaxIterHeap(comparator_)),
|
maxHeap_(NewMaxIterHeap(comparator_)),
|
||||||
minHeap_ (NewMinIterHeap(comparator_)) {
|
minHeap_ (NewMinIterHeap(comparator_)) {
|
||||||
@ -70,15 +76,52 @@ class MergingIterator : public Iterator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual void Seek(const Slice& target) {
|
virtual void Seek(const Slice& target) {
|
||||||
ClearHeaps();
|
// Invalidate the heap.
|
||||||
|
use_heap_ = false;
|
||||||
|
IteratorWrapper* first_child = nullptr;
|
||||||
|
StopWatchNano child_seek_timer(env_, false);
|
||||||
|
StopWatchNano min_heap_timer(env_, false);
|
||||||
for (auto& child : children_) {
|
for (auto& child : children_) {
|
||||||
|
StartPerfTimer(&child_seek_timer);
|
||||||
child.Seek(target);
|
child.Seek(target);
|
||||||
|
BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
|
||||||
|
BumpPerfCount(&perf_context.seek_child_seek_count);
|
||||||
|
|
||||||
if (child.Valid()) {
|
if (child.Valid()) {
|
||||||
minHeap_.push(&child);
|
// This child has valid key
|
||||||
|
if (!use_heap_) {
|
||||||
|
if (first_child == nullptr) {
|
||||||
|
// It's the first child has valid key. Only put it int
|
||||||
|
// current_. Now the values in the heap should be invalid.
|
||||||
|
first_child = &child;
|
||||||
|
} else {
|
||||||
|
// We have more than one children with valid keys. Initialize
|
||||||
|
// the heap and put the first child into the heap.
|
||||||
|
StartPerfTimer(&min_heap_timer);
|
||||||
|
ClearHeaps();
|
||||||
|
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||||
|
StartPerfTimer(&min_heap_timer);
|
||||||
|
minHeap_.push(first_child);
|
||||||
|
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (use_heap_) {
|
||||||
|
StartPerfTimer(&min_heap_timer);
|
||||||
|
minHeap_.push(&child);
|
||||||
|
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FindSmallest();
|
if (use_heap_) {
|
||||||
direction_ = kForward;
|
// If heap is valid, need to put the smallest key to curent_.
|
||||||
|
StartPerfTimer(&min_heap_timer);
|
||||||
|
FindSmallest();
|
||||||
|
BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
|
||||||
|
} else {
|
||||||
|
// The heap is not valid, then the current_ iterator is the first
|
||||||
|
// one, or null if there is no first child.
|
||||||
|
current_ = first_child;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void Next() {
|
virtual void Next() {
|
||||||
@ -109,10 +152,14 @@ class MergingIterator : public Iterator {
|
|||||||
// as the current points to the current record. move the iterator forward.
|
// as the current points to the current record. move the iterator forward.
|
||||||
// and if it is valid add it to the heap.
|
// and if it is valid add it to the heap.
|
||||||
current_->Next();
|
current_->Next();
|
||||||
if (current_->Valid()){
|
if (use_heap_) {
|
||||||
minHeap_.push(current_);
|
if (current_->Valid()) {
|
||||||
|
minHeap_.push(current_);
|
||||||
|
}
|
||||||
|
FindSmallest();
|
||||||
|
} else if (!current_->Valid()) {
|
||||||
|
current_ = nullptr;
|
||||||
}
|
}
|
||||||
FindSmallest();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void Prev() {
|
virtual void Prev() {
|
||||||
@ -178,6 +225,11 @@ class MergingIterator : public Iterator {
|
|||||||
const Comparator* comparator_;
|
const Comparator* comparator_;
|
||||||
std::vector<IteratorWrapper> children_;
|
std::vector<IteratorWrapper> children_;
|
||||||
IteratorWrapper* current_;
|
IteratorWrapper* current_;
|
||||||
|
// If the value is true, both of iterators in the heap and current_
|
||||||
|
// contain valid rows. If it is false, only current_ can possibly contain
|
||||||
|
// valid rows.
|
||||||
|
bool use_heap_;
|
||||||
|
Env* const env_;
|
||||||
// Which direction is the iterator moving?
|
// Which direction is the iterator moving?
|
||||||
enum Direction {
|
enum Direction {
|
||||||
kForward,
|
kForward,
|
||||||
@ -189,6 +241,7 @@ class MergingIterator : public Iterator {
|
|||||||
};
|
};
|
||||||
|
|
||||||
void MergingIterator::FindSmallest() {
|
void MergingIterator::FindSmallest() {
|
||||||
|
assert(use_heap_);
|
||||||
if (minHeap_.empty()) {
|
if (minHeap_.empty()) {
|
||||||
current_ = nullptr;
|
current_ = nullptr;
|
||||||
} else {
|
} else {
|
||||||
@ -199,6 +252,7 @@ void MergingIterator::FindSmallest() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void MergingIterator::FindLargest() {
|
void MergingIterator::FindLargest() {
|
||||||
|
assert(use_heap_);
|
||||||
if (maxHeap_.empty()) {
|
if (maxHeap_.empty()) {
|
||||||
current_ = nullptr;
|
current_ = nullptr;
|
||||||
} else {
|
} else {
|
||||||
@ -209,19 +263,21 @@ void MergingIterator::FindLargest() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void MergingIterator::ClearHeaps() {
|
void MergingIterator::ClearHeaps() {
|
||||||
|
use_heap_ = true;
|
||||||
maxHeap_ = NewMaxIterHeap(comparator_);
|
maxHeap_ = NewMaxIterHeap(comparator_);
|
||||||
minHeap_ = NewMinIterHeap(comparator_);
|
minHeap_ = NewMinIterHeap(comparator_);
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
|
Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
|
||||||
|
Iterator** list, int n) {
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
return NewEmptyIterator();
|
return NewEmptyIterator();
|
||||||
} else if (n == 1) {
|
} else if (n == 1) {
|
||||||
return list[0];
|
return list[0];
|
||||||
} else {
|
} else {
|
||||||
return new MergingIterator(cmp, list, n);
|
return new MergingIterator(env, cmp, list, n);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@ namespace rocksdb {
|
|||||||
|
|
||||||
class Comparator;
|
class Comparator;
|
||||||
class Iterator;
|
class Iterator;
|
||||||
|
class Env;
|
||||||
|
|
||||||
// Return an iterator that provided the union of the data in
|
// Return an iterator that provided the union of the data in
|
||||||
// children[0,n-1]. Takes ownership of the child iterators and
|
// children[0,n-1]. Takes ownership of the child iterators and
|
||||||
@ -22,7 +23,8 @@ class Iterator;
|
|||||||
// key is present in K child iterators, it will be yielded K times.
|
// key is present in K child iterators, it will be yielded K times.
|
||||||
//
|
//
|
||||||
// REQUIRES: n >= 0
|
// REQUIRES: n >= 0
|
||||||
extern Iterator* NewMergingIterator(
|
extern Iterator* NewMergingIterator(Env* const env,
|
||||||
const Comparator* comparator, Iterator** children, int n);
|
const Comparator* comparator,
|
||||||
|
Iterator** children, int n);
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
286
table/meta_blocks.cc
Normal file
286
table/meta_blocks.cc
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#include "table/meta_blocks.h"
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
#include "rocksdb/table.h"
|
||||||
|
#include "table/block.h"
|
||||||
|
#include "table/format.h"
|
||||||
|
#include "util/coding.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
MetaIndexBuilder::MetaIndexBuilder()
|
||||||
|
: meta_index_block_(
|
||||||
|
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void MetaIndexBuilder::Add(const std::string& key,
|
||||||
|
const BlockHandle& handle) {
|
||||||
|
std::string handle_encoding;
|
||||||
|
handle.EncodeTo(&handle_encoding);
|
||||||
|
meta_block_handles_.insert({key, handle_encoding});
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice MetaIndexBuilder::Finish() {
|
||||||
|
for (const auto& metablock : meta_block_handles_) {
|
||||||
|
meta_index_block_->Add(metablock.first, metablock.second);
|
||||||
|
}
|
||||||
|
return meta_index_block_->Finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
PropertyBlockBuilder::PropertyBlockBuilder()
|
||||||
|
: properties_block_(
|
||||||
|
new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void PropertyBlockBuilder::Add(const std::string& name,
|
||||||
|
const std::string& val) {
|
||||||
|
props_.insert({name, val});
|
||||||
|
}
|
||||||
|
|
||||||
|
void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
|
||||||
|
assert(props_.find(name) == props_.end());
|
||||||
|
|
||||||
|
std::string dst;
|
||||||
|
PutVarint64(&dst, val);
|
||||||
|
|
||||||
|
Add(name, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PropertyBlockBuilder::Add(
|
||||||
|
const UserCollectedProperties& user_collected_properties) {
|
||||||
|
for (const auto& prop : user_collected_properties) {
|
||||||
|
Add(prop.first, prop.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
|
||||||
|
Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
|
||||||
|
Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
|
||||||
|
Add(TablePropertiesNames::kDataSize, props.data_size);
|
||||||
|
Add(TablePropertiesNames::kIndexSize, props.index_size);
|
||||||
|
Add(TablePropertiesNames::kNumEntries, props.num_entries);
|
||||||
|
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
|
||||||
|
Add(TablePropertiesNames::kFilterSize, props.filter_size);
|
||||||
|
Add(TablePropertiesNames::kFormatVersion, props.format_version);
|
||||||
|
Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
|
||||||
|
|
||||||
|
if (!props.filter_policy_name.empty()) {
|
||||||
|
Add(TablePropertiesNames::kFilterPolicy,
|
||||||
|
props.filter_policy_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice PropertyBlockBuilder::Finish() {
|
||||||
|
for (const auto& prop : props_) {
|
||||||
|
properties_block_->Add(prop.first, prop.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
return properties_block_->Finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
void LogPropertiesCollectionError(
|
||||||
|
Logger* info_log, const std::string& method, const std::string& name) {
|
||||||
|
assert(method == "Add" || method == "Finish");
|
||||||
|
|
||||||
|
std::string msg =
|
||||||
|
"[Warning] encountered error when calling TablePropertiesCollector::" +
|
||||||
|
method + "() with collector name: " + name;
|
||||||
|
Log(info_log, "%s", msg.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NotifyCollectTableCollectorsOnAdd(
|
||||||
|
const Slice& key,
|
||||||
|
const Slice& value,
|
||||||
|
const Options::TablePropertiesCollectors& collectors,
|
||||||
|
Logger* info_log) {
|
||||||
|
bool all_succeeded = true;
|
||||||
|
for (auto collector : collectors) {
|
||||||
|
Status s = collector->Add(key, value);
|
||||||
|
all_succeeded = all_succeeded && s.ok();
|
||||||
|
if (!s.ok()) {
|
||||||
|
LogPropertiesCollectionError(
|
||||||
|
info_log, "Add", /* method */ collector->Name()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return all_succeeded;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NotifyCollectTableCollectorsOnFinish(
|
||||||
|
const Options::TablePropertiesCollectors& collectors,
|
||||||
|
Logger* info_log,
|
||||||
|
PropertyBlockBuilder* builder) {
|
||||||
|
bool all_succeeded = true;
|
||||||
|
for (auto collector : collectors) {
|
||||||
|
UserCollectedProperties user_collected_properties;
|
||||||
|
Status s = collector->Finish(&user_collected_properties);
|
||||||
|
|
||||||
|
all_succeeded = all_succeeded && s.ok();
|
||||||
|
if (!s.ok()) {
|
||||||
|
LogPropertiesCollectionError(
|
||||||
|
info_log, "Finish", /* method */ collector->Name()
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
builder->Add(user_collected_properties);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return all_succeeded;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status ReadProperties(
|
||||||
|
const Slice& handle_value,
|
||||||
|
RandomAccessFile* file,
|
||||||
|
Env* env,
|
||||||
|
Logger* logger,
|
||||||
|
TableProperties* table_properties) {
|
||||||
|
assert(table_properties);
|
||||||
|
|
||||||
|
Slice v = handle_value;
|
||||||
|
BlockHandle handle;
|
||||||
|
if (!handle.DecodeFrom(&v).ok()) {
|
||||||
|
return Status::InvalidArgument("Failed to decode properties block handle");
|
||||||
|
}
|
||||||
|
|
||||||
|
BlockContents block_contents;
|
||||||
|
ReadOptions read_options;
|
||||||
|
read_options.verify_checksums = false;
|
||||||
|
Status s = ReadBlockContents(
|
||||||
|
file,
|
||||||
|
read_options,
|
||||||
|
handle,
|
||||||
|
&block_contents,
|
||||||
|
env,
|
||||||
|
false
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Block properties_block(block_contents);
|
||||||
|
std::unique_ptr<Iterator> iter(
|
||||||
|
properties_block.NewIterator(BytewiseComparator())
|
||||||
|
);
|
||||||
|
|
||||||
|
// All pre-defined properties of type uint64_t
|
||||||
|
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
||||||
|
{ TablePropertiesNames::kDataSize, &table_properties->data_size },
|
||||||
|
{ TablePropertiesNames::kIndexSize, &table_properties->index_size },
|
||||||
|
{ TablePropertiesNames::kFilterSize, &table_properties->filter_size },
|
||||||
|
{ TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size },
|
||||||
|
{ TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size },
|
||||||
|
{ TablePropertiesNames::kNumDataBlocks,
|
||||||
|
&table_properties->num_data_blocks },
|
||||||
|
{ TablePropertiesNames::kNumEntries, &table_properties->num_entries },
|
||||||
|
{ TablePropertiesNames::kFormatVersion, &table_properties->format_version },
|
||||||
|
{ TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len },
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string last_key;
|
||||||
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
|
s = iter->status();
|
||||||
|
if (!s.ok()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto key = iter->key().ToString();
|
||||||
|
// properties block is strictly sorted with no duplicate key.
|
||||||
|
assert(
|
||||||
|
last_key.empty() ||
|
||||||
|
BytewiseComparator()->Compare(key, last_key) > 0
|
||||||
|
);
|
||||||
|
last_key = key;
|
||||||
|
|
||||||
|
auto raw_val = iter->value();
|
||||||
|
auto pos = predefined_uint64_properties.find(key);
|
||||||
|
|
||||||
|
if (pos != predefined_uint64_properties.end()) {
|
||||||
|
// handle predefined rocksdb properties
|
||||||
|
uint64_t val;
|
||||||
|
if (!GetVarint64(&raw_val, &val)) {
|
||||||
|
// skip malformed value
|
||||||
|
auto error_msg =
|
||||||
|
"[Warning] detect malformed value in properties meta-block:"
|
||||||
|
"\tkey: " + key + "\tval: " + raw_val.ToString();
|
||||||
|
Log(logger, "%s", error_msg.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
*(pos->second) = val;
|
||||||
|
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
||||||
|
table_properties->filter_policy_name = raw_val.ToString();
|
||||||
|
} else {
|
||||||
|
// handle user-collected properties
|
||||||
|
table_properties->user_collected_properties.insert(
|
||||||
|
std::make_pair(key, raw_val.ToString())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status ReadTableProperties(
|
||||||
|
RandomAccessFile* file,
|
||||||
|
uint64_t file_size,
|
||||||
|
uint64_t table_magic_number,
|
||||||
|
Env* env,
|
||||||
|
Logger* info_log,
|
||||||
|
TableProperties* properties) {
|
||||||
|
// -- Read metaindex block
|
||||||
|
Footer footer(table_magic_number);
|
||||||
|
auto s = ReadFooterFromFile(file, file_size, &footer);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto metaindex_handle = footer.metaindex_handle();
|
||||||
|
BlockContents metaindex_contents;
|
||||||
|
ReadOptions read_options;
|
||||||
|
read_options.verify_checksums = false;
|
||||||
|
s = ReadBlockContents(
|
||||||
|
file,
|
||||||
|
read_options,
|
||||||
|
metaindex_handle,
|
||||||
|
&metaindex_contents,
|
||||||
|
env,
|
||||||
|
false
|
||||||
|
);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
Block metaindex_block(metaindex_contents);
|
||||||
|
std::unique_ptr<Iterator> meta_iter(
|
||||||
|
metaindex_block.NewIterator(BytewiseComparator())
|
||||||
|
);
|
||||||
|
|
||||||
|
// -- Read property block
|
||||||
|
meta_iter->Seek(kPropertiesBlock);
|
||||||
|
TableProperties table_properties;
|
||||||
|
if (meta_iter->Valid() &&
|
||||||
|
meta_iter->key() == kPropertiesBlock &&
|
||||||
|
meta_iter->status().ok()) {
|
||||||
|
s = ReadProperties(
|
||||||
|
meta_iter->value(),
|
||||||
|
file,
|
||||||
|
env,
|
||||||
|
info_log,
|
||||||
|
properties
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
s = Status::Corruption(
|
||||||
|
"Unable to read the property block from the plain table"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
121
table/meta_blocks.h
Normal file
121
table/meta_blocks.h
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "rocksdb/comparator.h"
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/slice.h"
|
||||||
|
#include "rocksdb/table_properties.h"
|
||||||
|
#include "table/block_builder.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class BlockBuilder;
|
||||||
|
class BlockHandle;
|
||||||
|
class Env;
|
||||||
|
class Logger;
|
||||||
|
class RandomAccessFile;
|
||||||
|
struct TableProperties;
|
||||||
|
|
||||||
|
// An STL style comparator that does the bytewise comparator comparasion
|
||||||
|
// internally.
|
||||||
|
struct BytewiseLessThan {
|
||||||
|
bool operator()(const std::string& key1, const std::string& key2) const {
|
||||||
|
// smaller entries will be placed in front.
|
||||||
|
return comparator->Compare(key1, key2) <= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Comparator* comparator = BytewiseComparator();
|
||||||
|
};
|
||||||
|
|
||||||
|
// When writing to a block that requires entries to be sorted by
|
||||||
|
// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
|
||||||
|
// before writng to store.
|
||||||
|
typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
|
||||||
|
|
||||||
|
class MetaIndexBuilder {
|
||||||
|
public:
|
||||||
|
MetaIndexBuilder(const MetaIndexBuilder&) = delete;
|
||||||
|
MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
|
||||||
|
|
||||||
|
MetaIndexBuilder();
|
||||||
|
void Add(const std::string& key, const BlockHandle& handle);
|
||||||
|
|
||||||
|
// Write all the added key/value pairs to the block and return the contents
|
||||||
|
// of the block.
|
||||||
|
Slice Finish();
|
||||||
|
|
||||||
|
private:
|
||||||
|
// store the sorted key/handle of the metablocks.
|
||||||
|
BytewiseSortedMap meta_block_handles_;
|
||||||
|
std::unique_ptr<BlockBuilder> meta_index_block_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class PropertyBlockBuilder {
|
||||||
|
public:
|
||||||
|
PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
|
||||||
|
PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
|
||||||
|
|
||||||
|
PropertyBlockBuilder();
|
||||||
|
|
||||||
|
void AddTableProperty(const TableProperties& props);
|
||||||
|
void Add(const std::string& key, uint64_t value);
|
||||||
|
void Add(const std::string& key, const std::string& value);
|
||||||
|
void Add(const UserCollectedProperties& user_collected_properties);
|
||||||
|
|
||||||
|
// Write all the added entries to the block and return the block contents
|
||||||
|
Slice Finish();
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<BlockBuilder> properties_block_;
|
||||||
|
BytewiseSortedMap props_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Were we encounter any error occurs during user-defined statistics collection,
|
||||||
|
// we'll write the warning message to info log.
|
||||||
|
void LogPropertiesCollectionError(
|
||||||
|
Logger* info_log, const std::string& method, const std::string& name);
|
||||||
|
|
||||||
|
// Utility functions help table builder to trigger batch events for user
|
||||||
|
// defined property collectors.
|
||||||
|
// Return value indicates if there is any error occurred; if error occurred,
|
||||||
|
// the warning message will be logged.
|
||||||
|
// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
|
||||||
|
// property collectors.
|
||||||
|
bool NotifyCollectTableCollectorsOnAdd(
|
||||||
|
const Slice& key,
|
||||||
|
const Slice& value,
|
||||||
|
const Options::TablePropertiesCollectors& collectors,
|
||||||
|
Logger* info_log);
|
||||||
|
|
||||||
|
// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
|
||||||
|
// property collectors. The collected properties will be added to `builder`.
|
||||||
|
bool NotifyCollectTableCollectorsOnFinish(
|
||||||
|
const Options::TablePropertiesCollectors& collectors,
|
||||||
|
Logger* info_log,
|
||||||
|
PropertyBlockBuilder* builder);
|
||||||
|
|
||||||
|
// Read the properties from the table.
|
||||||
|
Status ReadProperties(
|
||||||
|
const Slice& handle_value,
|
||||||
|
RandomAccessFile* file,
|
||||||
|
Env* env,
|
||||||
|
Logger* logger,
|
||||||
|
TableProperties* table_properties);
|
||||||
|
|
||||||
|
// Directly read the properties from the properties block of a plain table.
|
||||||
|
Status ReadTableProperties(
|
||||||
|
RandomAccessFile* file,
|
||||||
|
uint64_t file_size,
|
||||||
|
uint64_t table_magic_number,
|
||||||
|
Env* env,
|
||||||
|
Logger* info_log,
|
||||||
|
TableProperties* properties);
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
198
table/plain_table_builder.cc
Normal file
198
table/plain_table_builder.cc
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "table/plain_table_builder.h"
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
#include "rocksdb/comparator.h"
|
||||||
|
#include "rocksdb/env.h"
|
||||||
|
#include "rocksdb/filter_policy.h"
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
#include "table/block_builder.h"
|
||||||
|
#include "table/filter_block.h"
|
||||||
|
#include "table/format.h"
|
||||||
|
#include "table/meta_blocks.h"
|
||||||
|
#include "util/coding.h"
|
||||||
|
#include "util/crc32c.h"
|
||||||
|
#include "util/stop_watch.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// a utility that helps writing block content to the file
|
||||||
|
// @offset will advance if @block_contents was successfully written.
|
||||||
|
// @block_handle the block handle this particular block.
|
||||||
|
Status WriteBlock(
|
||||||
|
const Slice& block_contents,
|
||||||
|
WritableFile* file,
|
||||||
|
uint64_t* offset,
|
||||||
|
BlockHandle* block_handle) {
|
||||||
|
block_handle->set_offset(*offset);
|
||||||
|
block_handle->set_size(block_contents.size());
|
||||||
|
Status s = file->Append(block_contents);
|
||||||
|
|
||||||
|
if (s.ok()) {
|
||||||
|
*offset += block_contents.size();
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
// kPlainTableMagicNumber was picked by running
|
||||||
|
// echo rocksdb.plain.table | sha1sum
|
||||||
|
// and taking the leading 64 bits.
|
||||||
|
extern const uint64_t kPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
|
||||||
|
|
||||||
|
PlainTableBuilder::PlainTableBuilder(const Options& options,
|
||||||
|
WritableFile* file,
|
||||||
|
uint32_t user_key_len) :
|
||||||
|
options_(options), file_(file), user_key_len_(user_key_len) {
|
||||||
|
properties_.fixed_key_len = user_key_len;
|
||||||
|
|
||||||
|
// for plain table, we put all the data in a big chuck.
|
||||||
|
properties_.num_data_blocks = 1;
|
||||||
|
// emphasize that currently plain table doesn't have persistent index or
|
||||||
|
// filter block.
|
||||||
|
properties_.index_size = 0;
|
||||||
|
properties_.filter_size = 0;
|
||||||
|
properties_.format_version = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
PlainTableBuilder::~PlainTableBuilder() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||||
|
size_t user_key_size = key.size() - 8;
|
||||||
|
assert(user_key_len_ == 0 || user_key_size == user_key_len_);
|
||||||
|
|
||||||
|
if (!IsFixedLength()) {
|
||||||
|
// Write key length
|
||||||
|
key_size_str_.clear();
|
||||||
|
PutVarint32(&key_size_str_, user_key_size);
|
||||||
|
file_->Append(key_size_str_);
|
||||||
|
offset_ += key_size_str_.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write key
|
||||||
|
ParsedInternalKey parsed_key;
|
||||||
|
if (!ParseInternalKey(key, &parsed_key)) {
|
||||||
|
status_ = Status::Corruption(Slice());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
|
||||||
|
file_->Append(Slice(key.data(), user_key_size));
|
||||||
|
char tmp_char = PlainTableFactory::kValueTypeSeqId0;
|
||||||
|
file_->Append(Slice(&tmp_char, 1));
|
||||||
|
offset_ += key.size() - 7;
|
||||||
|
} else {
|
||||||
|
file_->Append(key);
|
||||||
|
offset_ += key.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write value length
|
||||||
|
value_size_str_.clear();
|
||||||
|
int value_size = value.size();
|
||||||
|
PutVarint32(&value_size_str_, value_size);
|
||||||
|
file_->Append(value_size_str_);
|
||||||
|
|
||||||
|
// Write value
|
||||||
|
file_->Append(value);
|
||||||
|
offset_ += value_size + value_size_str_.length();
|
||||||
|
|
||||||
|
properties_.num_entries++;
|
||||||
|
properties_.raw_key_size += key.size();
|
||||||
|
properties_.raw_value_size += value.size();
|
||||||
|
|
||||||
|
// notify property collectors
|
||||||
|
NotifyCollectTableCollectorsOnAdd(
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
options_.table_properties_collectors,
|
||||||
|
options_.info_log.get()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableBuilder::status() const { return status_; }
|
||||||
|
|
||||||
|
Status PlainTableBuilder::Finish() {
|
||||||
|
assert(!closed_);
|
||||||
|
closed_ = true;
|
||||||
|
|
||||||
|
properties_.data_size = offset_;
|
||||||
|
|
||||||
|
// Write the following blocks
|
||||||
|
// 1. [meta block: properties]
|
||||||
|
// 2. [metaindex block]
|
||||||
|
// 3. [footer]
|
||||||
|
MetaIndexBuilder meta_index_builer;
|
||||||
|
|
||||||
|
PropertyBlockBuilder property_block_builder;
|
||||||
|
// -- Add basic properties
|
||||||
|
property_block_builder.AddTableProperty(properties_);
|
||||||
|
|
||||||
|
// -- Add user collected properties
|
||||||
|
NotifyCollectTableCollectorsOnFinish(
|
||||||
|
options_.table_properties_collectors,
|
||||||
|
options_.info_log.get(),
|
||||||
|
&property_block_builder
|
||||||
|
);
|
||||||
|
|
||||||
|
// -- Write property block
|
||||||
|
BlockHandle property_block_handle;
|
||||||
|
auto s = WriteBlock(
|
||||||
|
property_block_builder.Finish(),
|
||||||
|
file_,
|
||||||
|
&offset_,
|
||||||
|
&property_block_handle
|
||||||
|
);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
meta_index_builer.Add(kPropertiesBlock, property_block_handle);
|
||||||
|
|
||||||
|
// -- write metaindex block
|
||||||
|
BlockHandle metaindex_block_handle;
|
||||||
|
s = WriteBlock(
|
||||||
|
meta_index_builer.Finish(),
|
||||||
|
file_,
|
||||||
|
&offset_,
|
||||||
|
&metaindex_block_handle
|
||||||
|
);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write Footer
|
||||||
|
Footer footer(kPlainTableMagicNumber);
|
||||||
|
footer.set_metaindex_handle(metaindex_block_handle);
|
||||||
|
footer.set_index_handle(BlockHandle::NullBlockHandle());
|
||||||
|
std::string footer_encoding;
|
||||||
|
footer.EncodeTo(&footer_encoding);
|
||||||
|
s = file_->Append(footer_encoding);
|
||||||
|
if (s.ok()) {
|
||||||
|
offset_ += footer_encoding.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableBuilder::Abandon() {
|
||||||
|
closed_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t PlainTableBuilder::NumEntries() const {
|
||||||
|
return properties_.num_entries;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t PlainTableBuilder::FileSize() const {
|
||||||
|
return offset_;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
85
table/plain_table_builder.h
Normal file
85
table/plain_table_builder.h
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
//
|
||||||
|
// IndexedTable is a simple table format for UNIT TEST ONLY. It is not built
|
||||||
|
// as production quality.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/status.h"
|
||||||
|
#include "table/table_builder.h"
|
||||||
|
#include "rocksdb/table_properties.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class BlockBuilder;
|
||||||
|
class BlockHandle;
|
||||||
|
class WritableFile;
|
||||||
|
class TableBuilder;
|
||||||
|
|
||||||
|
class PlainTableBuilder: public TableBuilder {
|
||||||
|
public:
|
||||||
|
// Create a builder that will store the contents of the table it is
|
||||||
|
// building in *file. Does not close the file. It is up to the
|
||||||
|
// caller to close the file after calling Finish(). The output file
|
||||||
|
// will be part of level specified by 'level'. A value of -1 means
|
||||||
|
// that the caller does not know which level the output file will reside.
|
||||||
|
PlainTableBuilder(const Options& options, WritableFile* file,
|
||||||
|
uint32_t user_key_size);
|
||||||
|
|
||||||
|
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||||
|
~PlainTableBuilder();
|
||||||
|
|
||||||
|
// Add key,value to the table being constructed.
|
||||||
|
// REQUIRES: key is after any previously added key according to comparator.
|
||||||
|
// REQUIRES: Finish(), Abandon() have not been called
|
||||||
|
void Add(const Slice& key, const Slice& value) override;
|
||||||
|
|
||||||
|
// Return non-ok iff some error has been detected.
|
||||||
|
Status status() const override;
|
||||||
|
|
||||||
|
// Finish building the table. Stops using the file passed to the
|
||||||
|
// constructor after this function returns.
|
||||||
|
// REQUIRES: Finish(), Abandon() have not been called
|
||||||
|
Status Finish() override;
|
||||||
|
|
||||||
|
// Indicate that the contents of this builder should be abandoned. Stops
|
||||||
|
// using the file passed to the constructor after this function returns.
|
||||||
|
// If the caller is not going to call Finish(), it must call Abandon()
|
||||||
|
// before destroying this builder.
|
||||||
|
// REQUIRES: Finish(), Abandon() have not been called
|
||||||
|
void Abandon() override;
|
||||||
|
|
||||||
|
// Number of calls to Add() so far.
|
||||||
|
uint64_t NumEntries() const override;
|
||||||
|
|
||||||
|
// Size of the file generated so far. If invoked after a successful
|
||||||
|
// Finish() call, returns the size of the final generated file.
|
||||||
|
uint64_t FileSize() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
Options options_;
|
||||||
|
WritableFile* file_;
|
||||||
|
uint64_t offset_ = 0;
|
||||||
|
Status status_;
|
||||||
|
TableProperties properties_;
|
||||||
|
|
||||||
|
const size_t user_key_len_;
|
||||||
|
bool closed_ = false; // Either Finish() or Abandon() has been called.
|
||||||
|
|
||||||
|
std::string key_size_str_;
|
||||||
|
std::string value_size_str_;
|
||||||
|
|
||||||
|
bool IsFixedLength() const {
|
||||||
|
return user_key_len_ > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No copying allowed
|
||||||
|
PlainTableBuilder(const PlainTableBuilder&) = delete;
|
||||||
|
void operator=(const PlainTableBuilder&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
40
table/plain_table_factory.cc
Normal file
40
table/plain_table_factory.cc
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
#include "table/plain_table_builder.h"
|
||||||
|
#include "table/plain_table_reader.h"
|
||||||
|
#include "port/port.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
Status PlainTableFactory::NewTableReader(const Options& options,
|
||||||
|
const EnvOptions& soptions,
|
||||||
|
const InternalKeyComparator& icomp,
|
||||||
|
unique_ptr<RandomAccessFile>&& file,
|
||||||
|
uint64_t file_size,
|
||||||
|
unique_ptr<TableReader>* table) const {
|
||||||
|
return PlainTableReader::Open(options, soptions, icomp, std::move(file),
|
||||||
|
file_size, table, bloom_bits_per_key_,
|
||||||
|
hash_table_ratio_);
|
||||||
|
}
|
||||||
|
|
||||||
|
TableBuilder* PlainTableFactory::NewTableBuilder(
|
||||||
|
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||||
|
WritableFile* file, CompressionType compression_type) const {
|
||||||
|
return new PlainTableBuilder(options, file, user_key_len_);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern TableFactory* NewPlainTableFactory(uint32_t user_key_len,
|
||||||
|
int bloom_bits_per_key,
|
||||||
|
double hash_table_ratio) {
|
||||||
|
return new PlainTableFactory(user_key_len, bloom_bits_per_key,
|
||||||
|
hash_table_ratio);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
76
table/plain_table_factory.h
Normal file
76
table/plain_table_factory.h
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <memory>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/table.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
struct Options;
|
||||||
|
struct EnvOptions;
|
||||||
|
|
||||||
|
using std::unique_ptr;
|
||||||
|
class Status;
|
||||||
|
class RandomAccessFile;
|
||||||
|
class WritableFile;
|
||||||
|
class Table;
|
||||||
|
class TableBuilder;
|
||||||
|
|
||||||
|
// IndexedTable requires fixed length key, configured as a constructor
|
||||||
|
// parameter of the factory class. Output file format:
|
||||||
|
// +-------------+-----------------+
|
||||||
|
// | version | user_key_length |
|
||||||
|
// +------------++------------------------------+ <= key1 offset
|
||||||
|
// | [key_size] | key1 | value_size | |
|
||||||
|
// +------------+-------------+-------------+ |
|
||||||
|
// | value1 |
|
||||||
|
// | |
|
||||||
|
// +----------------------------------------+---+ <= key2 offset
|
||||||
|
// | [key_size] | key2 | value_size | |
|
||||||
|
// +------------+-------------+-------------+ |
|
||||||
|
// | value2 |
|
||||||
|
// | |
|
||||||
|
// | ...... |
|
||||||
|
// +-----------------+--------------------------+
|
||||||
|
// If user_key_length = kPlainTableVariableLength, it means the key is variable
|
||||||
|
// length, there will be an extra field for key size encoded before every key.
|
||||||
|
class PlainTableFactory : public TableFactory {
|
||||||
|
public:
|
||||||
|
~PlainTableFactory() {}
|
||||||
|
// user_key_size is the length of the user key. If it is set to be
|
||||||
|
// kPlainTableVariableLength, then it means variable length. Otherwise, all
|
||||||
|
// the keys need to have the fix length of this value. bloom_bits_per_key is
|
||||||
|
// number of bits used for bloom filer per key. hash_table_ratio is
|
||||||
|
// the desired utilization of the hash table used for prefix hashing.
|
||||||
|
// hash_table_ratio = number of prefixes / #buckets in the hash table
|
||||||
|
explicit PlainTableFactory(uint32_t user_key_len = kPlainTableVariableLength,
|
||||||
|
int bloom_bits_per_key = 0,
|
||||||
|
double hash_table_ratio = 0.75)
|
||||||
|
: user_key_len_(user_key_len),
|
||||||
|
bloom_bits_per_key_(bloom_bits_per_key),
|
||||||
|
hash_table_ratio_(hash_table_ratio) {}
|
||||||
|
const char* Name() const override { return "PlainTable"; }
|
||||||
|
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
|
unique_ptr<TableReader>* table) const override;
|
||||||
|
TableBuilder* NewTableBuilder(const Options& options,
|
||||||
|
const InternalKeyComparator& icomparator,
|
||||||
|
WritableFile* file,
|
||||||
|
CompressionType compression_type) const
|
||||||
|
override;
|
||||||
|
|
||||||
|
static const char kValueTypeSeqId0 = 0xFF;
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint32_t user_key_len_;
|
||||||
|
int bloom_bits_per_key_;
|
||||||
|
double hash_table_ratio_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
695
table/plain_table_reader.cc
Normal file
695
table/plain_table_reader.cc
Normal file
@ -0,0 +1,695 @@
|
|||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "table/plain_table_reader.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
|
||||||
|
#include "rocksdb/cache.h"
|
||||||
|
#include "rocksdb/comparator.h"
|
||||||
|
#include "rocksdb/env.h"
|
||||||
|
#include "rocksdb/filter_policy.h"
|
||||||
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/statistics.h"
|
||||||
|
|
||||||
|
#include "table/block.h"
|
||||||
|
#include "table/filter_block.h"
|
||||||
|
#include "table/format.h"
|
||||||
|
#include "table/meta_blocks.h"
|
||||||
|
#include "table/two_level_iterator.h"
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
|
|
||||||
|
#include "util/coding.h"
|
||||||
|
#include "util/dynamic_bloom.h"
|
||||||
|
#include "util/hash.h"
|
||||||
|
#include "util/histogram.h"
|
||||||
|
#include "util/murmurhash.h"
|
||||||
|
#include "util/perf_context_imp.h"
|
||||||
|
#include "util/stop_watch.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
inline uint32_t GetSliceHash(Slice const& s) {
|
||||||
|
return Hash(s.data(), s.size(), 397) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
|
||||||
|
return hash % num_buckets;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
// Iterator to iterate IndexedTable
|
||||||
|
class PlainTableIterator : public Iterator {
|
||||||
|
public:
|
||||||
|
explicit PlainTableIterator(PlainTableReader* table);
|
||||||
|
~PlainTableIterator();
|
||||||
|
|
||||||
|
bool Valid() const;
|
||||||
|
|
||||||
|
void SeekToFirst();
|
||||||
|
|
||||||
|
void SeekToLast();
|
||||||
|
|
||||||
|
void Seek(const Slice& target);
|
||||||
|
|
||||||
|
void Next();
|
||||||
|
|
||||||
|
void Prev();
|
||||||
|
|
||||||
|
Slice key() const;
|
||||||
|
|
||||||
|
Slice value() const;
|
||||||
|
|
||||||
|
Status status() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
PlainTableReader* table_;
|
||||||
|
uint32_t offset_;
|
||||||
|
uint32_t next_offset_;
|
||||||
|
Slice key_;
|
||||||
|
Slice value_;
|
||||||
|
Status status_;
|
||||||
|
std::string tmp_str_;
|
||||||
|
// No copying allowed
|
||||||
|
PlainTableIterator(const PlainTableIterator&) = delete;
|
||||||
|
void operator=(const Iterator&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
extern const uint64_t kPlainTableMagicNumber;
|
||||||
|
PlainTableReader::PlainTableReader(const EnvOptions& storage_options,
|
||||||
|
const InternalKeyComparator& icomparator,
|
||||||
|
uint64_t file_size, int bloom_bits_per_key,
|
||||||
|
double hash_table_ratio,
|
||||||
|
const TableProperties& table_properties)
|
||||||
|
: soptions_(storage_options),
|
||||||
|
internal_comparator_(icomparator),
|
||||||
|
file_size_(file_size),
|
||||||
|
kHashTableRatio(hash_table_ratio),
|
||||||
|
kBloomBitsPerKey(bloom_bits_per_key),
|
||||||
|
table_properties_(table_properties),
|
||||||
|
data_end_offset_(table_properties_.data_size),
|
||||||
|
user_key_len_(table_properties.fixed_key_len) {}
|
||||||
|
|
||||||
|
PlainTableReader::~PlainTableReader() {
|
||||||
|
delete[] hash_table_;
|
||||||
|
delete[] sub_index_;
|
||||||
|
delete bloom_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableReader::Open(const Options& options,
|
||||||
|
const EnvOptions& soptions,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
|
unique_ptr<RandomAccessFile>&& file,
|
||||||
|
uint64_t file_size,
|
||||||
|
unique_ptr<TableReader>* table_reader,
|
||||||
|
const int bloom_bits_per_key,
|
||||||
|
double hash_table_ratio) {
|
||||||
|
assert(options.allow_mmap_reads);
|
||||||
|
|
||||||
|
if (file_size > kMaxFileSize) {
|
||||||
|
return Status::NotSupported("File is too large for PlainTableReader!");
|
||||||
|
}
|
||||||
|
|
||||||
|
TableProperties table_properties;
|
||||||
|
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
|
||||||
|
options.env, options.info_log.get(),
|
||||||
|
&table_properties);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
|
||||||
|
soptions, internal_comparator, file_size, bloom_bits_per_key,
|
||||||
|
hash_table_ratio, table_properties));
|
||||||
|
new_reader->file_ = std::move(file);
|
||||||
|
new_reader->options_ = options;
|
||||||
|
|
||||||
|
// -- Populate Index
|
||||||
|
s = new_reader->PopulateIndex();
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
*table_reader = std::move(new_reader);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableReader::SetupForCompaction() {
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PlainTableReader::PrefixMayMatch(const Slice& internal_prefix) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Iterator* PlainTableReader::NewIterator(const ReadOptions& options) {
|
||||||
|
return new PlainTableIterator(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PlainTableReader::IndexRecord {
|
||||||
|
uint32_t hash; // hash of the prefix
|
||||||
|
uint32_t offset; // offset of a row
|
||||||
|
IndexRecord* next;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper class to track all the index records
|
||||||
|
class PlainTableReader::IndexRecordList {
|
||||||
|
public:
|
||||||
|
explicit IndexRecordList(size_t num_records_per_group)
|
||||||
|
: kNumRecordsPerGroup(num_records_per_group),
|
||||||
|
current_group_(nullptr),
|
||||||
|
num_records_in_current_group_(num_records_per_group) {}
|
||||||
|
|
||||||
|
~IndexRecordList() {
|
||||||
|
for (size_t i = 0; i < groups_.size(); i++) {
|
||||||
|
delete[] groups_[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AddRecord(murmur_t hash, uint32_t offset) {
|
||||||
|
if (num_records_in_current_group_ == kNumRecordsPerGroup) {
|
||||||
|
current_group_ = AllocateNewGroup();
|
||||||
|
num_records_in_current_group_ = 0;
|
||||||
|
}
|
||||||
|
auto& new_record = current_group_[num_records_in_current_group_++];
|
||||||
|
new_record.hash = hash;
|
||||||
|
new_record.offset = offset;
|
||||||
|
new_record.next = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GetNumRecords() const {
|
||||||
|
return (groups_.size() - 1) * kNumRecordsPerGroup +
|
||||||
|
num_records_in_current_group_;
|
||||||
|
}
|
||||||
|
IndexRecord* At(size_t index) {
|
||||||
|
return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
IndexRecord* AllocateNewGroup() {
|
||||||
|
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
|
||||||
|
groups_.push_back(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t kNumRecordsPerGroup;
|
||||||
|
IndexRecord* current_group_;
|
||||||
|
// List of arrays allocated
|
||||||
|
std::vector<IndexRecord*> groups_;
|
||||||
|
size_t num_records_in_current_group_;
|
||||||
|
};
|
||||||
|
|
||||||
|
int PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list) {
|
||||||
|
Slice prev_key_prefix_slice;
|
||||||
|
uint32_t prev_key_prefix_hash = 0;
|
||||||
|
uint32_t pos = data_start_offset_;
|
||||||
|
int key_index_within_prefix = 0;
|
||||||
|
bool is_first_record = true;
|
||||||
|
HistogramImpl keys_per_prefix_hist;
|
||||||
|
// Need map to be ordered to make sure sub indexes generated
|
||||||
|
// are in order.
|
||||||
|
|
||||||
|
int num_prefixes = 0;
|
||||||
|
while (pos < data_end_offset_) {
|
||||||
|
uint32_t key_offset = pos;
|
||||||
|
ParsedInternalKey key;
|
||||||
|
Slice value_slice;
|
||||||
|
status_ = Next(pos, &key, &value_slice, pos);
|
||||||
|
Slice key_prefix_slice = GetPrefix(key);
|
||||||
|
|
||||||
|
if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
|
||||||
|
++num_prefixes;
|
||||||
|
if (!is_first_record) {
|
||||||
|
keys_per_prefix_hist.Add(key_index_within_prefix);
|
||||||
|
}
|
||||||
|
key_index_within_prefix = 0;
|
||||||
|
prev_key_prefix_slice = key_prefix_slice;
|
||||||
|
prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (key_index_within_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
|
||||||
|
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
|
||||||
|
record_list->AddRecord(prev_key_prefix_hash, key_offset);
|
||||||
|
}
|
||||||
|
is_first_record = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
keys_per_prefix_hist.Add(key_index_within_prefix);
|
||||||
|
Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
|
||||||
|
keys_per_prefix_hist.ToString().c_str());
|
||||||
|
|
||||||
|
return num_prefixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
|
||||||
|
delete[] hash_table_;
|
||||||
|
|
||||||
|
if (kBloomBitsPerKey > 0) {
|
||||||
|
bloom_ = new DynamicBloom(num_prefixes * kBloomBitsPerKey);
|
||||||
|
}
|
||||||
|
double hash_table_size_multipier =
|
||||||
|
(kHashTableRatio > 1.0) ? 1.0 : 1.0 / kHashTableRatio;
|
||||||
|
hash_table_size_ = num_prefixes * hash_table_size_multipier + 1;
|
||||||
|
hash_table_ = new uint32_t[hash_table_size_];
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
|
||||||
|
IndexRecordList& record_list, int num_prefixes,
|
||||||
|
std::vector<IndexRecord*>* hash_to_offsets,
|
||||||
|
std::vector<uint32_t>* bucket_count) {
|
||||||
|
size_t sub_index_size_needed = 0;
|
||||||
|
bool first = true;
|
||||||
|
uint32_t prev_hash = 0;
|
||||||
|
size_t num_records = record_list.GetNumRecords();
|
||||||
|
for (size_t i = 0; i < num_records; i++) {
|
||||||
|
IndexRecord* index_record = record_list.At(i);
|
||||||
|
uint32_t cur_hash = index_record->hash;
|
||||||
|
if (first || prev_hash != cur_hash) {
|
||||||
|
prev_hash = cur_hash;
|
||||||
|
first = false;
|
||||||
|
if (bloom_) {
|
||||||
|
bloom_->AddHash(cur_hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uint32_t bucket = GetBucketIdFromHash(cur_hash, hash_table_size_);
|
||||||
|
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
|
||||||
|
index_record->next = prev_bucket_head;
|
||||||
|
(*hash_to_offsets)[bucket] = index_record;
|
||||||
|
auto& item_count = (*bucket_count)[bucket];
|
||||||
|
if (item_count > 0) {
|
||||||
|
if (item_count == 1) {
|
||||||
|
sub_index_size_needed += kOffsetLen + 1;
|
||||||
|
}
|
||||||
|
if (item_count == 127) {
|
||||||
|
// Need more than one byte for length
|
||||||
|
sub_index_size_needed++;
|
||||||
|
}
|
||||||
|
sub_index_size_needed += kOffsetLen;
|
||||||
|
}
|
||||||
|
item_count++;
|
||||||
|
}
|
||||||
|
return sub_index_size_needed;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableReader::FillIndexes(
|
||||||
|
size_t sub_index_size_needed,
|
||||||
|
const std::vector<IndexRecord*>& hash_to_offsets,
|
||||||
|
const std::vector<uint32_t>& bucket_count) {
|
||||||
|
Log(options_.info_log, "Reserving %zu bytes for sub index",
|
||||||
|
sub_index_size_needed);
|
||||||
|
// 8 bytes buffer for variable length size
|
||||||
|
size_t buffer_size = 8 * 8;
|
||||||
|
size_t buffer_used = 0;
|
||||||
|
sub_index_size_needed += buffer_size;
|
||||||
|
sub_index_ = new char[sub_index_size_needed];
|
||||||
|
size_t sub_index_offset = 0;
|
||||||
|
char* prev_ptr;
|
||||||
|
char* cur_ptr;
|
||||||
|
uint32_t* sub_index_ptr;
|
||||||
|
for (int i = 0; i < hash_table_size_; i++) {
|
||||||
|
uint32_t num_keys_for_bucket = bucket_count[i];
|
||||||
|
switch (num_keys_for_bucket) {
|
||||||
|
case 0:
|
||||||
|
// No key for bucket
|
||||||
|
hash_table_[i] = data_end_offset_;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
// point directly to the file offset
|
||||||
|
hash_table_[i] = hash_to_offsets[i]->offset;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// point to second level indexes.
|
||||||
|
hash_table_[i] = sub_index_offset | kSubIndexMask;
|
||||||
|
prev_ptr = sub_index_ + sub_index_offset;
|
||||||
|
cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
|
||||||
|
sub_index_offset += (cur_ptr - prev_ptr);
|
||||||
|
if (cur_ptr - prev_ptr > 2
|
||||||
|
|| (cur_ptr - prev_ptr == 2 && num_keys_for_bucket <= 127)) {
|
||||||
|
// Need to resize sub_index. Exponentially grow buffer.
|
||||||
|
buffer_used += cur_ptr - prev_ptr - 1;
|
||||||
|
if (buffer_used + 4 > buffer_size) {
|
||||||
|
Log(options_.info_log, "Recalculate suffix_map length to %zu",
|
||||||
|
sub_index_size_needed);
|
||||||
|
|
||||||
|
sub_index_size_needed += buffer_size;
|
||||||
|
buffer_size *= 2;
|
||||||
|
char* new_sub_index = new char[sub_index_size_needed];
|
||||||
|
memcpy(new_sub_index, sub_index_, sub_index_offset);
|
||||||
|
delete[] sub_index_;
|
||||||
|
sub_index_ = new_sub_index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sub_index_ptr = (uint32_t*) (sub_index_ + sub_index_offset);
|
||||||
|
IndexRecord* record = hash_to_offsets[i];
|
||||||
|
int j;
|
||||||
|
for (j = num_keys_for_bucket - 1; j >= 0 && record;
|
||||||
|
j--, record = record->next) {
|
||||||
|
sub_index_ptr[j] = record->offset;
|
||||||
|
}
|
||||||
|
assert(j == -1 && record == nullptr);
|
||||||
|
sub_index_offset += kOffsetLen * num_keys_for_bucket;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
|
||||||
|
hash_table_size_, sub_index_size_needed);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableReader::PopulateIndex() {
|
||||||
|
// Get mmapped memory to file_data_.
|
||||||
|
Status s = file_->Read(0, file_size_, &file_data_, nullptr);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexRecordList record_list(kRecordsPerGroup);
|
||||||
|
// First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
|
||||||
|
// for a prefix (starting from the first one), generate a record of (hash,
|
||||||
|
// offset) and append it to IndexRecordList, which is a data structure created
|
||||||
|
// to store them.
|
||||||
|
int num_prefixes = PopulateIndexRecordList(&record_list);
|
||||||
|
// Calculated hash table and bloom filter size and allocate memory for indexes
|
||||||
|
// and bloom filter based on the number of prefixes.
|
||||||
|
AllocateIndexAndBloom(num_prefixes);
|
||||||
|
|
||||||
|
// Bucketize all the index records to a temp data structure, in which for
|
||||||
|
// each bucket, we generate a linked list of IndexRecord, in reversed order.
|
||||||
|
std::vector<IndexRecord*> hash_to_offsets(hash_table_size_, nullptr);
|
||||||
|
std::vector<uint32_t> bucket_count(hash_table_size_, 0);
|
||||||
|
size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
|
||||||
|
record_list, num_prefixes, &hash_to_offsets, &bucket_count);
|
||||||
|
// From the temp data structure, populate indexes.
|
||||||
|
FillIndexes(sub_index_size_needed, hash_to_offsets, bucket_count);
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
|
||||||
|
uint32_t prefix_hash, bool& prefix_matched,
|
||||||
|
uint32_t& ret_offset) {
|
||||||
|
prefix_matched = false;
|
||||||
|
int bucket = GetBucketIdFromHash(prefix_hash, hash_table_size_);
|
||||||
|
uint32_t bucket_value = hash_table_[bucket];
|
||||||
|
if (bucket_value == data_end_offset_) {
|
||||||
|
ret_offset = data_end_offset_;
|
||||||
|
return Status::OK();
|
||||||
|
} else if ((bucket_value & kSubIndexMask) == 0) {
|
||||||
|
// point directly to the file
|
||||||
|
ret_offset = bucket_value;
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// point to sub-index, need to do a binary search
|
||||||
|
uint32_t low = 0;
|
||||||
|
uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
|
||||||
|
|
||||||
|
const char* index_ptr = sub_index_ + prefix_index_offset;
|
||||||
|
uint32_t upper_bound = 0;
|
||||||
|
const uint32_t* base_ptr = (const uint32_t*) GetVarint32Ptr(index_ptr,
|
||||||
|
index_ptr + 4,
|
||||||
|
&upper_bound);
|
||||||
|
uint32_t high = upper_bound;
|
||||||
|
ParsedInternalKey mid_key;
|
||||||
|
ParsedInternalKey parsed_target;
|
||||||
|
if (!ParseInternalKey(target, &parsed_target)) {
|
||||||
|
return Status::Corruption(Slice());
|
||||||
|
}
|
||||||
|
|
||||||
|
// The key is between [low, high). Do a binary search between it.
|
||||||
|
while (high - low > 1) {
|
||||||
|
uint32_t mid = (high + low) / 2;
|
||||||
|
uint32_t file_offset = base_ptr[mid];
|
||||||
|
size_t tmp;
|
||||||
|
Status s = ReadKey(file_data_.data() + file_offset, &mid_key, tmp);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
|
||||||
|
if (cmp_result < 0) {
|
||||||
|
low = mid;
|
||||||
|
} else {
|
||||||
|
if (cmp_result == 0) {
|
||||||
|
// Happen to have found the exact key or target is smaller than the
|
||||||
|
// first key after base_offset.
|
||||||
|
prefix_matched = true;
|
||||||
|
ret_offset = file_offset;
|
||||||
|
return Status::OK();
|
||||||
|
} else {
|
||||||
|
high = mid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Both of the key at the position low or low+1 could share the same
|
||||||
|
// prefix as target. We need to rule out one of them to avoid to go
|
||||||
|
// to the wrong prefix.
|
||||||
|
ParsedInternalKey low_key;
|
||||||
|
size_t tmp;
|
||||||
|
uint32_t low_key_offset = base_ptr[low];
|
||||||
|
Status s = ReadKey(file_data_.data() + low_key_offset, &low_key, tmp);
|
||||||
|
if (GetPrefix(low_key) == prefix) {
|
||||||
|
prefix_matched = true;
|
||||||
|
ret_offset = low_key_offset;
|
||||||
|
} else if (low + 1 < upper_bound) {
|
||||||
|
// There is possible a next prefix, return it
|
||||||
|
prefix_matched = false;
|
||||||
|
ret_offset = base_ptr[low + 1];
|
||||||
|
} else {
|
||||||
|
// target is larger than a key of the last prefix in this bucket
|
||||||
|
// but with a different prefix. Key does not exist.
|
||||||
|
ret_offset = data_end_offset_;
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PlainTableReader::MayHavePrefix(uint32_t hash) {
|
||||||
|
return bloom_ == nullptr || bloom_->MayContainHash(hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) {
|
||||||
|
return options_.prefix_extractor->Transform(target.user_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableReader::ReadKey(const char* row_ptr, ParsedInternalKey* key,
|
||||||
|
size_t& bytes_read) {
|
||||||
|
const char* key_ptr = nullptr;
|
||||||
|
bytes_read = 0;
|
||||||
|
size_t user_key_size = 0;
|
||||||
|
if (IsFixedLength()) {
|
||||||
|
user_key_size = user_key_len_;
|
||||||
|
key_ptr = row_ptr;
|
||||||
|
} else {
|
||||||
|
uint32_t tmp_size = 0;
|
||||||
|
key_ptr = GetVarint32Ptr(row_ptr, file_data_.data() + data_end_offset_,
|
||||||
|
&tmp_size);
|
||||||
|
if (key_ptr == nullptr) {
|
||||||
|
return Status::Corruption("Unable to read the next key");
|
||||||
|
}
|
||||||
|
user_key_size = (size_t)tmp_size;
|
||||||
|
bytes_read = key_ptr - row_ptr;
|
||||||
|
}
|
||||||
|
if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
|
||||||
|
return Status::Corruption("Unable to read the next key");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
|
||||||
|
// Special encoding for the row with seqID=0
|
||||||
|
key->user_key = Slice(key_ptr, user_key_size);
|
||||||
|
key->sequence = 0;
|
||||||
|
key->type = kTypeValue;
|
||||||
|
bytes_read += user_key_size + 1;
|
||||||
|
} else {
|
||||||
|
if (row_ptr + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
|
||||||
|
return Status::Corruption("Unable to read the next key");
|
||||||
|
}
|
||||||
|
if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
|
||||||
|
return Status::Corruption(Slice());
|
||||||
|
}
|
||||||
|
bytes_read += user_key_size + 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableReader::Next(uint32_t offset, ParsedInternalKey* key,
|
||||||
|
Slice* value, uint32_t& next_offset) {
|
||||||
|
if (offset == data_end_offset_) {
|
||||||
|
next_offset = data_end_offset_;
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (offset > data_end_offset_) {
|
||||||
|
return Status::Corruption("Offset is out of file size");
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* row_ptr = file_data_.data() + offset;
|
||||||
|
size_t bytes_for_key;
|
||||||
|
Status s = ReadKey(row_ptr, key, bytes_for_key);
|
||||||
|
uint32_t value_size;
|
||||||
|
const char* value_ptr = GetVarint32Ptr(row_ptr + bytes_for_key,
|
||||||
|
file_data_.data() + data_end_offset_,
|
||||||
|
&value_size);
|
||||||
|
if (value_ptr == nullptr) {
|
||||||
|
return Status::Corruption("Error reading value length.");
|
||||||
|
}
|
||||||
|
next_offset = offset + (value_ptr - row_ptr) + value_size;
|
||||||
|
if (next_offset > data_end_offset_) {
|
||||||
|
return Status::Corruption("Reach end of file when reading value");
|
||||||
|
}
|
||||||
|
*value = Slice(value_ptr, value_size);
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
|
||||||
|
void* arg,
|
||||||
|
bool (*saver)(void*, const ParsedInternalKey&,
|
||||||
|
const Slice&, bool),
|
||||||
|
void (*mark_key_may_exist)(void*)) {
|
||||||
|
// Check bloom filter first.
|
||||||
|
Slice prefix_slice = GetPrefix(target);
|
||||||
|
uint32_t prefix_hash = GetSliceHash(prefix_slice);
|
||||||
|
if (!MayHavePrefix(prefix_hash)) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
uint32_t offset;
|
||||||
|
bool prefix_match;
|
||||||
|
Status s = GetOffset(target, prefix_slice, prefix_hash, prefix_match, offset);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
ParsedInternalKey found_key;
|
||||||
|
ParsedInternalKey parsed_target;
|
||||||
|
if (!ParseInternalKey(target, &parsed_target)) {
|
||||||
|
return Status::Corruption(Slice());
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice found_value;
|
||||||
|
while (offset < data_end_offset_) {
|
||||||
|
Status s = Next(offset, &found_key, &found_value, offset);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
if (!prefix_match) {
|
||||||
|
// Need to verify prefix for the first key found if it is not yet
|
||||||
|
// checked.
|
||||||
|
if (GetPrefix(found_key) != prefix_slice) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
prefix_match = true;
|
||||||
|
}
|
||||||
|
if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
|
||||||
|
if (!(*saver)(arg, found_key, found_value, true)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
PlainTableIterator::PlainTableIterator(PlainTableReader* table) :
|
||||||
|
table_(table) {
|
||||||
|
next_offset_ = offset_ = table_->data_end_offset_;
|
||||||
|
}
|
||||||
|
|
||||||
|
PlainTableIterator::~PlainTableIterator() {
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PlainTableIterator::Valid() const {
|
||||||
|
return offset_ < table_->data_end_offset_
|
||||||
|
&& offset_ >= table_->data_start_offset_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableIterator::SeekToFirst() {
|
||||||
|
next_offset_ = table_->data_start_offset_;
|
||||||
|
if (next_offset_ >= table_->data_end_offset_) {
|
||||||
|
next_offset_ = offset_ = table_->data_end_offset_;
|
||||||
|
} else {
|
||||||
|
Next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableIterator::SeekToLast() {
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableIterator::Seek(const Slice& target) {
|
||||||
|
Slice prefix_slice = table_->GetPrefix(target);
|
||||||
|
uint32_t prefix_hash = GetSliceHash(prefix_slice);
|
||||||
|
if (!table_->MayHavePrefix(prefix_hash)) {
|
||||||
|
offset_ = next_offset_ = table_->data_end_offset_;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
bool prefix_match;
|
||||||
|
status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
|
||||||
|
next_offset_);
|
||||||
|
if (!status_.ok()) {
|
||||||
|
offset_ = next_offset_ = table_->data_end_offset_;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (next_offset_ < table_-> data_end_offset_) {
|
||||||
|
for (Next(); status_.ok() && Valid(); Next()) {
|
||||||
|
if (!prefix_match) {
|
||||||
|
// Need to verify the first key's prefix
|
||||||
|
if (table_->GetPrefix(key()) != prefix_slice) {
|
||||||
|
offset_ = next_offset_ = table_->data_end_offset_;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
prefix_match = true;
|
||||||
|
}
|
||||||
|
if (table_->internal_comparator_.Compare(key(), target) >= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
offset_ = table_->data_end_offset_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableIterator::Next() {
|
||||||
|
offset_ = next_offset_;
|
||||||
|
if (offset_ < table_->data_end_offset_) {
|
||||||
|
Slice tmp_slice;
|
||||||
|
ParsedInternalKey parsed_key;
|
||||||
|
status_ = table_->Next(next_offset_, &parsed_key, &value_, next_offset_);
|
||||||
|
if (status_.ok()) {
|
||||||
|
// Make a copy in this case. TODO optimize.
|
||||||
|
tmp_str_.clear();
|
||||||
|
AppendInternalKey(&tmp_str_, parsed_key);
|
||||||
|
key_ = Slice(tmp_str_);
|
||||||
|
} else {
|
||||||
|
offset_ = next_offset_ = table_->data_end_offset_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlainTableIterator::Prev() {
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice PlainTableIterator::key() const {
|
||||||
|
assert(Valid());
|
||||||
|
return key_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice PlainTableIterator::value() const {
|
||||||
|
assert(Valid());
|
||||||
|
return value_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Status PlainTableIterator::status() const {
|
||||||
|
return status_;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
220
table/plain_table_reader.h
Normal file
220
table/plain_table_reader.h
Normal file
@ -0,0 +1,220 @@
|
|||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
#include "rocksdb/env.h"
|
||||||
|
#include "rocksdb/iterator.h"
|
||||||
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "rocksdb/table.h"
|
||||||
|
#include "rocksdb/table_properties.h"
|
||||||
|
#include "table/table_reader.h"
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class Block;
|
||||||
|
class BlockHandle;
|
||||||
|
class Footer;
|
||||||
|
struct Options;
|
||||||
|
class RandomAccessFile;
|
||||||
|
struct ReadOptions;
|
||||||
|
class TableCache;
|
||||||
|
class TableReader;
|
||||||
|
class DynamicBloom;
|
||||||
|
class InternalKeyComparator;
|
||||||
|
|
||||||
|
using std::unique_ptr;
|
||||||
|
using std::unordered_map;
|
||||||
|
extern const uint32_t kPlainTableVariableLength;
|
||||||
|
|
||||||
|
// Based on following output file format shown in plain_table_factory.h
|
||||||
|
// When opening the output file, IndexedTableReader creates a hash table
|
||||||
|
// from key prefixes to offset of the output file. IndexedTable will decide
|
||||||
|
// whether it points to the data offset of the first key with the key prefix
|
||||||
|
// or the offset of it. If there are too many keys share this prefix, it will
|
||||||
|
// create a binary search-able index from the suffix to offset on disk.
|
||||||
|
//
|
||||||
|
// The implementation of IndexedTableReader requires output file is mmaped
|
||||||
|
class PlainTableReader: public TableReader {
|
||||||
|
public:
|
||||||
|
static Status Open(const Options& options, const EnvOptions& soptions,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
||||||
|
unique_ptr<TableReader>* table,
|
||||||
|
const int bloom_bits_per_key, double hash_table_ratio);
|
||||||
|
|
||||||
|
bool PrefixMayMatch(const Slice& internal_prefix);
|
||||||
|
|
||||||
|
Iterator* NewIterator(const ReadOptions&);
|
||||||
|
|
||||||
|
Status Get(const ReadOptions&, const Slice& key, void* arg,
|
||||||
|
bool (*result_handler)(void* arg, const ParsedInternalKey& k,
|
||||||
|
const Slice& v, bool),
|
||||||
|
void (*mark_key_may_exist)(void*) = nullptr);
|
||||||
|
|
||||||
|
uint64_t ApproximateOffsetOf(const Slice& key);
|
||||||
|
|
||||||
|
void SetupForCompaction();
|
||||||
|
|
||||||
|
const TableProperties& GetTableProperties() { return table_properties_; }
|
||||||
|
|
||||||
|
PlainTableReader(const EnvOptions& storage_options,
|
||||||
|
const InternalKeyComparator& internal_comparator,
|
||||||
|
uint64_t file_size, int bloom_num_bits,
|
||||||
|
double hash_table_ratio,
|
||||||
|
const TableProperties& table_properties);
|
||||||
|
~PlainTableReader();
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct IndexRecord;
|
||||||
|
class IndexRecordList;
|
||||||
|
|
||||||
|
uint32_t* hash_table_ = nullptr;
|
||||||
|
int hash_table_size_ = 0;
|
||||||
|
char* sub_index_ = nullptr;
|
||||||
|
|
||||||
|
Options options_;
|
||||||
|
const EnvOptions& soptions_;
|
||||||
|
const InternalKeyComparator internal_comparator_;
|
||||||
|
Status status_;
|
||||||
|
unique_ptr<RandomAccessFile> file_;
|
||||||
|
|
||||||
|
Slice file_data_;
|
||||||
|
uint32_t version_;
|
||||||
|
uint32_t file_size_;
|
||||||
|
|
||||||
|
const double kHashTableRatio;
|
||||||
|
const int kBloomBitsPerKey;
|
||||||
|
DynamicBloom* bloom_ = nullptr;
|
||||||
|
|
||||||
|
TableProperties table_properties_;
|
||||||
|
const uint32_t data_start_offset_ = 0;
|
||||||
|
const uint32_t data_end_offset_;
|
||||||
|
const size_t user_key_len_;
|
||||||
|
|
||||||
|
static const size_t kNumInternalBytes = 8;
|
||||||
|
static const uint32_t kSubIndexMask = 0x80000000;
|
||||||
|
static const size_t kOffsetLen = sizeof(uint32_t);
|
||||||
|
static const uint64_t kMaxFileSize = 1u << 31;
|
||||||
|
static const size_t kRecordsPerGroup = 256;
|
||||||
|
// To speed up the search for keys with same prefix, we'll add index key for
|
||||||
|
// every N keys, where the "N" is determined by
|
||||||
|
// kIndexIntervalForSamePrefixKeys
|
||||||
|
static const size_t kIndexIntervalForSamePrefixKeys = 16;
|
||||||
|
|
||||||
|
bool IsFixedLength() const {
|
||||||
|
return user_key_len_ != kPlainTableVariableLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GetFixedInternalKeyLength() const {
|
||||||
|
return user_key_len_ + kNumInternalBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
friend class TableCache;
|
||||||
|
friend class PlainTableIterator;
|
||||||
|
|
||||||
|
// Internal helper function to generate an IndexRecordList object from all
|
||||||
|
// the rows, which contains index records as a list.
|
||||||
|
int PopulateIndexRecordList(IndexRecordList* record_list);
|
||||||
|
|
||||||
|
// Internal helper function to allocate memory for indexes and bloom filters
|
||||||
|
void AllocateIndexAndBloom(int num_prefixes);
|
||||||
|
|
||||||
|
// Internal helper function to bucket index record list to hash buckets.
|
||||||
|
// hash_to_offsets is sized of of hash_table_size_, each contains a linked
|
||||||
|
// list
|
||||||
|
// of offsets for the hash, in reversed order.
|
||||||
|
// bucket_count is sized of hash_table_size_. The value is how many index
|
||||||
|
// records are there in hash_to_offsets for the same bucket.
|
||||||
|
size_t BucketizeIndexesAndFillBloom(
|
||||||
|
IndexRecordList& record_list, int num_prefixes,
|
||||||
|
std::vector<IndexRecord*>* hash_to_offsets,
|
||||||
|
std::vector<uint32_t>* bucket_count);
|
||||||
|
|
||||||
|
// Internal helper class to fill the indexes and bloom filters to internal
|
||||||
|
// data structures. hash_to_offsets and bucket_count are bucketized indexes
|
||||||
|
// and counts generated by BucketizeIndexesAndFillBloom().
|
||||||
|
void FillIndexes(size_t sub_index_size_needed,
|
||||||
|
const std::vector<IndexRecord*>& hash_to_offsets,
|
||||||
|
const std::vector<uint32_t>& bucket_count);
|
||||||
|
|
||||||
|
// PopulateIndex() builds index of keys. It must be called before any query
|
||||||
|
// to the table.
|
||||||
|
//
|
||||||
|
// hash_table_ contains buckets size of hash_table_size_, each is a 32-bit
|
||||||
|
// integer. The lower 31 bits contain an offset value (explained below) and
|
||||||
|
// the first bit of the integer indicates type of the offset.
|
||||||
|
//
|
||||||
|
// +--------------+------------------------------------------------------+
|
||||||
|
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
||||||
|
// +--------------+------------------------------------------------------+
|
||||||
|
//
|
||||||
|
// Explanation for the "flag bit":
|
||||||
|
//
|
||||||
|
// 0 indicates that the bucket contains only one prefix (no conflict when
|
||||||
|
// hashing this prefix), whose first row starts from this offset of the
|
||||||
|
// file.
|
||||||
|
// 1 indicates that the bucket contains more than one prefixes, or there
|
||||||
|
// are too many rows for one prefix so we need a binary search for it. In
|
||||||
|
// this case, the offset indicates the offset of sub_index_ holding the
|
||||||
|
// binary search indexes of keys for those rows. Those binary search indexes
|
||||||
|
// are organized in this way:
|
||||||
|
//
|
||||||
|
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
||||||
|
// it, there are N 32-bit integers, each points of an offset of the file,
|
||||||
|
// which
|
||||||
|
// points to starting of a row. Those offsets need to be guaranteed to be in
|
||||||
|
// ascending order so the keys they are pointing to are also in ascending
|
||||||
|
// order
|
||||||
|
// to make sure we can use them to do binary searches. Below is visual
|
||||||
|
// presentation of a bucket.
|
||||||
|
//
|
||||||
|
// <begin>
|
||||||
|
// number_of_records: varint32
|
||||||
|
// record 1 file offset: fixedint32
|
||||||
|
// record 2 file offset: fixedint32
|
||||||
|
// ....
|
||||||
|
// record N file offset: fixedint32
|
||||||
|
// <end>
|
||||||
|
Status PopulateIndex();
|
||||||
|
|
||||||
|
// Check bloom filter to see whether it might contain this prefix.
|
||||||
|
// The hash of the prefix is given, since it can be reused for index lookup
|
||||||
|
// too.
|
||||||
|
bool MayHavePrefix(uint32_t hash);
|
||||||
|
|
||||||
|
Status ReadKey(const char* row_ptr, ParsedInternalKey* key,
|
||||||
|
size_t& bytes_read);
|
||||||
|
// Read the key and value at offset to key and value.
|
||||||
|
// tmp_slice is a tmp slice.
|
||||||
|
// return next_offset as the offset for the next key.
|
||||||
|
Status Next(uint32_t offset, ParsedInternalKey* key, Slice* value,
|
||||||
|
uint32_t& next_offset);
|
||||||
|
// Get file offset for key target.
|
||||||
|
// return value prefix_matched is set to true if the offset is confirmed
|
||||||
|
// for a key with the same prefix as target.
|
||||||
|
Status GetOffset(const Slice& target, const Slice& prefix,
|
||||||
|
uint32_t prefix_hash, bool& prefix_matched,
|
||||||
|
uint32_t& ret_offset);
|
||||||
|
|
||||||
|
Slice GetPrefix(const Slice& target) {
|
||||||
|
assert(target.size() >= 8); // target is internal key
|
||||||
|
return options_.prefix_extractor->Transform(
|
||||||
|
Slice(target.data(), target.size() - 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice GetPrefix(const ParsedInternalKey& target);
|
||||||
|
|
||||||
|
// No copying allowed
|
||||||
|
explicit PlainTableReader(const TableReader&) = delete;
|
||||||
|
void operator=(const TableReader&) = delete;
|
||||||
|
};
|
||||||
|
} // namespace rocksdb
|
55
table/table_builder.h
Normal file
55
table/table_builder.h
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class Slice;
|
||||||
|
class Status;
|
||||||
|
|
||||||
|
// TableBuilder provides the interface used to build a Table
|
||||||
|
// (an immutable and sorted map from keys to values).
|
||||||
|
//
|
||||||
|
// Multiple threads can invoke const methods on a TableBuilder without
|
||||||
|
// external synchronization, but if any of the threads may call a
|
||||||
|
// non-const method, all threads accessing the same TableBuilder must use
|
||||||
|
// external synchronization.
|
||||||
|
class TableBuilder {
|
||||||
|
public:
|
||||||
|
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||||
|
virtual ~TableBuilder() {}
|
||||||
|
|
||||||
|
// Add key,value to the table being constructed.
|
||||||
|
// REQUIRES: key is after any previously added key according to comparator.
|
||||||
|
// REQUIRES: Finish(), Abandon() have not been called
|
||||||
|
virtual void Add(const Slice& key, const Slice& value) = 0;
|
||||||
|
|
||||||
|
// Return non-ok iff some error has been detected.
|
||||||
|
virtual Status status() const = 0;
|
||||||
|
|
||||||
|
// Finish building the table.
|
||||||
|
// REQUIRES: Finish(), Abandon() have not been called
|
||||||
|
virtual Status Finish() = 0;
|
||||||
|
|
||||||
|
// Indicate that the contents of this builder should be abandoned.
|
||||||
|
// If the caller is not going to call Finish(), it must call Abandon()
|
||||||
|
// before destroying this builder.
|
||||||
|
// REQUIRES: Finish(), Abandon() have not been called
|
||||||
|
virtual void Abandon() = 0;
|
||||||
|
|
||||||
|
// Number of calls to Add() so far.
|
||||||
|
virtual uint64_t NumEntries() const = 0;
|
||||||
|
|
||||||
|
// Size of the file generated so far. If invoked after a successful
|
||||||
|
// Finish() call, returns the size of the final generated file.
|
||||||
|
virtual uint64_t FileSize() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
114
table/table_properties.cc
Normal file
114
table/table_properties.cc
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#include "rocksdb/table_properties.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
void AppendProperty(
|
||||||
|
std::string& props,
|
||||||
|
const std::string& key,
|
||||||
|
const std::string& value,
|
||||||
|
const std::string& prop_delim,
|
||||||
|
const std::string& kv_delim) {
|
||||||
|
props.append(key);
|
||||||
|
props.append(kv_delim);
|
||||||
|
props.append(value);
|
||||||
|
props.append(prop_delim);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class TValue>
|
||||||
|
void AppendProperty(
|
||||||
|
std::string& props,
|
||||||
|
const std::string& key,
|
||||||
|
const TValue& value,
|
||||||
|
const std::string& prop_delim,
|
||||||
|
const std::string& kv_delim) {
|
||||||
|
AppendProperty(
|
||||||
|
props, key, std::to_string(value), prop_delim, kv_delim
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string TableProperties::ToString(
|
||||||
|
const std::string& prop_delim,
|
||||||
|
const std::string& kv_delim) const {
|
||||||
|
std::string result;
|
||||||
|
result.reserve(1024);
|
||||||
|
|
||||||
|
// Basic Info
|
||||||
|
AppendProperty(
|
||||||
|
result, "# data blocks", num_data_blocks, prop_delim, kv_delim
|
||||||
|
);
|
||||||
|
AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
|
||||||
|
|
||||||
|
AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
|
||||||
|
AppendProperty(
|
||||||
|
result,
|
||||||
|
"raw average key size",
|
||||||
|
num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
|
||||||
|
prop_delim,
|
||||||
|
kv_delim
|
||||||
|
);
|
||||||
|
AppendProperty(
|
||||||
|
result, "raw value size", raw_value_size, prop_delim, kv_delim
|
||||||
|
);
|
||||||
|
AppendProperty(
|
||||||
|
result,
|
||||||
|
"raw average value size",
|
||||||
|
num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
|
||||||
|
prop_delim,
|
||||||
|
kv_delim
|
||||||
|
);
|
||||||
|
|
||||||
|
AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
|
||||||
|
AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
|
||||||
|
AppendProperty(
|
||||||
|
result, "filter block size", filter_size, prop_delim, kv_delim
|
||||||
|
);
|
||||||
|
AppendProperty(
|
||||||
|
result,
|
||||||
|
"(estimated) table size",
|
||||||
|
data_size + index_size + filter_size,
|
||||||
|
prop_delim,
|
||||||
|
kv_delim
|
||||||
|
);
|
||||||
|
|
||||||
|
AppendProperty(
|
||||||
|
result,
|
||||||
|
"filter policy name",
|
||||||
|
filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
|
||||||
|
prop_delim,
|
||||||
|
kv_delim
|
||||||
|
);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string TablePropertiesNames::kDataSize =
|
||||||
|
"rocksdb.data.size";
|
||||||
|
const std::string TablePropertiesNames::kIndexSize =
|
||||||
|
"rocksdb.index.size";
|
||||||
|
const std::string TablePropertiesNames::kFilterSize =
|
||||||
|
"rocksdb.filter.size";
|
||||||
|
const std::string TablePropertiesNames::kRawKeySize =
|
||||||
|
"rocksdb.raw.key.size";
|
||||||
|
const std::string TablePropertiesNames::kRawValueSize =
|
||||||
|
"rocksdb.raw.value.size";
|
||||||
|
const std::string TablePropertiesNames::kNumDataBlocks =
|
||||||
|
"rocksdb.num.data.blocks";
|
||||||
|
const std::string TablePropertiesNames::kNumEntries =
|
||||||
|
"rocksdb.num.entries";
|
||||||
|
const std::string TablePropertiesNames::kFilterPolicy =
|
||||||
|
"rocksdb.filter.policy";
|
||||||
|
const std::string TablePropertiesNames::kFormatVersion =
|
||||||
|
"rocksdb.format.version";
|
||||||
|
const std::string TablePropertiesNames::kFixedKeyLen =
|
||||||
|
"rocksdb.fixed.key.length";
|
||||||
|
|
||||||
|
extern const std::string kPropertiesBlock = "rocksdb.properties";
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
71
table/table_reader.h
Normal file
71
table/table_reader.h
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class Iterator;
|
||||||
|
struct ParsedInternalKey;
|
||||||
|
class Slice;
|
||||||
|
struct ReadOptions;
|
||||||
|
struct TableProperties;
|
||||||
|
|
||||||
|
// A Table is a sorted map from strings to strings. Tables are
|
||||||
|
// immutable and persistent. A Table may be safely accessed from
|
||||||
|
// multiple threads without external synchronization.
|
||||||
|
class TableReader {
|
||||||
|
public:
|
||||||
|
virtual ~TableReader() {}
|
||||||
|
|
||||||
|
// Determine whether there is a chance that the current table file
|
||||||
|
// contains the key a key starting with iternal_prefix. The specific
|
||||||
|
// table implementation can use bloom filter and/or other heuristic
|
||||||
|
// to filter out this table as a whole.
|
||||||
|
virtual bool PrefixMayMatch(const Slice& internal_prefix) = 0;
|
||||||
|
|
||||||
|
// Returns a new iterator over the table contents.
|
||||||
|
// The result of NewIterator() is initially invalid (caller must
|
||||||
|
// call one of the Seek methods on the iterator before using it).
|
||||||
|
virtual Iterator* NewIterator(const ReadOptions&) = 0;
|
||||||
|
|
||||||
|
// Given a key, return an approximate byte offset in the file where
|
||||||
|
// the data for that key begins (or would begin if the key were
|
||||||
|
// present in the file). The returned value is in terms of file
|
||||||
|
// bytes, and so includes effects like compression of the underlying data.
|
||||||
|
// E.g., the approximate offset of the last key in the table will
|
||||||
|
// be close to the file length.
|
||||||
|
virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
|
||||||
|
|
||||||
|
// Set up the table for Compaction. Might change some parameters with
|
||||||
|
// posix_fadvise
|
||||||
|
virtual void SetupForCompaction() = 0;
|
||||||
|
|
||||||
|
virtual const TableProperties& GetTableProperties() = 0;
|
||||||
|
|
||||||
|
// Calls (*result_handler)(handle_context, ...) repeatedly, starting with
|
||||||
|
// the entry found after a call to Seek(key), until result_handler returns
|
||||||
|
// false, where k is the actual internal key for a row found and v as the
|
||||||
|
// value of the key. didIO is true if I/O is involved in the operation. May
|
||||||
|
// not make such a call if filter policy says that key is not present.
|
||||||
|
//
|
||||||
|
// mark_key_may_exist_handler needs to be called when it is configured to be
|
||||||
|
// memory only and the key is not found in the block cache, with
|
||||||
|
// the parameter to be handle_context.
|
||||||
|
//
|
||||||
|
// readOptions is the options for the read
|
||||||
|
// key is the key to search for
|
||||||
|
virtual Status Get(
|
||||||
|
const ReadOptions& readOptions, const Slice& key, void* handle_context,
|
||||||
|
bool (*result_handler)(void* arg, const ParsedInternalKey& k,
|
||||||
|
const Slice& v, bool didIO),
|
||||||
|
void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
@ -6,12 +6,13 @@
|
|||||||
#include <gflags/gflags.h>
|
#include <gflags/gflags.h>
|
||||||
|
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/table.h"
|
|
||||||
#include "rocksdb/slice_transform.h"
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "rocksdb/table.h"
|
||||||
#include "db/db_impl.h"
|
#include "db/db_impl.h"
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "port/atomic_pointer.h"
|
#include "port/atomic_pointer.h"
|
||||||
#include "table/block_based_table_factory.h"
|
#include "table/block_based_table_factory.h"
|
||||||
|
#include "table/plain_table_factory.h"
|
||||||
#include "util/histogram.h"
|
#include "util/histogram.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
#include "util/testutil.h"
|
#include "util/testutil.h"
|
||||||
@ -33,8 +34,8 @@ static std::string MakeKey(int i, int j, bool through_db) {
|
|||||||
return key.Encode().ToString();
|
return key.Encode().ToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool DummySaveValue(void* arg, const Slice& ikey, const Slice& v,
|
static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
|
||||||
bool didIO) {
|
const Slice& v, bool didIO) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,7 +71,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
|
|||||||
Status s;
|
Status s;
|
||||||
if (!through_db) {
|
if (!through_db) {
|
||||||
env->NewWritableFile(file_name, &file, env_options);
|
env->NewWritableFile(file_name, &file, env_options);
|
||||||
tb = opts.table_factory->GetTableBuilder(opts, file.get(),
|
tb = opts.table_factory->NewTableBuilder(opts, file.get(),
|
||||||
CompressionType::kNoCompression);
|
CompressionType::kNoCompression);
|
||||||
} else {
|
} else {
|
||||||
s = DB::Open(opts, dbname, &db);
|
s = DB::Open(opts, dbname, &db);
|
||||||
@ -101,7 +102,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
|
|||||||
Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
|
Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
|
||||||
uint64_t file_size;
|
uint64_t file_size;
|
||||||
env->GetFileSize(file_name, &file_size);
|
env->GetFileSize(file_name, &file_size);
|
||||||
s = opts.table_factory->GetTableReader(opts, env_options, std::move(raf),
|
s = opts.table_factory->NewTableReader(opts, env_options, std::move(raf),
|
||||||
file_size, &table_reader);
|
file_size, &table_reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,6 +219,8 @@ DEFINE_bool(iterator, false, "For test iterator");
|
|||||||
DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
|
DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
|
||||||
"the query will be against DB. Otherwise, will be directly against "
|
"the query will be against DB. Otherwise, will be directly against "
|
||||||
"a table reader.");
|
"a table reader.");
|
||||||
|
DEFINE_bool(plain_table, false, "Use PlainTable");
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
|
google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
|
||||||
@ -230,10 +233,23 @@ int main(int argc, char** argv) {
|
|||||||
options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
|
options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
|
||||||
FLAGS_prefix_len);
|
FLAGS_prefix_len);
|
||||||
}
|
}
|
||||||
options.SetUpDefaultFlushBlockPolicyFactory();
|
|
||||||
rocksdb::ReadOptions ro;
|
rocksdb::ReadOptions ro;
|
||||||
rocksdb::EnvOptions env_options;
|
rocksdb::EnvOptions env_options;
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
|
options.compression = rocksdb::CompressionType::kNoCompression;
|
||||||
|
options.internal_comparator =
|
||||||
|
new rocksdb::InternalKeyComparator(options.comparator);
|
||||||
|
|
||||||
|
if (FLAGS_plain_table) {
|
||||||
|
options.allow_mmap_reads = true;
|
||||||
|
env_options.use_mmap_reads = true;
|
||||||
|
tf = new rocksdb::PlainTableFactory(16, (FLAGS_prefix_len == 16) ? 0 : 8,
|
||||||
|
0.75);
|
||||||
|
options.prefix_extractor = rocksdb::NewFixedPrefixTransform(
|
||||||
|
FLAGS_prefix_len);
|
||||||
|
} else {
|
||||||
|
tf = new rocksdb::BlockBasedTableFactory();
|
||||||
|
}
|
||||||
options.table_factory =
|
options.table_factory =
|
||||||
std::shared_ptr<rocksdb::TableFactory>(tf);
|
std::shared_ptr<rocksdb::TableFactory>(tf);
|
||||||
TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
|
TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1,
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -20,18 +20,17 @@ namespace rocksdb {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
|
typedef Iterator* (*BlockFunction)(void*, const ReadOptions&,
|
||||||
const EnvOptions& soptions, const Slice&,
|
const EnvOptions& soptions,
|
||||||
bool for_compaction);
|
const InternalKeyComparator& icomparator,
|
||||||
|
const Slice&, bool for_compaction);
|
||||||
|
|
||||||
class TwoLevelIterator: public Iterator {
|
class TwoLevelIterator: public Iterator {
|
||||||
public:
|
public:
|
||||||
TwoLevelIterator(
|
TwoLevelIterator(Iterator* index_iter, BlockFunction block_function,
|
||||||
Iterator* index_iter,
|
void* arg, const ReadOptions& options,
|
||||||
BlockFunction block_function,
|
const EnvOptions& soptions,
|
||||||
void* arg,
|
const InternalKeyComparator& internal_comparator,
|
||||||
const ReadOptions& options,
|
bool for_compaction);
|
||||||
const EnvOptions& soptions,
|
|
||||||
bool for_compaction);
|
|
||||||
|
|
||||||
virtual ~TwoLevelIterator();
|
virtual ~TwoLevelIterator();
|
||||||
|
|
||||||
@ -76,6 +75,7 @@ class TwoLevelIterator: public Iterator {
|
|||||||
void* arg_;
|
void* arg_;
|
||||||
const ReadOptions options_;
|
const ReadOptions options_;
|
||||||
const EnvOptions& soptions_;
|
const EnvOptions& soptions_;
|
||||||
|
const InternalKeyComparator& internal_comparator_;
|
||||||
Status status_;
|
Status status_;
|
||||||
IteratorWrapper index_iter_;
|
IteratorWrapper index_iter_;
|
||||||
IteratorWrapper data_iter_; // May be nullptr
|
IteratorWrapper data_iter_; // May be nullptr
|
||||||
@ -86,20 +86,17 @@ class TwoLevelIterator: public Iterator {
|
|||||||
};
|
};
|
||||||
|
|
||||||
TwoLevelIterator::TwoLevelIterator(
|
TwoLevelIterator::TwoLevelIterator(
|
||||||
Iterator* index_iter,
|
Iterator* index_iter, BlockFunction block_function, void* arg,
|
||||||
BlockFunction block_function,
|
const ReadOptions& options, const EnvOptions& soptions,
|
||||||
void* arg,
|
const InternalKeyComparator& internal_comparator, bool for_compaction)
|
||||||
const ReadOptions& options,
|
|
||||||
const EnvOptions& soptions,
|
|
||||||
bool for_compaction)
|
|
||||||
: block_function_(block_function),
|
: block_function_(block_function),
|
||||||
arg_(arg),
|
arg_(arg),
|
||||||
options_(options),
|
options_(options),
|
||||||
soptions_(soptions),
|
soptions_(soptions),
|
||||||
|
internal_comparator_(internal_comparator),
|
||||||
index_iter_(index_iter),
|
index_iter_(index_iter),
|
||||||
data_iter_(nullptr),
|
data_iter_(nullptr),
|
||||||
for_compaction_(for_compaction) {
|
for_compaction_(for_compaction) {}
|
||||||
}
|
|
||||||
|
|
||||||
TwoLevelIterator::~TwoLevelIterator() {
|
TwoLevelIterator::~TwoLevelIterator() {
|
||||||
}
|
}
|
||||||
@ -181,8 +178,9 @@ void TwoLevelIterator::InitDataBlock() {
|
|||||||
// data_iter_ is already constructed with this iterator, so
|
// data_iter_ is already constructed with this iterator, so
|
||||||
// no need to change anything
|
// no need to change anything
|
||||||
} else {
|
} else {
|
||||||
Iterator* iter = (*block_function_)(arg_, options_, soptions_, handle,
|
Iterator* iter =
|
||||||
for_compaction_);
|
(*block_function_)(arg_, options_, soptions_, internal_comparator_,
|
||||||
|
handle, for_compaction_);
|
||||||
data_block_handle_.assign(handle.data(), handle.size());
|
data_block_handle_.assign(handle.data(), handle.size());
|
||||||
SetDataIterator(iter);
|
SetDataIterator(iter);
|
||||||
}
|
}
|
||||||
@ -191,15 +189,14 @@ void TwoLevelIterator::InitDataBlock() {
|
|||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Iterator* NewTwoLevelIterator(
|
Iterator* NewTwoLevelIterator(Iterator* index_iter,
|
||||||
Iterator* index_iter,
|
BlockFunction block_function, void* arg,
|
||||||
BlockFunction block_function,
|
const ReadOptions& options,
|
||||||
void* arg,
|
const EnvOptions& soptions,
|
||||||
const ReadOptions& options,
|
const InternalKeyComparator& internal_comparator,
|
||||||
const EnvOptions& soptions,
|
bool for_compaction) {
|
||||||
bool for_compaction) {
|
return new TwoLevelIterator(index_iter, block_function, arg, options,
|
||||||
return new TwoLevelIterator(index_iter, block_function, arg,
|
soptions, internal_comparator, for_compaction);
|
||||||
options, soptions, for_compaction);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
struct ReadOptions;
|
struct ReadOptions;
|
||||||
|
class InternalKeyComparator;
|
||||||
|
|
||||||
// Return a new two level iterator. A two-level iterator contains an
|
// Return a new two level iterator. A two-level iterator contains an
|
||||||
// index iterator whose values point to a sequence of blocks where
|
// index iterator whose values point to a sequence of blocks where
|
||||||
@ -27,14 +28,11 @@ struct ReadOptions;
|
|||||||
extern Iterator* NewTwoLevelIterator(
|
extern Iterator* NewTwoLevelIterator(
|
||||||
Iterator* index_iter,
|
Iterator* index_iter,
|
||||||
Iterator* (*block_function)(
|
Iterator* (*block_function)(
|
||||||
void* arg,
|
void* arg, const ReadOptions& options, const EnvOptions& soptions,
|
||||||
const ReadOptions& options,
|
const InternalKeyComparator& internal_comparator,
|
||||||
const EnvOptions& soptions,
|
const Slice& index_value, bool for_compaction),
|
||||||
const Slice& index_value,
|
void* arg, const ReadOptions& options, const EnvOptions& soptions,
|
||||||
bool for_compaction),
|
const InternalKeyComparator& internal_comparator,
|
||||||
void* arg,
|
|
||||||
const ReadOptions& options,
|
|
||||||
const EnvOptions& soptions,
|
|
||||||
bool for_compaction = false);
|
bool for_compaction = false);
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
|
#include "rocksdb/table_properties.h"
|
||||||
#include "table/block.h"
|
#include "table/block.h"
|
||||||
#include "table/block_builder.h"
|
#include "table/block_builder.h"
|
||||||
#include "table/format.h"
|
#include "table/format.h"
|
||||||
@ -38,22 +39,50 @@ class SstFileReader {
|
|||||||
bool has_to,
|
bool has_to,
|
||||||
const std::string& to_key);
|
const std::string& to_key);
|
||||||
|
|
||||||
|
Status ReadTableProperties(TableProperties* table_properties);
|
||||||
uint64_t GetReadNumber() { return read_num_; }
|
uint64_t GetReadNumber() { return read_num_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
Status NewTableReader(const std::string& file_path);
|
||||||
|
|
||||||
std::string file_name_;
|
std::string file_name_;
|
||||||
uint64_t read_num_;
|
uint64_t read_num_;
|
||||||
bool verify_checksum_;
|
bool verify_checksum_;
|
||||||
bool output_hex_;
|
bool output_hex_;
|
||||||
EnvOptions soptions_;
|
EnvOptions soptions_;
|
||||||
|
|
||||||
|
Status init_result_;
|
||||||
|
unique_ptr<TableReader> table_reader_;
|
||||||
|
unique_ptr<RandomAccessFile> file_;
|
||||||
|
// table_options_ and internal_comparator_ will also be used in
|
||||||
|
// ReadSequential internally (specifically, seek-related operations)
|
||||||
|
Options table_options_;
|
||||||
|
InternalKeyComparator internal_comparator_;
|
||||||
};
|
};
|
||||||
|
|
||||||
SstFileReader::SstFileReader(const std::string& file_path,
|
SstFileReader::SstFileReader(const std::string& file_path,
|
||||||
bool verify_checksum,
|
bool verify_checksum,
|
||||||
bool output_hex)
|
bool output_hex)
|
||||||
:file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
|
:file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
|
||||||
output_hex_(output_hex) {
|
output_hex_(output_hex), internal_comparator_(BytewiseComparator()) {
|
||||||
std::cout << "Process " << file_path << "\n";
|
fprintf(stdout, "Process %s\n", file_path.c_str());
|
||||||
|
|
||||||
|
init_result_ = NewTableReader(file_name_);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status SstFileReader::NewTableReader(const std::string& file_path) {
|
||||||
|
Status s = table_options_.env->NewRandomAccessFile(file_path, &file_,
|
||||||
|
soptions_);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
uint64_t file_size;
|
||||||
|
table_options_.env->GetFileSize(file_path, &file_size);
|
||||||
|
unique_ptr<TableFactory> table_factory;
|
||||||
|
s = table_options_.table_factory->NewTableReader(
|
||||||
|
table_options_, soptions_, internal_comparator_, std::move(file_),
|
||||||
|
file_size, &table_reader_);
|
||||||
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status SstFileReader::ReadSequential(bool print_kv,
|
Status SstFileReader::ReadSequential(bool print_kv,
|
||||||
@ -61,29 +90,12 @@ Status SstFileReader::ReadSequential(bool print_kv,
|
|||||||
bool has_from,
|
bool has_from,
|
||||||
const std::string& from_key,
|
const std::string& from_key,
|
||||||
bool has_to,
|
bool has_to,
|
||||||
const std::string& to_key)
|
const std::string& to_key) {
|
||||||
{
|
if (!table_reader_) {
|
||||||
unique_ptr<TableReader> table_reader;
|
return init_result_;
|
||||||
InternalKeyComparator internal_comparator_(BytewiseComparator());
|
|
||||||
Options table_options;
|
|
||||||
table_options.comparator = &internal_comparator_;
|
|
||||||
unique_ptr<RandomAccessFile> file;
|
|
||||||
Status s = table_options.env->NewRandomAccessFile(file_name_, &file,
|
|
||||||
soptions_);
|
|
||||||
if(!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
uint64_t file_size;
|
|
||||||
table_options.env->GetFileSize(file_name_, &file_size);
|
|
||||||
unique_ptr<TableFactory> table_factory;
|
|
||||||
s = table_options.table_factory->GetTableReader(table_options, soptions_,
|
|
||||||
std::move(file), file_size,
|
|
||||||
&table_reader);
|
|
||||||
if(!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Iterator* iter = table_reader->NewIterator(ReadOptions(verify_checksum_,
|
Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
|
||||||
false));
|
false));
|
||||||
uint64_t i = 0;
|
uint64_t i = 0;
|
||||||
if (has_from) {
|
if (has_from) {
|
||||||
@ -113,21 +125,29 @@ Status SstFileReader::ReadSequential(bool print_kv,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (print_kv) {
|
if (print_kv) {
|
||||||
std::cout << ikey.DebugString(output_hex_)
|
fprintf(stdout, "%s => %s\n",
|
||||||
<< " => "
|
ikey.DebugString(output_hex_).c_str(),
|
||||||
<< value.ToString(output_hex_) << "\n";
|
value.ToString(output_hex_).c_str());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
read_num_ += i;
|
||||||
|
|
||||||
read_num_ += i;
|
Status ret = iter->status();
|
||||||
|
delete iter;
|
||||||
Status ret = iter->status();
|
return ret;
|
||||||
delete iter;
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
Status SstFileReader::ReadTableProperties(TableProperties* table_properties) {
|
||||||
|
if (!table_reader_) {
|
||||||
|
return init_result_;
|
||||||
|
}
|
||||||
|
|
||||||
|
*table_properties = table_reader_->GetTableProperties();
|
||||||
|
return init_result_;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
static void print_help() {
|
static void print_help() {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
@ -137,7 +157,8 @@ static void print_help() {
|
|||||||
" [--input_key_hex]"
|
" [--input_key_hex]"
|
||||||
" [--from=<user_key>]"
|
" [--from=<user_key>]"
|
||||||
" [--to=<user_key>]"
|
" [--to=<user_key>]"
|
||||||
" [--read_num=NUM]\n");
|
" [--read_num=NUM]"
|
||||||
|
" [--show_properties]\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
string HexToString(const string& str) {
|
string HexToString(const string& str) {
|
||||||
@ -158,7 +179,6 @@ string HexToString(const string& str) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
||||||
const char* dir_or_file = nullptr;
|
const char* dir_or_file = nullptr;
|
||||||
uint64_t read_num = -1;
|
uint64_t read_num = -1;
|
||||||
std::string command;
|
std::string command;
|
||||||
@ -170,10 +190,10 @@ int main(int argc, char** argv) {
|
|||||||
bool input_key_hex = false;
|
bool input_key_hex = false;
|
||||||
bool has_from = false;
|
bool has_from = false;
|
||||||
bool has_to = false;
|
bool has_to = false;
|
||||||
|
bool show_properties = false;
|
||||||
std::string from_key;
|
std::string from_key;
|
||||||
std::string to_key;
|
std::string to_key;
|
||||||
for (int i = 1; i < argc; i++)
|
for (int i = 1; i < argc; i++) {
|
||||||
{
|
|
||||||
if (strncmp(argv[i], "--file=", 7) == 0) {
|
if (strncmp(argv[i], "--file=", 7) == 0) {
|
||||||
dir_or_file = argv[i] + 7;
|
dir_or_file = argv[i] + 7;
|
||||||
} else if (strcmp(argv[i], "--output_hex") == 0) {
|
} else if (strcmp(argv[i], "--output_hex") == 0) {
|
||||||
@ -194,7 +214,9 @@ int main(int argc, char** argv) {
|
|||||||
} else if (strncmp(argv[i], "--to=", 5) == 0) {
|
} else if (strncmp(argv[i], "--to=", 5) == 0) {
|
||||||
to_key = argv[i] + 5;
|
to_key = argv[i] + 5;
|
||||||
has_to = true;
|
has_to = true;
|
||||||
}else {
|
} else if (strcmp(argv[i], "--show_properties") == 0) {
|
||||||
|
show_properties = true;
|
||||||
|
} else {
|
||||||
print_help();
|
print_help();
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -210,7 +232,7 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(dir_or_file == nullptr) {
|
if (dir_or_file == nullptr) {
|
||||||
print_help();
|
print_help();
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -225,18 +247,19 @@ int main(int argc, char** argv) {
|
|||||||
dir = false;
|
dir = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "from [" << rocksdb::Slice(from_key).ToString(true)
|
fprintf(stdout, "from [%s] to [%s]\n",
|
||||||
<< "] to [" << rocksdb::Slice(to_key).ToString(true) << "]\n";
|
rocksdb::Slice(from_key).ToString(true).c_str(),
|
||||||
|
rocksdb::Slice(to_key).ToString(true).c_str());
|
||||||
|
|
||||||
uint64_t total_read = 0;
|
uint64_t total_read = 0;
|
||||||
for (size_t i = 0; i < filenames.size(); i++) {
|
for (size_t i = 0; i < filenames.size(); i++) {
|
||||||
std::string filename = filenames.at(i);
|
std::string filename = filenames.at(i);
|
||||||
if (filename.length() <= 4 ||
|
if (filename.length() <= 4 ||
|
||||||
filename.rfind(".sst") != filename.length() - 4) {
|
filename.rfind(".sst") != filename.length() - 4) {
|
||||||
//ignore
|
// ignore
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(dir) {
|
if (dir) {
|
||||||
filename = std::string(dir_or_file) + "/" + filename;
|
filename = std::string(dir_or_file) + "/" + filename;
|
||||||
}
|
}
|
||||||
rocksdb::SstFileReader reader(filename, verify_checksum,
|
rocksdb::SstFileReader reader(filename, verify_checksum,
|
||||||
@ -257,5 +280,20 @@ int main(int argc, char** argv) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (show_properties) {
|
||||||
|
rocksdb::TableProperties table_properties;
|
||||||
|
st = reader.ReadTableProperties(&table_properties);
|
||||||
|
if (!st.ok()) {
|
||||||
|
fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
|
||||||
|
} else {
|
||||||
|
fprintf(stdout,
|
||||||
|
"Table Properties:\n"
|
||||||
|
"------------------------------\n"
|
||||||
|
" %s", table_properties.ToString("\n ", ": ").c_str());
|
||||||
|
fprintf(stdout, "# deleted keys: %zd\n",
|
||||||
|
rocksdb::GetDeletedKeys(
|
||||||
|
table_properties.user_collected_properties));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,19 +7,19 @@
|
|||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
#include "util/arena_impl.h"
|
#include "util/arena.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
const size_t ArenaImpl::kMinBlockSize = 4096;
|
const size_t Arena::kMinBlockSize = 4096;
|
||||||
const size_t ArenaImpl::kMaxBlockSize = 2 << 30;
|
const size_t Arena::kMaxBlockSize = 2 << 30;
|
||||||
static const int kAlignUnit = sizeof(void*);
|
static const int kAlignUnit = sizeof(void*);
|
||||||
|
|
||||||
size_t OptimizeBlockSize(size_t block_size) {
|
size_t OptimizeBlockSize(size_t block_size) {
|
||||||
// Make sure block_size is in optimal range
|
// Make sure block_size is in optimal range
|
||||||
block_size = std::max(ArenaImpl::kMinBlockSize, block_size);
|
block_size = std::max(Arena::kMinBlockSize, block_size);
|
||||||
block_size = std::min(ArenaImpl::kMaxBlockSize, block_size);
|
block_size = std::min(Arena::kMaxBlockSize, block_size);
|
||||||
|
|
||||||
// make sure block_size is the multiple of kAlignUnit
|
// make sure block_size is the multiple of kAlignUnit
|
||||||
if (block_size % kAlignUnit != 0) {
|
if (block_size % kAlignUnit != 0) {
|
||||||
@ -29,19 +29,18 @@ size_t OptimizeBlockSize(size_t block_size) {
|
|||||||
return block_size;
|
return block_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
ArenaImpl::ArenaImpl(size_t block_size)
|
Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) {
|
||||||
: kBlockSize(OptimizeBlockSize(block_size)) {
|
|
||||||
assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
|
assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
|
||||||
kBlockSize % kAlignUnit == 0);
|
kBlockSize % kAlignUnit == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
ArenaImpl::~ArenaImpl() {
|
Arena::~Arena() {
|
||||||
for (const auto& block : blocks_) {
|
for (const auto& block : blocks_) {
|
||||||
delete[] block;
|
delete[] block;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
|
char* Arena::AllocateFallback(size_t bytes, bool aligned) {
|
||||||
if (bytes > kBlockSize / 4) {
|
if (bytes > kBlockSize / 4) {
|
||||||
// Object is more than a quarter of our block size. Allocate it separately
|
// Object is more than a quarter of our block size. Allocate it separately
|
||||||
// to avoid wasting too much space in leftover bytes.
|
// to avoid wasting too much space in leftover bytes.
|
||||||
@ -63,7 +62,7 @@ char* ArenaImpl::AllocateFallback(size_t bytes, bool aligned) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
char* ArenaImpl::AllocateAligned(size_t bytes) {
|
char* Arena::AllocateAligned(size_t bytes) {
|
||||||
assert((kAlignUnit & (kAlignUnit - 1)) ==
|
assert((kAlignUnit & (kAlignUnit - 1)) ==
|
||||||
0); // Pointer size should be a power of 2
|
0); // Pointer size should be a power of 2
|
||||||
size_t current_mod =
|
size_t current_mod =
|
||||||
@ -83,7 +82,7 @@ char* ArenaImpl::AllocateAligned(size_t bytes) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
char* ArenaImpl::AllocateNewBlock(size_t block_bytes) {
|
char* Arena::AllocateNewBlock(size_t block_bytes) {
|
||||||
char* block = new char[block_bytes];
|
char* block = new char[block_bytes];
|
||||||
blocks_memory_ += block_bytes;
|
blocks_memory_ += block_bytes;
|
||||||
blocks_.push_back(block);
|
blocks_.push_back(block);
|
@ -7,7 +7,7 @@
|
|||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
// ArenaImpl is an implementation of Arena class. For a request of small size,
|
// Arena is an implementation of Arena class. For a request of small size,
|
||||||
// it allocates a block with pre-defined block size. For a request of big
|
// it allocates a block with pre-defined block size. For a request of big
|
||||||
// size, it uses malloc to directly get the requested size.
|
// size, it uses malloc to directly get the requested size.
|
||||||
|
|
||||||
@ -16,37 +16,35 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "rocksdb/arena.h"
|
#include "util/arena.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
class ArenaImpl : public Arena {
|
class Arena {
|
||||||
public:
|
public:
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
ArenaImpl(const ArenaImpl&) = delete;
|
Arena(const Arena&) = delete;
|
||||||
void operator=(const ArenaImpl&) = delete;
|
void operator=(const Arena&) = delete;
|
||||||
|
|
||||||
static const size_t kMinBlockSize;
|
static const size_t kMinBlockSize;
|
||||||
static const size_t kMaxBlockSize;
|
static const size_t kMaxBlockSize;
|
||||||
|
|
||||||
explicit ArenaImpl(size_t block_size = kMinBlockSize);
|
explicit Arena(size_t block_size = kMinBlockSize);
|
||||||
virtual ~ArenaImpl();
|
~Arena();
|
||||||
|
|
||||||
virtual char* Allocate(size_t bytes) override;
|
char* Allocate(size_t bytes);
|
||||||
|
|
||||||
virtual char* AllocateAligned(size_t bytes) override;
|
char* AllocateAligned(size_t bytes);
|
||||||
|
|
||||||
// Returns an estimate of the total memory usage of data allocated
|
// Returns an estimate of the total memory usage of data allocated
|
||||||
// by the arena (exclude the space allocated but not yet used for future
|
// by the arena (exclude the space allocated but not yet used for future
|
||||||
// allocations).
|
// allocations).
|
||||||
virtual const size_t ApproximateMemoryUsage() {
|
const size_t ApproximateMemoryUsage() {
|
||||||
return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
|
return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
|
||||||
alloc_bytes_remaining_;
|
alloc_bytes_remaining_;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual const size_t MemoryAllocatedBytes() override {
|
const size_t MemoryAllocatedBytes() { return blocks_memory_; }
|
||||||
return blocks_memory_;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Number of bytes allocated in one block
|
// Number of bytes allocated in one block
|
||||||
@ -72,7 +70,7 @@ class ArenaImpl : public Arena {
|
|||||||
size_t blocks_memory_ = 0;
|
size_t blocks_memory_ = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline char* ArenaImpl::Allocate(size_t bytes) {
|
inline char* Arena::Allocate(size_t bytes) {
|
||||||
// The semantics of what to return are a bit messy if we allow
|
// The semantics of what to return are a bit messy if we allow
|
||||||
// 0-byte allocations, so we disallow them here (we don't need
|
// 0-byte allocations, so we disallow them here (we don't need
|
||||||
// them for our internal use).
|
// them for our internal use).
|
@ -7,34 +7,32 @@
|
|||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
#include "util/arena_impl.h"
|
#include "util/arena.h"
|
||||||
#include "util/random.h"
|
#include "util/random.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
class ArenaImplTest { };
|
class ArenaTest {};
|
||||||
|
|
||||||
TEST(ArenaImplTest, Empty) {
|
TEST(ArenaTest, Empty) { Arena arena0; }
|
||||||
ArenaImpl arena0;
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(ArenaImplTest, MemoryAllocatedBytes) {
|
TEST(ArenaTest, MemoryAllocatedBytes) {
|
||||||
const int N = 17;
|
const int N = 17;
|
||||||
size_t req_sz; //requested size
|
size_t req_sz; // requested size
|
||||||
size_t bsz = 8192; // block size
|
size_t bsz = 8192; // block size
|
||||||
size_t expected_memory_allocated;
|
size_t expected_memory_allocated;
|
||||||
|
|
||||||
ArenaImpl arena_impl(bsz);
|
Arena arena(bsz);
|
||||||
|
|
||||||
// requested size > quarter of a block:
|
// requested size > quarter of a block:
|
||||||
// allocate requested size separately
|
// allocate requested size separately
|
||||||
req_sz = 3001;
|
req_sz = 3001;
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
arena_impl.Allocate(req_sz);
|
arena.Allocate(req_sz);
|
||||||
}
|
}
|
||||||
expected_memory_allocated = req_sz * N;
|
expected_memory_allocated = req_sz * N;
|
||||||
ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
|
ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
|
||||||
|
|
||||||
// requested size < quarter of a block:
|
// requested size < quarter of a block:
|
||||||
// allocate a block with the default size, then try to use unused part
|
// allocate a block with the default size, then try to use unused part
|
||||||
@ -42,28 +40,28 @@ TEST(ArenaImplTest, MemoryAllocatedBytes) {
|
|||||||
// Allocate(99) call. All the remaining calls won't lead to new allocation.
|
// Allocate(99) call. All the remaining calls won't lead to new allocation.
|
||||||
req_sz = 99;
|
req_sz = 99;
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
arena_impl.Allocate(req_sz);
|
arena.Allocate(req_sz);
|
||||||
}
|
}
|
||||||
expected_memory_allocated += bsz;
|
expected_memory_allocated += bsz;
|
||||||
ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
|
ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
|
||||||
|
|
||||||
// requested size > quarter of a block:
|
// requested size > quarter of a block:
|
||||||
// allocate requested size separately
|
// allocate requested size separately
|
||||||
req_sz = 99999999;
|
req_sz = 99999999;
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
arena_impl.Allocate(req_sz);
|
arena.Allocate(req_sz);
|
||||||
}
|
}
|
||||||
expected_memory_allocated += req_sz * N;
|
expected_memory_allocated += req_sz * N;
|
||||||
ASSERT_EQ(arena_impl.MemoryAllocatedBytes(), expected_memory_allocated);
|
ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure we didn't count the allocate but not used memory space in
|
// Make sure we didn't count the allocate but not used memory space in
|
||||||
// Arena::ApproximateMemoryUsage()
|
// Arena::ApproximateMemoryUsage()
|
||||||
TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
|
TEST(ArenaTest, ApproximateMemoryUsageTest) {
|
||||||
const size_t kBlockSize = 4096;
|
const size_t kBlockSize = 4096;
|
||||||
const size_t kEntrySize = kBlockSize / 8;
|
const size_t kEntrySize = kBlockSize / 8;
|
||||||
const size_t kZero = 0;
|
const size_t kZero = 0;
|
||||||
ArenaImpl arena(kBlockSize);
|
Arena arena(kBlockSize);
|
||||||
ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
|
ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
|
||||||
|
|
||||||
auto num_blocks = kBlockSize / kEntrySize;
|
auto num_blocks = kBlockSize / kEntrySize;
|
||||||
@ -83,9 +81,9 @@ TEST(ArenaImplTest, ApproximateMemoryUsageTest) {
|
|||||||
ASSERT_GT(usage, mem_usage);
|
ASSERT_GT(usage, mem_usage);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ArenaImplTest, Simple) {
|
TEST(ArenaTest, Simple) {
|
||||||
std::vector<std::pair<size_t, char*>> allocated;
|
std::vector<std::pair<size_t, char*>> allocated;
|
||||||
ArenaImpl arena_impl;
|
Arena arena;
|
||||||
const int N = 100000;
|
const int N = 100000;
|
||||||
size_t bytes = 0;
|
size_t bytes = 0;
|
||||||
Random rnd(301);
|
Random rnd(301);
|
||||||
@ -104,9 +102,9 @@ TEST(ArenaImplTest, Simple) {
|
|||||||
}
|
}
|
||||||
char* r;
|
char* r;
|
||||||
if (rnd.OneIn(10)) {
|
if (rnd.OneIn(10)) {
|
||||||
r = arena_impl.AllocateAligned(s);
|
r = arena.AllocateAligned(s);
|
||||||
} else {
|
} else {
|
||||||
r = arena_impl.Allocate(s);
|
r = arena.Allocate(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int b = 0; b < s; b++) {
|
for (unsigned int b = 0; b < s; b++) {
|
||||||
@ -115,9 +113,9 @@ TEST(ArenaImplTest, Simple) {
|
|||||||
}
|
}
|
||||||
bytes += s;
|
bytes += s;
|
||||||
allocated.push_back(std::make_pair(s, r));
|
allocated.push_back(std::make_pair(s, r));
|
||||||
ASSERT_GE(arena_impl.ApproximateMemoryUsage(), bytes);
|
ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
|
||||||
if (i > N / 10) {
|
if (i > N / 10) {
|
||||||
ASSERT_LE(arena_impl.ApproximateMemoryUsage(), bytes * 1.10);
|
ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (unsigned int i = 0; i < allocated.size(); i++) {
|
for (unsigned int i = 0; i < allocated.size(); i++) {
|
||||||
@ -132,6 +130,4 @@ TEST(ArenaImplTest, Simple) {
|
|||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
|
||||||
return rocksdb::test::RunAllTests();
|
|
||||||
}
|
|
||||||
|
@ -57,11 +57,9 @@ class autovector {
|
|||||||
typedef std::random_access_iterator_tag iterator_category;
|
typedef std::random_access_iterator_tag iterator_category;
|
||||||
|
|
||||||
iterator_impl(TAutoVector* vect, size_t index)
|
iterator_impl(TAutoVector* vect, size_t index)
|
||||||
: vect_(vect)
|
: vect_(vect), index_(index) {};
|
||||||
, index_(index) {
|
|
||||||
};
|
|
||||||
iterator_impl(const iterator_impl&) = default;
|
iterator_impl(const iterator_impl&) = default;
|
||||||
~iterator_impl() { }
|
~iterator_impl() {}
|
||||||
iterator_impl& operator=(const iterator_impl&) = default;
|
iterator_impl& operator=(const iterator_impl&) = default;
|
||||||
|
|
||||||
// -- Advancement
|
// -- Advancement
|
||||||
@ -130,9 +128,7 @@ class autovector {
|
|||||||
return index_ == other.index_;
|
return index_ == other.index_;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator!=(const self_type& other) const {
|
bool operator!=(const self_type& other) const { return !(*this == other); }
|
||||||
return !(*this == other);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator>(const self_type& other) const {
|
bool operator>(const self_type& other) const {
|
||||||
assert(vect_ == other.vect_);
|
assert(vect_ == other.vect_);
|
||||||
@ -174,13 +170,9 @@ class autovector {
|
|||||||
return vect_.capacity() == 0;
|
return vect_.capacity() == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_type size() const {
|
size_type size() const { return num_stack_items_ + vect_.size(); }
|
||||||
return num_stack_items_ + vect_.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool empty() const {
|
bool empty() const { return size() == 0; }
|
||||||
return size() == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// will not check boundry
|
// will not check boundry
|
||||||
const_reference operator[](size_type n) const {
|
const_reference operator[](size_type n) const {
|
||||||
@ -235,11 +227,9 @@ class autovector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void push_back(const T& item) {
|
void push_back(const T& item) { push_back(value_type(item)); }
|
||||||
push_back(value_type(item));
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class... Args>
|
template <class... Args>
|
||||||
void emplace_back(Args&&... args) {
|
void emplace_back(Args&&... args) {
|
||||||
push_back(value_type(args...));
|
push_back(value_type(args...));
|
||||||
}
|
}
|
||||||
@ -261,13 +251,9 @@ class autovector {
|
|||||||
// -- Copy and Assignment
|
// -- Copy and Assignment
|
||||||
autovector& assign(const autovector& other);
|
autovector& assign(const autovector& other);
|
||||||
|
|
||||||
autovector(const autovector& other) {
|
autovector(const autovector& other) { assign(other); }
|
||||||
assign(other);
|
|
||||||
}
|
|
||||||
|
|
||||||
autovector& operator=(const autovector& other) {
|
autovector& operator=(const autovector& other) { return assign(other); }
|
||||||
return assign(other);
|
|
||||||
}
|
|
||||||
|
|
||||||
// move operation are disallowed since it is very hard to make sure both
|
// move operation are disallowed since it is very hard to make sure both
|
||||||
// autovectors are allocated from the same function stack.
|
// autovectors are allocated from the same function stack.
|
||||||
@ -275,41 +261,29 @@ class autovector {
|
|||||||
autovector(autovector&& other) = delete;
|
autovector(autovector&& other) = delete;
|
||||||
|
|
||||||
// -- Iterator Operations
|
// -- Iterator Operations
|
||||||
iterator begin() {
|
iterator begin() { return iterator(this, 0); }
|
||||||
return iterator(this, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
const_iterator begin() const {
|
const_iterator begin() const { return const_iterator(this, 0); }
|
||||||
return const_iterator(this, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
iterator end() {
|
iterator end() { return iterator(this, this->size()); }
|
||||||
return iterator(this, this->size());
|
|
||||||
}
|
|
||||||
|
|
||||||
const_iterator end() const {
|
const_iterator end() const { return const_iterator(this, this->size()); }
|
||||||
return const_iterator(this, this->size());
|
|
||||||
}
|
|
||||||
|
|
||||||
reverse_iterator rbegin() {
|
reverse_iterator rbegin() { return reverse_iterator(end()); }
|
||||||
return reverse_iterator(end());
|
|
||||||
}
|
|
||||||
|
|
||||||
const_reverse_iterator rbegin() const {
|
const_reverse_iterator rbegin() const {
|
||||||
return const_reverse_iterator(end());
|
return const_reverse_iterator(end());
|
||||||
}
|
}
|
||||||
|
|
||||||
reverse_iterator rend() {
|
reverse_iterator rend() { return reverse_iterator(begin()); }
|
||||||
return reverse_iterator(begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
const_reverse_iterator rend() const {
|
const_reverse_iterator rend() const {
|
||||||
return const_reverse_iterator(begin());
|
return const_reverse_iterator(begin());
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
size_type num_stack_items_ = 0; // current number of items
|
size_type num_stack_items_ = 0; // current number of items
|
||||||
value_type values_[kSize]; // the first `kSize` items
|
value_type values_[kSize]; // the first `kSize` items
|
||||||
// used only if there are more than `kSize` items.
|
// used only if there are more than `kSize` items.
|
||||||
std::vector<T> vect_;
|
std::vector<T> vect_;
|
||||||
};
|
};
|
||||||
|
@ -7,12 +7,16 @@
|
|||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include <gflags/gflags.h>
|
||||||
|
|
||||||
#include "rocksdb/filter_policy.h"
|
#include "rocksdb/filter_policy.h"
|
||||||
|
|
||||||
#include "util/logging.h"
|
#include "util/logging.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
#include "util/testutil.h"
|
#include "util/testutil.h"
|
||||||
|
|
||||||
|
DEFINE_int32(bits_per_key, 10, "");
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
static const int kVerbose = 1;
|
static const int kVerbose = 1;
|
||||||
@ -29,7 +33,7 @@ class BloomTest {
|
|||||||
std::vector<std::string> keys_;
|
std::vector<std::string> keys_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
|
BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { }
|
||||||
|
|
||||||
~BloomTest() {
|
~BloomTest() {
|
||||||
delete policy_;
|
delete policy_;
|
||||||
@ -160,5 +164,7 @@ TEST(BloomTest, VaryingLengths) {
|
|||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||||
|
|
||||||
return rocksdb::test::RunAllTests();
|
return rocksdb::test::RunAllTests();
|
||||||
}
|
}
|
||||||
|
@ -10,10 +10,10 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "rocksdb/cache.h"
|
#include "rocksdb/cache.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
|
#include "util/autovector.h"
|
||||||
#include "util/hash.h"
|
#include "util/hash.h"
|
||||||
#include "util/mutexlock.h"
|
#include "util/mutexlock.h"
|
||||||
|
|
||||||
@ -156,6 +156,13 @@ class LRUCache {
|
|||||||
Cache::Handle* Lookup(const Slice& key, uint32_t hash);
|
Cache::Handle* Lookup(const Slice& key, uint32_t hash);
|
||||||
void Release(Cache::Handle* handle);
|
void Release(Cache::Handle* handle);
|
||||||
void Erase(const Slice& key, uint32_t hash);
|
void Erase(const Slice& key, uint32_t hash);
|
||||||
|
// Although in some platforms the update of size_t is atomic, to make sure
|
||||||
|
// GetUsage() works correctly under any platforms, we'll protect this
|
||||||
|
// function with mutex.
|
||||||
|
size_t GetUsage() const {
|
||||||
|
MutexLock l(&mutex_);
|
||||||
|
return usage_;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void LRU_Remove(LRUHandle* e);
|
void LRU_Remove(LRUHandle* e);
|
||||||
@ -171,7 +178,9 @@ class LRUCache {
|
|||||||
uint32_t remove_scan_count_limit_;
|
uint32_t remove_scan_count_limit_;
|
||||||
|
|
||||||
// mutex_ protects the following state.
|
// mutex_ protects the following state.
|
||||||
port::Mutex mutex_;
|
// We don't count mutex_ as the cache's internal state so semantically we
|
||||||
|
// don't mind mutex_ invoking the non-const actions.
|
||||||
|
mutable port::Mutex mutex_;
|
||||||
size_t usage_;
|
size_t usage_;
|
||||||
|
|
||||||
// Dummy head of LRU list.
|
// Dummy head of LRU list.
|
||||||
@ -255,8 +264,7 @@ Cache::Handle* LRUCache::Insert(
|
|||||||
|
|
||||||
LRUHandle* e = reinterpret_cast<LRUHandle*>(
|
LRUHandle* e = reinterpret_cast<LRUHandle*>(
|
||||||
malloc(sizeof(LRUHandle)-1 + key.size()));
|
malloc(sizeof(LRUHandle)-1 + key.size()));
|
||||||
std::vector<LRUHandle*> last_reference_list;
|
autovector<LRUHandle*> last_reference_list;
|
||||||
last_reference_list.reserve(1);
|
|
||||||
|
|
||||||
e->value = value;
|
e->value = value;
|
||||||
e->deleter = deleter;
|
e->deleter = deleter;
|
||||||
@ -342,10 +350,10 @@ static int kRemoveScanCountLimit = 0; // default values, can be overridden
|
|||||||
|
|
||||||
class ShardedLRUCache : public Cache {
|
class ShardedLRUCache : public Cache {
|
||||||
private:
|
private:
|
||||||
LRUCache* shard_;
|
LRUCache* shards_;
|
||||||
port::Mutex id_mutex_;
|
port::Mutex id_mutex_;
|
||||||
uint64_t last_id_;
|
uint64_t last_id_;
|
||||||
int numShardBits;
|
int num_shard_bits_;
|
||||||
size_t capacity_;
|
size_t capacity_;
|
||||||
|
|
||||||
static inline uint32_t HashSlice(const Slice& s) {
|
static inline uint32_t HashSlice(const Slice& s) {
|
||||||
@ -354,18 +362,18 @@ class ShardedLRUCache : public Cache {
|
|||||||
|
|
||||||
uint32_t Shard(uint32_t hash) {
|
uint32_t Shard(uint32_t hash) {
|
||||||
// Note, hash >> 32 yields hash in gcc, not the zero we expect!
|
// Note, hash >> 32 yields hash in gcc, not the zero we expect!
|
||||||
return (numShardBits > 0) ? (hash >> (32 - numShardBits)) : 0;
|
return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void init(size_t capacity, int numbits, int removeScanCountLimit) {
|
void init(size_t capacity, int numbits, int removeScanCountLimit) {
|
||||||
numShardBits = numbits;
|
num_shard_bits_ = numbits;
|
||||||
capacity_ = capacity;
|
capacity_ = capacity;
|
||||||
int numShards = 1 << numShardBits;
|
int num_shards = 1 << num_shard_bits_;
|
||||||
shard_ = new LRUCache[numShards];
|
shards_ = new LRUCache[num_shards];
|
||||||
const size_t per_shard = (capacity + (numShards - 1)) / numShards;
|
const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
|
||||||
for (int s = 0; s < numShards; s++) {
|
for (int s = 0; s < num_shards; s++) {
|
||||||
shard_[s].SetCapacity(per_shard);
|
shards_[s].SetCapacity(per_shard);
|
||||||
shard_[s].SetRemoveScanCountLimit(removeScanCountLimit);
|
shards_[s].SetRemoveScanCountLimit(removeScanCountLimit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -374,30 +382,30 @@ class ShardedLRUCache : public Cache {
|
|||||||
: last_id_(0) {
|
: last_id_(0) {
|
||||||
init(capacity, kNumShardBits, kRemoveScanCountLimit);
|
init(capacity, kNumShardBits, kRemoveScanCountLimit);
|
||||||
}
|
}
|
||||||
ShardedLRUCache(size_t capacity, int numShardBits,
|
ShardedLRUCache(size_t capacity, int num_shard_bits,
|
||||||
int removeScanCountLimit)
|
int removeScanCountLimit)
|
||||||
: last_id_(0) {
|
: last_id_(0) {
|
||||||
init(capacity, numShardBits, removeScanCountLimit);
|
init(capacity, num_shard_bits, removeScanCountLimit);
|
||||||
}
|
}
|
||||||
virtual ~ShardedLRUCache() {
|
virtual ~ShardedLRUCache() {
|
||||||
delete[] shard_;
|
delete[] shards_;
|
||||||
}
|
}
|
||||||
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
|
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
|
||||||
void (*deleter)(const Slice& key, void* value)) {
|
void (*deleter)(const Slice& key, void* value)) {
|
||||||
const uint32_t hash = HashSlice(key);
|
const uint32_t hash = HashSlice(key);
|
||||||
return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
|
return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter);
|
||||||
}
|
}
|
||||||
virtual Handle* Lookup(const Slice& key) {
|
virtual Handle* Lookup(const Slice& key) {
|
||||||
const uint32_t hash = HashSlice(key);
|
const uint32_t hash = HashSlice(key);
|
||||||
return shard_[Shard(hash)].Lookup(key, hash);
|
return shards_[Shard(hash)].Lookup(key, hash);
|
||||||
}
|
}
|
||||||
virtual void Release(Handle* handle) {
|
virtual void Release(Handle* handle) {
|
||||||
LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
|
LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
|
||||||
shard_[Shard(h->hash)].Release(handle);
|
shards_[Shard(h->hash)].Release(handle);
|
||||||
}
|
}
|
||||||
virtual void Erase(const Slice& key) {
|
virtual void Erase(const Slice& key) {
|
||||||
const uint32_t hash = HashSlice(key);
|
const uint32_t hash = HashSlice(key);
|
||||||
shard_[Shard(hash)].Erase(key, hash);
|
shards_[Shard(hash)].Erase(key, hash);
|
||||||
}
|
}
|
||||||
virtual void* Value(Handle* handle) {
|
virtual void* Value(Handle* handle) {
|
||||||
return reinterpret_cast<LRUHandle*>(handle)->value;
|
return reinterpret_cast<LRUHandle*>(handle)->value;
|
||||||
@ -406,11 +414,23 @@ class ShardedLRUCache : public Cache {
|
|||||||
MutexLock l(&id_mutex_);
|
MutexLock l(&id_mutex_);
|
||||||
return ++(last_id_);
|
return ++(last_id_);
|
||||||
}
|
}
|
||||||
virtual size_t GetCapacity() {
|
virtual size_t GetCapacity() const {
|
||||||
return capacity_;
|
return capacity_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual size_t GetUsage() const {
|
||||||
|
// We will not lock the cache when getting the usage from shards.
|
||||||
|
// for (size_t i = 0; i < num_shard_bits_; ++i)
|
||||||
|
int num_shards = 1 << num_shard_bits_;
|
||||||
|
size_t usage = 0;
|
||||||
|
for (int s = 0; s < num_shards; s++) {
|
||||||
|
usage += shards_[s].GetUsage();
|
||||||
|
}
|
||||||
|
return usage;
|
||||||
|
}
|
||||||
|
|
||||||
virtual void DisownData() {
|
virtual void DisownData() {
|
||||||
shard_ = nullptr;
|
shards_ = nullptr;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -420,17 +440,17 @@ shared_ptr<Cache> NewLRUCache(size_t capacity) {
|
|||||||
return NewLRUCache(capacity, kNumShardBits);
|
return NewLRUCache(capacity, kNumShardBits);
|
||||||
}
|
}
|
||||||
|
|
||||||
shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits) {
|
shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits) {
|
||||||
return NewLRUCache(capacity, numShardBits, kRemoveScanCountLimit);
|
return NewLRUCache(capacity, num_shard_bits, kRemoveScanCountLimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
|
shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
|
||||||
int removeScanCountLimit) {
|
int removeScanCountLimit) {
|
||||||
if (numShardBits >= 20) {
|
if (num_shard_bits >= 20) {
|
||||||
return nullptr; // the cache cannot be sharded into too many fine pieces
|
return nullptr; // the cache cannot be sharded into too many fine pieces
|
||||||
}
|
}
|
||||||
return std::make_shared<ShardedLRUCache>(capacity,
|
return std::make_shared<ShardedLRUCache>(capacity,
|
||||||
numShardBits,
|
num_shard_bits,
|
||||||
removeScanCountLimit);
|
removeScanCountLimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,6 +107,39 @@ class CacheTest {
|
|||||||
};
|
};
|
||||||
CacheTest* CacheTest::current_;
|
CacheTest* CacheTest::current_;
|
||||||
|
|
||||||
|
void dumbDeleter(const Slice& key, void* value) { }
|
||||||
|
|
||||||
|
TEST(CacheTest, UsageTest) {
|
||||||
|
// cache is shared_ptr and will be automatically cleaned up.
|
||||||
|
const uint64_t kCapacity = 100000;
|
||||||
|
auto cache = NewLRUCache(kCapacity, 8, 200);
|
||||||
|
|
||||||
|
size_t usage = 0;
|
||||||
|
const char* value = "abcdef";
|
||||||
|
// make sure everything will be cached
|
||||||
|
for (int i = 1; i < 100; ++i) {
|
||||||
|
std::string key(i, 'a');
|
||||||
|
auto kv_size = key.size() + 5;
|
||||||
|
cache->Release(
|
||||||
|
cache->Insert(key, (void*)value, kv_size, dumbDeleter)
|
||||||
|
);
|
||||||
|
usage += kv_size;
|
||||||
|
ASSERT_EQ(usage, cache->GetUsage());
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure the cache will be overloaded
|
||||||
|
for (uint64_t i = 1; i < kCapacity; ++i) {
|
||||||
|
auto key = std::to_string(i);
|
||||||
|
cache->Release(
|
||||||
|
cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// the usage should be close to the capacity
|
||||||
|
ASSERT_GT(kCapacity, cache->GetUsage());
|
||||||
|
ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
|
||||||
|
}
|
||||||
|
|
||||||
TEST(CacheTest, HitAndMiss) {
|
TEST(CacheTest, HitAndMiss) {
|
||||||
ASSERT_EQ(-1, Lookup(100));
|
ASSERT_EQ(-1, Lookup(100));
|
||||||
|
|
||||||
@ -353,7 +386,6 @@ void deleter(const Slice& key, void* value) {
|
|||||||
delete (Value *)value;
|
delete (Value *)value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(CacheTest, BadEviction) {
|
TEST(CacheTest, BadEviction) {
|
||||||
int n = 10;
|
int n = 10;
|
||||||
|
|
||||||
|
196
util/coding.cc
196
util/coding.cc
@ -9,131 +9,41 @@
|
|||||||
|
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include "rocksdb/slice.h"
|
#include "rocksdb/slice.h"
|
||||||
#include "rocksdb/slice_transform.h"
|
#include "rocksdb/slice_transform.h"
|
||||||
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
|
||||||
void EncodeFixed32(char* buf, uint32_t value) {
|
|
||||||
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
|
||||||
memcpy(buf, &value, sizeof(value));
|
|
||||||
#else
|
|
||||||
buf[0] = value & 0xff;
|
|
||||||
buf[1] = (value >> 8) & 0xff;
|
|
||||||
buf[2] = (value >> 16) & 0xff;
|
|
||||||
buf[3] = (value >> 24) & 0xff;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
void EncodeFixed64(char* buf, uint64_t value) {
|
|
||||||
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
|
||||||
memcpy(buf, &value, sizeof(value));
|
|
||||||
#else
|
|
||||||
buf[0] = value & 0xff;
|
|
||||||
buf[1] = (value >> 8) & 0xff;
|
|
||||||
buf[2] = (value >> 16) & 0xff;
|
|
||||||
buf[3] = (value >> 24) & 0xff;
|
|
||||||
buf[4] = (value >> 32) & 0xff;
|
|
||||||
buf[5] = (value >> 40) & 0xff;
|
|
||||||
buf[6] = (value >> 48) & 0xff;
|
|
||||||
buf[7] = (value >> 56) & 0xff;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
void PutFixed32(std::string* dst, uint32_t value) {
|
|
||||||
char buf[sizeof(value)];
|
|
||||||
EncodeFixed32(buf, value);
|
|
||||||
dst->append(buf, sizeof(buf));
|
|
||||||
}
|
|
||||||
|
|
||||||
void PutFixed64(std::string* dst, uint64_t value) {
|
|
||||||
char buf[sizeof(value)];
|
|
||||||
EncodeFixed64(buf, value);
|
|
||||||
dst->append(buf, sizeof(buf));
|
|
||||||
}
|
|
||||||
|
|
||||||
char* EncodeVarint32(char* dst, uint32_t v) {
|
char* EncodeVarint32(char* dst, uint32_t v) {
|
||||||
// Operate on characters as unsigneds
|
// Operate on characters as unsigneds
|
||||||
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
|
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
|
||||||
static const int B = 128;
|
static const int B = 128;
|
||||||
if (v < (1<<7)) {
|
if (v < (1 << 7)) {
|
||||||
*(ptr++) = v;
|
*(ptr++) = v;
|
||||||
} else if (v < (1<<14)) {
|
} else if (v < (1 << 14)) {
|
||||||
*(ptr++) = v | B;
|
*(ptr++) = v | B;
|
||||||
*(ptr++) = v>>7;
|
*(ptr++) = v >> 7;
|
||||||
} else if (v < (1<<21)) {
|
} else if (v < (1 << 21)) {
|
||||||
*(ptr++) = v | B;
|
*(ptr++) = v | B;
|
||||||
*(ptr++) = (v>>7) | B;
|
*(ptr++) = (v >> 7) | B;
|
||||||
*(ptr++) = v>>14;
|
*(ptr++) = v >> 14;
|
||||||
} else if (v < (1<<28)) {
|
} else if (v < (1 << 28)) {
|
||||||
*(ptr++) = v | B;
|
*(ptr++) = v | B;
|
||||||
*(ptr++) = (v>>7) | B;
|
*(ptr++) = (v >> 7) | B;
|
||||||
*(ptr++) = (v>>14) | B;
|
*(ptr++) = (v >> 14) | B;
|
||||||
*(ptr++) = v>>21;
|
*(ptr++) = v >> 21;
|
||||||
} else {
|
} else {
|
||||||
*(ptr++) = v | B;
|
*(ptr++) = v | B;
|
||||||
*(ptr++) = (v>>7) | B;
|
*(ptr++) = (v >> 7) | B;
|
||||||
*(ptr++) = (v>>14) | B;
|
*(ptr++) = (v >> 14) | B;
|
||||||
*(ptr++) = (v>>21) | B;
|
*(ptr++) = (v >> 21) | B;
|
||||||
*(ptr++) = v>>28;
|
*(ptr++) = v >> 28;
|
||||||
}
|
}
|
||||||
return reinterpret_cast<char*>(ptr);
|
return reinterpret_cast<char*>(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PutVarint32(std::string* dst, uint32_t v) {
|
const char* GetVarint32PtrFallback(const char* p, const char* limit,
|
||||||
char buf[5];
|
|
||||||
char* ptr = EncodeVarint32(buf, v);
|
|
||||||
dst->append(buf, ptr - buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
char* EncodeVarint64(char* dst, uint64_t v) {
|
|
||||||
static const unsigned int B = 128;
|
|
||||||
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
|
|
||||||
while (v >= B) {
|
|
||||||
*(ptr++) = (v & (B-1)) | B;
|
|
||||||
v >>= 7;
|
|
||||||
}
|
|
||||||
*(ptr++) = static_cast<unsigned char>(v);
|
|
||||||
return reinterpret_cast<char*>(ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PutVarint64(std::string* dst, uint64_t v) {
|
|
||||||
char buf[10];
|
|
||||||
char* ptr = EncodeVarint64(buf, v);
|
|
||||||
dst->append(buf, ptr - buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
|
|
||||||
PutVarint32(dst, value.size());
|
|
||||||
dst->append(value.data(), value.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
void PutLengthPrefixedSliceParts(std::string* dst,
|
|
||||||
const SliceParts& slice_parts) {
|
|
||||||
uint32_t total_bytes = 0;
|
|
||||||
for (int i = 0; i < slice_parts.num_parts; ++i) {
|
|
||||||
total_bytes += slice_parts.parts[i].size();
|
|
||||||
}
|
|
||||||
PutVarint32(dst, total_bytes);
|
|
||||||
for (int i = 0; i < slice_parts.num_parts; ++i) {
|
|
||||||
dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int VarintLength(uint64_t v) {
|
|
||||||
int len = 1;
|
|
||||||
while (v >= 128) {
|
|
||||||
v >>= 7;
|
|
||||||
len++;
|
|
||||||
}
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char* GetVarint32PtrFallback(const char* p,
|
|
||||||
const char* limit,
|
|
||||||
uint32_t* value) {
|
uint32_t* value) {
|
||||||
uint32_t result = 0;
|
uint32_t result = 0;
|
||||||
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
|
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
|
||||||
@ -151,18 +61,6 @@ const char* GetVarint32PtrFallback(const char* p,
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool GetVarint32(Slice* input, uint32_t* value) {
|
|
||||||
const char* p = input->data();
|
|
||||||
const char* limit = p + input->size();
|
|
||||||
const char* q = GetVarint32Ptr(p, limit, value);
|
|
||||||
if (q == nullptr) {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
*input = Slice(q, limit - q);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
|
const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
|
||||||
uint64_t result = 0;
|
uint64_t result = 0;
|
||||||
for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
|
for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
|
||||||
@ -180,58 +78,6 @@ const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool GetVarint64(Slice* input, uint64_t* value) {
|
|
||||||
const char* p = input->data();
|
|
||||||
const char* limit = p + input->size();
|
|
||||||
const char* q = GetVarint64Ptr(p, limit, value);
|
|
||||||
if (q == nullptr) {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
*input = Slice(q, limit - q);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const char* GetLengthPrefixedSlice(const char* p, const char* limit,
|
|
||||||
Slice* result) {
|
|
||||||
uint32_t len;
|
|
||||||
p = GetVarint32Ptr(p, limit, &len);
|
|
||||||
if (p == nullptr) return nullptr;
|
|
||||||
if (p + len > limit) return nullptr;
|
|
||||||
*result = Slice(p, len);
|
|
||||||
return p + len;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
|
|
||||||
uint32_t len;
|
|
||||||
if (GetVarint32(input, &len) &&
|
|
||||||
input->size() >= len) {
|
|
||||||
*result = Slice(input->data(), len);
|
|
||||||
input->remove_prefix(len);
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Slice GetLengthPrefixedSlice(const char* data) {
|
|
||||||
uint32_t len;
|
|
||||||
const char* p = data;
|
|
||||||
p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
|
|
||||||
return Slice(p, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
Slice GetSliceUntil(Slice* slice, char delimiter) {
|
|
||||||
uint32_t len;
|
|
||||||
for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
|
|
||||||
// nothing
|
|
||||||
}
|
|
||||||
|
|
||||||
Slice ret(slice->data(), len);
|
|
||||||
slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
|
void BitStreamPutInt(char* dst, size_t dstlen, size_t offset,
|
||||||
uint32_t bits, uint64_t value) {
|
uint32_t bits, uint64_t value) {
|
||||||
assert((offset + bits + 7)/8 <= dstlen);
|
assert((offset + bits + 7)/8 <= dstlen);
|
||||||
@ -320,14 +166,4 @@ void BitStreamPutInt(std::string* dst, size_t offset, uint32_t bits,
|
|||||||
BitStreamGetInt(dst, offset, bits));
|
BitStreamGetInt(dst, offset, bits));
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t BitStreamGetInt(const std::string* src, size_t offset,
|
|
||||||
uint32_t bits) {
|
|
||||||
return BitStreamGetInt(src->data(), src->size(), offset, bits);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t BitStreamGetInt(const Slice* src, size_t offset,
|
|
||||||
uint32_t bits) {
|
|
||||||
return BitStreamGetInt(src->data(), src->size(), offset, bits);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
153
util/coding.h
153
util/coding.h
@ -13,6 +13,7 @@
|
|||||||
// * Strings are encoded prefixed by their length in varint format
|
// * Strings are encoded prefixed by their length in varint format
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include <algorithm>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -40,6 +41,7 @@ extern void PutLengthPrefixedSliceParts(std::string* dst,
|
|||||||
extern bool GetVarint32(Slice* input, uint32_t* value);
|
extern bool GetVarint32(Slice* input, uint32_t* value);
|
||||||
extern bool GetVarint64(Slice* input, uint64_t* value);
|
extern bool GetVarint64(Slice* input, uint64_t* value);
|
||||||
extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
|
extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
|
||||||
|
// This function assumes data is well-formed.
|
||||||
extern Slice GetLengthPrefixedSlice(const char* data);
|
extern Slice GetLengthPrefixedSlice(const char* data);
|
||||||
|
|
||||||
extern Slice GetSliceUntil(Slice* slice, char delimiter);
|
extern Slice GetSliceUntil(Slice* slice, char delimiter);
|
||||||
@ -138,4 +140,155 @@ extern uint64_t BitStreamGetInt(const std::string* src, size_t offset,
|
|||||||
extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
|
extern uint64_t BitStreamGetInt(const Slice* src, size_t offset,
|
||||||
uint32_t bits);
|
uint32_t bits);
|
||||||
|
|
||||||
|
// -- Implementation of the functions declared above
|
||||||
|
inline void EncodeFixed32(char* buf, uint32_t value) {
|
||||||
|
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||||
|
memcpy(buf, &value, sizeof(value));
|
||||||
|
#else
|
||||||
|
buf[0] = value & 0xff;
|
||||||
|
buf[1] = (value >> 8) & 0xff;
|
||||||
|
buf[2] = (value >> 16) & 0xff;
|
||||||
|
buf[3] = (value >> 24) & 0xff;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void EncodeFixed64(char* buf, uint64_t value) {
|
||||||
|
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||||
|
memcpy(buf, &value, sizeof(value));
|
||||||
|
#else
|
||||||
|
buf[0] = value & 0xff;
|
||||||
|
buf[1] = (value >> 8) & 0xff;
|
||||||
|
buf[2] = (value >> 16) & 0xff;
|
||||||
|
buf[3] = (value >> 24) & 0xff;
|
||||||
|
buf[4] = (value >> 32) & 0xff;
|
||||||
|
buf[5] = (value >> 40) & 0xff;
|
||||||
|
buf[6] = (value >> 48) & 0xff;
|
||||||
|
buf[7] = (value >> 56) & 0xff;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void PutFixed32(std::string* dst, uint32_t value) {
|
||||||
|
char buf[sizeof(value)];
|
||||||
|
EncodeFixed32(buf, value);
|
||||||
|
dst->append(buf, sizeof(buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void PutFixed64(std::string* dst, uint64_t value) {
|
||||||
|
char buf[sizeof(value)];
|
||||||
|
EncodeFixed64(buf, value);
|
||||||
|
dst->append(buf, sizeof(buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void PutVarint32(std::string* dst, uint32_t v) {
|
||||||
|
char buf[5];
|
||||||
|
char* ptr = EncodeVarint32(buf, v);
|
||||||
|
dst->append(buf, ptr - buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline char* EncodeVarint64(char* dst, uint64_t v) {
|
||||||
|
static const unsigned int B = 128;
|
||||||
|
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
|
||||||
|
while (v >= B) {
|
||||||
|
*(ptr++) = (v & (B - 1)) | B;
|
||||||
|
v >>= 7;
|
||||||
|
}
|
||||||
|
*(ptr++) = static_cast<unsigned char>(v);
|
||||||
|
return reinterpret_cast<char*>(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void PutVarint64(std::string* dst, uint64_t v) {
|
||||||
|
char buf[10];
|
||||||
|
char* ptr = EncodeVarint64(buf, v);
|
||||||
|
dst->append(buf, ptr - buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
|
||||||
|
PutVarint32(dst, value.size());
|
||||||
|
dst->append(value.data(), value.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void PutLengthPrefixedSliceParts(std::string* dst,
|
||||||
|
const SliceParts& slice_parts) {
|
||||||
|
uint32_t total_bytes = 0;
|
||||||
|
for (int i = 0; i < slice_parts.num_parts; ++i) {
|
||||||
|
total_bytes += slice_parts.parts[i].size();
|
||||||
|
}
|
||||||
|
PutVarint32(dst, total_bytes);
|
||||||
|
for (int i = 0; i < slice_parts.num_parts; ++i) {
|
||||||
|
dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int VarintLength(uint64_t v) {
|
||||||
|
int len = 1;
|
||||||
|
while (v >= 128) {
|
||||||
|
v >>= 7;
|
||||||
|
len++;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool GetVarint32(Slice* input, uint32_t* value) {
|
||||||
|
const char* p = input->data();
|
||||||
|
const char* limit = p + input->size();
|
||||||
|
const char* q = GetVarint32Ptr(p, limit, value);
|
||||||
|
if (q == nullptr) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
*input = Slice(q, limit - q);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool GetVarint64(Slice* input, uint64_t* value) {
|
||||||
|
const char* p = input->data();
|
||||||
|
const char* limit = p + input->size();
|
||||||
|
const char* q = GetVarint64Ptr(p, limit, value);
|
||||||
|
if (q == nullptr) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
*input = Slice(q, limit - q);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
|
||||||
|
uint32_t len = 0;
|
||||||
|
if (GetVarint32(input, &len) && input->size() >= len) {
|
||||||
|
*result = Slice(input->data(), len);
|
||||||
|
input->remove_prefix(len);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Slice GetLengthPrefixedSlice(const char* data) {
|
||||||
|
uint32_t len = 0;
|
||||||
|
// +5: we assume "data" is not corrupted
|
||||||
|
auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
|
||||||
|
return Slice(p, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Slice GetSliceUntil(Slice* slice, char delimiter) {
|
||||||
|
uint32_t len = 0;
|
||||||
|
for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
|
||||||
|
// nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice ret(slice->data(), len);
|
||||||
|
slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint64_t BitStreamGetInt(const std::string* src, size_t offset,
|
||||||
|
uint32_t bits) {
|
||||||
|
return BitStreamGetInt(src->data(), src->size(), offset, bits);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint64_t BitStreamGetInt(const Slice* src, size_t offset,
|
||||||
|
uint32_t bits) {
|
||||||
|
return BitStreamGetInt(src->data(), src->size(), offset, bits);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -41,7 +41,7 @@ TEST(Coding, Fixed64) {
|
|||||||
const char* p = s.data();
|
const char* p = s.data();
|
||||||
for (int power = 0; power <= 63; power++) {
|
for (int power = 0; power <= 63; power++) {
|
||||||
uint64_t v = static_cast<uint64_t>(1) << power;
|
uint64_t v = static_cast<uint64_t>(1) << power;
|
||||||
uint64_t actual;
|
uint64_t actual = 0;
|
||||||
actual = DecodeFixed64(p);
|
actual = DecodeFixed64(p);
|
||||||
ASSERT_EQ(v-1, actual);
|
ASSERT_EQ(v-1, actual);
|
||||||
p += sizeof(uint64_t);
|
p += sizeof(uint64_t);
|
||||||
@ -90,7 +90,7 @@ TEST(Coding, Varint32) {
|
|||||||
const char* limit = p + s.size();
|
const char* limit = p + s.size();
|
||||||
for (uint32_t i = 0; i < (32 * 32); i++) {
|
for (uint32_t i = 0; i < (32 * 32); i++) {
|
||||||
uint32_t expected = (i / 32) << (i % 32);
|
uint32_t expected = (i / 32) << (i % 32);
|
||||||
uint32_t actual;
|
uint32_t actual = 0;
|
||||||
const char* start = p;
|
const char* start = p;
|
||||||
p = GetVarint32Ptr(p, limit, &actual);
|
p = GetVarint32Ptr(p, limit, &actual);
|
||||||
ASSERT_TRUE(p != nullptr);
|
ASSERT_TRUE(p != nullptr);
|
||||||
@ -125,7 +125,7 @@ TEST(Coding, Varint64) {
|
|||||||
const char* limit = p + s.size();
|
const char* limit = p + s.size();
|
||||||
for (unsigned int i = 0; i < values.size(); i++) {
|
for (unsigned int i = 0; i < values.size(); i++) {
|
||||||
ASSERT_TRUE(p < limit);
|
ASSERT_TRUE(p < limit);
|
||||||
uint64_t actual;
|
uint64_t actual = 0;
|
||||||
const char* start = p;
|
const char* start = p;
|
||||||
p = GetVarint64Ptr(p, limit, &actual);
|
p = GetVarint64Ptr(p, limit, &actual);
|
||||||
ASSERT_TRUE(p != nullptr);
|
ASSERT_TRUE(p != nullptr);
|
||||||
|
36
util/dynamic_bloom.cc
Normal file
36
util/dynamic_bloom.cc
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#include "dynamic_bloom.h"
|
||||||
|
|
||||||
|
#include "rocksdb/slice.h"
|
||||||
|
#include "util/hash.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
static uint32_t BloomHash(const Slice& key) {
|
||||||
|
return Hash(key.data(), key.size(), 0xbc9f1d34);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DynamicBloom::DynamicBloom(uint32_t total_bits,
|
||||||
|
uint32_t (*hash_func)(const Slice& key),
|
||||||
|
uint32_t num_probes)
|
||||||
|
: hash_func_(hash_func),
|
||||||
|
kTotalBits((total_bits + 7) / 8 * 8),
|
||||||
|
kNumProbes(num_probes) {
|
||||||
|
assert(hash_func_);
|
||||||
|
assert(kNumProbes > 0);
|
||||||
|
assert(kTotalBits > 0);
|
||||||
|
data_.reset(new unsigned char[kTotalBits / 8]());
|
||||||
|
}
|
||||||
|
|
||||||
|
DynamicBloom::DynamicBloom(uint32_t total_bits,
|
||||||
|
uint32_t num_probes)
|
||||||
|
: DynamicBloom(total_bits, &BloomHash, num_probes) {
|
||||||
|
}
|
||||||
|
|
||||||
|
} // rocksdb
|
72
util/dynamic_bloom.h
Normal file
72
util/dynamic_bloom.h
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class Slice;
|
||||||
|
|
||||||
|
class DynamicBloom {
|
||||||
|
public:
|
||||||
|
// total_bits: fixed total bits for the bloom
|
||||||
|
// hash_func: customized hash function
|
||||||
|
// num_probes: number of hash probes for a single key
|
||||||
|
DynamicBloom(uint32_t total_bits,
|
||||||
|
uint32_t (*hash_func)(const Slice& key),
|
||||||
|
uint32_t num_probes = 6);
|
||||||
|
|
||||||
|
explicit DynamicBloom(uint32_t total_bits, uint32_t num_probes = 6);
|
||||||
|
|
||||||
|
// Assuming single threaded access to this function.
|
||||||
|
void Add(const Slice& key);
|
||||||
|
|
||||||
|
// Assuming single threaded access to this function.
|
||||||
|
void AddHash(uint32_t hash);
|
||||||
|
|
||||||
|
// Multithreaded access to this function is OK
|
||||||
|
bool MayContain(const Slice& key);
|
||||||
|
|
||||||
|
// Multithreaded access to this function is OK
|
||||||
|
bool MayContainHash(uint32_t hash);
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint32_t (*hash_func_)(const Slice& key);
|
||||||
|
const uint32_t kTotalBits;
|
||||||
|
const uint32_t kNumProbes;
|
||||||
|
std::unique_ptr<unsigned char[]> data_;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
|
||||||
|
|
||||||
|
inline bool DynamicBloom::MayContain(const Slice& key) {
|
||||||
|
return (MayContainHash(hash_func_(key)));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool DynamicBloom::MayContainHash(uint32_t h) {
|
||||||
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||||
|
for (uint32_t i = 0; i < kNumProbes; i++) {
|
||||||
|
const uint32_t bitpos = h % kTotalBits;
|
||||||
|
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
h += delta;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void DynamicBloom::AddHash(uint32_t h) {
|
||||||
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||||
|
for (uint32_t i = 0; i < kNumProbes; i++) {
|
||||||
|
const uint32_t bitpos = h % kTotalBits;
|
||||||
|
data_[bitpos / 8] |= (1 << (bitpos % 8));
|
||||||
|
h += delta;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // rocksdb
|
113
util/dynamic_bloom_test.cc
Normal file
113
util/dynamic_bloom_test.cc
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#include <gflags/gflags.h>
|
||||||
|
|
||||||
|
#include "dynamic_bloom.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/testharness.h"
|
||||||
|
#include "util/testutil.h"
|
||||||
|
|
||||||
|
DEFINE_int32(bits_per_key, 10, "");
|
||||||
|
DEFINE_int32(num_probes, 6, "");
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
static Slice Key(int i, char* buffer) {
|
||||||
|
memcpy(buffer, &i, sizeof(i));
|
||||||
|
return Slice(buffer, sizeof(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
class DynamicBloomTest {
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST(DynamicBloomTest, EmptyFilter) {
|
||||||
|
DynamicBloom bloom(100, 2);
|
||||||
|
ASSERT_TRUE(! bloom.MayContain("hello"));
|
||||||
|
ASSERT_TRUE(! bloom.MayContain("world"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DynamicBloomTest, Small) {
|
||||||
|
DynamicBloom bloom(100, 2);
|
||||||
|
bloom.Add("hello");
|
||||||
|
bloom.Add("world");
|
||||||
|
ASSERT_TRUE(bloom.MayContain("hello"));
|
||||||
|
ASSERT_TRUE(bloom.MayContain("world"));
|
||||||
|
ASSERT_TRUE(! bloom.MayContain("x"));
|
||||||
|
ASSERT_TRUE(! bloom.MayContain("foo"));
|
||||||
|
}
|
||||||
|
|
||||||
|
static int NextLength(int length) {
|
||||||
|
if (length < 10) {
|
||||||
|
length += 1;
|
||||||
|
} else if (length < 100) {
|
||||||
|
length += 10;
|
||||||
|
} else if (length < 1000) {
|
||||||
|
length += 100;
|
||||||
|
} else {
|
||||||
|
length += 1000;
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(DynamicBloomTest, VaryingLengths) {
|
||||||
|
char buffer[sizeof(int)];
|
||||||
|
|
||||||
|
// Count number of filters that significantly exceed the false positive rate
|
||||||
|
int mediocre_filters = 0;
|
||||||
|
int good_filters = 0;
|
||||||
|
|
||||||
|
fprintf(stderr, "bits_per_key: %d num_probes: %d\n",
|
||||||
|
FLAGS_bits_per_key, FLAGS_num_probes);
|
||||||
|
|
||||||
|
for (int length = 1; length <= 10000; length = NextLength(length)) {
|
||||||
|
uint32_t bloom_bits = std::max(length * FLAGS_bits_per_key, 64);
|
||||||
|
DynamicBloom bloom(bloom_bits, FLAGS_num_probes);
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
bloom.Add(Key(i, buffer));
|
||||||
|
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// All added keys must match
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)))
|
||||||
|
<< "Length " << length << "; key " << i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check false positive rate
|
||||||
|
|
||||||
|
int result = 0;
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
if (bloom.MayContain(Key(i + 1000000000, buffer))) {
|
||||||
|
result++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double rate = result / 10000.0;
|
||||||
|
|
||||||
|
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; \n",
|
||||||
|
rate*100.0, length);
|
||||||
|
|
||||||
|
//ASSERT_LE(rate, 0.02); // Must not be over 2%
|
||||||
|
if (rate > 0.0125)
|
||||||
|
mediocre_filters++; // Allowed, but not too often
|
||||||
|
else
|
||||||
|
good_filters++;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "Filters: %d good, %d mediocre\n",
|
||||||
|
good_filters, mediocre_filters);
|
||||||
|
|
||||||
|
ASSERT_LE(mediocre_filters, good_filters/5);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Different bits-per-byte
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||||
|
|
||||||
|
return rocksdb::test::RunAllTests();
|
||||||
|
}
|
@ -306,7 +306,13 @@ class PosixMmapReadableFile: public RandomAccessFile {
|
|||||||
assert(options.use_mmap_reads);
|
assert(options.use_mmap_reads);
|
||||||
assert(options.use_os_buffer);
|
assert(options.use_os_buffer);
|
||||||
}
|
}
|
||||||
virtual ~PosixMmapReadableFile() { munmap(mmapped_region_, length_); }
|
virtual ~PosixMmapReadableFile() {
|
||||||
|
int ret = munmap(mmapped_region_, length_);
|
||||||
|
if (ret != 0) {
|
||||||
|
fprintf(stdout, "failed to munmap %p length %zu \n",
|
||||||
|
mmapped_region_, length_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
||||||
char* scratch) const {
|
char* scratch) const {
|
||||||
|
470
util/hash_linklist_rep.cc
Normal file
470
util/hash_linklist_rep.cc
Normal file
@ -0,0 +1,470 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "util/hash_linklist_rep.h"
|
||||||
|
|
||||||
|
#include "rocksdb/memtablerep.h"
|
||||||
|
#include "util/arena.h"
|
||||||
|
#include "rocksdb/slice.h"
|
||||||
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "port/port.h"
|
||||||
|
#include "port/atomic_pointer.h"
|
||||||
|
#include "util/murmurhash.h"
|
||||||
|
#include "db/memtable.h"
|
||||||
|
#include "db/skiplist.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
typedef const char* Key;
|
||||||
|
|
||||||
|
struct Node {
|
||||||
|
explicit Node(const Key& k) :
|
||||||
|
key(k) {
|
||||||
|
}
|
||||||
|
|
||||||
|
Key const key;
|
||||||
|
|
||||||
|
// Accessors/mutators for links. Wrapped in methods so we can
|
||||||
|
// add the appropriate barriers as necessary.
|
||||||
|
Node* Next() {
|
||||||
|
// Use an 'acquire load' so that we observe a fully initialized
|
||||||
|
// version of the returned Node.
|
||||||
|
return reinterpret_cast<Node*>(next_.Acquire_Load());
|
||||||
|
}
|
||||||
|
void SetNext(Node* x) {
|
||||||
|
// Use a 'release store' so that anybody who reads through this
|
||||||
|
// pointer observes a fully initialized version of the inserted node.
|
||||||
|
next_.Release_Store(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
// No-barrier variants that can be safely used in a few locations.
|
||||||
|
Node* NoBarrier_Next() {
|
||||||
|
return reinterpret_cast<Node*>(next_.NoBarrier_Load());
|
||||||
|
}
|
||||||
|
void NoBarrier_SetNext(Node* x) {
|
||||||
|
next_.NoBarrier_Store(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
port::AtomicPointer next_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class HashLinkListRep : public MemTableRep {
|
||||||
|
public:
|
||||||
|
HashLinkListRep(MemTableRep::KeyComparator& compare, Arena* arena,
|
||||||
|
const SliceTransform* transform, size_t bucket_size);
|
||||||
|
|
||||||
|
virtual void Insert(const char* key) override;
|
||||||
|
|
||||||
|
virtual bool Contains(const char* key) const override;
|
||||||
|
|
||||||
|
virtual size_t ApproximateMemoryUsage() override;
|
||||||
|
|
||||||
|
virtual ~HashLinkListRep();
|
||||||
|
|
||||||
|
virtual MemTableRep::Iterator* GetIterator() override;
|
||||||
|
|
||||||
|
virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
|
||||||
|
|
||||||
|
virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
|
||||||
|
override;
|
||||||
|
|
||||||
|
virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend class DynamicIterator;
|
||||||
|
typedef SkipList<const char*, MemTableRep::KeyComparator&> FullList;
|
||||||
|
|
||||||
|
size_t bucket_size_;
|
||||||
|
|
||||||
|
// Maps slices (which are transformed user keys) to buckets of keys sharing
|
||||||
|
// the same transform.
|
||||||
|
port::AtomicPointer* buckets_;
|
||||||
|
|
||||||
|
// The user-supplied transform whose domain is the user keys.
|
||||||
|
const SliceTransform* transform_;
|
||||||
|
|
||||||
|
MemTableRep::KeyComparator& compare_;
|
||||||
|
// immutable after construction
|
||||||
|
Arena* const arena_;
|
||||||
|
|
||||||
|
bool BucketContains(Node* head, const Slice& key) const;
|
||||||
|
|
||||||
|
Slice GetPrefix(const Slice& internal_key) const {
|
||||||
|
return transform_->Transform(ExtractUserKey(internal_key));
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GetHash(const Slice& slice) const {
|
||||||
|
return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Node* GetBucket(size_t i) const {
|
||||||
|
return static_cast<Node*>(buckets_[i].Acquire_Load());
|
||||||
|
}
|
||||||
|
|
||||||
|
Node* GetBucket(const Slice& slice) const {
|
||||||
|
return GetBucket(GetHash(slice));
|
||||||
|
}
|
||||||
|
|
||||||
|
Node* NewNode(const Key& key) {
|
||||||
|
char* mem = arena_->AllocateAligned(sizeof(Node));
|
||||||
|
return new (mem) Node(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Equal(const Slice& a, const Key& b) const {
|
||||||
|
return (compare_(b, a) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
|
||||||
|
|
||||||
|
bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
|
||||||
|
// nullptr n is considered infinite
|
||||||
|
return (n != nullptr) && (compare_(n->key, internal_key) < 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool KeyIsAfterNode(const Key& key, const Node* n) const {
|
||||||
|
// nullptr n is considered infinite
|
||||||
|
return (n != nullptr) && (compare_(n->key, key) < 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
|
||||||
|
|
||||||
|
class FullListIterator : public MemTableRep::Iterator {
|
||||||
|
public:
|
||||||
|
explicit FullListIterator(FullList* list)
|
||||||
|
: iter_(list), full_list_(list) {}
|
||||||
|
|
||||||
|
virtual ~FullListIterator() {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true iff the iterator is positioned at a valid node.
|
||||||
|
virtual bool Valid() const {
|
||||||
|
return iter_.Valid();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the key at the current position.
|
||||||
|
// REQUIRES: Valid()
|
||||||
|
virtual const char* key() const {
|
||||||
|
assert(Valid());
|
||||||
|
return iter_.key();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advances to the next position.
|
||||||
|
// REQUIRES: Valid()
|
||||||
|
virtual void Next() {
|
||||||
|
assert(Valid());
|
||||||
|
iter_.Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advances to the previous position.
|
||||||
|
// REQUIRES: Valid()
|
||||||
|
virtual void Prev() {
|
||||||
|
assert(Valid());
|
||||||
|
iter_.Prev();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advance to the first entry with a key >= target
|
||||||
|
virtual void Seek(const Slice& internal_key, const char* memtable_key) {
|
||||||
|
const char* encoded_key =
|
||||||
|
(memtable_key != nullptr) ?
|
||||||
|
memtable_key : EncodeKey(&tmp_, internal_key);
|
||||||
|
iter_.Seek(encoded_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Position at the first entry in collection.
|
||||||
|
// Final state of iterator is Valid() iff collection is not empty.
|
||||||
|
virtual void SeekToFirst() {
|
||||||
|
iter_.SeekToFirst();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Position at the last entry in collection.
|
||||||
|
// Final state of iterator is Valid() iff collection is not empty.
|
||||||
|
virtual void SeekToLast() {
|
||||||
|
iter_.SeekToLast();
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
FullList::Iterator iter_;
|
||||||
|
// To destruct with the iterator.
|
||||||
|
std::unique_ptr<FullList> full_list_;
|
||||||
|
std::string tmp_; // For passing to EncodeKey
|
||||||
|
};
|
||||||
|
|
||||||
|
class Iterator : public MemTableRep::Iterator {
|
||||||
|
public:
|
||||||
|
explicit Iterator(const HashLinkListRep* const hash_link_list_rep,
|
||||||
|
Node* head) :
|
||||||
|
hash_link_list_rep_(hash_link_list_rep), head_(head), node_(nullptr) {
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual ~Iterator() {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true iff the iterator is positioned at a valid node.
|
||||||
|
virtual bool Valid() const {
|
||||||
|
return node_ != nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the key at the current position.
|
||||||
|
// REQUIRES: Valid()
|
||||||
|
virtual const char* key() const {
|
||||||
|
assert(Valid());
|
||||||
|
return node_->key;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advances to the next position.
|
||||||
|
// REQUIRES: Valid()
|
||||||
|
virtual void Next() {
|
||||||
|
assert(Valid());
|
||||||
|
node_ = node_->Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advances to the previous position.
|
||||||
|
// REQUIRES: Valid()
|
||||||
|
virtual void Prev() {
|
||||||
|
// Prefix iterator does not support total order.
|
||||||
|
// We simply set the iterator to invalid state
|
||||||
|
Reset(nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advance to the first entry with a key >= target
|
||||||
|
virtual void Seek(const Slice& internal_key, const char* memtable_key) {
|
||||||
|
node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
|
||||||
|
internal_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Position at the first entry in collection.
|
||||||
|
// Final state of iterator is Valid() iff collection is not empty.
|
||||||
|
virtual void SeekToFirst() {
|
||||||
|
// Prefix iterator does not support total order.
|
||||||
|
// We simply set the iterator to invalid state
|
||||||
|
Reset(nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Position at the last entry in collection.
|
||||||
|
// Final state of iterator is Valid() iff collection is not empty.
|
||||||
|
virtual void SeekToLast() {
|
||||||
|
// Prefix iterator does not support total order.
|
||||||
|
// We simply set the iterator to invalid state
|
||||||
|
Reset(nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void Reset(Node* head) {
|
||||||
|
head_ = head;
|
||||||
|
node_ = nullptr;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
friend class HashLinkListRep;
|
||||||
|
const HashLinkListRep* const hash_link_list_rep_;
|
||||||
|
Node* head_;
|
||||||
|
Node* node_;
|
||||||
|
std::string tmp_; // For passing to EncodeKey
|
||||||
|
|
||||||
|
virtual void SeekToHead() {
|
||||||
|
node_ = head_;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class DynamicIterator : public HashLinkListRep::Iterator {
|
||||||
|
public:
|
||||||
|
explicit DynamicIterator(HashLinkListRep& memtable_rep)
|
||||||
|
: HashLinkListRep::Iterator(&memtable_rep, nullptr),
|
||||||
|
memtable_rep_(memtable_rep) {}
|
||||||
|
|
||||||
|
// Advance to the first entry with a key >= target
|
||||||
|
virtual void Seek(const Slice& k, const char* memtable_key) {
|
||||||
|
auto transformed = memtable_rep_.GetPrefix(k);
|
||||||
|
Reset(memtable_rep_.GetBucket(transformed));
|
||||||
|
HashLinkListRep::Iterator::Seek(k, memtable_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// the underlying memtable
|
||||||
|
const HashLinkListRep& memtable_rep_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class EmptyIterator : public MemTableRep::Iterator {
|
||||||
|
// This is used when there wasn't a bucket. It is cheaper than
|
||||||
|
// instantiating an empty bucket over which to iterate.
|
||||||
|
public:
|
||||||
|
EmptyIterator() { }
|
||||||
|
virtual bool Valid() const {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
virtual const char* key() const {
|
||||||
|
assert(false);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
virtual void Next() { }
|
||||||
|
virtual void Prev() { }
|
||||||
|
virtual void Seek(const Slice& user_key, const char* memtable_key) { }
|
||||||
|
virtual void SeekToFirst() { }
|
||||||
|
virtual void SeekToLast() { }
|
||||||
|
private:
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
HashLinkListRep::HashLinkListRep(MemTableRep::KeyComparator& compare,
|
||||||
|
Arena* arena, const SliceTransform* transform,
|
||||||
|
size_t bucket_size)
|
||||||
|
: bucket_size_(bucket_size),
|
||||||
|
transform_(transform),
|
||||||
|
compare_(compare),
|
||||||
|
arena_(arena) {
|
||||||
|
char* mem = arena_->AllocateAligned(
|
||||||
|
sizeof(port::AtomicPointer) * bucket_size);
|
||||||
|
|
||||||
|
buckets_ = new (mem) port::AtomicPointer[bucket_size];
|
||||||
|
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
buckets_[i].NoBarrier_Store(nullptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HashLinkListRep::~HashLinkListRep() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void HashLinkListRep::Insert(const char* key) {
|
||||||
|
assert(!Contains(key));
|
||||||
|
Slice internal_key = GetLengthPrefixedSlice(key);
|
||||||
|
auto transformed = GetPrefix(internal_key);
|
||||||
|
auto& bucket = buckets_[GetHash(transformed)];
|
||||||
|
Node* head = static_cast<Node*>(bucket.Acquire_Load());
|
||||||
|
|
||||||
|
if (!head) {
|
||||||
|
Node* x = NewNode(key);
|
||||||
|
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||||
|
// we publish a pointer to "x" in prev[i].
|
||||||
|
x->NoBarrier_SetNext(nullptr);
|
||||||
|
bucket.Release_Store(static_cast<void*>(x));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Node* cur = head;
|
||||||
|
Node* prev = nullptr;
|
||||||
|
while (true) {
|
||||||
|
if (cur == nullptr) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Node* next = cur->Next();
|
||||||
|
// Make sure the lists are sorted.
|
||||||
|
// If x points to head_ or next points nullptr, it is trivially satisfied.
|
||||||
|
assert((cur == head) || (next == nullptr) ||
|
||||||
|
KeyIsAfterNode(next->key, cur));
|
||||||
|
if (KeyIsAfterNode(internal_key, cur)) {
|
||||||
|
// Keep searching in this list
|
||||||
|
prev = cur;
|
||||||
|
cur = next;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Our data structure does not allow duplicate insertion
|
||||||
|
assert(cur == nullptr || !Equal(key, cur->key));
|
||||||
|
|
||||||
|
Node* x = NewNode(key);
|
||||||
|
|
||||||
|
// NoBarrier_SetNext() suffices since we will add a barrier when
|
||||||
|
// we publish a pointer to "x" in prev[i].
|
||||||
|
x->NoBarrier_SetNext(cur);
|
||||||
|
|
||||||
|
if (prev) {
|
||||||
|
prev->SetNext(x);
|
||||||
|
} else {
|
||||||
|
bucket.Release_Store(static_cast<void*>(x));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HashLinkListRep::Contains(const char* key) const {
|
||||||
|
Slice internal_key = GetLengthPrefixedSlice(key);
|
||||||
|
|
||||||
|
auto transformed = GetPrefix(internal_key);
|
||||||
|
auto bucket = GetBucket(transformed);
|
||||||
|
if (bucket == nullptr) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return BucketContains(bucket, internal_key);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t HashLinkListRep::ApproximateMemoryUsage() {
|
||||||
|
// Memory is always allocated from the arena.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
MemTableRep::Iterator* HashLinkListRep::GetIterator() {
|
||||||
|
auto list = new FullList(compare_, arena_);
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
auto bucket = GetBucket(i);
|
||||||
|
if (bucket != nullptr) {
|
||||||
|
Iterator itr(this, bucket);
|
||||||
|
for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
|
||||||
|
list->Insert(itr.key());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new FullListIterator(list);
|
||||||
|
}
|
||||||
|
|
||||||
|
MemTableRep::Iterator* HashLinkListRep::GetPrefixIterator(
|
||||||
|
const Slice& prefix) {
|
||||||
|
auto bucket = GetBucket(prefix);
|
||||||
|
if (bucket == nullptr) {
|
||||||
|
return new EmptyIterator();
|
||||||
|
}
|
||||||
|
return new Iterator(this, bucket);
|
||||||
|
}
|
||||||
|
|
||||||
|
MemTableRep::Iterator* HashLinkListRep::GetIterator(const Slice& slice) {
|
||||||
|
return GetPrefixIterator(transform_->Transform(slice));
|
||||||
|
}
|
||||||
|
|
||||||
|
MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator() {
|
||||||
|
return new DynamicIterator(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HashLinkListRep::BucketContains(Node* head, const Slice& user_key) const {
|
||||||
|
Node* x = FindGreaterOrEqualInBucket(head, user_key);
|
||||||
|
return (x != nullptr && Equal(user_key, x->key));
|
||||||
|
}
|
||||||
|
|
||||||
|
Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
|
||||||
|
const Slice& key) const {
|
||||||
|
Node* x = head;
|
||||||
|
while (true) {
|
||||||
|
if (x == nullptr) {
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
Node* next = x->Next();
|
||||||
|
// Make sure the lists are sorted.
|
||||||
|
// If x points to head_ or next points nullptr, it is trivially satisfied.
|
||||||
|
assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
|
||||||
|
if (KeyIsAfterNode(key, x)) {
|
||||||
|
// Keep searching in this list
|
||||||
|
x = next;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // anon namespace
|
||||||
|
|
||||||
|
MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
|
||||||
|
MemTableRep::KeyComparator& compare, Arena* arena) {
|
||||||
|
return new HashLinkListRep(compare, arena, transform_, bucket_count_);
|
||||||
|
}
|
||||||
|
|
||||||
|
MemTableRepFactory* NewHashLinkListRepFactory(
|
||||||
|
const SliceTransform* transform, size_t bucket_count) {
|
||||||
|
return new HashLinkListRepFactory(transform, bucket_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace rocksdb
|
39
util/hash_linklist_rep.h
Normal file
39
util/hash_linklist_rep.h
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "rocksdb/memtablerep.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class HashLinkListRepFactory : public MemTableRepFactory {
|
||||||
|
public:
|
||||||
|
explicit HashLinkListRepFactory(
|
||||||
|
const SliceTransform* transform,
|
||||||
|
size_t bucket_count)
|
||||||
|
: transform_(transform),
|
||||||
|
bucket_count_(bucket_count) { }
|
||||||
|
|
||||||
|
virtual ~HashLinkListRepFactory() { delete transform_; }
|
||||||
|
|
||||||
|
virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
|
||||||
|
Arena* arena) override;
|
||||||
|
|
||||||
|
virtual const char* Name() const override {
|
||||||
|
return "HashLinkListRepFactory";
|
||||||
|
}
|
||||||
|
|
||||||
|
const SliceTransform* GetTransform() { return transform_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const SliceTransform* transform_;
|
||||||
|
const size_t bucket_count_;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
@ -7,12 +7,13 @@
|
|||||||
#include "util/hash_skiplist_rep.h"
|
#include "util/hash_skiplist_rep.h"
|
||||||
|
|
||||||
#include "rocksdb/memtablerep.h"
|
#include "rocksdb/memtablerep.h"
|
||||||
#include "rocksdb/arena.h"
|
#include "util/arena.h"
|
||||||
#include "rocksdb/slice.h"
|
#include "rocksdb/slice.h"
|
||||||
#include "rocksdb/slice_transform.h"
|
#include "rocksdb/slice_transform.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "port/atomic_pointer.h"
|
#include "port/atomic_pointer.h"
|
||||||
#include "util/murmurhash.h"
|
#include "util/murmurhash.h"
|
||||||
|
#include "db/memtable.h"
|
||||||
#include "db/skiplist.h"
|
#include "db/skiplist.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -21,7 +22,8 @@ namespace {
|
|||||||
class HashSkipListRep : public MemTableRep {
|
class HashSkipListRep : public MemTableRep {
|
||||||
public:
|
public:
|
||||||
HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena,
|
HashSkipListRep(MemTableRep::KeyComparator& compare, Arena* arena,
|
||||||
const SliceTransform* transform, size_t bucket_size);
|
const SliceTransform* transform, size_t bucket_size,
|
||||||
|
int32_t skiplist_height, int32_t skiplist_branching_factor);
|
||||||
|
|
||||||
virtual void Insert(const char* key) override;
|
virtual void Insert(const char* key) override;
|
||||||
|
|
||||||
@ -46,6 +48,9 @@ class HashSkipListRep : public MemTableRep {
|
|||||||
|
|
||||||
size_t bucket_size_;
|
size_t bucket_size_;
|
||||||
|
|
||||||
|
const int32_t skiplist_height_;
|
||||||
|
const int32_t skiplist_branching_factor_;
|
||||||
|
|
||||||
// Maps slices (which are transformed user keys) to buckets of keys sharing
|
// Maps slices (which are transformed user keys) to buckets of keys sharing
|
||||||
// the same transform.
|
// the same transform.
|
||||||
port::AtomicPointer* buckets_;
|
port::AtomicPointer* buckets_;
|
||||||
@ -112,9 +117,12 @@ class HashSkipListRep : public MemTableRep {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Advance to the first entry with a key >= target
|
// Advance to the first entry with a key >= target
|
||||||
virtual void Seek(const char* target) {
|
virtual void Seek(const Slice& internal_key, const char* memtable_key) {
|
||||||
if (list_ != nullptr) {
|
if (list_ != nullptr) {
|
||||||
iter_.Seek(target);
|
const char* encoded_key =
|
||||||
|
(memtable_key != nullptr) ?
|
||||||
|
memtable_key : EncodeKey(&tmp_, internal_key);
|
||||||
|
iter_.Seek(encoded_key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,6 +159,7 @@ class HashSkipListRep : public MemTableRep {
|
|||||||
// here we track if we own list_. If we own it, we are also
|
// here we track if we own list_. If we own it, we are also
|
||||||
// responsible for it's cleaning. This is a poor man's shared_ptr
|
// responsible for it's cleaning. This is a poor man's shared_ptr
|
||||||
bool own_list_;
|
bool own_list_;
|
||||||
|
std::string tmp_; // For passing to EncodeKey
|
||||||
};
|
};
|
||||||
|
|
||||||
class DynamicIterator : public HashSkipListRep::Iterator {
|
class DynamicIterator : public HashSkipListRep::Iterator {
|
||||||
@ -160,11 +169,10 @@ class HashSkipListRep : public MemTableRep {
|
|||||||
memtable_rep_(memtable_rep) {}
|
memtable_rep_(memtable_rep) {}
|
||||||
|
|
||||||
// Advance to the first entry with a key >= target
|
// Advance to the first entry with a key >= target
|
||||||
virtual void Seek(const char* target) {
|
virtual void Seek(const Slice& k, const char* memtable_key) {
|
||||||
auto transformed = memtable_rep_.transform_->Transform(
|
auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
|
||||||
memtable_rep_.UserKey(target));
|
|
||||||
Reset(memtable_rep_.GetBucket(transformed));
|
Reset(memtable_rep_.GetBucket(transformed));
|
||||||
HashSkipListRep::Iterator::Seek(target);
|
HashSkipListRep::Iterator::Seek(k, memtable_key);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Position at the first entry in collection.
|
// Position at the first entry in collection.
|
||||||
@ -201,7 +209,8 @@ class HashSkipListRep : public MemTableRep {
|
|||||||
}
|
}
|
||||||
virtual void Next() { }
|
virtual void Next() { }
|
||||||
virtual void Prev() { }
|
virtual void Prev() { }
|
||||||
virtual void Seek(const char* target) { }
|
virtual void Seek(const Slice& internal_key,
|
||||||
|
const char* memtable_key) { }
|
||||||
virtual void SeekToFirst() { }
|
virtual void SeekToFirst() { }
|
||||||
virtual void SeekToLast() { }
|
virtual void SeekToLast() { }
|
||||||
private:
|
private:
|
||||||
@ -210,8 +219,11 @@ class HashSkipListRep : public MemTableRep {
|
|||||||
|
|
||||||
HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
|
HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
|
||||||
Arena* arena, const SliceTransform* transform,
|
Arena* arena, const SliceTransform* transform,
|
||||||
size_t bucket_size)
|
size_t bucket_size, int32_t skiplist_height,
|
||||||
|
int32_t skiplist_branching_factor)
|
||||||
: bucket_size_(bucket_size),
|
: bucket_size_(bucket_size),
|
||||||
|
skiplist_height_(skiplist_height),
|
||||||
|
skiplist_branching_factor_(skiplist_branching_factor),
|
||||||
transform_(transform),
|
transform_(transform),
|
||||||
compare_(compare),
|
compare_(compare),
|
||||||
arena_(arena) {
|
arena_(arena) {
|
||||||
@ -232,7 +244,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
|
|||||||
auto bucket = GetBucket(hash);
|
auto bucket = GetBucket(hash);
|
||||||
if (bucket == nullptr) {
|
if (bucket == nullptr) {
|
||||||
auto addr = arena_->AllocateAligned(sizeof(Bucket));
|
auto addr = arena_->AllocateAligned(sizeof(Bucket));
|
||||||
bucket = new (addr) Bucket(compare_, arena_);
|
bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
|
||||||
|
skiplist_branching_factor_);
|
||||||
buckets_[hash].Release_Store(static_cast<void*>(bucket));
|
buckets_[hash].Release_Store(static_cast<void*>(bucket));
|
||||||
}
|
}
|
||||||
return bucket;
|
return bucket;
|
||||||
@ -292,12 +305,15 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
|
|||||||
|
|
||||||
MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
|
MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
|
||||||
MemTableRep::KeyComparator& compare, Arena* arena) {
|
MemTableRep::KeyComparator& compare, Arena* arena) {
|
||||||
return new HashSkipListRep(compare, arena, transform_, bucket_count_);
|
return new HashSkipListRep(compare, arena, transform_, bucket_count_,
|
||||||
|
skiplist_height_, skiplist_branching_factor_);
|
||||||
}
|
}
|
||||||
|
|
||||||
MemTableRepFactory* NewHashSkipListRepFactory(
|
MemTableRepFactory* NewHashSkipListRepFactory(
|
||||||
const SliceTransform* transform, size_t bucket_count) {
|
const SliceTransform* transform, size_t bucket_count,
|
||||||
return new HashSkipListRepFactory(transform, bucket_count);
|
int32_t skiplist_height, int32_t skiplist_branching_factor) {
|
||||||
|
return new HashSkipListRepFactory(transform, bucket_count,
|
||||||
|
skiplist_height, skiplist_branching_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -14,10 +14,15 @@ namespace rocksdb {
|
|||||||
|
|
||||||
class HashSkipListRepFactory : public MemTableRepFactory {
|
class HashSkipListRepFactory : public MemTableRepFactory {
|
||||||
public:
|
public:
|
||||||
explicit HashSkipListRepFactory(const SliceTransform* transform,
|
explicit HashSkipListRepFactory(
|
||||||
size_t bucket_count = 1000000)
|
const SliceTransform* transform,
|
||||||
: transform_(transform),
|
size_t bucket_count,
|
||||||
bucket_count_(bucket_count) { }
|
int32_t skiplist_height,
|
||||||
|
int32_t skiplist_branching_factor)
|
||||||
|
: transform_(transform),
|
||||||
|
bucket_count_(bucket_count),
|
||||||
|
skiplist_height_(skiplist_height),
|
||||||
|
skiplist_branching_factor_(skiplist_branching_factor) { }
|
||||||
|
|
||||||
virtual ~HashSkipListRepFactory() { delete transform_; }
|
virtual ~HashSkipListRepFactory() { delete transform_; }
|
||||||
|
|
||||||
@ -33,6 +38,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
|
|||||||
private:
|
private:
|
||||||
const SliceTransform* transform_;
|
const SliceTransform* transform_;
|
||||||
const size_t bucket_count_;
|
const size_t bucket_count_;
|
||||||
|
const int32_t skiplist_height_;
|
||||||
|
const int32_t skiplist_branching_factor_;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -16,10 +16,11 @@
|
|||||||
#include "rocksdb/comparator.h"
|
#include "rocksdb/comparator.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/filter_policy.h"
|
#include "rocksdb/filter_policy.h"
|
||||||
#include "rocksdb/merge_operator.h"
|
|
||||||
#include "rocksdb/memtablerep.h"
|
#include "rocksdb/memtablerep.h"
|
||||||
|
#include "rocksdb/merge_operator.h"
|
||||||
#include "rocksdb/slice.h"
|
#include "rocksdb/slice.h"
|
||||||
#include "rocksdb/slice_transform.h"
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "rocksdb/table.h"
|
||||||
#include "rocksdb/table_properties.h"
|
#include "rocksdb/table_properties.h"
|
||||||
#include "table/block_based_table_factory.h"
|
#include "table/block_based_table_factory.h"
|
||||||
|
|
||||||
@ -73,6 +74,9 @@ ColumnFamilyOptions::ColumnFamilyOptions()
|
|||||||
std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
|
std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
|
||||||
inplace_update_support(false),
|
inplace_update_support(false),
|
||||||
inplace_update_num_locks(10000),
|
inplace_update_num_locks(10000),
|
||||||
|
inplace_callback(nullptr),
|
||||||
|
memtable_prefix_bloom_bits(0),
|
||||||
|
memtable_prefix_bloom_probes(6),
|
||||||
max_successive_merges(0) {
|
max_successive_merges(0) {
|
||||||
assert(memtable_factory.get() != nullptr);
|
assert(memtable_factory.get() != nullptr);
|
||||||
}
|
}
|
||||||
@ -131,6 +135,9 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
|
|||||||
table_properties_collectors(options.table_properties_collectors),
|
table_properties_collectors(options.table_properties_collectors),
|
||||||
inplace_update_support(options.inplace_update_support),
|
inplace_update_support(options.inplace_update_support),
|
||||||
inplace_update_num_locks(options.inplace_update_num_locks),
|
inplace_update_num_locks(options.inplace_update_num_locks),
|
||||||
|
inplace_callback(options.inplace_callback),
|
||||||
|
memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
|
||||||
|
memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
|
||||||
max_successive_merges(options.max_successive_merges) {
|
max_successive_merges(options.max_successive_merges) {
|
||||||
assert(memtable_factory.get() != nullptr);
|
assert(memtable_factory.get() != nullptr);
|
||||||
}
|
}
|
||||||
@ -396,6 +403,11 @@ Options::Dump(Logger* log) const
|
|||||||
inplace_update_support);
|
inplace_update_support);
|
||||||
Log(log, " Options.inplace_update_num_locks: %zd",
|
Log(log, " Options.inplace_update_num_locks: %zd",
|
||||||
inplace_update_num_locks);
|
inplace_update_num_locks);
|
||||||
|
// TODO: easier config for bloom (maybe based on avg key/value size)
|
||||||
|
Log(log, " Options.memtable_prefix_bloom_bits: %d",
|
||||||
|
memtable_prefix_bloom_bits);
|
||||||
|
Log(log, " Options.memtable_prefix_bloom_probes: %d",
|
||||||
|
memtable_prefix_bloom_probes);
|
||||||
Log(log, " Options.max_successive_merges: %zd",
|
Log(log, " Options.max_successive_merges: %zd",
|
||||||
max_successive_merges);
|
max_successive_merges);
|
||||||
} // Options::Dump
|
} // Options::Dump
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user