Compare commits
41 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
79f08d7ffa | ||
|
da11a59034 | ||
|
13a03a5452 | ||
|
2e7a386229 | ||
|
92ea271a6a | ||
|
e3a1707d05 | ||
|
f4790bdd1b | ||
|
de91718e99 | ||
|
6a385e90cd | ||
|
aadbf86b6c | ||
|
00191c84e6 | ||
|
02f6ebb7d4 | ||
|
ccaadd8705 | ||
|
7629bf17fe | ||
|
9f0a64c4f1 | ||
|
94a71b686e | ||
|
bcd86d9074 | ||
|
96a2453d7a | ||
|
32ad0dcafe | ||
|
4f37eb4db2 | ||
|
446a152aa6 | ||
|
947c5fd441 | ||
|
f94fce8623 | ||
|
1a2781d48c | ||
|
2e9a9f04d7 | ||
|
e40bbc57db | ||
|
82f1c1418b | ||
|
39af4e9565 | ||
|
b7f2164a6f | ||
|
529efcc5b2 | ||
|
d6bb43202e | ||
|
ee95900680 | ||
|
e5451b30db | ||
|
877f8b43df | ||
|
05769ea7fb | ||
|
f103aadad9 | ||
|
7818d19169 | ||
|
6f8323a009 | ||
|
da478f3eae | ||
|
d36eda014f | ||
|
d24dd13024 |
@ -321,6 +321,7 @@ if(NOT MSVC)
|
||||
set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
|
||||
endif()
|
||||
|
||||
if (NOT PORTABLE OR FORCE_SSE42)
|
||||
CHECK_CXX_SOURCE_COMPILES("
|
||||
#include <cstdint>
|
||||
#include <nmmintrin.h>
|
||||
@ -339,6 +340,7 @@ if(HAVE_SSE42)
|
||||
elseif(FORCE_SSE42)
|
||||
message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Check if -latomic is required or not
|
||||
if (NOT MSVC)
|
||||
|
31
HISTORY.md
31
HISTORY.md
@ -1,4 +1,35 @@
|
||||
# Rocksdb Change Log
|
||||
## 6.29.5 (03/29/2022)
|
||||
### Bug Fixes
|
||||
* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
|
||||
* Fixed a race condition when mmaping a WritableFile on POSIX.
|
||||
* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
|
||||
* Fixed a heap use-after-free race with DropColumnFamily.
|
||||
* Fixed a bug that `rocksdb.read.block.compaction.micros` cannot track compaction stats (#9722).
|
||||
|
||||
## 6.29.4 (03/22/2022)
|
||||
### Bug Fixes
|
||||
* Fixed a bug caused by race among flush, incoming writes and taking snapshots. Queries to snapshots created with these race condition can return incorrect result, e.g. resurfacing deleted data.
|
||||
* Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
|
||||
* Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed.
|
||||
* Fixed a race condition when disable and re-enable manual compaction.
|
||||
* Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
|
||||
* Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
|
||||
* Fixed a read-after-free bug in `DB::GetMergeOperands()`.
|
||||
* Fixed NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, NUM_DATA_BLOCKS_READ_PER_LEVEL, and NUM_SST_READ_PER_LEVEL stats to be reported once per MultiGet batch per level.
|
||||
|
||||
## 6.29.3 (02/17/2022)
|
||||
### Bug Fixes
|
||||
* Fix a data loss bug for 2PC write-committed transaction caused by concurrent transaction commit and memtable switch (#9571).
|
||||
|
||||
## 6.29.2 (02/15/2022)
|
||||
### Performance Improvements
|
||||
* DisableManualCompaction() doesn't have to wait scheduled manual compaction to be executed in thread-pool to cancel the job.
|
||||
|
||||
## 6.29.1 (01/31/2022)
|
||||
### Bug Fixes
|
||||
* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
|
||||
|
||||
## 6.29.0 (01/21/2022)
|
||||
Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info.
|
||||
### Public API change
|
||||
|
39
Makefile
39
Makefile
@ -324,8 +324,8 @@ ifneq ($(MACHINE), arm64)
|
||||
# linking with jemalloc (as it won't be arm64-compatible) and remove some other options
|
||||
# set during platform detection
|
||||
DISABLE_JEMALLOC=1
|
||||
PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42, $(PLATFORM_CFLAGS))
|
||||
PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42, $(PLATFORM_CXXFLAGS))
|
||||
PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
|
||||
PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
@ -2088,7 +2088,7 @@ SHA256_CMD = sha256sum
|
||||
|
||||
ZLIB_VER ?= 1.2.11
|
||||
ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
|
||||
ZLIB_DOWNLOAD_BASE ?= http://zlib.net
|
||||
ZLIB_DOWNLOAD_BASE ?= https://zlib.net/fossils
|
||||
BZIP2_VER ?= 1.0.8
|
||||
BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269
|
||||
BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2
|
||||
@ -2106,7 +2106,9 @@ CURL_SSL_OPTS ?= --tlsv1
|
||||
ifeq ($(PLATFORM), OS_MACOSX)
|
||||
ifeq (,$(findstring librocksdbjni-osx,$(ROCKSDBJNILIB)))
|
||||
ifeq ($(MACHINE),arm64)
|
||||
ROCKSDBJNILIB = librocksdbjni-osx-aarch64.jnilib
|
||||
ROCKSDBJNILIB = librocksdbjni-osx-arm64.jnilib
|
||||
else ifeq ($(MACHINE),x86_64)
|
||||
ROCKSDBJNILIB = librocksdbjni-osx-x86_64.jnilib
|
||||
else
|
||||
ROCKSDBJNILIB = librocksdbjni-osx.jnilib
|
||||
endif
|
||||
@ -2237,15 +2239,20 @@ endif
|
||||
$(MAKE) rocksdbjavastatic_deps
|
||||
$(MAKE) rocksdbjavastatic_libobjects
|
||||
$(MAKE) rocksdbjavastatic_javalib
|
||||
$(MAKE) rocksdbjavastatic_jar
|
||||
$(MAKE) rocksdbjava_jar
|
||||
|
||||
rocksdbjavastaticosx: rocksdbjavastaticosx_archs
|
||||
mv java/target/librocksdbjni-osx-x86_64.jnilib java/target/librocksdbjni-osx.jnilib
|
||||
mv java/target/librocksdbjni-osx-arm64.jnilib java/target/librocksdbjni-osx-aarch64.jnilib
|
||||
cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md
|
||||
cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
|
||||
cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
|
||||
openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
|
||||
|
||||
rocksdbjavastaticosx_ub: rocksdbjavastaticosx_archs
|
||||
lipo -create -output ./java/target/$(ROCKSDBJNILIB) java/target/librocksdbjni-osx-x86_64.jnilib java/target/librocksdbjni-osx-arm64.jnilib
|
||||
$(MAKE) rocksdbjavastatic_jar
|
||||
cd java/target; lipo -create -output librocksdbjni-osx.jnilib librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
|
||||
cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md
|
||||
cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx.jnilib
|
||||
cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
|
||||
openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
|
||||
|
||||
rocksdbjavastaticosx_archs:
|
||||
$(MAKE) rocksdbjavastaticosx_arch_x86_64
|
||||
@ -2279,28 +2286,32 @@ rocksdbjavastatic_javalib:
|
||||
strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \
|
||||
fi
|
||||
|
||||
rocksdbjavastatic_jar:
|
||||
rocksdbjava_jar:
|
||||
cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md
|
||||
cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
|
||||
cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
|
||||
cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
|
||||
cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
|
||||
openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
|
||||
|
||||
rocksdbjava_javadocs_jar:
|
||||
cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
|
||||
openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1
|
||||
|
||||
rocksdbjava_sources_jar:
|
||||
cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
|
||||
openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1
|
||||
|
||||
rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS)
|
||||
|
||||
rocksdbjavastatic_libobjects: $(LIB_OBJECTS)
|
||||
|
||||
rocksdbjavastaticrelease: rocksdbjavastaticosx
|
||||
rocksdbjavastaticrelease: rocksdbjavastaticosx rocksdbjava_javadocs_jar rocksdbjava_sources_jar
|
||||
cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
|
||||
cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
|
||||
cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
|
||||
cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
|
||||
openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1
|
||||
|
||||
rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl
|
||||
rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl rocksdbjava_javadocs_jar rocksdbjava_sources_jar
|
||||
cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
|
||||
cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
|
||||
cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
|
||||
|
@ -58,6 +58,7 @@ Status ArenaWrappedDBIter::Refresh() {
|
||||
uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
|
||||
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
|
||||
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
|
||||
while (true) {
|
||||
if (sv_number_ != cur_sv_number) {
|
||||
Env* env = db_iter_->env();
|
||||
db_iter_->~DBIter();
|
||||
@ -79,9 +80,33 @@ Status ArenaWrappedDBIter::Refresh() {
|
||||
read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
|
||||
latest_seq, /* allow_unprepared_value */ true);
|
||||
SetIterUnderDBIter(internal_iter);
|
||||
break;
|
||||
} else {
|
||||
db_iter_->set_sequence(db_impl_->GetLatestSequenceNumber());
|
||||
SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
|
||||
// Refresh range-tombstones in MemTable
|
||||
if (!read_options_.ignore_range_deletions) {
|
||||
SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
|
||||
ReadRangeDelAggregator* range_del_agg =
|
||||
db_iter_->GetRangeDelAggregator();
|
||||
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
|
||||
range_del_iter.reset(
|
||||
sv->mem->NewRangeTombstoneIterator(read_options_, latest_seq));
|
||||
range_del_agg->AddTombstones(std::move(range_del_iter));
|
||||
cfd_->ReturnThreadLocalSuperVersion(sv);
|
||||
}
|
||||
// Refresh latest sequence number
|
||||
db_iter_->set_sequence(latest_seq);
|
||||
db_iter_->set_valid(false);
|
||||
// Check again if the latest super version number is changed
|
||||
uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
|
||||
if (latest_sv_number != cur_sv_number) {
|
||||
// If the super version number is changed after refreshing,
|
||||
// fallback to Re-Init the InternalIterator
|
||||
cur_sv_number = latest_sv_number;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
42
db/c_test.c
42
db/c_test.c
@ -7,12 +7,13 @@
|
||||
|
||||
#ifndef ROCKSDB_LITE // Lite does not support C API
|
||||
|
||||
#include "rocksdb/c.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "rocksdb/c.h"
|
||||
#ifndef OS_WIN
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
@ -89,10 +90,8 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
|
||||
// ok
|
||||
return;
|
||||
} else {
|
||||
fprintf(stderr, "%s: expected '%s', got '%s'\n",
|
||||
phase,
|
||||
(expected ? expected : "(null)"),
|
||||
(v ? v : "(null"));
|
||||
fprintf(stderr, "%s: expected '%s', got '%s'\n", phase,
|
||||
(expected ? expected : "(null)"), (v ? v : "(null)"));
|
||||
abort();
|
||||
}
|
||||
}
|
||||
@ -1019,7 +1018,36 @@ int main(int argc, char** argv) {
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
rocksdb_release_snapshot(db, snap);
|
||||
}
|
||||
|
||||
StartPhase("snapshot_with_memtable_inplace_update");
|
||||
{
|
||||
rocksdb_close(db);
|
||||
const rocksdb_snapshot_t* snap = NULL;
|
||||
const char* s_key = "foo_snap";
|
||||
const char* value1 = "hello_s1";
|
||||
const char* value2 = "hello_s2";
|
||||
rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
|
||||
rocksdb_options_set_inplace_update_support(options, 1);
|
||||
rocksdb_options_set_error_if_exists(options, 0);
|
||||
db = rocksdb_open(options, dbname, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_put(db, woptions, s_key, 8, value1, 8, &err);
|
||||
snap = rocksdb_create_snapshot(db);
|
||||
assert(snap != NULL);
|
||||
rocksdb_put(db, woptions, s_key, 8, value2, 8, &err);
|
||||
CheckNoError(err);
|
||||
rocksdb_readoptions_set_snapshot(roptions, snap);
|
||||
CheckGet(db, roptions, "foo", NULL);
|
||||
// snapshot syntax is invalid, because of inplace update supported is set
|
||||
CheckGet(db, roptions, s_key, value2);
|
||||
// restore the data and options
|
||||
rocksdb_delete(db, woptions, s_key, 8, &err);
|
||||
CheckGet(db, roptions, s_key, NULL);
|
||||
rocksdb_release_snapshot(db, snap);
|
||||
rocksdb_readoptions_set_snapshot(roptions, NULL);
|
||||
rocksdb_options_set_inplace_update_support(options, 0);
|
||||
rocksdb_options_set_allow_concurrent_memtable_write(options, 1);
|
||||
rocksdb_options_set_error_if_exists(options, 1);
|
||||
}
|
||||
StartPhase("repair");
|
||||
{
|
||||
// If we do not compact here, then the lazy deletion of
|
||||
|
@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
||||
return new_cfd;
|
||||
}
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
void ColumnFamilySet::FreeDeadColumnFamilies() {
|
||||
autovector<ColumnFamilyData*> to_delete;
|
||||
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
|
||||
if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
|
||||
to_delete.push_back(cfd);
|
||||
}
|
||||
}
|
||||
for (auto cfd : to_delete) {
|
||||
// this is very rare, so it's not a problem that we do it under a mutex
|
||||
delete cfd;
|
||||
}
|
||||
}
|
||||
|
||||
// under a DB mutex AND from a write thread
|
||||
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
||||
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
||||
|
@ -519,9 +519,10 @@ class ColumnFamilyData {
|
||||
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
|
||||
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
|
||||
|
||||
static const uint32_t kDummyColumnFamilyDataId;
|
||||
|
||||
private:
|
||||
friend class ColumnFamilySet;
|
||||
static const uint32_t kDummyColumnFamilyDataId;
|
||||
ColumnFamilyData(uint32_t id, const std::string& name,
|
||||
Version* dummy_versions, Cache* table_cache,
|
||||
WriteBufferManager* write_buffer_manager,
|
||||
@ -627,10 +628,8 @@ class ColumnFamilyData {
|
||||
// held and it needs to be executed from the write thread. SetDropped() also
|
||||
// guarantees that it will be called only from single-threaded LogAndApply(),
|
||||
// but this condition is not that important.
|
||||
// * Iteration -- hold DB mutex, but you can release it in the body of
|
||||
// iteration. If you release DB mutex in body, reference the column
|
||||
// family before the mutex and unreference after you unlock, since the column
|
||||
// family might get dropped when the DB mutex is released
|
||||
// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
|
||||
// body of the iteration, wrap in a RefedColumnFamilySet.
|
||||
// * GetDefault() -- thread safe
|
||||
// * GetColumnFamily() -- either inside of DB mutex or from a write thread
|
||||
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
|
||||
@ -642,17 +641,12 @@ class ColumnFamilySet {
|
||||
public:
|
||||
explicit iterator(ColumnFamilyData* cfd)
|
||||
: current_(cfd) {}
|
||||
// NOTE: minimum operators for for-loop iteration
|
||||
iterator& operator++() {
|
||||
// dropped column families might still be included in this iteration
|
||||
// (we're only removing them when client drops the last reference to the
|
||||
// column family).
|
||||
// dummy is never dead, so this will never be infinite
|
||||
do {
|
||||
current_ = current_->next_;
|
||||
} while (current_->refs_.load(std::memory_order_relaxed) == 0);
|
||||
return *this;
|
||||
}
|
||||
bool operator!=(const iterator& other) {
|
||||
bool operator!=(const iterator& other) const {
|
||||
return this->current_ != other.current_;
|
||||
}
|
||||
ColumnFamilyData* operator*() { return current_; }
|
||||
@ -691,10 +685,6 @@ class ColumnFamilySet {
|
||||
iterator begin() { return iterator(dummy_cfd_->next_); }
|
||||
iterator end() { return iterator(dummy_cfd_); }
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
// Don't call while iterating over ColumnFamilySet
|
||||
void FreeDeadColumnFamilies();
|
||||
|
||||
Cache* get_table_cache() { return table_cache_; }
|
||||
|
||||
WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
|
||||
@ -737,6 +727,55 @@ class ColumnFamilySet {
|
||||
std::string db_session_id_;
|
||||
};
|
||||
|
||||
// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
|
||||
// iteration over the iterator, because the cfd is Refed and Unrefed during
|
||||
// each iteration to prevent concurrent CF drop from destroying it (until
|
||||
// Unref).
|
||||
class RefedColumnFamilySet {
|
||||
public:
|
||||
explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
|
||||
|
||||
class iterator {
|
||||
public:
|
||||
explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
|
||||
MaybeRef(*wrapped_);
|
||||
}
|
||||
~iterator() { MaybeUnref(*wrapped_); }
|
||||
inline void MaybeRef(ColumnFamilyData* cfd) {
|
||||
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
|
||||
cfd->Ref();
|
||||
}
|
||||
}
|
||||
inline void MaybeUnref(ColumnFamilyData* cfd) {
|
||||
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
|
||||
cfd->UnrefAndTryDelete();
|
||||
}
|
||||
}
|
||||
// NOTE: minimum operators for for-loop iteration
|
||||
inline iterator& operator++() {
|
||||
ColumnFamilyData* old = *wrapped_;
|
||||
++wrapped_;
|
||||
// Can only unref & potentially free cfd after accessing its next_
|
||||
MaybeUnref(old);
|
||||
MaybeRef(*wrapped_);
|
||||
return *this;
|
||||
}
|
||||
inline bool operator!=(const iterator& other) const {
|
||||
return this->wrapped_ != other.wrapped_;
|
||||
}
|
||||
inline ColumnFamilyData* operator*() { return *wrapped_; }
|
||||
|
||||
private:
|
||||
ColumnFamilySet::iterator wrapped_;
|
||||
};
|
||||
|
||||
iterator begin() { return iterator(wrapped_->begin()); }
|
||||
iterator end() { return iterator(wrapped_->end()); }
|
||||
|
||||
private:
|
||||
ColumnFamilySet* wrapped_;
|
||||
};
|
||||
|
||||
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
|
||||
// memtables of different column families (specified by ID in the write batch)
|
||||
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
|
||||
|
@ -1228,7 +1228,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
||||
}
|
||||
#endif // !ROCKSDB_LITE
|
||||
|
||||
uint64_t prev_cpu_micros = db_options_.clock->CPUNanos() / 1000;
|
||||
uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
|
||||
|
||||
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
|
||||
|
||||
@ -1572,7 +1572,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
||||
}
|
||||
|
||||
sub_compact->compaction_job_stats.cpu_micros =
|
||||
db_options_.clock->CPUNanos() / 1000 - prev_cpu_micros;
|
||||
db_options_.clock->CPUMicros() - prev_cpu_micros;
|
||||
|
||||
if (measure_io_stats_) {
|
||||
sub_compact->compaction_job_stats.file_write_nanos +=
|
||||
|
@ -1938,8 +1938,9 @@ TEST_F(DBBasicTest, MultiGetStats) {
|
||||
int total_keys = 2000;
|
||||
std::vector<std::string> keys_str(total_keys);
|
||||
std::vector<Slice> keys(total_keys);
|
||||
std::vector<PinnableSlice> values(total_keys);
|
||||
std::vector<Status> s(total_keys);
|
||||
static size_t kMultiGetBatchSize = 100;
|
||||
std::vector<PinnableSlice> values(kMultiGetBatchSize);
|
||||
std::vector<Status> s(kMultiGetBatchSize);
|
||||
ReadOptions read_opts;
|
||||
|
||||
Random rnd(309);
|
||||
@ -1976,15 +1977,16 @@ TEST_F(DBBasicTest, MultiGetStats) {
|
||||
}
|
||||
}
|
||||
ASSERT_OK(Flush(1));
|
||||
MoveFilesToLevel(1, 1);
|
||||
Close();
|
||||
|
||||
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
||||
ASSERT_OK(options.statistics->Reset());
|
||||
|
||||
db_->MultiGet(read_opts, handles_[1], total_keys, keys.data(), values.data(),
|
||||
s.data(), false);
|
||||
db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250],
|
||||
values.data(), s.data(), false);
|
||||
|
||||
ASSERT_EQ(values.size(), total_keys);
|
||||
ASSERT_EQ(values.size(), kMultiGetBatchSize);
|
||||
HistogramData hist_data_blocks;
|
||||
HistogramData hist_index_and_filter_blocks;
|
||||
HistogramData hist_sst;
|
||||
@ -1996,16 +1998,16 @@ TEST_F(DBBasicTest, MultiGetStats) {
|
||||
options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);
|
||||
|
||||
// Maximum number of blocks read from a file system in a level.
|
||||
ASSERT_GT(hist_data_blocks.max, 0);
|
||||
ASSERT_EQ(hist_data_blocks.max, 32);
|
||||
ASSERT_GT(hist_index_and_filter_blocks.max, 0);
|
||||
// Maximum number of sst files read from file system in a level.
|
||||
ASSERT_GT(hist_sst.max, 0);
|
||||
ASSERT_EQ(hist_sst.max, 2);
|
||||
|
||||
// Minimun number of blocks read in a level.
|
||||
ASSERT_EQ(hist_data_blocks.min, 3);
|
||||
ASSERT_EQ(hist_data_blocks.min, 4);
|
||||
ASSERT_GT(hist_index_and_filter_blocks.min, 0);
|
||||
// Minimun number of sst files read in a level.
|
||||
ASSERT_GT(hist_sst.max, 0);
|
||||
ASSERT_EQ(hist_sst.min, 1);
|
||||
}
|
||||
|
||||
// Test class for batched MultiGet with prefix extractor
|
||||
|
@ -1502,6 +1502,63 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
|
||||
ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
|
||||
}
|
||||
|
||||
TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
|
||||
Options options = CurrentOptions();
|
||||
options.memtable_prefix_bloom_size_ratio = 0.015;
|
||||
options.memtable_whole_key_filtering = true;
|
||||
Reopen(options);
|
||||
std::string key1("AA");
|
||||
std::string key2("BB");
|
||||
std::string key3("CC");
|
||||
std::string key4("DD");
|
||||
std::string key_not("EE");
|
||||
std::string value1("Value1");
|
||||
std::string value2("Value2");
|
||||
std::string value3("Value3");
|
||||
std::string value4("Value4");
|
||||
|
||||
ASSERT_OK(Put(key1, value1, WriteOptions()));
|
||||
ASSERT_OK(Put(key2, value2, WriteOptions()));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(Put(key3, value3, WriteOptions()));
|
||||
const Snapshot* snapshot = db_->GetSnapshot();
|
||||
ASSERT_OK(Put(key4, value4, WriteOptions()));
|
||||
|
||||
// Delete key2 and key3
|
||||
ASSERT_OK(
|
||||
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
|
||||
|
||||
// Read without snapshot
|
||||
auto results = MultiGet({key_not, key1, key2, key3, key4});
|
||||
ASSERT_EQ(results[0], "NOT_FOUND");
|
||||
ASSERT_EQ(results[1], value1);
|
||||
ASSERT_EQ(results[2], "NOT_FOUND");
|
||||
ASSERT_EQ(results[3], "NOT_FOUND");
|
||||
ASSERT_EQ(results[4], value4);
|
||||
|
||||
// Also check Get
|
||||
ASSERT_EQ(Get(key1), value1);
|
||||
ASSERT_EQ(Get(key2), "NOT_FOUND");
|
||||
ASSERT_EQ(Get(key3), "NOT_FOUND");
|
||||
ASSERT_EQ(Get(key4), value4);
|
||||
|
||||
// Read with snapshot
|
||||
results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
|
||||
ASSERT_EQ(results[0], "NOT_FOUND");
|
||||
ASSERT_EQ(results[1], value1);
|
||||
ASSERT_EQ(results[2], value2);
|
||||
ASSERT_EQ(results[3], value3);
|
||||
ASSERT_EQ(results[4], "NOT_FOUND");
|
||||
|
||||
// Also check Get
|
||||
ASSERT_EQ(Get(key1, snapshot), value1);
|
||||
ASSERT_EQ(Get(key2, snapshot), value2);
|
||||
ASSERT_EQ(Get(key3, snapshot), value3);
|
||||
ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
|
||||
|
||||
db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
|
||||
constexpr size_t kPrefixSize = 8;
|
||||
const std::string kKey = "key";
|
||||
|
@ -6881,6 +6881,319 @@ TEST_F(DBCompactionTest, FIFOWarm) {
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
|
||||
const int kNumL0Files = 10;
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
Reopen(options);
|
||||
|
||||
// Generate 2 levels of file to make sure the manual compaction is not skipped
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ASSERT_OK(Put(Key(i), "value"));
|
||||
if (i % 2) {
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
}
|
||||
MoveFilesToLevel(2);
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ASSERT_OK(Put(Key(i), "value"));
|
||||
if (i % 2) {
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
}
|
||||
MoveFilesToLevel(1);
|
||||
|
||||
// Block compaction queue
|
||||
test::SleepingBackgroundTask sleeping_task_low;
|
||||
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
||||
Env::Priority::LOW);
|
||||
|
||||
port::Thread compact_thread1([&]() {
|
||||
CompactRangeOptions cro;
|
||||
cro.exclusive_manual_compaction = false;
|
||||
std::string begin_str = Key(0);
|
||||
std::string end_str = Key(3);
|
||||
Slice b = begin_str;
|
||||
Slice e = end_str;
|
||||
auto s = db_->CompactRange(cro, &b, &e);
|
||||
ASSERT_TRUE(s.IsIncomplete());
|
||||
});
|
||||
|
||||
port::Thread compact_thread2([&]() {
|
||||
CompactRangeOptions cro;
|
||||
cro.exclusive_manual_compaction = false;
|
||||
std::string begin_str = Key(4);
|
||||
std::string end_str = Key(7);
|
||||
Slice b = begin_str;
|
||||
Slice e = end_str;
|
||||
auto s = db_->CompactRange(cro, &b, &e);
|
||||
ASSERT_TRUE(s.IsIncomplete());
|
||||
});
|
||||
|
||||
// Disable manual compaction should cancel both manual compactions and both
|
||||
// compaction should return incomplete.
|
||||
db_->DisableManualCompaction();
|
||||
|
||||
compact_thread1.join();
|
||||
compact_thread2.join();
|
||||
|
||||
sleeping_task_low.WakeUp();
|
||||
sleeping_task_low.WaitUntilDone();
|
||||
ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
|
||||
const int kNumL0Files = 4;
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
Reopen(options);
|
||||
|
||||
// generate files, but avoid trigger auto compaction
|
||||
for (int i = 0; i < kNumL0Files / 2; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
|
||||
// make sure the manual compaction background is started but not yet set the
|
||||
// status to in_progress, then cancel the manual compaction, which should not
|
||||
// result in segfault
|
||||
SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::BGWorkCompaction",
|
||||
"DBCompactionTest::DisableJustStartedManualCompaction:"
|
||||
"PreDisableManualCompaction"},
|
||||
{"DBImpl::RunManualCompaction:Unscheduled",
|
||||
"BackgroundCallCompaction:0"}});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
port::Thread compact_thread([&]() {
|
||||
CompactRangeOptions cro;
|
||||
cro.exclusive_manual_compaction = true;
|
||||
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
||||
ASSERT_TRUE(s.IsIncomplete());
|
||||
});
|
||||
TEST_SYNC_POINT(
|
||||
"DBCompactionTest::DisableJustStartedManualCompaction:"
|
||||
"PreDisableManualCompaction");
|
||||
db_->DisableManualCompaction();
|
||||
|
||||
compact_thread.join();
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
|
||||
const int kNumL0Files = 4;
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
Reopen(options);
|
||||
|
||||
SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::BackgroundCompaction:InProgress",
|
||||
"DBCompactionTest::DisableInProgressManualCompaction:"
|
||||
"PreDisableManualCompaction"},
|
||||
{"DBImpl::RunManualCompaction:Unscheduled",
|
||||
"CompactionJob::Run():Start"}});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
// generate files, but avoid trigger auto compaction
|
||||
for (int i = 0; i < kNumL0Files / 2; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
|
||||
port::Thread compact_thread([&]() {
|
||||
CompactRangeOptions cro;
|
||||
cro.exclusive_manual_compaction = true;
|
||||
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
||||
ASSERT_TRUE(s.IsIncomplete());
|
||||
});
|
||||
|
||||
TEST_SYNC_POINT(
|
||||
"DBCompactionTest::DisableInProgressManualCompaction:"
|
||||
"PreDisableManualCompaction");
|
||||
db_->DisableManualCompaction();
|
||||
|
||||
compact_thread.join();
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
|
||||
const int kNumL0Files = 4;
|
||||
|
||||
SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::RunManualCompaction:Scheduled",
|
||||
"DBCompactionTest::DisableManualCompactionThreadQueueFull:"
|
||||
"PreDisableManualCompaction"}});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
Reopen(options);
|
||||
|
||||
// Block compaction queue
|
||||
test::SleepingBackgroundTask sleeping_task_low;
|
||||
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
||||
Env::Priority::LOW);
|
||||
|
||||
// generate files, but avoid trigger auto compaction
|
||||
for (int i = 0; i < kNumL0Files / 2; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
|
||||
port::Thread compact_thread([&]() {
|
||||
CompactRangeOptions cro;
|
||||
cro.exclusive_manual_compaction = true;
|
||||
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
||||
ASSERT_TRUE(s.IsIncomplete());
|
||||
});
|
||||
|
||||
TEST_SYNC_POINT(
|
||||
"DBCompactionTest::DisableManualCompactionThreadQueueFull:"
|
||||
"PreDisableManualCompaction");
|
||||
|
||||
// Generate more files to trigger auto compaction which is scheduled after
|
||||
// manual compaction. Has to generate 4 more files because existing files are
|
||||
// pending compaction
|
||||
for (int i = 0; i < kNumL0Files; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
|
||||
|
||||
db_->DisableManualCompaction();
|
||||
|
||||
// CompactRange should return before the compaction has the chance to run
|
||||
compact_thread.join();
|
||||
|
||||
sleeping_task_low.WakeUp();
|
||||
sleeping_task_low.WaitUntilDone();
|
||||
ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
|
||||
ASSERT_EQ("0,1", FilesPerLevel(0));
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
|
||||
const int kNumL0Files = 4;
|
||||
|
||||
SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::RunManualCompaction:Scheduled",
|
||||
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
||||
"PreDisableManualCompaction"}});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
Reopen(options);
|
||||
|
||||
// Block compaction queue
|
||||
test::SleepingBackgroundTask sleeping_task_low;
|
||||
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
||||
Env::Priority::LOW);
|
||||
|
||||
// generate files, but avoid trigger auto compaction
|
||||
for (int i = 0; i < kNumL0Files / 2; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
|
||||
port::Thread compact_thread([&]() {
|
||||
CompactRangeOptions cro;
|
||||
cro.exclusive_manual_compaction = true;
|
||||
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
||||
ASSERT_TRUE(s.IsIncomplete());
|
||||
});
|
||||
|
||||
TEST_SYNC_POINT(
|
||||
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
||||
"PreDisableManualCompaction");
|
||||
|
||||
// Generate more files to trigger auto compaction which is scheduled after
|
||||
// manual compaction. Has to generate 4 more files because existing files are
|
||||
// pending compaction
|
||||
for (int i = 0; i < kNumL0Files; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
|
||||
|
||||
db_->DisableManualCompaction();
|
||||
|
||||
// CompactRange should return before the compaction has the chance to run
|
||||
compact_thread.join();
|
||||
|
||||
// Try close DB while manual compaction is canceled but still in the queue.
|
||||
// And an auto-triggered compaction is also in the queue.
|
||||
auto s = db_->Close();
|
||||
ASSERT_OK(s);
|
||||
|
||||
sleeping_task_low.WakeUp();
|
||||
sleeping_task_low.WaitUntilDone();
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
|
||||
const int kNumL0Files = 4;
|
||||
|
||||
SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::RunManualCompaction:Scheduled",
|
||||
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
||||
"PreDisableManualCompaction"}});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
Reopen(options);
|
||||
|
||||
// Block compaction queue
|
||||
test::SleepingBackgroundTask sleeping_task_low;
|
||||
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
||||
Env::Priority::LOW);
|
||||
|
||||
// generate files, but avoid trigger auto compaction
|
||||
for (int i = 0; i < kNumL0Files / 2; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
|
||||
port::Thread compact_thread([&]() {
|
||||
CompactRangeOptions cro;
|
||||
cro.exclusive_manual_compaction = true;
|
||||
auto s = db_->CompactRange(cro, nullptr, nullptr);
|
||||
ASSERT_TRUE(s.IsIncomplete());
|
||||
});
|
||||
|
||||
TEST_SYNC_POINT(
|
||||
"DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
|
||||
"PreDisableManualCompaction");
|
||||
|
||||
// Generate more files to trigger auto compaction which is scheduled after
|
||||
// manual compaction. Has to generate 4 more files because existing files are
|
||||
// pending compaction
|
||||
for (int i = 0; i < kNumL0Files; i++) {
|
||||
ASSERT_OK(Put(Key(1), "value1"));
|
||||
ASSERT_OK(Put(Key(2), "value2"));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
|
||||
|
||||
// Close DB with manual compaction and auto triggered compaction in the queue.
|
||||
auto s = db_->Close();
|
||||
ASSERT_OK(s);
|
||||
|
||||
// manual compaction thread should return with Incomplete().
|
||||
compact_thread.join();
|
||||
|
||||
sleeping_task_low.WakeUp();
|
||||
sleeping_task_low.WaitUntilDone();
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest,
|
||||
DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
|
||||
// When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
|
||||
|
@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() {
|
||||
}
|
||||
mutex_.Lock();
|
||||
} else {
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (cfd->IsDropped()) {
|
||||
continue;
|
||||
}
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
|
||||
TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
|
||||
TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
|
||||
mutex_.Lock();
|
||||
cfd->UnrefAndTryDelete();
|
||||
if (!status.ok() && !status.IsColumnFamilyDropped()) {
|
||||
break;
|
||||
} else if (status.IsColumnFamilyDropped()) {
|
||||
@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() {
|
||||
}
|
||||
}
|
||||
}
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -676,6 +676,7 @@ class TestFlushListener : public EventListener {
|
||||
~TestFlushListener() override {
|
||||
prev_fc_info_.status.PermitUncheckedError(); // Ignore the status
|
||||
}
|
||||
|
||||
void OnTableFileCreated(const TableFileCreationInfo& info) override {
|
||||
// remember the info for later checking the FlushJobInfo.
|
||||
prev_fc_info_ = info;
|
||||
@ -1999,6 +2000,61 @@ TEST_P(DBFlushTestBlobError, FlushError) {
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
|
||||
class SimpleTestFlushListener : public EventListener {
|
||||
public:
|
||||
explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {}
|
||||
~SimpleTestFlushListener() override {}
|
||||
|
||||
void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
|
||||
ASSERT_EQ(static_cast<uint32_t>(0), info.cf_id);
|
||||
|
||||
ASSERT_OK(db->Delete(WriteOptions(), "foo"));
|
||||
snapshot_ = db->GetSnapshot();
|
||||
ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
|
||||
|
||||
auto* dbimpl = static_cast_with_check<DBImpl>(db);
|
||||
assert(dbimpl);
|
||||
|
||||
ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
|
||||
auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
|
||||
assert(cfhi);
|
||||
ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd()));
|
||||
}
|
||||
|
||||
DBFlushTest* test_ = nullptr;
|
||||
const Snapshot* snapshot_ = nullptr;
|
||||
};
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
auto* listener = new SimpleTestFlushListener(this);
|
||||
options.listeners.emplace_back(listener);
|
||||
DestroyAndReopen(options);
|
||||
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
|
||||
|
||||
ManagedSnapshot snapshot_guard(db_);
|
||||
|
||||
ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
|
||||
ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
|
||||
|
||||
const Snapshot* snapshot = listener->snapshot_;
|
||||
assert(snapshot);
|
||||
|
||||
ReadOptions read_opts;
|
||||
read_opts.snapshot = snapshot;
|
||||
|
||||
// Using snapshot should not see "foo".
|
||||
{
|
||||
std::string value;
|
||||
Status s = db_->Get(read_opts, "foo", &value);
|
||||
ASSERT_TRUE(s.IsNotFound());
|
||||
}
|
||||
|
||||
db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
|
@ -383,15 +383,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
||||
s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
|
||||
mutex_.Lock();
|
||||
} else {
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (cfd->IsDropped()) {
|
||||
continue;
|
||||
}
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
InstrumentedMutexUnlock u(&mutex_);
|
||||
s = FlushMemTable(cfd, flush_opts, context.flush_reason);
|
||||
mutex_.Lock();
|
||||
cfd->UnrefAndTryDelete();
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
@ -406,14 +403,6 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
||||
|
||||
JobContext job_context(0);
|
||||
FindObsoleteFiles(&job_context, true);
|
||||
if (s.ok()) {
|
||||
s = error_handler_.ClearBGError();
|
||||
} else {
|
||||
// NOTE: this is needed to pass ASSERT_STATUS_CHECKED
|
||||
// in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
|
||||
// See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
|
||||
error_handler_.GetRecoveryError().PermitUncheckedError();
|
||||
}
|
||||
mutex_.Unlock();
|
||||
|
||||
job_context.manifest_file_number = 1;
|
||||
@ -434,11 +423,31 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
||||
immutable_db_options_.info_log,
|
||||
"DB resume requested but could not enable file deletions [%s]",
|
||||
s.ToString().c_str());
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
|
||||
}
|
||||
|
||||
mutex_.Lock();
|
||||
if (s.ok()) {
|
||||
// This will notify and unblock threads waiting for error recovery to
|
||||
// finish. Those previouly waiting threads can now proceed, which may
|
||||
// include closing the db.
|
||||
s = error_handler_.ClearBGError();
|
||||
} else {
|
||||
// NOTE: this is needed to pass ASSERT_STATUS_CHECKED
|
||||
// in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
|
||||
// See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
|
||||
error_handler_.GetRecoveryError().PermitUncheckedError();
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
|
||||
} else {
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]",
|
||||
s.ToString().c_str());
|
||||
}
|
||||
|
||||
// Check for shutdown again before scheduling further compactions,
|
||||
// since we released and re-acquired the lock above
|
||||
if (shutdown_initiated_) {
|
||||
@ -491,18 +500,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
|
||||
s.PermitUncheckedError(); //**TODO: What to do on error?
|
||||
mutex_.Lock();
|
||||
} else {
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
InstrumentedMutexUnlock u(&mutex_);
|
||||
Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
|
||||
s.PermitUncheckedError(); //**TODO: What to do on error?
|
||||
mutex_.Lock();
|
||||
cfd->UnrefAndTryDelete();
|
||||
}
|
||||
}
|
||||
}
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
}
|
||||
|
||||
shutting_down_.store(true, std::memory_order_release);
|
||||
@ -533,10 +538,19 @@ Status DBImpl::CloseHelper() {
|
||||
// marker. After this we do a variant of the waiting and unschedule work
|
||||
// (to consider: moving all the waiting into CancelAllBackgroundWork(true))
|
||||
CancelAllBackgroundWork(false);
|
||||
|
||||
// Cancel manual compaction if there's any
|
||||
if (HasPendingManualCompaction()) {
|
||||
DisableManualCompaction();
|
||||
}
|
||||
mutex_.Lock();
|
||||
env_->UnSchedule(this, Env::Priority::BOTTOM);
|
||||
env_->UnSchedule(this, Env::Priority::LOW);
|
||||
env_->UnSchedule(this, Env::Priority::HIGH);
|
||||
// Unschedule all tasks for this DB
|
||||
for (uint8_t i = 0; i < static_cast<uint8_t>(TaskType::kCount); i++) {
|
||||
env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM);
|
||||
env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW);
|
||||
env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH);
|
||||
}
|
||||
|
||||
Status ret = Status::OK();
|
||||
|
||||
// Wait for background work to finish
|
||||
@ -956,19 +970,14 @@ void DBImpl::DumpStats() {
|
||||
TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (cfd->initialized()) {
|
||||
// Release DB mutex for gathering cache entry stats. Pass over all
|
||||
// column families for this first so that other stats are dumped
|
||||
// near-atomically.
|
||||
// Get a ref before unlocking
|
||||
cfd->Ref();
|
||||
{
|
||||
InstrumentedMutexUnlock u(&mutex_);
|
||||
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
|
||||
}
|
||||
cfd->UnrefAndTryDelete();
|
||||
}
|
||||
}
|
||||
|
||||
const std::string* property = &DB::Properties::kDBStats;
|
||||
@ -1891,11 +1900,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
|
||||
return s;
|
||||
}
|
||||
}
|
||||
PinnedIteratorsManager pinned_iters_mgr;
|
||||
if (!done) {
|
||||
PERF_TIMER_GUARD(get_from_output_files_time);
|
||||
sv->current->Get(
|
||||
read_options, lkey, get_impl_options.value, timestamp, &s,
|
||||
&merge_context, &max_covering_tombstone_seq,
|
||||
&merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
|
||||
get_impl_options.get_value ? get_impl_options.value_found : nullptr,
|
||||
nullptr, nullptr,
|
||||
get_impl_options.get_value ? get_impl_options.callback : nullptr,
|
||||
@ -2072,9 +2082,11 @@ std::vector<Status> DBImpl::MultiGet(
|
||||
if (!done) {
|
||||
PinnableSlice pinnable_val;
|
||||
PERF_TIMER_GUARD(get_from_output_files_time);
|
||||
super_version->current->Get(
|
||||
read_options, lkey, &pinnable_val, timestamp, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, /*value_found=*/nullptr,
|
||||
PinnedIteratorsManager pinned_iters_mgr;
|
||||
super_version->current->Get(read_options, lkey, &pinnable_val, timestamp,
|
||||
&s, &merge_context,
|
||||
&max_covering_tombstone_seq,
|
||||
&pinned_iters_mgr, /*value_found=*/nullptr,
|
||||
/*key_exists=*/nullptr,
|
||||
/*seq=*/nullptr, read_callback);
|
||||
value->assign(pinnable_val.data(), pinnable_val.size());
|
||||
@ -3148,6 +3160,12 @@ bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
|
||||
} // namespace
|
||||
|
||||
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
||||
if (s == nullptr) {
|
||||
// DBImpl::GetSnapshot() can return nullptr when snapshot
|
||||
// not supported by specifying the condition:
|
||||
// inplace_update_support enabled.
|
||||
return;
|
||||
}
|
||||
const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
@ -3427,15 +3445,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
|
||||
// Needs mutex to protect the list of column families.
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
uint64_t value;
|
||||
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (!cfd->initialized()) {
|
||||
continue;
|
||||
}
|
||||
cfd->Ref();
|
||||
ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
|
||||
// GetIntPropertyInternal may release db mutex and re-acquire it.
|
||||
mutex_.AssertHeld();
|
||||
cfd->UnrefAndTryDelete();
|
||||
if (ret) {
|
||||
sum += value;
|
||||
} else {
|
||||
@ -4539,10 +4555,12 @@ Status DBImpl::GetLatestSequenceForKey(
|
||||
// SST files if cache_only=true?
|
||||
if (!cache_only) {
|
||||
// Check tables
|
||||
PinnedIteratorsManager pinned_iters_mgr;
|
||||
sv->current->Get(read_options, lkey, /*value=*/nullptr, timestamp, &s,
|
||||
&merge_context, &max_covering_tombstone_seq,
|
||||
nullptr /* value_found */, found_record_for_key, seq,
|
||||
nullptr /*read_callback*/, is_blob_index);
|
||||
&pinned_iters_mgr, nullptr /* value_found */,
|
||||
found_record_for_key, seq, nullptr /*read_callback*/,
|
||||
is_blob_index);
|
||||
|
||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||
// unexpected error reading SST files
|
||||
@ -5024,6 +5042,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: simplify using GetRefedColumnFamilySet?
|
||||
std::vector<ColumnFamilyData*> cfd_list;
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
|
@ -1492,19 +1492,31 @@ class DBImpl : public DB {
|
||||
|
||||
// Information for a manual compaction
|
||||
struct ManualCompactionState {
|
||||
ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
|
||||
int _output_level, uint32_t _output_path_id,
|
||||
bool _exclusive, bool _disallow_trivial_move,
|
||||
std::atomic<bool>* _canceled)
|
||||
: cfd(_cfd),
|
||||
input_level(_input_level),
|
||||
output_level(_output_level),
|
||||
output_path_id(_output_path_id),
|
||||
exclusive(_exclusive),
|
||||
disallow_trivial_move(_disallow_trivial_move),
|
||||
canceled(_canceled) {}
|
||||
|
||||
ColumnFamilyData* cfd;
|
||||
int input_level;
|
||||
int output_level;
|
||||
uint32_t output_path_id;
|
||||
Status status;
|
||||
bool done;
|
||||
bool in_progress; // compaction request being processed?
|
||||
bool incomplete; // only part of requested range compacted
|
||||
bool done = false;
|
||||
bool in_progress = false; // compaction request being processed?
|
||||
bool incomplete = false; // only part of requested range compacted
|
||||
bool exclusive; // current behavior of only one manual
|
||||
bool disallow_trivial_move; // Force actual compaction to run
|
||||
const InternalKey* begin; // nullptr means beginning of key range
|
||||
const InternalKey* end; // nullptr means end of key range
|
||||
InternalKey* manual_end; // how far we are compacting
|
||||
const InternalKey* begin = nullptr; // nullptr means beginning of key range
|
||||
const InternalKey* end = nullptr; // nullptr means end of key range
|
||||
InternalKey* manual_end = nullptr; // how far we are compacting
|
||||
InternalKey tmp_storage; // Used to keep track of compaction progress
|
||||
InternalKey tmp_storage1; // Used to keep track of compaction progress
|
||||
std::atomic<bool>* canceled; // Compaction canceled by the user?
|
||||
@ -1711,6 +1723,25 @@ class DBImpl : public DB {
|
||||
}
|
||||
}
|
||||
|
||||
// TaskType is used to identify tasks in thread-pool, currently only
|
||||
// differentiate manual compaction, which could be unscheduled from the
|
||||
// thread-pool.
|
||||
enum class TaskType : uint8_t {
|
||||
kDefault = 0,
|
||||
kManualCompaction = 1,
|
||||
kCount = 2,
|
||||
};
|
||||
|
||||
// Task tag is used to identity tasks in thread-pool, which is
|
||||
// dbImpl obj address + type
|
||||
inline void* GetTaskTag(TaskType type) {
|
||||
return GetTaskTag(static_cast<uint8_t>(type));
|
||||
}
|
||||
|
||||
inline void* GetTaskTag(uint8_t type) {
|
||||
return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
|
||||
}
|
||||
|
||||
// REQUIRES: mutex locked and in write thread.
|
||||
void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
|
||||
|
||||
@ -1729,7 +1760,8 @@ class DBImpl : public DB {
|
||||
WriteBatch** to_be_cached_state);
|
||||
|
||||
IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
|
||||
uint64_t* log_used, uint64_t* log_size);
|
||||
uint64_t* log_used, uint64_t* log_size,
|
||||
bool with_db_mutex = false, bool with_log_mutex = false);
|
||||
|
||||
IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
|
||||
log::Writer* log_writer, uint64_t* log_used,
|
||||
@ -2055,12 +2087,15 @@ class DBImpl : public DB {
|
||||
bool persistent_stats_cfd_exists_ = true;
|
||||
|
||||
// Without two_write_queues, read and writes to alive_log_files_ are
|
||||
// protected by mutex_. However since back() is never popped, and push_back()
|
||||
// is done only from write_thread_, the same thread can access the item
|
||||
// reffered by back() without mutex_. With two_write_queues_, writes
|
||||
// protected by mutex_. With two_write_queues_, writes
|
||||
// are protected by locking both mutex_ and log_write_mutex_, and reads must
|
||||
// be under either mutex_ or log_write_mutex_.
|
||||
std::deque<LogFileNumberSize> alive_log_files_;
|
||||
// Caching the result of `alive_log_files_.back()` so that we do not have to
|
||||
// call `alive_log_files_.back()` in the write thread (WriteToWAL()) which
|
||||
// requires locking db mutex if log_mutex_ is not already held in
|
||||
// two-write-queues mode.
|
||||
std::deque<LogFileNumberSize>::reverse_iterator alive_log_files_tail_;
|
||||
// Log files that aren't fully synced, and the current log file.
|
||||
// Synchronization:
|
||||
// - push_back() is done from write_thread_ with locked mutex_ and
|
||||
@ -2385,11 +2420,10 @@ extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
|
||||
// will not depend on any WAL file. nullptr means no memtable is being flushed.
|
||||
// The function is only applicable to 2pc mode.
|
||||
extern uint64_t FindMinPrepLogReferencedByMemTable(
|
||||
VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
|
||||
const autovector<MemTable*>& memtables_to_flush);
|
||||
VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
|
||||
// For atomic flush.
|
||||
extern uint64_t FindMinPrepLogReferencedByMemTable(
|
||||
VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
|
||||
VersionSet* vset,
|
||||
const autovector<const autovector<MemTable*>*>& memtables_to_flush);
|
||||
|
||||
// Fix user-supplied options to be reasonable
|
||||
|
@ -170,6 +170,7 @@ Status DBImpl::FlushMemTableToOutputFile(
|
||||
const bool needs_to_sync_closed_wals =
|
||||
logfile_number_ > 0 &&
|
||||
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
|
||||
|
||||
// If needs_to_sync_closed_wals is true, we need to record the current
|
||||
// maximum memtable ID of this column family so that a later PickMemtables()
|
||||
// call will not pick memtables whose IDs are higher. This is due to the fact
|
||||
@ -177,9 +178,33 @@ Status DBImpl::FlushMemTableToOutputFile(
|
||||
// happen for this column family in the meantime. The newly created memtables
|
||||
// have their data backed by unsynced WALs, thus they cannot be included in
|
||||
// this flush job.
|
||||
// Another reason why we must record the current maximum memtable ID of this
|
||||
// column family: SyncClosedLogs() may release db mutex, thus it's possible
|
||||
// for application to continue to insert into memtables increasing db's
|
||||
// sequence number. The application may take a snapshot, but this snapshot is
|
||||
// not included in `snapshot_seqs` which will be passed to flush job because
|
||||
// `snapshot_seqs` has already been computed before this function starts.
|
||||
// Recording the max memtable ID ensures that the flush job does not flush
|
||||
// a memtable without knowing such snapshot(s).
|
||||
uint64_t max_memtable_id = needs_to_sync_closed_wals
|
||||
? cfd->imm()->GetLatestMemTableID()
|
||||
: port::kMaxUint64;
|
||||
|
||||
// If needs_to_sync_closed_wals is false, then the flush job will pick ALL
|
||||
// existing memtables of the column family when PickMemTable() is called
|
||||
// later. Although we won't call SyncClosedLogs() in this case, we may still
|
||||
// call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
|
||||
// releases and re-acquires the db mutex. In the meantime, the application
|
||||
// can still insert into the memtables and increase the db's sequence number.
|
||||
// The application can take a snapshot, hoping that the latest visible state
|
||||
// to this snapshto is preserved. This is hard to guarantee since db mutex
|
||||
// not held. This newly-created snapshot is not included in `snapshot_seqs`
|
||||
// and the flush job is unaware of its presence. Consequently, the flush job
|
||||
// may drop certain keys when generating the L0, causing incorrect data to be
|
||||
// returned for snapshot read using this snapshot.
|
||||
// To address this, we make sure NotifyOnFlushBegin() executes after memtable
|
||||
// picking so that no new snapshot can be taken between the two functions.
|
||||
|
||||
FlushJob flush_job(
|
||||
dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
|
||||
file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
|
||||
@ -192,11 +217,6 @@ Status DBImpl::FlushMemTableToOutputFile(
|
||||
&blob_callback_);
|
||||
FileMetaData file_meta;
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// may temporarily unlock and lock the mutex.
|
||||
NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
Status s;
|
||||
bool need_cancel = false;
|
||||
IOStatus log_io_s = IOStatus::OK();
|
||||
@ -221,6 +241,12 @@ Status DBImpl::FlushMemTableToOutputFile(
|
||||
}
|
||||
TEST_SYNC_POINT_CALLBACK(
|
||||
"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// may temporarily unlock and lock the mutex.
|
||||
NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
bool switched_to_mempurge = false;
|
||||
// Within flush_job.Run, rocksdb may call event listener to notify
|
||||
// file creation and deletion.
|
||||
@ -1752,21 +1778,16 @@ Status DBImpl::RunManualCompaction(
|
||||
input_level >= 0);
|
||||
|
||||
InternalKey begin_storage, end_storage;
|
||||
CompactionArg* ca;
|
||||
CompactionArg* ca = nullptr;
|
||||
|
||||
bool scheduled = false;
|
||||
bool unscheduled = false;
|
||||
Env::Priority thread_pool_priority = Env::Priority::TOTAL;
|
||||
bool manual_conflict = false;
|
||||
ManualCompactionState manual;
|
||||
manual.cfd = cfd;
|
||||
manual.input_level = input_level;
|
||||
manual.output_level = output_level;
|
||||
manual.output_path_id = compact_range_options.target_path_id;
|
||||
manual.done = false;
|
||||
manual.in_progress = false;
|
||||
manual.incomplete = false;
|
||||
manual.exclusive = exclusive;
|
||||
manual.disallow_trivial_move = disallow_trivial_move;
|
||||
manual.canceled = compact_range_options.canceled;
|
||||
|
||||
ManualCompactionState manual(
|
||||
cfd, input_level, output_level, compact_range_options.target_path_id,
|
||||
exclusive, disallow_trivial_move, compact_range_options.canceled);
|
||||
// For universal compaction, we enforce every manual compaction to compact
|
||||
// all files.
|
||||
if (begin == nullptr ||
|
||||
@ -1871,6 +1892,23 @@ Status DBImpl::RunManualCompaction(
|
||||
assert(!exclusive || !manual_conflict);
|
||||
// Running either this or some other manual compaction
|
||||
bg_cv_.Wait();
|
||||
if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) {
|
||||
assert(thread_pool_priority != Env::Priority::TOTAL);
|
||||
// unschedule all manual compactions
|
||||
auto unscheduled_task_num = env_->UnSchedule(
|
||||
GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
|
||||
if (unscheduled_task_num > 0) {
|
||||
ROCKS_LOG_INFO(
|
||||
immutable_db_options_.info_log,
|
||||
"[%s] Unscheduled %d number of manual compactions from the "
|
||||
"thread-pool",
|
||||
cfd->GetName().c_str(), unscheduled_task_num);
|
||||
// it may unschedule other manual compactions, notify others.
|
||||
bg_cv_.SignalAll();
|
||||
}
|
||||
unscheduled = true;
|
||||
TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled");
|
||||
}
|
||||
if (scheduled && manual.incomplete == true) {
|
||||
assert(!manual.in_progress);
|
||||
scheduled = false;
|
||||
@ -1898,15 +1936,20 @@ Status DBImpl::RunManualCompaction(
|
||||
bg_bottom_compaction_scheduled_++;
|
||||
ca->compaction_pri_ = Env::Priority::BOTTOM;
|
||||
env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca,
|
||||
Env::Priority::BOTTOM, this,
|
||||
Env::Priority::BOTTOM,
|
||||
GetTaskTag(TaskType::kManualCompaction),
|
||||
&DBImpl::UnscheduleCompactionCallback);
|
||||
thread_pool_priority = Env::Priority::BOTTOM;
|
||||
} else {
|
||||
bg_compaction_scheduled_++;
|
||||
ca->compaction_pri_ = Env::Priority::LOW;
|
||||
env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
|
||||
env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW,
|
||||
GetTaskTag(TaskType::kManualCompaction),
|
||||
&DBImpl::UnscheduleCompactionCallback);
|
||||
thread_pool_priority = Env::Priority::LOW;
|
||||
}
|
||||
scheduled = true;
|
||||
TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1914,6 +1957,13 @@ Status DBImpl::RunManualCompaction(
|
||||
assert(!manual.in_progress);
|
||||
assert(HasPendingManualCompaction());
|
||||
RemoveManualCompaction(&manual);
|
||||
// if the manual job is unscheduled, try schedule other jobs in case there's
|
||||
// any unscheduled compaction job which was blocked by exclusive manual
|
||||
// compaction.
|
||||
if (manual.status.IsIncomplete() &&
|
||||
manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
|
||||
MaybeScheduleFlushOrCompaction();
|
||||
}
|
||||
bg_cv_.SignalAll();
|
||||
return manual.status;
|
||||
}
|
||||
@ -2641,7 +2691,15 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) {
|
||||
CompactionArg ca = *(ca_ptr);
|
||||
delete reinterpret_cast<CompactionArg*>(arg);
|
||||
if (ca.prepicked_compaction != nullptr) {
|
||||
// if it's a manual compaction, set status to ManualCompactionPaused
|
||||
if (ca.prepicked_compaction->manual_compaction_state) {
|
||||
ca.prepicked_compaction->manual_compaction_state->done = true;
|
||||
ca.prepicked_compaction->manual_compaction_state->status =
|
||||
Status::Incomplete(Status::SubCode::kManualCompactionPaused);
|
||||
}
|
||||
if (ca.prepicked_compaction->compaction != nullptr) {
|
||||
ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
|
||||
Status::Incomplete(Status::SubCode::kManualCompactionPaused));
|
||||
delete ca.prepicked_compaction->compaction;
|
||||
}
|
||||
delete ca.prepicked_compaction;
|
||||
@ -2880,6 +2938,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
||||
immutable_db_options_.clock->SleepForMicroseconds(1000000);
|
||||
mutex_.Lock();
|
||||
} else if (s.IsManualCompactionPaused()) {
|
||||
assert(prepicked_compaction);
|
||||
ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
|
||||
assert(m);
|
||||
ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
|
||||
@ -2888,9 +2947,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
||||
|
||||
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
|
||||
|
||||
// If compaction failed, we want to delete all temporary files that we might
|
||||
// have created (they might not be all recorded in job_context in case of a
|
||||
// failure). Thus, we force full scan in FindObsoleteFiles()
|
||||
// If compaction failed, we want to delete all temporary files that we
|
||||
// might have created (they might not be all recorded in job_context in
|
||||
// case of a failure). Thus, we force full scan in FindObsoleteFiles()
|
||||
FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
|
||||
!s.IsManualCompactionPaused() &&
|
||||
!s.IsColumnFamilyDropped() &&
|
||||
@ -2917,6 +2976,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
||||
|
||||
assert(num_running_compactions_ > 0);
|
||||
num_running_compactions_--;
|
||||
|
||||
if (bg_thread_pri == Env::Priority::LOW) {
|
||||
bg_compaction_scheduled_--;
|
||||
} else {
|
||||
@ -2924,8 +2984,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
||||
bg_bottom_compaction_scheduled_--;
|
||||
}
|
||||
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
|
||||
// See if there's more work to be done
|
||||
MaybeScheduleFlushOrCompaction();
|
||||
|
||||
@ -2935,7 +2993,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
||||
// must be done before we potentially signal the DB close process to
|
||||
// proceed below.
|
||||
prepicked_compaction->task_token.reset();
|
||||
;
|
||||
}
|
||||
|
||||
if (made_progress ||
|
||||
@ -3022,6 +3079,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
||||
manual_compaction->in_progress = true;
|
||||
}
|
||||
|
||||
TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress");
|
||||
|
||||
std::unique_ptr<TaskLimiterToken> task_token;
|
||||
|
||||
// InternalKey manual_end_storage;
|
||||
|
@ -262,8 +262,7 @@ size_t DBImpl::TEST_LogsWithPrepSize() {
|
||||
|
||||
uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
|
||||
autovector<MemTable*> empty_list;
|
||||
return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr,
|
||||
empty_list);
|
||||
return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_GetLatestMutableCFOptions(
|
||||
|
@ -23,11 +23,7 @@
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
uint64_t DBImpl::MinLogNumberToKeep() {
|
||||
if (allow_2pc()) {
|
||||
return versions_->min_log_number_to_keep_2pc();
|
||||
} else {
|
||||
return versions_->MinLogNumberWithUnflushedData();
|
||||
}
|
||||
return versions_->min_log_number_to_keep();
|
||||
}
|
||||
|
||||
uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
|
||||
@ -224,7 +220,6 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
||||
}
|
||||
|
||||
// Add log files in wal_dir
|
||||
|
||||
if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) {
|
||||
std::vector<std::string> log_files;
|
||||
Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files);
|
||||
@ -234,6 +229,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
||||
log_file, immutable_db_options_.wal_dir);
|
||||
}
|
||||
}
|
||||
|
||||
// Add info log files in db_log_dir
|
||||
if (!immutable_db_options_.db_log_dir.empty() &&
|
||||
immutable_db_options_.db_log_dir != dbname_) {
|
||||
@ -670,8 +666,7 @@ void DBImpl::DeleteObsoleteFiles() {
|
||||
}
|
||||
|
||||
uint64_t FindMinPrepLogReferencedByMemTable(
|
||||
VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
|
||||
const autovector<MemTable*>& memtables_to_flush) {
|
||||
VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
|
||||
uint64_t min_log = 0;
|
||||
|
||||
// we must look through the memtables for two phase transactions
|
||||
@ -679,7 +674,7 @@ uint64_t FindMinPrepLogReferencedByMemTable(
|
||||
std::unordered_set<MemTable*> memtables_to_flush_set(
|
||||
memtables_to_flush.begin(), memtables_to_flush.end());
|
||||
for (auto loop_cfd : *vset->GetColumnFamilySet()) {
|
||||
if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
|
||||
if (loop_cfd->IsDropped()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -701,18 +696,16 @@ uint64_t FindMinPrepLogReferencedByMemTable(
|
||||
}
|
||||
|
||||
uint64_t FindMinPrepLogReferencedByMemTable(
|
||||
VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
|
||||
VersionSet* vset,
|
||||
const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
|
||||
uint64_t min_log = 0;
|
||||
|
||||
std::unordered_set<ColumnFamilyData*> cfds_to_flush_set(cfds_to_flush.begin(),
|
||||
cfds_to_flush.end());
|
||||
std::unordered_set<MemTable*> memtables_to_flush_set;
|
||||
for (const autovector<MemTable*>* memtables : memtables_to_flush) {
|
||||
memtables_to_flush_set.insert(memtables->begin(), memtables->end());
|
||||
}
|
||||
for (auto loop_cfd : *vset->GetColumnFamilySet()) {
|
||||
if (loop_cfd->IsDropped() || cfds_to_flush_set.count(loop_cfd)) {
|
||||
if (loop_cfd->IsDropped()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -828,8 +821,8 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
|
||||
min_log_number_to_keep = min_log_in_prep_heap;
|
||||
}
|
||||
|
||||
uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
|
||||
vset, &cfd_to_flush, memtables_to_flush);
|
||||
uint64_t min_log_refed_by_mem =
|
||||
FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
|
||||
|
||||
if (min_log_refed_by_mem != 0 &&
|
||||
min_log_refed_by_mem < min_log_number_to_keep) {
|
||||
@ -859,8 +852,8 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
|
||||
min_log_number_to_keep = min_log_in_prep_heap;
|
||||
}
|
||||
|
||||
uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
|
||||
vset, cfds_to_flush, memtables_to_flush);
|
||||
uint64_t min_log_refed_by_mem =
|
||||
FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
|
||||
|
||||
if (min_log_refed_by_mem != 0 &&
|
||||
min_log_refed_by_mem < min_log_number_to_keep) {
|
||||
|
@ -864,6 +864,11 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
||||
bool flushed = false;
|
||||
uint64_t corrupted_wal_number = kMaxSequenceNumber;
|
||||
uint64_t min_wal_number = MinLogNumberToKeep();
|
||||
if (!allow_2pc()) {
|
||||
// In non-2pc mode, we skip WALs that do not back unflushed data.
|
||||
min_wal_number =
|
||||
std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
|
||||
}
|
||||
for (auto wal_number : wal_numbers) {
|
||||
if (wal_number < min_wal_number) {
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
||||
@ -1268,9 +1273,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
||||
}
|
||||
|
||||
std::unique_ptr<VersionEdit> wal_deletion;
|
||||
if (flushed) {
|
||||
wal_deletion = std::unique_ptr<VersionEdit>(new VersionEdit());
|
||||
if (immutable_db_options_.track_and_verify_wals_in_manifest) {
|
||||
wal_deletion.reset(new VersionEdit);
|
||||
wal_deletion->DeleteWalsBefore(max_wal_number + 1);
|
||||
}
|
||||
if (!allow_2pc()) {
|
||||
// In non-2pc mode, flushing the memtables of the column families
|
||||
// means we can advance min_log_number_to_keep.
|
||||
wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1);
|
||||
}
|
||||
edit_lists.back().push_back(wal_deletion.get());
|
||||
}
|
||||
|
||||
@ -1349,7 +1361,14 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
||||
// FindObsoleteFiles()
|
||||
total_log_size_ = 0;
|
||||
log_empty_ = false;
|
||||
uint64_t min_wal_with_unflushed_data =
|
||||
versions_->MinLogNumberWithUnflushedData();
|
||||
for (auto wal_number : wal_numbers) {
|
||||
if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
|
||||
// In non-2pc mode, the WAL files not backing unflushed data are not
|
||||
// alive, thus should not be added to the alive_log_files_.
|
||||
continue;
|
||||
}
|
||||
// We preallocate space for wals, but then after a crash and restart, those
|
||||
// preallocated space are not needed anymore. It is likely only the last
|
||||
// log has such preallocated space, so we only truncate for the last log.
|
||||
@ -1362,6 +1381,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
||||
total_log_size_ += log.size;
|
||||
alive_log_files_.push_back(log);
|
||||
}
|
||||
alive_log_files_tail_ = alive_log_files_.rbegin();
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Unlock();
|
||||
}
|
||||
@ -1371,6 +1391,12 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
||||
Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
||||
MemTable* mem, VersionEdit* edit) {
|
||||
mutex_.AssertHeld();
|
||||
assert(cfd);
|
||||
assert(cfd->imm());
|
||||
// The immutable memtable list must be empty.
|
||||
assert(std::numeric_limits<uint64_t>::max() ==
|
||||
cfd->imm()->GetEarliestMemTableID());
|
||||
|
||||
const uint64_t start_micros = immutable_db_options_.clock->NowMicros();
|
||||
|
||||
FileMetaData meta;
|
||||
@ -1699,6 +1725,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
||||
}
|
||||
impl->alive_log_files_.push_back(
|
||||
DBImpl::LogFileNumberSize(impl->logfile_number_));
|
||||
impl->alive_log_files_tail_ = impl->alive_log_files_.rbegin();
|
||||
if (impl->two_write_queues_) {
|
||||
impl->log_write_mutex_.Unlock();
|
||||
}
|
||||
@ -1719,7 +1746,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
||||
WriteOptions write_options;
|
||||
uint64_t log_used, log_size;
|
||||
log::Writer* log_writer = impl->logs_.back().writer;
|
||||
s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size);
|
||||
s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
|
||||
/*with_db_mutex==*/true);
|
||||
if (s.ok()) {
|
||||
// Need to fsync, otherwise it might get lost after a power reset.
|
||||
s = impl->FlushWAL(false);
|
||||
|
@ -58,9 +58,10 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
|
||||
RecordTick(stats_, MEMTABLE_HIT);
|
||||
} else {
|
||||
PERF_TIMER_GUARD(get_from_output_files_time);
|
||||
PinnedIteratorsManager pinned_iters_mgr;
|
||||
super_version->current->Get(read_options, lkey, pinnable_val,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq);
|
||||
&max_covering_tombstone_seq, &pinned_iters_mgr);
|
||||
RecordTick(stats_, MEMTABLE_MISS);
|
||||
}
|
||||
RecordTick(stats_, NUMBER_KEYS_READ);
|
||||
|
@ -377,9 +377,10 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
|
||||
}
|
||||
if (!done) {
|
||||
PERF_TIMER_GUARD(get_from_output_files_time);
|
||||
PinnedIteratorsManager pinned_iters_mgr;
|
||||
super_version->current->Get(read_options, lkey, pinnable_val,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq);
|
||||
&max_covering_tombstone_seq, &pinned_iters_mgr);
|
||||
RecordTick(stats_, MEMTABLE_MISS);
|
||||
}
|
||||
{
|
||||
|
@ -1085,8 +1085,18 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
|
||||
// write thread. Otherwise this must be called holding log_write_mutex_.
|
||||
IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
|
||||
log::Writer* log_writer, uint64_t* log_used,
|
||||
uint64_t* log_size) {
|
||||
uint64_t* log_size,
|
||||
bool with_db_mutex, bool with_log_mutex) {
|
||||
assert(log_size != nullptr);
|
||||
|
||||
// Assert mutex explicitly.
|
||||
if (with_db_mutex) {
|
||||
mutex_.AssertHeld();
|
||||
} else if (two_write_queues_) {
|
||||
log_write_mutex_.AssertHeld();
|
||||
assert(with_log_mutex);
|
||||
}
|
||||
|
||||
Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
|
||||
*log_size = log_entry.size();
|
||||
// When two_write_queues_ WriteToWAL has to be protected from concurretn calls
|
||||
@ -1109,9 +1119,12 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
|
||||
*log_used = logfile_number_;
|
||||
}
|
||||
total_log_size_ += log_entry.size();
|
||||
// TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here
|
||||
// since alive_log_files_ might be modified concurrently
|
||||
alive_log_files_.back().AddSize(log_entry.size());
|
||||
if (with_db_mutex || with_log_mutex) {
|
||||
assert(alive_log_files_tail_ == alive_log_files_.rbegin());
|
||||
assert(alive_log_files_tail_ != alive_log_files_.rend());
|
||||
}
|
||||
LogFileNumberSize& last_alive_log = *alive_log_files_tail_;
|
||||
last_alive_log.AddSize(*log_size);
|
||||
log_empty_ = false;
|
||||
return io_s;
|
||||
}
|
||||
@ -1121,6 +1134,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
|
||||
bool need_log_sync, bool need_log_dir_sync,
|
||||
SequenceNumber sequence) {
|
||||
IOStatus io_s;
|
||||
assert(!two_write_queues_);
|
||||
assert(!write_group.leader->disable_wal);
|
||||
// Same holds for all in the batch group
|
||||
size_t write_with_wal = 0;
|
||||
@ -1208,6 +1222,7 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
|
||||
SequenceNumber* last_sequence, size_t seq_inc) {
|
||||
IOStatus io_s;
|
||||
|
||||
assert(two_write_queues_ || immutable_db_options_.unordered_write);
|
||||
assert(!write_group.leader->disable_wal);
|
||||
// Same holds for all in the batch group
|
||||
WriteBatch tmp_batch;
|
||||
@ -1232,7 +1247,8 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
|
||||
|
||||
log::Writer* log_writer = logs_.back().writer;
|
||||
uint64_t log_size;
|
||||
io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
|
||||
io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
|
||||
/*with_db_mutex=*/false, /*with_log_mutex=*/true);
|
||||
if (to_be_cached_state) {
|
||||
cached_recoverable_state_ = *to_be_cached_state;
|
||||
cached_recoverable_state_empty_ = false;
|
||||
@ -1886,6 +1902,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
log_dir_synced_ = false;
|
||||
logs_.emplace_back(logfile_number_, new_log);
|
||||
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
|
||||
alive_log_files_tail_ = alive_log_files_.rbegin();
|
||||
}
|
||||
log_write_mutex_.Unlock();
|
||||
}
|
||||
|
@ -169,6 +169,36 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
|
||||
ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
|
||||
} while (ChangeCompactOptions());
|
||||
}
|
||||
|
||||
TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) {
|
||||
do {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
options.inplace_update_support = true;
|
||||
options.env = env_;
|
||||
options.write_buffer_size = 100000;
|
||||
options.allow_concurrent_memtable_write = false;
|
||||
Reopen(options);
|
||||
CreateAndReopenWithCF({"pikachu"}, options);
|
||||
|
||||
// Update key with values of smaller size, and
|
||||
// run GetSnapshot and ReleaseSnapshot
|
||||
int numValues = 2;
|
||||
for (int i = numValues; i > 0; i--) {
|
||||
const Snapshot* s = db_->GetSnapshot();
|
||||
ASSERT_EQ(nullptr, s);
|
||||
std::string value = DummyString(i, 'a');
|
||||
ASSERT_OK(Put(1, "key", value));
|
||||
ASSERT_EQ(value, Get(1, "key"));
|
||||
// release s (nullptr)
|
||||
db_->ReleaseSnapshot(s);
|
||||
}
|
||||
|
||||
// Only 1 instance for that key.
|
||||
validateNumberOfEntries(1, 1);
|
||||
} while (ChangeCompactOptions());
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
@ -47,6 +47,45 @@ class DBMergeOperandTest : public DBTestBase {
|
||||
: DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {}
|
||||
};
|
||||
|
||||
TEST_F(DBMergeOperandTest, MergeOperandReadAfterFreeBug) {
|
||||
// There was a bug of reading merge operands after they are mistakely freed
|
||||
// in DB::GetMergeOperands, which is surfaced by cache full.
|
||||
// See PR#9507 for more.
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
||||
options.env = env_;
|
||||
BlockBasedTableOptions table_options;
|
||||
|
||||
// Small cache to simulate cache full
|
||||
table_options.block_cache = NewLRUCache(1);
|
||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||
|
||||
Reopen(options);
|
||||
int num_records = 4;
|
||||
int number_of_operands = 0;
|
||||
std::vector<PinnableSlice> values(num_records);
|
||||
GetMergeOperandsOptions merge_operands_info;
|
||||
merge_operands_info.expected_max_number_of_operands = num_records;
|
||||
|
||||
ASSERT_OK(Merge("k1", "v1"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(Merge("k1", "v2"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(Merge("k1", "v3"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(Merge("k1", "v4"));
|
||||
|
||||
ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
|
||||
"k1", values.data(), &merge_operands_info,
|
||||
&number_of_operands));
|
||||
ASSERT_EQ(number_of_operands, 4);
|
||||
ASSERT_EQ(values[0].ToString(), "v1");
|
||||
ASSERT_EQ(values[1].ToString(), "v2");
|
||||
ASSERT_EQ(values[2].ToString(), "v3");
|
||||
ASSERT_EQ(values[3].ToString(), "v4");
|
||||
}
|
||||
|
||||
TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
|
@ -1724,6 +1724,34 @@ TEST_F(DBRangeDelTest, OverlappedKeys) {
|
||||
ASSERT_EQ(0, NumTableFilesAtLevel(1));
|
||||
}
|
||||
|
||||
TEST_F(DBRangeDelTest, IteratorRefresh) {
|
||||
// Refreshing an iterator after a range tombstone is added should cause the
|
||||
// deleted range of keys to disappear.
|
||||
for (bool sv_changed : {false, true}) {
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
|
||||
|
||||
auto* iter = db_->NewIterator(ReadOptions());
|
||||
ASSERT_OK(iter->status());
|
||||
|
||||
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
|
||||
"key2", "key3"));
|
||||
|
||||
if (sv_changed) {
|
||||
ASSERT_OK(db_->Flush(FlushOptions()));
|
||||
}
|
||||
|
||||
ASSERT_OK(iter->Refresh());
|
||||
ASSERT_OK(iter->status());
|
||||
iter->SeekToFirst();
|
||||
ASSERT_EQ("key1", iter->key());
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
|
||||
delete iter;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -1481,6 +1481,93 @@ TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
|
||||
ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
|
||||
}
|
||||
|
||||
TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
options.track_and_verify_wals_in_manifest = true;
|
||||
// The following make sure there are two bg flush threads.
|
||||
options.max_background_jobs = 8;
|
||||
|
||||
const std::string cf1_name("cf1");
|
||||
CreateAndReopenWithCF({cf1_name}, options);
|
||||
assert(handles_.size() == 2);
|
||||
|
||||
{
|
||||
dbfull()->TEST_LockMutex();
|
||||
ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes);
|
||||
dbfull()->TEST_UnlockMutex();
|
||||
}
|
||||
|
||||
ASSERT_OK(dbfull()->PauseBackgroundWork());
|
||||
|
||||
ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
|
||||
|
||||
ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[1]));
|
||||
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
|
||||
ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[0]));
|
||||
|
||||
bool called = false;
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
// This callback will be called when the first bg flush thread reaches the
|
||||
// point before entering the MANIFEST write queue after flushing the SST
|
||||
// file.
|
||||
// The purpose of the sync points here is to ensure both bg flush threads
|
||||
// finish computing `min_wal_number_to_keep` before any of them updates the
|
||||
// `log_number` for the column family that's being flushed.
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep",
|
||||
[&](void* /*arg*/) {
|
||||
dbfull()->mutex()->AssertHeld();
|
||||
if (!called) {
|
||||
// We are the first bg flush thread in the MANIFEST write queue.
|
||||
// We set up the dependency between sync points for two threads that
|
||||
// will be executing the same code.
|
||||
// For the interleaving of events, see
|
||||
// https://github.com/facebook/rocksdb/pull/9715.
|
||||
// bg flush thread1 will release the db mutex while in the MANIFEST
|
||||
// write queue. In the meantime, bg flush thread2 locks db mutex and
|
||||
// computes the min_wal_number_to_keep (before thread1 writes to
|
||||
// MANIFEST thus before cf1->log_number is updated). Bg thread2 joins
|
||||
// the MANIFEST write queue afterwards and bg flush thread1 proceeds
|
||||
// with writing to MANIFEST.
|
||||
called = true;
|
||||
SyncPoint::GetInstance()->LoadDependency({
|
||||
{"VersionSet::LogAndApply:WriteManifestStart",
|
||||
"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"},
|
||||
{"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2",
|
||||
"VersionSet::LogAndApply:WriteManifest"},
|
||||
});
|
||||
} else {
|
||||
// The other bg flush thread has already been in the MANIFEST write
|
||||
// queue, and we are after.
|
||||
TEST_SYNC_POINT(
|
||||
"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2");
|
||||
}
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
ASSERT_OK(dbfull()->ContinueBackgroundWork());
|
||||
|
||||
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
|
||||
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
|
||||
|
||||
ASSERT_TRUE(called);
|
||||
|
||||
Close();
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
|
||||
DB* db1 = nullptr;
|
||||
Status s = DB::OpenForReadOnly(options, dbname_, &db1);
|
||||
ASSERT_OK(s);
|
||||
assert(db1);
|
||||
delete db1;
|
||||
}
|
||||
|
||||
// Test scope:
|
||||
// - We expect to open data store under all circumstances
|
||||
// - We expect only data upto the point where the first error was encountered
|
||||
|
@ -95,8 +95,9 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
|
||||
jwriter << "cf_name" << cf_name << "job" << job_id << "event"
|
||||
<< "table_file_creation"
|
||||
<< "file_number" << fd.GetNumber() << "file_size"
|
||||
<< fd.GetFileSize() << "file_checksum" << file_checksum
|
||||
<< "file_checksum_func_name" << file_checksum_func_name;
|
||||
<< fd.GetFileSize() << "file_checksum"
|
||||
<< Slice(file_checksum).ToString(true) << "file_checksum_func_name"
|
||||
<< file_checksum_func_name;
|
||||
|
||||
// table_properties
|
||||
{
|
||||
|
@ -773,9 +773,10 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
|
||||
const std::vector<FileMetaData*>& level_files =
|
||||
vstorage->LevelFiles(lvl);
|
||||
const SequenceNumber level_largest_seqno =
|
||||
(*max_element(level_files.begin(), level_files.end(),
|
||||
(*std::max_element(level_files.begin(), level_files.end(),
|
||||
[](FileMetaData* f1, FileMetaData* f2) {
|
||||
return f1->fd.largest_seqno < f2->fd.largest_seqno;
|
||||
return f1->fd.largest_seqno <
|
||||
f2->fd.largest_seqno;
|
||||
}))
|
||||
->fd.largest_seqno;
|
||||
// should only assign seqno to current level's largest seqno when
|
||||
|
@ -351,7 +351,7 @@ Status FlushJob::MemPurge() {
|
||||
|
||||
// Measure purging time.
|
||||
const uint64_t start_micros = clock_->NowMicros();
|
||||
const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
|
||||
const uint64_t start_cpu_micros = clock_->CPUMicros();
|
||||
|
||||
MemTable* new_mem = nullptr;
|
||||
// For performance/log investigation purposes:
|
||||
@ -603,7 +603,7 @@ Status FlushJob::MemPurge() {
|
||||
TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
|
||||
}
|
||||
const uint64_t micros = clock_->NowMicros() - start_micros;
|
||||
const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
|
||||
const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"[%s] [JOB %d] Mempurge lasted %" PRIu64
|
||||
" microseconds, and %" PRIu64
|
||||
@ -789,7 +789,7 @@ Status FlushJob::WriteLevel0Table() {
|
||||
ThreadStatus::STAGE_FLUSH_WRITE_L0);
|
||||
db_mutex_->AssertHeld();
|
||||
const uint64_t start_micros = clock_->NowMicros();
|
||||
const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
|
||||
const uint64_t start_cpu_micros = clock_->CPUMicros();
|
||||
Status s;
|
||||
|
||||
std::vector<BlobFileAddition> blob_file_additions;
|
||||
@ -976,7 +976,7 @@ Status FlushJob::WriteLevel0Table() {
|
||||
// Note that here we treat flush as level 0 compaction in internal stats
|
||||
InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
|
||||
const uint64_t micros = clock_->NowMicros() - start_micros;
|
||||
const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
|
||||
const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
|
||||
stats.micros = micros;
|
||||
stats.cpu_micros = cpu_micros;
|
||||
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksdb/write_buffer_manager.h"
|
||||
#include "table/internal_iterator.h"
|
||||
#include "table/iterator_wrapper.h"
|
||||
@ -447,11 +448,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
|
||||
is_range_del_table_empty_.load(std::memory_order_relaxed)) {
|
||||
return nullptr;
|
||||
}
|
||||
return NewRangeTombstoneIteratorInternal(read_options, read_seq);
|
||||
}
|
||||
|
||||
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
|
||||
const ReadOptions& read_options, SequenceNumber read_seq) {
|
||||
auto* unfragmented_iter = new MemTableIterator(
|
||||
*this, read_options, nullptr /* arena */, true /* use_range_del_table */);
|
||||
if (unfragmented_iter == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
auto fragmented_tombstone_list =
|
||||
std::make_shared<FragmentedRangeTombstoneList>(
|
||||
std::unique_ptr<InternalIterator>(unfragmented_iter),
|
||||
@ -960,53 +963,58 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
||||
}
|
||||
PERF_TIMER_GUARD(get_from_memtable_time);
|
||||
|
||||
// For now, memtable Bloom filter is effectively disabled if there are any
|
||||
// range tombstones. This is the simplest way to ensure range tombstones are
|
||||
// handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
|
||||
bool no_range_del = read_options.ignore_range_deletions ||
|
||||
is_range_del_table_empty_.load(std::memory_order_relaxed);
|
||||
MultiGetRange temp_range(*range, range->begin(), range->end());
|
||||
if (bloom_filter_) {
|
||||
std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
|
||||
std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
|
||||
autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
|
||||
if (bloom_filter_ && no_range_del) {
|
||||
bool whole_key =
|
||||
!prefix_extractor_ || moptions_.memtable_whole_key_filtering;
|
||||
std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
|
||||
std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
|
||||
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
|
||||
int num_keys = 0;
|
||||
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
|
||||
if (!prefix_extractor_) {
|
||||
keys[num_keys++] = &iter->ukey_without_ts;
|
||||
if (whole_key) {
|
||||
bloom_keys[num_keys] = iter->ukey_without_ts;
|
||||
range_indexes[num_keys++] = iter.index();
|
||||
} else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
|
||||
prefixes.emplace_back(
|
||||
prefix_extractor_->Transform(iter->ukey_without_ts));
|
||||
keys[num_keys++] = &prefixes.back();
|
||||
}
|
||||
}
|
||||
bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
|
||||
int idx = 0;
|
||||
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
|
||||
if (prefix_extractor_ &&
|
||||
!prefix_extractor_->InDomain(iter->ukey_without_ts)) {
|
||||
bloom_keys[num_keys] =
|
||||
prefix_extractor_->Transform(iter->ukey_without_ts);
|
||||
range_indexes[num_keys++] = iter.index();
|
||||
} else {
|
||||
// TODO: consider not counting these as Bloom hits to more closely
|
||||
// match bloom_sst_hit_count
|
||||
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
||||
continue;
|
||||
}
|
||||
if (!may_match[idx]) {
|
||||
temp_range.SkipKey(iter);
|
||||
}
|
||||
bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
|
||||
for (int i = 0; i < num_keys; ++i) {
|
||||
if (!may_match[i]) {
|
||||
temp_range.SkipIndex(range_indexes[i]);
|
||||
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
|
||||
} else {
|
||||
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
|
||||
SequenceNumber seq = kMaxSequenceNumber;
|
||||
bool found_final_value{false};
|
||||
bool merge_in_progress = iter->s->IsMergeInProgress();
|
||||
if (!no_range_del) {
|
||||
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
||||
NewRangeTombstoneIterator(
|
||||
NewRangeTombstoneIteratorInternal(
|
||||
read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
|
||||
if (range_del_iter != nullptr) {
|
||||
iter->max_covering_tombstone_seq = std::max(
|
||||
iter->max_covering_tombstone_seq,
|
||||
range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
|
||||
}
|
||||
SequenceNumber dummy_seq;
|
||||
GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
|
||||
callback, &iter->is_blob_index, iter->value->GetSelf(),
|
||||
iter->timestamp, iter->s, &(iter->merge_context), &seq,
|
||||
iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
|
||||
&found_final_value, &merge_in_progress);
|
||||
|
||||
if (!found_final_value && merge_in_progress) {
|
||||
|
@ -600,6 +600,10 @@ class MemTable {
|
||||
std::string* value, std::string* timestamp, Status* s,
|
||||
MergeContext* merge_context, SequenceNumber* seq,
|
||||
bool* found_final_value, bool* merge_in_progress);
|
||||
|
||||
// Always returns non-null and assumes certain pre-checks are done
|
||||
FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
|
||||
const ReadOptions& read_options, SequenceNumber read_seq);
|
||||
};
|
||||
|
||||
extern const char* EncodeKey(std::string* scratch, const Slice& target);
|
||||
|
@ -494,8 +494,8 @@ Status MemTableList::TryInstallMemtableFlushResults(
|
||||
// TODO(myabandeh): Not sure how batch_count could be 0 here.
|
||||
if (batch_count > 0) {
|
||||
uint64_t min_wal_number_to_keep = 0;
|
||||
if (vset->db_options()->allow_2pc) {
|
||||
assert(edit_list.size() > 0);
|
||||
if (vset->db_options()->allow_2pc) {
|
||||
// Note that if mempurge is successful, the edit_list will
|
||||
// not be applicable (contains info of new min_log number to keep,
|
||||
// and level 0 file path of SST file created during normal flush,
|
||||
@ -506,21 +506,24 @@ Status MemTableList::TryInstallMemtableFlushResults(
|
||||
|
||||
// We piggyback the information of earliest log file to keep in the
|
||||
// manifest entry for the last file flushed.
|
||||
edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
|
||||
}
|
||||
|
||||
std::unique_ptr<VersionEdit> wal_deletion;
|
||||
if (vset->db_options()->track_and_verify_wals_in_manifest) {
|
||||
if (!vset->db_options()->allow_2pc) {
|
||||
} else {
|
||||
min_wal_number_to_keep =
|
||||
PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
|
||||
}
|
||||
edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
|
||||
|
||||
std::unique_ptr<VersionEdit> wal_deletion;
|
||||
if (vset->db_options()->track_and_verify_wals_in_manifest) {
|
||||
if (min_wal_number_to_keep >
|
||||
vset->GetWalSet().GetMinWalNumberToKeep()) {
|
||||
wal_deletion.reset(new VersionEdit);
|
||||
wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
|
||||
edit_list.push_back(wal_deletion.get());
|
||||
}
|
||||
TEST_SYNC_POINT_CALLBACK(
|
||||
"MemTableList::TryInstallMemtableFlushResults:"
|
||||
"AfterComputeMinWalToKeep",
|
||||
nullptr);
|
||||
}
|
||||
|
||||
const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
|
||||
@ -805,15 +808,14 @@ Status InstallMemtableAtomicFlushResults(
|
||||
if (vset->db_options()->allow_2pc) {
|
||||
min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
|
||||
vset, cfds, edit_lists, mems_list, prep_tracker);
|
||||
edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
|
||||
}
|
||||
|
||||
std::unique_ptr<VersionEdit> wal_deletion;
|
||||
if (vset->db_options()->track_and_verify_wals_in_manifest) {
|
||||
if (!vset->db_options()->allow_2pc) {
|
||||
} else {
|
||||
min_wal_number_to_keep =
|
||||
PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
|
||||
}
|
||||
edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
|
||||
|
||||
std::unique_ptr<VersionEdit> wal_deletion;
|
||||
if (vset->db_options()->track_and_verify_wals_in_manifest) {
|
||||
if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
|
||||
wal_deletion.reset(new VersionEdit);
|
||||
wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
|
||||
|
@ -114,16 +114,19 @@ Status TableCache::GetTableReader(
|
||||
if (s.ok()) {
|
||||
s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
|
||||
}
|
||||
if (s.ok()) {
|
||||
RecordTick(ioptions_.stats, NO_FILE_OPENS);
|
||||
if (s.IsPathNotFound()) {
|
||||
} else if (s.IsPathNotFound()) {
|
||||
fname = Rocks2LevelTableFileName(fname);
|
||||
s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
|
||||
if (s.ok()) {
|
||||
s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
|
||||
nullptr);
|
||||
}
|
||||
if (s.ok()) {
|
||||
RecordTick(ioptions_.stats, NO_FILE_OPENS);
|
||||
}
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
if (!sequential_mode && ioptions_.advise_random_on_open) {
|
||||
|
@ -394,7 +394,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
|
||||
if (s->ok()) {
|
||||
version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily(
|
||||
version_edit_params_.max_column_family_);
|
||||
version_set_->MarkMinLogNumberToKeep2PC(
|
||||
version_set_->MarkMinLogNumberToKeep(
|
||||
version_edit_params_.min_log_number_to_keep_);
|
||||
version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
|
||||
version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
|
||||
@ -970,12 +970,11 @@ void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
|
||||
fprintf(stdout,
|
||||
"next_file_number %" PRIu64 " last_sequence %" PRIu64
|
||||
" prev_log_number %" PRIu64 " max_column_family %" PRIu32
|
||||
" min_log_number_to_keep "
|
||||
"%" PRIu64 "\n",
|
||||
" min_log_number_to_keep %" PRIu64 "\n",
|
||||
version_set_->current_next_file_number(),
|
||||
version_set_->LastSequence(), version_set_->prev_log_number(),
|
||||
version_set_->column_family_set_->GetMaxColumnFamily(),
|
||||
version_set_->min_log_number_to_keep_2pc());
|
||||
version_set_->min_log_number_to_keep());
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -1965,7 +1965,8 @@ void Version::MultiGetBlob(
|
||||
void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
||||
PinnableSlice* value, std::string* timestamp, Status* status,
|
||||
MergeContext* merge_context,
|
||||
SequenceNumber* max_covering_tombstone_seq, bool* value_found,
|
||||
SequenceNumber* max_covering_tombstone_seq,
|
||||
PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
|
||||
bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
|
||||
bool* is_blob, bool do_merge) {
|
||||
Slice ikey = k.internal_key();
|
||||
@ -1978,7 +1979,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
||||
*key_exists = true;
|
||||
}
|
||||
|
||||
PinnedIteratorsManager pinned_iters_mgr;
|
||||
uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
|
||||
if (vset_ && vset_->block_cache_tracer_ &&
|
||||
vset_->block_cache_tracer_->is_tracing_enabled()) {
|
||||
@ -1992,17 +1992,18 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
|
||||
bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
|
||||
BlobFetcher blob_fetcher(this, read_options);
|
||||
|
||||
assert(pinned_iters_mgr);
|
||||
GetContext get_context(
|
||||
user_comparator(), merge_operator_, info_log_, db_statistics_,
|
||||
status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
|
||||
do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found,
|
||||
merge_context, do_merge, max_covering_tombstone_seq, clock_, seq,
|
||||
merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob_to_use,
|
||||
merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
|
||||
tracing_get_id, &blob_fetcher);
|
||||
|
||||
// Pin blocks that we read to hold merge operands
|
||||
if (merge_operator_) {
|
||||
pinned_iters_mgr.StartPinning();
|
||||
pinned_iters_mgr->StartPinning();
|
||||
}
|
||||
|
||||
FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
|
||||
@ -2188,12 +2189,31 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
||||
MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
|
||||
// blob_file => [[blob_idx, it], ...]
|
||||
std::unordered_map<uint64_t, BlobReadRequests> blob_rqs;
|
||||
int level = -1;
|
||||
|
||||
while (f != nullptr) {
|
||||
MultiGetRange file_range = fp.CurrentFileRange();
|
||||
bool timer_enabled =
|
||||
GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
|
||||
get_perf_context()->per_level_perf_context_enabled;
|
||||
|
||||
// Report MultiGet stats per level.
|
||||
if (level >= 0 && level != (int)fp.GetHitFileLevel()) {
|
||||
// Dump the stats if the search has moved to the next level and
|
||||
// reset for next level.
|
||||
RecordInHistogram(db_statistics_,
|
||||
NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
||||
num_index_read + num_filter_read);
|
||||
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
|
||||
num_data_read);
|
||||
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
|
||||
num_filter_read = 0;
|
||||
num_index_read = 0;
|
||||
num_data_read = 0;
|
||||
num_sst_read = 0;
|
||||
level = fp.GetHitFileLevel();
|
||||
}
|
||||
|
||||
StopWatchNano timer(clock_, timer_enabled /* auto_start */);
|
||||
s = table_cache_->MultiGet(
|
||||
read_options, *internal_comparator(), *f->file_metadata, &file_range,
|
||||
@ -2238,6 +2258,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
||||
num_filter_read += get_context.get_context_stats_.num_filter_read;
|
||||
num_data_read += get_context.get_context_stats_.num_data_read;
|
||||
num_sst_read += get_context.get_context_stats_.num_sst_read;
|
||||
// Reset these stats since they're specific to a level
|
||||
get_context.get_context_stats_.num_index_read = 0;
|
||||
get_context.get_context_stats_.num_filter_read = 0;
|
||||
get_context.get_context_stats_.num_data_read = 0;
|
||||
get_context.get_context_stats_.num_sst_read = 0;
|
||||
|
||||
// report the counters before returning
|
||||
if (get_context.State() != GetContext::kNotFound &&
|
||||
@ -2314,22 +2339,6 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
||||
}
|
||||
}
|
||||
|
||||
// Report MultiGet stats per level.
|
||||
if (fp.IsHitFileLastInLevel()) {
|
||||
// Dump the stats if this is the last file of this level and reset for
|
||||
// next level.
|
||||
RecordInHistogram(db_statistics_,
|
||||
NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
||||
num_index_read + num_filter_read);
|
||||
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
|
||||
num_data_read);
|
||||
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
|
||||
num_filter_read = 0;
|
||||
num_index_read = 0;
|
||||
num_data_read = 0;
|
||||
num_sst_read = 0;
|
||||
}
|
||||
|
||||
RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
|
||||
if (!s.ok() || file_picker_range.empty()) {
|
||||
break;
|
||||
@ -2337,6 +2346,13 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
||||
f = fp.GetNextFile();
|
||||
}
|
||||
|
||||
// Dump stats for most recent level
|
||||
RecordInHistogram(db_statistics_, NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
||||
num_index_read + num_filter_read);
|
||||
RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
|
||||
num_data_read);
|
||||
RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
|
||||
|
||||
if (s.ok() && !blob_rqs.empty()) {
|
||||
MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs);
|
||||
}
|
||||
@ -4097,7 +4113,7 @@ void VersionSet::Reset() {
|
||||
}
|
||||
db_id_.clear();
|
||||
next_file_number_.store(2);
|
||||
min_log_number_to_keep_2pc_.store(0);
|
||||
min_log_number_to_keep_.store(0);
|
||||
manifest_file_number_ = 0;
|
||||
options_file_number_ = 0;
|
||||
pending_manifest_file_number_ = 0;
|
||||
@ -4564,8 +4580,7 @@ Status VersionSet::ProcessManifestWrites(
|
||||
}
|
||||
|
||||
if (last_min_log_number_to_keep != 0) {
|
||||
// Should only be set in 2PC mode.
|
||||
MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
|
||||
MarkMinLogNumberToKeep(last_min_log_number_to_keep);
|
||||
}
|
||||
|
||||
for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
|
||||
@ -4919,7 +4934,7 @@ Status VersionSet::Recover(
|
||||
",min_log_number_to_keep is %" PRIu64 "\n",
|
||||
manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
|
||||
last_sequence_.load(), log_number, prev_log_number_,
|
||||
column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
|
||||
column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
|
||||
|
||||
for (auto cfd : *column_family_set_) {
|
||||
if (cfd->IsDropped()) {
|
||||
@ -5324,9 +5339,9 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
|
||||
}
|
||||
// Called only either from ::LogAndApply which is protected by mutex or during
|
||||
// recovery which is single-threaded.
|
||||
void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
|
||||
if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
|
||||
min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
|
||||
void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
|
||||
if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
|
||||
min_log_number_to_keep_.store(number, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
@ -5448,7 +5463,7 @@ Status VersionSet::WriteCurrentStateToManifest(
|
||||
// min_log_number_to_keep is for the whole db, not for specific column family.
|
||||
// So it does not need to be set for every column family, just need to be set once.
|
||||
// Since default CF can never be dropped, we set the min_log to the default CF here.
|
||||
uint64_t min_log = min_log_number_to_keep_2pc();
|
||||
uint64_t min_log = min_log_number_to_keep();
|
||||
if (min_log != 0) {
|
||||
edit.SetMinLogNumberToKeep(min_log);
|
||||
}
|
||||
|
@ -708,9 +708,11 @@ class Version {
|
||||
// If the key has any merge operands then store them in
|
||||
// merge_context.operands_list and don't merge the operands
|
||||
// REQUIRES: lock is not held
|
||||
// REQUIRES: pinned_iters_mgr != nullptr
|
||||
void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
|
||||
std::string* timestamp, Status* status, MergeContext* merge_context,
|
||||
SequenceNumber* max_covering_tombstone_seq,
|
||||
PinnedIteratorsManager* pinned_iters_mgr,
|
||||
bool* value_found = nullptr, bool* key_exists = nullptr,
|
||||
SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
|
||||
bool* is_blob = nullptr, bool do_merge = true);
|
||||
@ -1099,8 +1101,8 @@ class VersionSet {
|
||||
|
||||
uint64_t current_next_file_number() const { return next_file_number_.load(); }
|
||||
|
||||
uint64_t min_log_number_to_keep_2pc() const {
|
||||
return min_log_number_to_keep_2pc_.load();
|
||||
uint64_t min_log_number_to_keep() const {
|
||||
return min_log_number_to_keep_.load();
|
||||
}
|
||||
|
||||
// Allocate and return a new file number
|
||||
@ -1158,7 +1160,7 @@ class VersionSet {
|
||||
// Mark the specified log number as deleted
|
||||
// REQUIRED: this is only called during single-threaded recovery or repair, or
|
||||
// from ::LogAndApply where the global mutex is held.
|
||||
void MarkMinLogNumberToKeep2PC(uint64_t number);
|
||||
void MarkMinLogNumberToKeep(uint64_t number);
|
||||
|
||||
// Return the log file number for the log file that is currently
|
||||
// being compacted, or zero if there is no such log file.
|
||||
@ -1167,10 +1169,12 @@ class VersionSet {
|
||||
// Returns the minimum log number which still has data not flushed to any SST
|
||||
// file.
|
||||
// In non-2PC mode, all the log numbers smaller than this number can be safely
|
||||
// deleted.
|
||||
// deleted, although we still use `min_log_number_to_keep_` to determine when
|
||||
// to delete a WAL file.
|
||||
uint64_t MinLogNumberWithUnflushedData() const {
|
||||
return PreComputeMinLogNumberWithUnflushedData(nullptr);
|
||||
}
|
||||
|
||||
// Returns the minimum log number which still has data not flushed to any SST
|
||||
// file.
|
||||
// Empty column families' log number is considered to be
|
||||
@ -1268,6 +1272,10 @@ class VersionSet {
|
||||
uint64_t min_pending_output);
|
||||
|
||||
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
|
||||
RefedColumnFamilySet GetRefedColumnFamilySet() {
|
||||
return RefedColumnFamilySet(GetColumnFamilySet());
|
||||
}
|
||||
|
||||
const FileOptions& file_options() { return file_options_; }
|
||||
void ChangeFileOptions(const MutableDBOptions& new_options) {
|
||||
file_options_.writable_file_max_buffer_size =
|
||||
@ -1370,9 +1378,8 @@ class VersionSet {
|
||||
const ImmutableDBOptions* const db_options_;
|
||||
std::atomic<uint64_t> next_file_number_;
|
||||
// Any WAL number smaller than this should be ignored during recovery,
|
||||
// and is qualified for being deleted in 2PC mode. In non-2PC mode, this
|
||||
// number is ignored.
|
||||
std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
|
||||
// and is qualified for being deleted.
|
||||
std::atomic<uint64_t> min_log_number_to_keep_ = {0};
|
||||
uint64_t manifest_file_number_;
|
||||
uint64_t options_file_number_;
|
||||
uint64_t options_file_size_;
|
||||
|
@ -3204,6 +3204,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
|
||||
}
|
||||
|
||||
TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
|
||||
db_options_.allow_2pc = true;
|
||||
NewDB();
|
||||
|
||||
SstInfo sst(100, kDefaultColumnFamilyName, "a");
|
||||
@ -3215,12 +3216,12 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
|
||||
edit.AddFile(0, file_metas[0]);
|
||||
edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
|
||||
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
||||
ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
|
||||
ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
CreateNewManifest();
|
||||
ReopenDB();
|
||||
ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
|
||||
ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
|
||||
}
|
||||
}
|
||||
|
||||
|
2
env/env_posix.cc
vendored
2
env/env_posix.cc
vendored
@ -166,7 +166,7 @@ class PosixClock : public SystemClock {
|
||||
defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
|
||||
return static_cast<uint64_t>(ts.tv_sec) * 1000000000;
|
||||
return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
51
env/fs_posix.cc
vendored
51
env/fs_posix.cc
vendored
@ -325,14 +325,7 @@ class PosixFileSystem : public FileSystem {
|
||||
SetFD_CLOEXEC(fd, &options);
|
||||
|
||||
if (options.use_mmap_writes) {
|
||||
if (!checkedDiskForMmap_) {
|
||||
// this will be executed once in the program's lifetime.
|
||||
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
||||
if (!SupportsFastAllocate(fname)) {
|
||||
forceMmapOff_ = true;
|
||||
}
|
||||
checkedDiskForMmap_ = true;
|
||||
}
|
||||
MaybeForceDisableMmap(fd);
|
||||
}
|
||||
if (options.use_mmap_writes && !forceMmapOff_) {
|
||||
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
|
||||
@ -431,14 +424,7 @@ class PosixFileSystem : public FileSystem {
|
||||
}
|
||||
|
||||
if (options.use_mmap_writes) {
|
||||
if (!checkedDiskForMmap_) {
|
||||
// this will be executed once in the program's lifetime.
|
||||
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
||||
if (!SupportsFastAllocate(fname)) {
|
||||
forceMmapOff_ = true;
|
||||
}
|
||||
checkedDiskForMmap_ = true;
|
||||
}
|
||||
MaybeForceDisableMmap(fd);
|
||||
}
|
||||
if (options.use_mmap_writes && !forceMmapOff_) {
|
||||
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
|
||||
@ -753,8 +739,10 @@ class PosixFileSystem : public FileSystem {
|
||||
const IOOptions& /*opts*/,
|
||||
IODebugContext* /*dbg*/) override {
|
||||
if (link(src.c_str(), target.c_str()) != 0) {
|
||||
if (errno == EXDEV) {
|
||||
return IOStatus::NotSupported("No cross FS links allowed");
|
||||
if (errno == EXDEV || errno == ENOTSUP) {
|
||||
return IOStatus::NotSupported(errno == EXDEV
|
||||
? "No cross FS links allowed"
|
||||
: "Links not supported by FS");
|
||||
}
|
||||
return IOError("while link file to " + target, src, errno);
|
||||
}
|
||||
@ -997,8 +985,7 @@ class PosixFileSystem : public FileSystem {
|
||||
}
|
||||
#endif
|
||||
private:
|
||||
bool checkedDiskForMmap_;
|
||||
bool forceMmapOff_; // do we override Env options?
|
||||
bool forceMmapOff_ = false; // do we override Env options?
|
||||
|
||||
// Returns true iff the named directory exists and is a directory.
|
||||
virtual bool DirExists(const std::string& dname) {
|
||||
@ -1009,10 +996,10 @@ class PosixFileSystem : public FileSystem {
|
||||
return false; // stat() failed return false
|
||||
}
|
||||
|
||||
bool SupportsFastAllocate(const std::string& path) {
|
||||
bool SupportsFastAllocate(int fd) {
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
struct statfs s;
|
||||
if (statfs(path.c_str(), &s)) {
|
||||
if (fstatfs(fd, &s)) {
|
||||
return false;
|
||||
}
|
||||
switch (s.f_type) {
|
||||
@ -1026,11 +1013,26 @@ class PosixFileSystem : public FileSystem {
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
(void)path;
|
||||
(void)fd;
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
void MaybeForceDisableMmap(int fd) {
|
||||
static std::once_flag s_check_disk_for_mmap_once;
|
||||
assert(this == FileSystem::Default().get());
|
||||
std::call_once(
|
||||
s_check_disk_for_mmap_once,
|
||||
[this](int fdesc) {
|
||||
// this will be executed once in the program's lifetime.
|
||||
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
||||
if (!SupportsFastAllocate(fdesc)) {
|
||||
forceMmapOff_ = true;
|
||||
}
|
||||
},
|
||||
fd);
|
||||
}
|
||||
|
||||
#ifdef ROCKSDB_IOURING_PRESENT
|
||||
bool IsIOUringEnabled() {
|
||||
if (RocksDbIOUringEnable && RocksDbIOUringEnable()) {
|
||||
@ -1094,8 +1096,7 @@ size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
|
||||
}
|
||||
|
||||
PosixFileSystem::PosixFileSystem()
|
||||
: checkedDiskForMmap_(false),
|
||||
forceMmapOff_(false),
|
||||
: forceMmapOff_(false),
|
||||
page_size_(getpagesize()),
|
||||
allow_non_owner_access_(true) {
|
||||
#if defined(ROCKSDB_IOURING_PRESENT)
|
||||
|
@ -366,10 +366,16 @@ struct AdvancedColumnFamilyOptions {
|
||||
Slice delta_value,
|
||||
std::string* merged_value) = nullptr;
|
||||
|
||||
// if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
|
||||
// create prefix bloom for memtable with the size of
|
||||
// Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
|
||||
// Bloom filter in memtable to optimize many queries that must go beyond
|
||||
// the memtable. The size in bytes of the filter is
|
||||
// write_buffer_size * memtable_prefix_bloom_size_ratio.
|
||||
// If it is larger than 0.25, it is sanitized to 0.25.
|
||||
// * If prefix_extractor is set, the filter includes prefixes.
|
||||
// * If memtable_whole_key_filtering, the filter includes whole keys.
|
||||
// * If both, the filter includes both.
|
||||
// * If neither, the feature is disabled.
|
||||
//
|
||||
// If this value is larger than 0.25, it is sanitized to 0.25.
|
||||
//
|
||||
// Default: 0 (disable)
|
||||
//
|
||||
|
@ -744,7 +744,7 @@ class DB {
|
||||
// snapshot is no longer needed.
|
||||
//
|
||||
// nullptr will be returned if the DB fails to take a snapshot or does
|
||||
// not support snapshot.
|
||||
// not support snapshot (eg: inplace_update_support enabled).
|
||||
virtual const Snapshot* GetSnapshot() = 0;
|
||||
|
||||
// Release a previously acquired snapshot. The caller must not
|
||||
|
@ -171,7 +171,7 @@ class Env : public Customizable {
|
||||
Env(const Env&) = delete;
|
||||
void operator=(const Env&) = delete;
|
||||
|
||||
virtual ~Env();
|
||||
~Env() override;
|
||||
|
||||
static const char* Type() { return "Environment"; }
|
||||
|
||||
|
@ -570,10 +570,10 @@ class ObjectRegistry {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (parent_ != nullptr) {
|
||||
return parent_->FindFactory<T>(name);
|
||||
} else {
|
||||
if (parent_ == nullptr) {
|
||||
return nullptr;
|
||||
} else {
|
||||
return parent_->FindFactory<T>(name);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
|
||||
#define ROCKSDB_MAJOR 6
|
||||
#define ROCKSDB_MINOR 29
|
||||
#define ROCKSDB_PATCH 0
|
||||
#define ROCKSDB_PATCH 5
|
||||
|
||||
// Do not use these. We made the mistake of declaring macros starting with
|
||||
// double underscore. Now we have to live with our choice. We'll deprecate these
|
||||
|
@ -4282,9 +4282,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory(
|
||||
JNIEnv*, jobject, jlong jhandle, jlong factory_handle) {
|
||||
auto* options =
|
||||
reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
|
||||
auto* factory = reinterpret_cast<ROCKSDB_NAMESPACE::SstPartitionerFactory*>(
|
||||
auto factory = reinterpret_cast<
|
||||
std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(
|
||||
factory_handle);
|
||||
options->sst_partitioner_factory.reset(factory);
|
||||
options->sst_partitioner_factory = *factory;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -18,7 +18,11 @@ public class NativeLibraryLoader {
|
||||
|
||||
private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb");
|
||||
private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb");
|
||||
private static final /* @Nullable */ String fallbackJniLibraryName =
|
||||
Environment.getFallbackJniLibraryName("rocksdb");
|
||||
private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb");
|
||||
private static final /* @Nullable */ String fallbackJniLibraryFileName =
|
||||
Environment.getFallbackJniLibraryFileName("rocksdb");
|
||||
private static final String tempFilePrefix = "librocksdbjni";
|
||||
private static final String tempFileSuffix = Environment.getJniLibraryExtension();
|
||||
|
||||
@ -49,15 +53,34 @@ public class NativeLibraryLoader {
|
||||
*/
|
||||
public synchronized void loadLibrary(final String tmpDir) throws IOException {
|
||||
try {
|
||||
// try dynamic library
|
||||
System.loadLibrary(sharedLibraryName);
|
||||
} catch(final UnsatisfiedLinkError ule1) {
|
||||
return;
|
||||
} catch (final UnsatisfiedLinkError ule) {
|
||||
// ignore - try from static library
|
||||
}
|
||||
|
||||
try {
|
||||
// try static library
|
||||
System.loadLibrary(jniLibraryName);
|
||||
} catch(final UnsatisfiedLinkError ule2) {
|
||||
return;
|
||||
} catch (final UnsatisfiedLinkError ule) {
|
||||
// ignore - then try static library fallback or from jar
|
||||
}
|
||||
|
||||
if (fallbackJniLibraryName != null) {
|
||||
try {
|
||||
// try static library fallback
|
||||
System.loadLibrary(fallbackJniLibraryName);
|
||||
return;
|
||||
} catch (final UnsatisfiedLinkError ule) {
|
||||
// ignore - then try from jar
|
||||
}
|
||||
}
|
||||
|
||||
// try jar
|
||||
loadLibraryFromJar(tmpDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to extract the native RocksDB library
|
||||
@ -83,6 +106,27 @@ public class NativeLibraryLoader {
|
||||
|
||||
File loadLibraryFromJarToTemp(final String tmpDir)
|
||||
throws IOException {
|
||||
InputStream is = null;
|
||||
try {
|
||||
// attempt to look up the static library in the jar file
|
||||
String libraryFileName = jniLibraryFileName;
|
||||
is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
|
||||
|
||||
if (is == null) {
|
||||
// is there a fallback we can try
|
||||
if (fallbackJniLibraryFileName == null) {
|
||||
throw new RuntimeException(libraryFileName + " was not found inside JAR.");
|
||||
}
|
||||
|
||||
// attempt to look up the fallback static library in the jar file
|
||||
libraryFileName = fallbackJniLibraryFileName;
|
||||
is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
|
||||
if (is == null) {
|
||||
throw new RuntimeException(libraryFileName + " was not found inside JAR.");
|
||||
}
|
||||
}
|
||||
|
||||
// create a temporary file to copy the library to
|
||||
final File temp;
|
||||
if (tmpDir == null || tmpDir.isEmpty()) {
|
||||
temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
|
||||
@ -92,34 +136,32 @@ public class NativeLibraryLoader {
|
||||
throw new RuntimeException(
|
||||
"Directory: " + parentDir.getAbsolutePath() + " does not exist!");
|
||||
}
|
||||
temp = new File(parentDir, jniLibraryFileName);
|
||||
temp = new File(parentDir, libraryFileName);
|
||||
if (temp.exists() && !temp.delete()) {
|
||||
throw new RuntimeException("File: " + temp.getAbsolutePath()
|
||||
+ " already exists and cannot be removed.");
|
||||
throw new RuntimeException(
|
||||
"File: " + temp.getAbsolutePath() + " already exists and cannot be removed.");
|
||||
}
|
||||
if (!temp.createNewFile()) {
|
||||
throw new RuntimeException("File: " + temp.getAbsolutePath()
|
||||
+ " could not be created.");
|
||||
throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created.");
|
||||
}
|
||||
}
|
||||
|
||||
if (!temp.exists()) {
|
||||
throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
|
||||
} else {
|
||||
temp.deleteOnExit();
|
||||
}
|
||||
|
||||
// attempt to copy the library from the Jar file to the temp destination
|
||||
try (final InputStream is = getClass().getClassLoader().
|
||||
getResourceAsStream(jniLibraryFileName)) {
|
||||
if (is == null) {
|
||||
throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
|
||||
} else {
|
||||
// copy the library from the Jar file to the temp destination
|
||||
Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
}
|
||||
|
||||
// return the temporary library file
|
||||
return temp;
|
||||
|
||||
} finally {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -110,8 +110,14 @@ public class Environment {
|
||||
return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix());
|
||||
}
|
||||
} else if (isMac()) {
|
||||
if (is64Bit()) {
|
||||
final String arch;
|
||||
if (isAarch64()) {
|
||||
return String.format("%sjni-osx-%s", name, ARCH);
|
||||
arch = "arm64";
|
||||
} else {
|
||||
arch = "x86_64";
|
||||
}
|
||||
return String.format("%sjni-osx-%s", name, arch);
|
||||
} else {
|
||||
return String.format("%sjni-osx", name);
|
||||
}
|
||||
@ -131,10 +137,25 @@ public class Environment {
|
||||
throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name));
|
||||
}
|
||||
|
||||
public static /*@Nullable*/ String getFallbackJniLibraryName(final String name) {
|
||||
if (isMac() && is64Bit()) {
|
||||
return String.format("%sjni-osx", name);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static String getJniLibraryFileName(final String name) {
|
||||
return appendLibOsSuffix("lib" + getJniLibraryName(name), false);
|
||||
}
|
||||
|
||||
public static /*@Nullable*/ String getFallbackJniLibraryFileName(final String name) {
|
||||
final String fallbackJniLibraryName = getFallbackJniLibraryName(name);
|
||||
if (fallbackJniLibraryName == null) {
|
||||
return null;
|
||||
}
|
||||
return appendLibOsSuffix("lib" + fallbackJniLibraryName, false);
|
||||
}
|
||||
|
||||
private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
|
||||
if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) {
|
||||
return libraryFileName + ".so";
|
||||
|
@ -5,6 +5,7 @@
|
||||
|
||||
package org.rocksdb;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.util.List;
|
||||
@ -21,7 +22,7 @@ public class SstPartitionerTest {
|
||||
@Rule public TemporaryFolder dbFolder = new TemporaryFolder();
|
||||
|
||||
@Test
|
||||
public void sstFixedPrefix() throws InterruptedException, RocksDBException {
|
||||
public void sstFixedPrefix() throws RocksDBException {
|
||||
try (SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4);
|
||||
final Options opt =
|
||||
new Options().setCreateIfMissing(true).setSstPartitionerFactory(factory);
|
||||
@ -31,7 +32,8 @@ public class SstPartitionerTest {
|
||||
db.put("bbbb1".getBytes(), "B".getBytes());
|
||||
db.flush(new FlushOptions());
|
||||
|
||||
db.put("aaaa1".getBytes(), "A2".getBytes());
|
||||
db.put("aaaa0".getBytes(), "A2".getBytes());
|
||||
db.put("aaaa2".getBytes(), "A2".getBytes());
|
||||
db.flush(new FlushOptions());
|
||||
|
||||
db.compactRange();
|
||||
@ -40,4 +42,31 @@ public class SstPartitionerTest {
|
||||
assertThat(metadata.size()).isEqualTo(2);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void sstFixedPrefixFamily() throws RocksDBException {
|
||||
final byte[] cfName = "new_cf".getBytes(UTF_8);
|
||||
final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName,
|
||||
new ColumnFamilyOptions().setSstPartitionerFactory(
|
||||
new SstPartitionerFixedPrefixFactory(4)));
|
||||
|
||||
try (final Options opt = new Options().setCreateIfMissing(true);
|
||||
final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
|
||||
final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor);
|
||||
|
||||
// writing (long)100 under key
|
||||
db.put(columnFamilyHandle, "aaaa1".getBytes(), "A".getBytes());
|
||||
db.put(columnFamilyHandle, "bbbb1".getBytes(), "B".getBytes());
|
||||
db.flush(new FlushOptions(), columnFamilyHandle);
|
||||
|
||||
db.put(columnFamilyHandle, "aaaa0".getBytes(), "A2".getBytes());
|
||||
db.put(columnFamilyHandle, "aaaa2".getBytes(), "A2".getBytes());
|
||||
db.flush(new FlushOptions(), columnFamilyHandle);
|
||||
|
||||
db.compactRange(columnFamilyHandle);
|
||||
|
||||
List<LiveFileMetaData> metadata = db.getLiveFilesMetaData();
|
||||
assertThat(metadata.size()).isEqualTo(2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,7 +9,6 @@ import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Modifier;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
@ -37,18 +36,21 @@ public class EnvironmentTest {
|
||||
isEqualTo(".jnilib");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-osx.jnilib");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.dylib");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mac64() {
|
||||
setEnvironmentClassFields("mac", "x86-64");
|
||||
public void mac64_x86_64() {
|
||||
setEnvironmentClassFields("mac", "x86_64");
|
||||
assertThat(Environment.isWindows()).isFalse();
|
||||
assertThat(Environment.getJniLibraryExtension()).
|
||||
isEqualTo(".jnilib");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-osx.jnilib");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-osx-x86_64.jnilib");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-osx.jnilib");
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.dylib");
|
||||
}
|
||||
@ -59,7 +61,9 @@ public class EnvironmentTest {
|
||||
assertThat(Environment.isWindows()).isFalse();
|
||||
assertThat(Environment.getJniLibraryExtension()).isEqualTo(".jnilib");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-osx-aarch64.jnilib");
|
||||
.isEqualTo("librocksdbjni-osx-arm64.jnilib");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-osx.jnilib");
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.dylib");
|
||||
}
|
||||
|
||||
@ -73,6 +77,7 @@ public class EnvironmentTest {
|
||||
isEqualTo(".so");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-linux32.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.so");
|
||||
// Linux musl-libc (Alpine)
|
||||
@ -103,7 +108,8 @@ public class EnvironmentTest {
|
||||
assertThat(Environment.isWindows()).isFalse();
|
||||
assertThat(Environment.getJniLibraryExtension()).
|
||||
isEqualTo(".so");
|
||||
Environment.getJniLibraryFileName("rocksdb");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).isEqualTo("blah");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -115,6 +121,7 @@ public class EnvironmentTest {
|
||||
isEqualTo(".so");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-linux64.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.so");
|
||||
// Linux musl-libc (Alpine)
|
||||
@ -124,6 +131,7 @@ public class EnvironmentTest {
|
||||
isEqualTo(".so");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-linux64-musl.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.so");
|
||||
// UNIX
|
||||
@ -134,6 +142,7 @@ public class EnvironmentTest {
|
||||
isEqualTo(".so");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-linux64.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.so");
|
||||
// AIX
|
||||
@ -143,6 +152,7 @@ public class EnvironmentTest {
|
||||
isEqualTo(".so");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-aix64.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.so");
|
||||
}
|
||||
@ -161,6 +171,7 @@ public class EnvironmentTest {
|
||||
isEqualTo(".dll");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni-win64.dll");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).
|
||||
isEqualTo("librocksdbjni.dll");
|
||||
}
|
||||
@ -177,6 +188,7 @@ public class EnvironmentTest {
|
||||
assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-linux-ppc64le.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
|
||||
// Linux musl-libc (Alpine)
|
||||
setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
|
||||
@ -189,6 +201,7 @@ public class EnvironmentTest {
|
||||
assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-linux-ppc64le-musl.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
|
||||
setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
|
||||
}
|
||||
@ -205,6 +218,7 @@ public class EnvironmentTest {
|
||||
assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-linux-aarch64.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
|
||||
// Linux musl-libc (Alpine)
|
||||
setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
|
||||
@ -217,6 +231,7 @@ public class EnvironmentTest {
|
||||
assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl");
|
||||
assertThat(Environment.getJniLibraryFileName("rocksdb"))
|
||||
.isEqualTo("librocksdbjni-linux-aarch64-musl.so");
|
||||
assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
|
||||
assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
|
||||
setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
|
||||
}
|
||||
|
@ -9,6 +9,8 @@
|
||||
|
||||
#if defined(OS_WIN)
|
||||
|
||||
#include "port/win/env_win.h"
|
||||
|
||||
#include <direct.h> // _rmdir, _mkdir, _getcwd
|
||||
#include <errno.h>
|
||||
#include <io.h> // _access
|
||||
@ -17,6 +19,7 @@
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <windows.h>
|
||||
#include <winioctl.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <ctime>
|
||||
@ -27,7 +30,6 @@
|
||||
#include "monitoring/thread_status_util.h"
|
||||
#include "port/port.h"
|
||||
#include "port/port_dirent.h"
|
||||
#include "port/win/env_win.h"
|
||||
#include "port/win/io_win.h"
|
||||
#include "port/win/win_logger.h"
|
||||
#include "rocksdb/env.h"
|
||||
|
@ -1436,9 +1436,10 @@ template <typename TBlocklike>
|
||||
Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
|
||||
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
||||
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
||||
const bool wait, CachableEntry<TBlocklike>* block_entry,
|
||||
BlockType block_type, GetContext* get_context,
|
||||
BlockCacheLookupContext* lookup_context, BlockContents* contents) const {
|
||||
const bool wait, const bool for_compaction,
|
||||
CachableEntry<TBlocklike>* block_entry, BlockType block_type,
|
||||
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
||||
BlockContents* contents) const {
|
||||
assert(block_entry != nullptr);
|
||||
const bool no_io = (ro.read_tier == kBlockCacheTier);
|
||||
Cache* block_cache = rep_->table_options.block_cache.get();
|
||||
@ -1491,7 +1492,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
|
||||
CompressionType raw_block_comp_type;
|
||||
BlockContents raw_block_contents;
|
||||
if (!contents) {
|
||||
StopWatch sw(rep_->ioptions.clock, statistics, READ_BLOCK_GET_MICROS);
|
||||
Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
|
||||
: READ_BLOCK_GET_MICROS;
|
||||
StopWatch sw(rep_->ioptions.clock, statistics, histogram);
|
||||
BlockFetcher block_fetcher(
|
||||
rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
|
||||
&raw_block_contents, rep_->ioptions, do_uncompress,
|
||||
@ -1849,8 +1852,9 @@ void BlockBasedTable::RetrieveMultipleBlocks(
|
||||
// avoid looking up the block cache
|
||||
s = MaybeReadBlockAndLoadToCache(
|
||||
nullptr, options, handle, uncompression_dict, /*wait=*/true,
|
||||
block_entry, BlockType::kData, mget_iter->get_context,
|
||||
&lookup_data_block_context, &raw_block_contents);
|
||||
/*for_compaction=*/false, block_entry, BlockType::kData,
|
||||
mget_iter->get_context, &lookup_data_block_context,
|
||||
&raw_block_contents);
|
||||
|
||||
// block_entry value could be null if no block cache is present, i.e
|
||||
// BlockBasedTableOptions::no_block_cache is true and no compressed
|
||||
@ -1904,7 +1908,7 @@ Status BlockBasedTable::RetrieveBlock(
|
||||
if (use_cache) {
|
||||
s = MaybeReadBlockAndLoadToCache(
|
||||
prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache,
|
||||
block_entry, block_type, get_context, lookup_context,
|
||||
for_compaction, block_entry, block_type, get_context, lookup_context,
|
||||
/*contents=*/nullptr);
|
||||
|
||||
if (!s.ok()) {
|
||||
@ -1933,8 +1937,9 @@ Status BlockBasedTable::RetrieveBlock(
|
||||
std::unique_ptr<TBlocklike> block;
|
||||
|
||||
{
|
||||
StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats,
|
||||
READ_BLOCK_GET_MICROS);
|
||||
Histograms histogram =
|
||||
for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
|
||||
StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
|
||||
s = ReadBlockFromFile(
|
||||
rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
|
||||
rep_->ioptions, do_uncompress, maybe_compressed, block_type,
|
||||
@ -3161,6 +3166,7 @@ Status BlockBasedTable::CreateIndexReader(
|
||||
uint64_t BlockBasedTable::ApproximateDataOffsetOf(
|
||||
const InternalIteratorBase<IndexValue>& index_iter,
|
||||
uint64_t data_size) const {
|
||||
assert(index_iter.status().ok());
|
||||
if (index_iter.Valid()) {
|
||||
BlockHandle handle = index_iter.value().handle;
|
||||
return handle.offset();
|
||||
@ -3203,8 +3209,16 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
||||
}
|
||||
|
||||
index_iter->Seek(key);
|
||||
uint64_t offset;
|
||||
if (index_iter->status().ok()) {
|
||||
offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
} else {
|
||||
// Split in half to avoid skewing one way or another,
|
||||
// since we don't know whether we're operating on lower bound or
|
||||
// upper bound.
|
||||
return rep_->file_size / 2;
|
||||
}
|
||||
|
||||
uint64_t offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
// Pro-rate file metadata (incl filters) size-proportionally across data
|
||||
// blocks.
|
||||
double size_ratio =
|
||||
@ -3220,7 +3234,9 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
||||
uint64_t data_size = GetApproximateDataSize();
|
||||
if (UNLIKELY(data_size == 0)) {
|
||||
// Hmm. Assume whole file is involved, since we have lower and upper
|
||||
// bound.
|
||||
// bound. This likely skews the estimate if we consider that this function
|
||||
// is typically called with `[start, end]` fully contained in the file's
|
||||
// key-range.
|
||||
return rep_->file_size;
|
||||
}
|
||||
|
||||
@ -3238,9 +3254,24 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
||||
}
|
||||
|
||||
index_iter->Seek(start);
|
||||
uint64_t start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
uint64_t start_offset;
|
||||
if (index_iter->status().ok()) {
|
||||
start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
} else {
|
||||
// Assume file is involved from the start. This likely skews the estimate
|
||||
// but is consistent with the above error handling.
|
||||
start_offset = 0;
|
||||
}
|
||||
|
||||
index_iter->Seek(end);
|
||||
uint64_t end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
uint64_t end_offset;
|
||||
if (index_iter->status().ok()) {
|
||||
end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
} else {
|
||||
// Assume file is involved until the end. This likely skews the estimate
|
||||
// but is consistent with the above error handling.
|
||||
end_offset = data_size;
|
||||
}
|
||||
|
||||
assert(end_offset >= start_offset);
|
||||
// Pro-rate file metadata (incl filters) size-proportionally across data
|
||||
|
@ -343,9 +343,10 @@ class BlockBasedTable : public TableReader {
|
||||
Status MaybeReadBlockAndLoadToCache(
|
||||
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
||||
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
||||
const bool wait, CachableEntry<TBlocklike>* block_entry,
|
||||
BlockType block_type, GetContext* get_context,
|
||||
BlockCacheLookupContext* lookup_context, BlockContents* contents) const;
|
||||
const bool wait, const bool for_compaction,
|
||||
CachableEntry<TBlocklike>* block_entry, BlockType block_type,
|
||||
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
||||
BlockContents* contents) const;
|
||||
|
||||
// Similar to the above, with one crucial difference: it will retrieve the
|
||||
// block from the file even if there are no caches configured (assuming the
|
||||
|
@ -501,8 +501,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
|
||||
// filter blocks
|
||||
s = table()->MaybeReadBlockAndLoadToCache(
|
||||
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
|
||||
/* wait */ true, &block, BlockType::kFilter, nullptr /* get_context */,
|
||||
&lookup_context, nullptr /* contents */);
|
||||
/* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter,
|
||||
nullptr /* get_context */, &lookup_context, nullptr /* contents */);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -179,8 +179,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
|
||||
// filter blocks
|
||||
Status s = table()->MaybeReadBlockAndLoadToCache(
|
||||
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
|
||||
/*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr,
|
||||
&lookup_context, /*contents=*/nullptr);
|
||||
/*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex,
|
||||
/*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
|
@ -235,9 +235,9 @@ class MultiGetContext {
|
||||
|
||||
bool empty() const { return RemainingMask() == 0; }
|
||||
|
||||
void SkipKey(const Iterator& iter) {
|
||||
skip_mask_ |= uint64_t{1} << iter.index_;
|
||||
}
|
||||
void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; }
|
||||
|
||||
void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }
|
||||
|
||||
bool IsKeySkipped(const Iterator& iter) const {
|
||||
return skip_mask_ & (uint64_t{1} << iter.index_);
|
||||
|
@ -87,6 +87,7 @@ default_params = {
|
||||
"partition_filters": lambda: random.randint(0, 1),
|
||||
"partition_pinning": lambda: random.randint(0, 3),
|
||||
"pause_background_one_in": 1000000,
|
||||
"prefix_size" : lambda: random.choice([-1, 1, 5, 7, 8]),
|
||||
"prefixpercent": 5,
|
||||
"progress_reports": 0,
|
||||
"readpercent": 45,
|
||||
@ -155,6 +156,8 @@ default_params = {
|
||||
"user_timestamp_size": 0,
|
||||
"secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]),
|
||||
"prepopulate_block_cache" : lambda: random.choice([0, 1]),
|
||||
"memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]),
|
||||
"memtable_whole_key_filtering": lambda: random.randint(0, 1),
|
||||
}
|
||||
|
||||
_TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
|
||||
@ -240,9 +243,6 @@ simple_default_params = {
|
||||
"max_background_compactions": 1,
|
||||
"max_bytes_for_level_base": 67108864,
|
||||
"memtablerep": "skip_list",
|
||||
"prefixpercent": 0,
|
||||
"readpercent": 50,
|
||||
"prefix_size" : -1,
|
||||
"target_file_size_base": 16777216,
|
||||
"target_file_size_multiplier": 1,
|
||||
"test_batches_snapshots": 0,
|
||||
@ -399,8 +399,15 @@ def finalize_and_sanitize(src_params):
|
||||
dest_params["readpercent"] += dest_params.get("iterpercent", 10)
|
||||
dest_params["iterpercent"] = 0
|
||||
dest_params["test_batches_snapshots"] = 0
|
||||
if dest_params.get("prefix_size") == -1:
|
||||
dest_params["readpercent"] += dest_params.get("prefixpercent", 20)
|
||||
dest_params["prefixpercent"] = 0
|
||||
dest_params["test_batches_snapshots"] = 0
|
||||
if dest_params.get("test_batches_snapshots") == 0:
|
||||
dest_params["batch_protection_bytes_per_key"] = 0
|
||||
if (dest_params.get("prefix_size") == -1 and
|
||||
dest_params.get("memtable_whole_key_filtering") == 0):
|
||||
dest_params["memtable_prefix_bloom_size_ratio"] = 0
|
||||
return dest_params
|
||||
|
||||
def gen_cmd_params(args):
|
||||
|
@ -724,9 +724,6 @@ inline bool Zlib_Compress(const CompressionInfo& info,
|
||||
output_header_len = compression::PutDecompressedSizeInfo(
|
||||
output, static_cast<uint32_t>(length));
|
||||
}
|
||||
// Resize output to be the plain data length.
|
||||
// This may not be big enough if the compression actually expands data.
|
||||
output->resize(output_header_len + length);
|
||||
|
||||
// The memLevel parameter specifies how much memory should be allocated for
|
||||
// the internal compression state.
|
||||
@ -760,12 +757,17 @@ inline bool Zlib_Compress(const CompressionInfo& info,
|
||||
}
|
||||
}
|
||||
|
||||
// Get an upper bound on the compressed size.
|
||||
size_t upper_bound =
|
||||
deflateBound(&_stream, static_cast<unsigned long>(length));
|
||||
output->resize(output_header_len + upper_bound);
|
||||
|
||||
// Compress the input, and put compressed data in output.
|
||||
_stream.next_in = (Bytef*)input;
|
||||
_stream.avail_in = static_cast<unsigned int>(length);
|
||||
|
||||
// Initialize the output size.
|
||||
_stream.avail_out = static_cast<unsigned int>(length);
|
||||
_stream.avail_out = static_cast<unsigned int>(upper_bound);
|
||||
_stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
|
||||
|
||||
bool compressed = false;
|
||||
|
@ -65,7 +65,7 @@ class DynamicBloom {
|
||||
// Multithreaded access to this function is OK
|
||||
bool MayContain(const Slice& key) const;
|
||||
|
||||
void MayContain(int num_keys, Slice** keys, bool* may_match) const;
|
||||
void MayContain(int num_keys, Slice* keys, bool* may_match) const;
|
||||
|
||||
// Multithreaded access to this function is OK
|
||||
bool MayContainHash(uint32_t hash) const;
|
||||
@ -120,12 +120,12 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
|
||||
return (MayContainHash(BloomHash(key)));
|
||||
}
|
||||
|
||||
inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
|
||||
inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
|
||||
bool* may_match) const {
|
||||
std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
|
||||
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
|
||||
for (int i = 0; i < num_keys; ++i) {
|
||||
hashes[i] = BloomHash(*keys[i]);
|
||||
hashes[i] = BloomHash(keys[i]);
|
||||
size_t a = FastRange32(kLen, hashes[i]);
|
||||
PREFETCH(data_ + a, 0, 3);
|
||||
byte_offsets[i] = a;
|
||||
|
22
util/math.h
22
util/math.h
@ -92,18 +92,25 @@ inline int CountTrailingZeroBits(T v) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER) && !defined(_M_X64)
|
||||
// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include
|
||||
// the following code at coarse granularity for simpler macros. It's important
|
||||
// to exclude at least so our non-MSVC unit test coverage tool doesn't see it.
|
||||
#ifdef _MSC_VER
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
int BitsSetToOneFallback(T v) {
|
||||
const int kBits = static_cast<int>(sizeof(T)) * 8;
|
||||
static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits");
|
||||
// we static_cast these bit patterns in order to truncate them to the correct
|
||||
// size
|
||||
// size. Warning C4309 dislikes this technique, so disable it here.
|
||||
#pragma warning(disable : 4309)
|
||||
v = static_cast<T>(v - ((v >> 1) & static_cast<T>(0x5555555555555555ull)));
|
||||
v = static_cast<T>((v & static_cast<T>(0x3333333333333333ull)) +
|
||||
((v >> 2) & static_cast<T>(0x3333333333333333ull)));
|
||||
v = static_cast<T>((v + (v >> 4)) & static_cast<T>(0x0F0F0F0F0F0F0F0Full));
|
||||
#pragma warning(default : 4309)
|
||||
for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) {
|
||||
v += static_cast<T>(v >> shift_bits);
|
||||
}
|
||||
@ -113,7 +120,8 @@ int BitsSetToOneFallback(T v) {
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
#endif
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
// Number of bits set to 1. Also known as "population count".
|
||||
template <typename T>
|
||||
@ -126,21 +134,21 @@ inline int BitsSetToOne(T v) {
|
||||
constexpr auto mm = 8 * sizeof(uint32_t) - 1;
|
||||
// The bit mask is to neutralize sign extension on small signed types
|
||||
constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1;
|
||||
#if defined(_M_X64) || defined(_M_IX86)
|
||||
#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
|
||||
return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m));
|
||||
#else
|
||||
return static_cast<int>(detail::BitsSetToOneFallback(v) & m);
|
||||
#endif
|
||||
} else if (sizeof(T) == sizeof(uint32_t)) {
|
||||
#if defined(_M_X64) || defined(_M_IX86)
|
||||
#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
|
||||
return static_cast<int>(__popcnt(static_cast<uint32_t>(v)));
|
||||
#else
|
||||
return detail::BitsSetToOneFallback(static_cast<uint32_t>(v));
|
||||
#endif
|
||||
} else {
|
||||
#ifdef _M_X64
|
||||
#if defined(HAVE_SSE42) && defined(_M_X64)
|
||||
return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
|
||||
#elif defined(_M_IX86)
|
||||
#elif defined(HAVE_SSE42) && defined(_M_IX86)
|
||||
return static_cast<int>(
|
||||
__popcnt(static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32) +
|
||||
__popcnt(static_cast<uint32_t>(v))));
|
||||
|
@ -148,6 +148,78 @@ TEST_P(TransactionTest, SuccessTest) {
|
||||
delete txn;
|
||||
}
|
||||
|
||||
TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) {
|
||||
const TxnDBWritePolicy write_policy = std::get<2>(GetParam());
|
||||
|
||||
if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) {
|
||||
ROCKSDB_GTEST_BYPASS("Test applies to write-committed only");
|
||||
return;
|
||||
}
|
||||
|
||||
ASSERT_OK(db->Put(WriteOptions(), "key0", "value"));
|
||||
|
||||
TransactionOptions txn_opts;
|
||||
txn_opts.use_only_the_last_commit_time_batch_for_recovery = true;
|
||||
Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts);
|
||||
assert(txn);
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"FlushJob::WriteLevel0Table", [&](void* arg) {
|
||||
// db mutex not held.
|
||||
auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
|
||||
assert(mems);
|
||||
ASSERT_EQ(1, mems->size());
|
||||
auto* ctwb = txn->GetCommitTimeWriteBatch();
|
||||
ASSERT_OK(ctwb->Put("gtid", "123"));
|
||||
ASSERT_OK(txn->Commit());
|
||||
delete txn;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
ASSERT_OK(txn->Put("key1", "value"));
|
||||
ASSERT_OK(txn->SetName("txn1"));
|
||||
|
||||
ASSERT_OK(txn->Prepare());
|
||||
|
||||
auto dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
||||
ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr));
|
||||
ASSERT_OK(dbimpl->TEST_FlushMemTable(
|
||||
/*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr));
|
||||
|
||||
ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable());
|
||||
|
||||
{
|
||||
std::string value;
|
||||
ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
|
||||
ASSERT_EQ("value", value);
|
||||
}
|
||||
|
||||
delete db;
|
||||
db = nullptr;
|
||||
Status s;
|
||||
if (use_stackable_db_ == false) {
|
||||
s = TransactionDB::Open(options, txn_db_options, dbname, &db);
|
||||
} else {
|
||||
s = OpenWithStackableDB();
|
||||
}
|
||||
ASSERT_OK(s);
|
||||
assert(db);
|
||||
|
||||
{
|
||||
std::string value;
|
||||
ASSERT_OK(db->Get(ReadOptions(), "gtid", &value));
|
||||
ASSERT_EQ("123", value);
|
||||
|
||||
ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
|
||||
ASSERT_EQ("value", value);
|
||||
}
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
}
|
||||
|
||||
// The test clarifies the contract of do_validate and assume_tracked
|
||||
// in GetForUpdate and Put/Merge/Delete
|
||||
TEST_P(TransactionTest, AssumeExclusiveTracked) {
|
||||
|
Loading…
Reference in New Issue
Block a user