WritePrepared Txn: Disable GC during recovery
Summary: Disables GC during recovery of a WritePrepared txn db to avoid GCing uncommitted key values. Closes https://github.com/facebook/rocksdb/pull/2980 Differential Revision: D6000191 Pulled By: maysamyabandeh fbshipit-source-id: fc4d522c643d24ebf043f811fe4ecd0dd0294675
This commit is contained in:
parent
7891af8b53
commit
7e38238981
@ -182,8 +182,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
||||
unable_to_flush_oldest_log_(false),
|
||||
env_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
|
||||
env_options_for_compaction_(env_->OptimizeForCompactionTableWrite(
|
||||
env_options_,
|
||||
immutable_db_options_)),
|
||||
env_options_, immutable_db_options_)),
|
||||
num_running_ingest_file_(0),
|
||||
#ifndef ROCKSDB_LITE
|
||||
wal_manager_(immutable_db_options_, env_options_),
|
||||
@ -195,7 +194,9 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
||||
opened_successfully_(false),
|
||||
concurrent_prepare_(options.concurrent_prepare),
|
||||
manual_wal_flush_(options.manual_wal_flush),
|
||||
seq_per_batch_(options.seq_per_batch) {
|
||||
seq_per_batch_(options.seq_per_batch),
|
||||
// TODO(myabandeh): revise this when we change options.seq_per_batch
|
||||
use_custom_gc_(options.seq_per_batch) {
|
||||
env_->GetAbsolutePath(dbname, &db_absolute_path_);
|
||||
|
||||
// Reserve ten files or so for other uses and give the rest to TableCache.
|
||||
|
@ -1300,6 +1300,7 @@ class DBImpl : public DB {
|
||||
const bool concurrent_prepare_;
|
||||
const bool manual_wal_flush_;
|
||||
const bool seq_per_batch_;
|
||||
const bool use_custom_gc_;
|
||||
};
|
||||
|
||||
extern Options SanitizeOptions(const std::string& db,
|
||||
|
@ -86,10 +86,14 @@ Status DBImpl::FlushMemTableToOutputFile(
|
||||
std::vector<SequenceNumber> snapshot_seqs =
|
||||
snapshots_.GetAll(&earliest_write_conflict_snapshot);
|
||||
|
||||
auto snapshot_checker = snapshot_checker_.get();
|
||||
if (use_custom_gc_ && snapshot_checker == nullptr) {
|
||||
snapshot_checker = DisableGCSnapshotChecker::Instance();
|
||||
}
|
||||
FlushJob flush_job(
|
||||
dbname_, cfd, immutable_db_options_, mutable_cf_options,
|
||||
env_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
|
||||
snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker_.get(),
|
||||
snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
|
||||
job_context, log_buffer, directories_.GetDbDir(),
|
||||
directories_.GetDataDir(0U),
|
||||
GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
|
||||
@ -531,13 +535,17 @@ Status DBImpl::CompactFilesImpl(
|
||||
auto pending_outputs_inserted_elem =
|
||||
CaptureCurrentFileNumberInPendingOutputs();
|
||||
|
||||
auto snapshot_checker = snapshot_checker_.get();
|
||||
if (use_custom_gc_ && snapshot_checker == nullptr) {
|
||||
snapshot_checker = DisableGCSnapshotChecker::Instance();
|
||||
}
|
||||
assert(is_snapshot_supported_ || snapshots_.empty());
|
||||
CompactionJob compaction_job(
|
||||
job_context->job_id, c.get(), immutable_db_options_,
|
||||
env_options_for_compaction_, versions_.get(), &shutting_down_, log_buffer,
|
||||
directories_.GetDbDir(), directories_.GetDataDir(c->output_path_id()),
|
||||
stats_, &mutex_, &bg_error_, snapshot_seqs,
|
||||
earliest_write_conflict_snapshot, snapshot_checker_.get(), table_cache_,
|
||||
earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
|
||||
&event_logger_, c->mutable_cf_options()->paranoid_file_checks,
|
||||
c->mutable_cf_options()->report_bg_io_stats, dbname_,
|
||||
nullptr); // Here we pass a nullptr for CompactionJobStats because
|
||||
@ -1678,6 +1686,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
||||
std::vector<SequenceNumber> snapshot_seqs =
|
||||
snapshots_.GetAll(&earliest_write_conflict_snapshot);
|
||||
|
||||
auto snapshot_checker = snapshot_checker_.get();
|
||||
if (use_custom_gc_ && snapshot_checker == nullptr) {
|
||||
snapshot_checker = DisableGCSnapshotChecker::Instance();
|
||||
}
|
||||
assert(is_snapshot_supported_ || snapshots_.empty());
|
||||
CompactionJob compaction_job(
|
||||
job_context->job_id, c.get(), immutable_db_options_,
|
||||
@ -1685,7 +1697,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
||||
log_buffer, directories_.GetDbDir(),
|
||||
directories_.GetDataDir(c->output_path_id()), stats_, &mutex_,
|
||||
&bg_error_, snapshot_seqs, earliest_write_conflict_snapshot,
|
||||
snapshot_checker_.get(), table_cache_, &event_logger_,
|
||||
snapshot_checker, table_cache_, &event_logger_,
|
||||
c->mutable_cf_options()->paranoid_file_checks,
|
||||
c->mutable_cf_options()->report_bg_io_stats, dbname_,
|
||||
&compaction_job_stats);
|
||||
|
@ -883,10 +883,10 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
||||
SequenceNumber earliest_write_conflict_snapshot;
|
||||
std::vector<SequenceNumber> snapshot_seqs =
|
||||
snapshots_.GetAll(&earliest_write_conflict_snapshot);
|
||||
// Only TransactionDB passes snapshot_checker and it creates it after db
|
||||
// open. Just pass nullptr here.
|
||||
SnapshotChecker* snapshot_checker = nullptr;
|
||||
|
||||
auto snapshot_checker = snapshot_checker_.get();
|
||||
if (use_custom_gc_ && snapshot_checker == nullptr) {
|
||||
snapshot_checker = DisableGCSnapshotChecker::Instance();
|
||||
}
|
||||
s = BuildTable(
|
||||
dbname_, env_, *cfd->ioptions(), mutable_cf_options,
|
||||
env_options_for_compaction_, cfd->table_cache(), iter.get(),
|
||||
|
@ -400,11 +400,7 @@ class Repairer {
|
||||
int64_t _current_time = 0;
|
||||
status = env_->GetCurrentTime(&_current_time); // ignore error
|
||||
const uint64_t current_time = static_cast<uint64_t>(_current_time);
|
||||
// Only TransactionDB make use of snapshot_checker and repair doesn't
|
||||
// currently support TransactionDB with uncommitted prepared keys in WAL.
|
||||
// TODO(yiwu) Support repairing TransactionDB.
|
||||
SnapshotChecker* snapshot_checker = nullptr;
|
||||
|
||||
SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
|
||||
status = BuildTable(
|
||||
dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
|
||||
env_options_, table_cache_, iter.get(),
|
||||
|
@ -8,16 +8,40 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// Callback class that control GC of duplicate keys in flush/compaction
|
||||
class SnapshotChecker {
|
||||
public:
|
||||
virtual ~SnapshotChecker() {}
|
||||
virtual bool IsInSnapshot(SequenceNumber sequence,
|
||||
SequenceNumber snapshot_sequence) const = 0;
|
||||
};
|
||||
|
||||
class DisableGCSnapshotChecker : public SnapshotChecker {
|
||||
public:
|
||||
virtual ~DisableGCSnapshotChecker() {}
|
||||
virtual bool IsInSnapshot(SequenceNumber sequence,
|
||||
SequenceNumber snapshot_sequence) const {
|
||||
// By returning false, we prevent all the values from being GCed
|
||||
return false;
|
||||
}
|
||||
static DisableGCSnapshotChecker* Instance() { return &instance_; }
|
||||
|
||||
protected:
|
||||
static DisableGCSnapshotChecker instance_;
|
||||
explicit DisableGCSnapshotChecker() {}
|
||||
};
|
||||
|
||||
class WritePreparedTxnDB;
|
||||
|
||||
// Callback class created by WritePreparedTxnDB to check if a key
|
||||
// is visible by a snapshot.
|
||||
class SnapshotChecker {
|
||||
class WritePreparedSnapshotChecker : public SnapshotChecker {
|
||||
public:
|
||||
explicit SnapshotChecker(WritePreparedTxnDB* txn_db);
|
||||
explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db);
|
||||
virtual ~WritePreparedSnapshotChecker() {}
|
||||
|
||||
bool IsInSnapshot(SequenceNumber sequence,
|
||||
SequenceNumber snapshot_sequence) const;
|
||||
virtual bool IsInSnapshot(SequenceNumber sequence,
|
||||
SequenceNumber snapshot_sequence) const override;
|
||||
|
||||
private:
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
@ -148,7 +148,7 @@ Status WritePreparedTxnDB::Initialize(
|
||||
SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
|
||||
AdvanceMaxEvictedSeq(prev_max, last_seq);
|
||||
|
||||
db_impl_->SetSnapshotChecker(new SnapshotChecker(this));
|
||||
db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
|
||||
|
||||
auto s = PessimisticTransactionDB::Initialize(compaction_enabled_cf_indices,
|
||||
handles);
|
||||
|
@ -14,10 +14,11 @@
|
||||
namespace rocksdb {
|
||||
|
||||
#ifdef ROCKSDB_LITE
|
||||
SnapshotChecker::SnapshotChecker(WritePreparedTxnDB* txn_db) {}
|
||||
WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
|
||||
WritePreparedTxnDB* txn_db) {}
|
||||
|
||||
bool SnapshotChecker::IsInSnapshot(SequenceNumber sequence,
|
||||
SequenceNumber snapshot_sequence) const {
|
||||
bool WritePreparedSnapshotChecker::IsInSnapshot(
|
||||
SequenceNumber sequence, SequenceNumber snapshot_sequence) const {
|
||||
// Should never be called in LITE mode.
|
||||
assert(false);
|
||||
return true;
|
||||
@ -25,13 +26,16 @@ bool SnapshotChecker::IsInSnapshot(SequenceNumber sequence,
|
||||
|
||||
#else
|
||||
|
||||
SnapshotChecker::SnapshotChecker(WritePreparedTxnDB* txn_db)
|
||||
WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
|
||||
WritePreparedTxnDB* txn_db)
|
||||
: txn_db_(txn_db){};
|
||||
|
||||
bool SnapshotChecker::IsInSnapshot(SequenceNumber sequence,
|
||||
SequenceNumber snapshot_sequence) const {
|
||||
bool WritePreparedSnapshotChecker::IsInSnapshot(
|
||||
SequenceNumber sequence, SequenceNumber snapshot_sequence) const {
|
||||
return txn_db_->IsInSnapshot(sequence, snapshot_sequence);
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
DisableGCSnapshotChecker DisableGCSnapshotChecker::instance_;
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -1349,6 +1349,28 @@ TEST_P(WritePreparedTransactionTest, DuplicateKeyTest) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(WritePreparedTransactionTest, DisableGCDuringRecoveryTest) {
|
||||
// Use large buffer to avoid memtable flush after 1024 insertions
|
||||
options.write_buffer_size = 1024 * 1024;
|
||||
ReOpen();
|
||||
std::vector<KeyVersion> versions;
|
||||
for (uint64_t i = 1; i <= 1024; i++) {
|
||||
std::string v = "bar" + ToString(i);
|
||||
ASSERT_OK(db->Put(WriteOptions(), "foo", v));
|
||||
VerifyKeys({{"foo", v}});
|
||||
KeyVersion kv = {"foo", v, i, kTypeValue};
|
||||
versions.emplace_back(kv);
|
||||
}
|
||||
std::reverse(std::begin(versions), std::end(versions));
|
||||
VerifyInternalKeys(versions);
|
||||
DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
db_impl->FlushWAL(true);
|
||||
// Use small buffer to ensure memtable flush during recovery
|
||||
options.write_buffer_size = 1024;
|
||||
ReOpenNoDelete();
|
||||
VerifyInternalKeys(versions);
|
||||
}
|
||||
|
||||
TEST_P(WritePreparedTransactionTest, SequenceNumberZeroTest) {
|
||||
ASSERT_OK(db->Put(WriteOptions(), "foo", "bar"));
|
||||
VerifyKeys({{"foo", "bar"}});
|
||||
|
Loading…
Reference in New Issue
Block a user