diff --git a/db/builder.cc b/db/builder.cc index bd695f1dd..68b9fe804 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -96,7 +96,8 @@ Status BuildTable( snapshots.empty() ? 0 : snapshots.back()); CompactionIterator c_iter(iter, internal_comparator.user_comparator(), - &merge, kMaxSequenceNumber, &snapshots, env, + &merge, kMaxSequenceNumber, &snapshots, + kMaxSequenceNumber, env, true /* internal key corruption is not ok */); c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index 8d0dc7f62..9cdc4feb6 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -13,12 +13,14 @@ namespace rocksdb { CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, - Env* env, bool expect_valid_internal_key, Compaction* compaction, + SequenceNumber earliest_write_conflict_snapshot, Env* env, + bool expect_valid_internal_key, Compaction* compaction, const CompactionFilter* compaction_filter, LogBuffer* log_buffer) : input_(input), cmp_(cmp), merge_helper_(merge_helper), snapshots_(snapshots), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), env_(env), expect_valid_internal_key_(expect_valid_internal_key), compaction_(compaction), @@ -200,6 +202,11 @@ void CompactionIterator::NextFromInput() { ParsedInternalKey next_ikey; input_->Next(); + if (earliest_write_conflict_snapshot_) { + // TODO(agiardullo): to be used in D50295 + // adding this if statement to keep CLANG happy in the meantime + } + // Check whether the current key is valid, not corrupt and the same // as the single delete. if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h index fdb8a7bf5..3be9112bc 100644 --- a/db/compaction_iterator.h +++ b/db/compaction_iterator.h @@ -39,7 +39,8 @@ class CompactionIterator { public: CompactionIterator(InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, - std::vector* snapshots, Env* env, + std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, Env* env, bool expect_valid_internal_key, Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, @@ -88,6 +89,7 @@ class CompactionIterator { const Comparator* cmp_; MergeHelper* merge_helper_; const std::vector* snapshots_; + const SequenceNumber earliest_write_conflict_snapshot_; Env* env_; bool expect_valid_internal_key_; Compaction* compaction_; diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc index 1148c2ac7..a59f56771 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction_iterator_test.cc @@ -20,9 +20,9 @@ class CompactionIteratorTest : public testing::Test { nullptr, 0U, false, 0)); iter_.reset(new test::VectorIterator(ks, vs)); iter_->SeekToFirst(); - c_iter_.reset(new CompactionIterator(iter_.get(), cmp_, merge_helper_.get(), - last_sequence, &snapshots_, - Env::Default(), false)); + c_iter_.reset(new CompactionIterator( + iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, + kMaxSequenceNumber, Env::Default(), false)); } const Comparator* cmp_; diff --git a/db/compaction_job.cc b/db/compaction_job.cc index ea052b84f..4806eff7e 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -212,6 +212,7 @@ CompactionJob::CompactionJob( std::atomic* shutting_down, LogBuffer* log_buffer, Directory* db_directory, Directory* output_directory, Statistics* stats, std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats) @@ -230,6 +231,7 @@ CompactionJob::CompactionJob( output_directory_(output_directory), stats_(stats), existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), table_cache_(std::move(table_cache)), event_logger_(event_logger), paranoid_file_checks_(paranoid_file_checks), @@ -638,8 +640,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { Status status; sub_compact->c_iter.reset(new CompactionIterator( input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), - &existing_snapshots_, env_, false, sub_compact->compaction, - compaction_filter)); + &existing_snapshots_, earliest_write_conflict_snapshot_, env_, false, + sub_compact->compaction, compaction_filter)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); const auto& c_iter_stats = c_iter->iter_stats(); diff --git a/db/compaction_job.h b/db/compaction_job.h index ab71519f4..cfb57ce2d 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -58,6 +58,7 @@ class CompactionJob { Directory* db_directory, Directory* output_directory, Statistics* stats, std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, @@ -134,6 +135,12 @@ class CompactionJob { // entirely within s1 and s2, then the earlier version of k1 can be safely // deleted because that version is not visible in any snapshot. std::vector existing_snapshots_; + + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occured. + SequenceNumber earliest_write_conflict_snapshot_; + std::shared_ptr table_cache_; EventLogger* event_logger_; diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index d4ae5f2f3..57c5a4aec 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -243,11 +243,11 @@ class CompactionJobTest : public testing::Test { LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); mutex_.Lock(); EventLogger event_logger(db_options_.info_log.get()); - CompactionJob compaction_job(0, &compaction, db_options_, env_options_, - versions_.get(), &shutting_down_, &log_buffer, - nullptr, nullptr, nullptr, snapshots, - table_cache_, &event_logger, false, false, - dbname_, &compaction_job_stats_); + CompactionJob compaction_job( + 0, &compaction, db_options_, env_options_, versions_.get(), + &shutting_down_, &log_buffer, nullptr, nullptr, nullptr, snapshots, + kMaxSequenceNumber, table_cache_, &event_logger, false, false, dbname_, + &compaction_job_stats_); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); diff --git a/db/db_impl.cc b/db/db_impl.cc index 215e7d941..a6e2deba2 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1779,12 +1779,16 @@ Status DBImpl::CompactFilesImpl( // deletion compaction currently not allowed in CompactFiles. assert(!c->deletion_compaction()); + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, snapshots_.GetAll(), - table_cache_, &event_logger_, + directories_.GetDataDir(c->output_path_id()), stats_, snapshot_seqs, + earliest_write_conflict_snapshot, table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->compaction_measure_io_stats, dbname_, nullptr); // Here we pass a nullptr for CompactionJobStats because @@ -2868,12 +2872,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, int output_level __attribute__((unused)) = c->output_level(); TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", &output_level); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, - snapshots_.GetAll(), table_cache_, &event_logger_, + directories_.GetDataDir(c->output_path_id()), stats_, snapshot_seqs, + earliest_write_conflict_snapshot, table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->compaction_measure_io_stats, dbname_, &compaction_job_stats); @@ -3784,7 +3793,13 @@ Status DBImpl::NewIterators( return Status::OK(); } -const Snapshot* DBImpl::GetSnapshot() { +const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } + +const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { + return GetSnapshotImpl(true); +} + +const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) { int64_t unix_time = 0; env_->GetCurrentTime(&unix_time); // Ignore error SnapshotImpl* s = new SnapshotImpl; @@ -3795,7 +3810,8 @@ const Snapshot* DBImpl::GetSnapshot() { delete s; return nullptr; } - return snapshots_.New(s, versions_->LastSequence(), unix_time); + return snapshots_.New(s, versions_->LastSequence(), unix_time, + is_write_conflict_boundary); } void DBImpl::ReleaseSnapshot(const Snapshot* s) { diff --git a/db/db_impl.h b/db/db_impl.h index 1ef96be14..6c13ff5dd 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -243,6 +243,12 @@ class DBImpl : public DB { #endif // ROCKSDB_LITE + // Similar to GetSnapshot(), but also lets the db know that this snapshot + // will be used for transaction write-conflict checking. The DB can then + // make sure not to compact any keys that would prevent a write-conflict from + // being detected. + const Snapshot* GetSnapshotForWriteConflictBoundary(); + // checks if all live files exist on file system and that their file sizes // match to our in-memory records virtual Status CheckConsistency(); @@ -560,6 +566,8 @@ class DBImpl : public DB { // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status); + const Snapshot* GetSnapshotImpl(bool is_write_conflict_boundary); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; diff --git a/db/snapshot_impl.cc b/db/snapshot_impl.cc index 1546d68f6..d901b61d2 100644 --- a/db/snapshot_impl.cc +++ b/db/snapshot_impl.cc @@ -12,6 +12,9 @@ namespace rocksdb { ManagedSnapshot::ManagedSnapshot(DB* db) : db_(db), snapshot_(db->GetSnapshot()) {} +ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot) + : db_(db), snapshot_(_snapshot) {} + ManagedSnapshot::~ManagedSnapshot() { if (snapshot_) { db_->ReleaseSnapshot(snapshot_); diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h index b4d58fdf0..4fa0bb9d5 100644 --- a/db/snapshot_impl.h +++ b/db/snapshot_impl.h @@ -34,6 +34,9 @@ class SnapshotImpl : public Snapshot { SnapshotList* list_; // just for sanity checks int64_t unix_time_; + + // Will this snapshot be used by a Transaction to do write-conflict checking? + bool is_write_conflict_boundary_; }; class SnapshotList { @@ -50,9 +53,10 @@ class SnapshotList { SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } const SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, - uint64_t unix_time) { + uint64_t unix_time, bool is_write_conflict_boundary) { s->number_ = seq; s->unix_time_ = unix_time; + s->is_write_conflict_boundary_ = is_write_conflict_boundary; s->list_ = this; s->next_ = &list_; s->prev_ = list_.prev_; @@ -71,14 +75,29 @@ class SnapshotList { } // retrieve all snapshot numbers. They are sorted in ascending order. - std::vector GetAll() { + std::vector GetAll( + SequenceNumber* oldest_write_conflict_snapshot = nullptr) { std::vector ret; + + if (oldest_write_conflict_snapshot != nullptr) { + *oldest_write_conflict_snapshot = kMaxSequenceNumber; + } + if (empty()) { return ret; } SnapshotImpl* s = &list_; while (s->next_ != &list_) { ret.push_back(s->next_->number_); + + if (oldest_write_conflict_snapshot != nullptr && + *oldest_write_conflict_snapshot != kMaxSequenceNumber && + s->next_->is_write_conflict_boundary_) { + // If this is the first write-conflict boundary snapshot in the list, + // it is the oldest + *oldest_write_conflict_snapshot = s->next_->number_; + } + s = s->next_; } return ret; diff --git a/include/rocksdb/snapshot.h b/include/rocksdb/snapshot.h index aad675b4b..95822d297 100644 --- a/include/rocksdb/snapshot.h +++ b/include/rocksdb/snapshot.h @@ -33,6 +33,9 @@ class ManagedSnapshot { public: explicit ManagedSnapshot(DB* db); + // Instead of creating a snapshot, take ownership of the input snapshot. + ManagedSnapshot(DB* db, const Snapshot* _snapshot); + ~ManagedSnapshot(); const Snapshot* snapshot(); diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index d31ac6a26..5f3e97e9b 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -7,6 +7,7 @@ #include "utilities/transactions/transaction_base.h" +#include "db/db_impl.h" #include "db/column_family.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -35,7 +36,11 @@ void TransactionBaseImpl::Clear() { } void TransactionBaseImpl::SetSnapshot() { - snapshot_.reset(new ManagedSnapshot(db_)); + assert(dynamic_cast(db_) != nullptr); + auto db_impl = reinterpret_cast(db_); + + const Snapshot* snapshot = db_impl->GetSnapshotForWriteConflictBoundary(); + snapshot_.reset(new ManagedSnapshot(db_, snapshot)); snapshot_needed_ = false; snapshot_notifier_ = nullptr; }