Fix a bug of CompactionIterator/CompactionFilter using Delete (#9929)

Summary:
When compaction filter determines that a key should be removed, it updates the internal key's type
to `Delete`. If this internal key is preserved in current compaction but seen by a later compaction
together with `SingleDelete`, it will cause compaction iterator to return Corruption.

To fix the issue, compaction filter should return more information in addition to the intention of removing
a key. Therefore, we add a new `kRemoveWithSingleDelete` to `CompactionFilter::Decision`. Seeing
`kRemoveWithSingleDelete`, compaction iterator will update the op type of the internal key to `kTypeSingleDelete`.

In addition, I updated db_stress_shared_state.[cc|h] so that `no_overwrite_ids_` becomes `const`. It is easier to
reason about thread-safety if accessed from multiple threads. This information is passed to `PrepareTxnDBOptions()`
when calling from `Open()` so that we can set up the rollback deletion type callback for transactions.

Finally, disable compaction filter for multiops_txn because the key removal logic of `DbStressCompactionFilter` does
not quite work with `MultiOpsTxnsStressTest`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9929

Test Plan:
make check
make crash_test
make crash_test_with_txn

Reviewed By: anand1976

Differential Revision: D36069678

Pulled By: riversand963

fbshipit-source-id: cedd2f1ba958af59ad3916f1ba6f424307955f92
This commit is contained in:
Yanqin Jin 2022-05-02 13:25:45 -07:00 committed by Facebook GitHub Bot
parent 37f490834d
commit 06394ff4e7
14 changed files with 148 additions and 47 deletions

View File

@ -4,6 +4,7 @@
* Fixed a bug where manual flush would block forever even though flush options had wait=false.
* Fixed a bug where RocksDB could corrupt DBs with `avoid_flush_during_recovery == true` by removing valid WALs, leading to `Status::Corruption` with message like "SST file is ahead of WALs" when attempting to reopen.
* Fixed a bug in async_io path where incorrect length of data is read by FilePrefetchBuffer if data is consumed from two populated buffers and request for more data is sent.
* Fixed a CompactionFilter bug. Compaction filter used to use `Delete` to remove keys, even if the keys should be removed with `SingleDelete`. Mixing `Delete` and `SingleDelete` may cause undefined behavior.
### New Features
* DB::GetLiveFilesStorageInfo is ready for production use.
@ -12,6 +13,7 @@
### Public API changes
* Add rollback_deletion_type_callback to TransactionDBOptions so that write-prepared transactions know whether to issue a Delete or SingleDelete to cancel a previous key written during prior prepare phase. The PR aims to prevent mixing SingleDeletes and Deletes for the same key that can lead to undefined behaviors for write-prepared transactions.
* EXPERIMENTAL: Add new API AbortIO in file_system to abort the read requests submitted asynchronously.
* CompactionFilter::Decision has a new value: kRemoveWithSingleDelete. If CompactionFilter returns this decision, then CompactionIterator will use `SingleDelete` to mark a key as removed.
### Bug Fixes
* RocksDB calls FileSystem::Poll API during FilePrefetchBuffer destruction which impacts performance as it waits for read requets completion which is not needed anymore. Calling FileSystem::AbortIO to abort those requests instead fixes that performance issue.

View File

@ -307,6 +307,14 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
// no value associated with delete
value_.clear();
iter_stats_.num_record_drop_user++;
} else if (filter == CompactionFilter::Decision::kRemoveWithSingleDelete) {
// convert the current key to a single delete; key_ is pointing into
// current_key_ at this point, so updating current_key_ updates key()
ikey_.type = kTypeSingleDeletion;
current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion);
// no value associated with single delete
value_.clear();
iter_stats_.num_record_drop_user++;
} else if (filter == CompactionFilter::Decision::kChangeValue) {
if (ikey_.type == kTypeBlobIndex) {
// value transfer from blob file to inlined data

View File

@ -968,6 +968,71 @@ TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) {
ASSERT_TRUE(TryReopen(options).IsNotSupported());
}
TEST_F(DBTestCompactionFilter, DropKeyWithSingleDelete) {
Options options = GetDefaultOptions();
options.create_if_missing = true;
Reopen(options);
ASSERT_OK(Put("a", "v0"));
ASSERT_OK(Put("b", "v0"));
const Snapshot* snapshot = db_->GetSnapshot();
ASSERT_OK(SingleDelete("b"));
ASSERT_OK(Flush());
{
CompactRangeOptions cro;
cro.change_level = true;
cro.target_level = options.num_levels - 1;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
db_->ReleaseSnapshot(snapshot);
Close();
class DeleteFilterV2 : public CompactionFilter {
public:
Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
const Slice& /*existing_value*/,
std::string* /*new_value*/,
std::string* /*skip_until*/) const override {
if (key.starts_with("b")) {
return Decision::kRemoveWithSingleDelete;
}
return Decision::kRemove;
}
const char* Name() const override { return "DeleteFilterV2"; }
} delete_filter_v2;
options.compaction_filter = &delete_filter_v2;
options.level0_file_num_compaction_trigger = 2;
Reopen(options);
ASSERT_OK(Put("b", "v1"));
ASSERT_OK(Put("x", "v1"));
ASSERT_OK(Flush());
ASSERT_OK(Put("r", "v1"));
ASSERT_OK(Put("z", "v1"));
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
Close();
options.compaction_filter = nullptr;
Reopen(options);
ASSERT_OK(SingleDelete("b"));
ASSERT_OK(Flush());
{
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

View File

@ -46,11 +46,13 @@ class DbStressCompactionFilter : public CompactionFilter {
// Reaching here means we acquired the lock.
bool key_exists = state_->Exists(cf_id_, key_num);
const bool allow_overwrite = state_->AllowsOverwrite(key_num);
key_mutex->Unlock();
if (!key_exists) {
return Decision::kRemove;
return allow_overwrite ? Decision::kRemove
: Decision::kRemoveWithSingleDelete;
}
return Decision::kKeep;
}

View File

@ -58,8 +58,8 @@ void ThreadBody(void* v) {
bool RunStressTest(StressTest* stress) {
SystemClock* clock = db_stress_env->GetSystemClock().get();
stress->InitDb();
SharedState shared(db_stress_env, stress);
stress->InitDb(&shared);
stress->FinishInitDb(&shared);
#ifndef NDEBUG

View File

@ -12,8 +12,6 @@
#include "db_stress_tool/db_stress_shared_state.h"
namespace ROCKSDB_NAMESPACE {
const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff;
#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
#if defined(OS_SOLARIS)
__thread bool SharedState::ignore_read_error;

View File

@ -45,9 +45,9 @@ class SharedState {
public:
// indicates a key may have any value (or not be present) as an operation on
// it is incomplete.
static const uint32_t UNKNOWN_SENTINEL;
static constexpr uint32_t UNKNOWN_SENTINEL = 0xfffffffe;
// indicates a key should definitely be deleted
static const uint32_t DELETION_SENTINEL;
static constexpr uint32_t DELETION_SENTINEL = 0xffffffff;
// Errors when reading filter blocks are ignored, so we use a thread
// local variable updated via sync points to keep track of errors injected
@ -81,36 +81,9 @@ class SharedState {
stress_test_(stress_test),
verification_failure_(false),
should_stop_test_(false),
no_overwrite_ids_(FLAGS_column_families),
no_overwrite_ids_(GenerateNoOverwriteIds()),
expected_state_manager_(nullptr),
printing_verification_results_(false) {
// Pick random keys in each column family that will not experience
// overwrite
fprintf(stdout, "Choosing random keys with no overwrite\n");
Random64 rnd(seed_);
// Start with the identity permutation. Subsequent iterations of
// for loop below will start with perm of previous for loop
int64_t* permutation = new int64_t[max_key_];
for (int64_t i = 0; i < max_key_; i++) {
permutation[i] = i;
}
// Now do the Knuth shuffle
int64_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
// Only need to figure out first num_no_overwrite_keys of permutation
no_overwrite_ids_.reserve(num_no_overwrite_keys);
for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
int64_t rand_index = i + rnd.Next() % (max_key_ - i);
// Swap i and rand_index;
int64_t temp = permutation[i];
permutation[i] = permutation[rand_index];
permutation[rand_index] = temp;
// Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
// permutation
no_overwrite_ids_.insert(permutation[i]);
}
delete[] permutation;
Status status;
// TODO: We should introduce a way to explicitly disable verification
// during shutdown. When that is disabled and FLAGS_expected_values_dir
@ -293,7 +266,7 @@ class SharedState {
pending);
}
bool AllowsOverwrite(int64_t key) {
bool AllowsOverwrite(int64_t key) const {
return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
}
@ -335,6 +308,36 @@ class SharedState {
ignore_read_error = true;
}
// Pick random keys in each column family that will not experience overwrite.
std::unordered_set<int64_t> GenerateNoOverwriteIds() const {
fprintf(stdout, "Choosing random keys with no overwrite\n");
// Start with the identity permutation. Subsequent iterations of
// for loop below will start with perm of previous for loop
std::vector<int64_t> permutation(max_key_);
for (int64_t i = 0; i < max_key_; ++i) {
permutation[i] = i;
}
// Now do the Knuth shuffle
const int64_t num_no_overwrite_keys =
(max_key_ * FLAGS_nooverwritepercent) / 100;
// Only need to figure out first num_no_overwrite_keys of permutation
std::unordered_set<int64_t> ret;
ret.reserve(num_no_overwrite_keys);
Random64 rnd(seed_);
for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
assert(i < max_key_);
int64_t rand_index = i + rnd.Next() % (max_key_ - i);
// Swap i and rand_index;
int64_t temp = permutation[i];
permutation[i] = permutation[rand_index];
permutation[rand_index] = temp;
// Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
// permutation
ret.insert(permutation[i]);
}
return ret;
}
port::Mutex mu_;
port::CondVar cv_;
const uint32_t seed_;
@ -355,7 +358,7 @@ class SharedState {
std::atomic<bool> should_stop_test_;
// Keys that should not be overwritten
std::unordered_set<size_t> no_overwrite_ids_;
const std::unordered_set<int64_t> no_overwrite_ids_;
std::unique_ptr<ExpectedStateManager> expected_state_manager_;
// Cannot store `port::Mutex` directly in vector since it is not copyable

View File

@ -281,12 +281,12 @@ bool StressTest::BuildOptionsTable() {
return true;
}
void StressTest::InitDb() {
void StressTest::InitDb(SharedState* shared) {
uint64_t now = clock_->NowMicros();
fprintf(stdout, "%s Initializing db_stress\n",
clock_->TimeToString(now / 1000000).c_str());
PrintEnv();
Open();
Open(shared);
BuildOptionsTable();
}
@ -568,7 +568,7 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
fprintf(stdout, "%s Reopening database in read-only\n",
clock_->TimeToString(now / 1000000).c_str());
// Reopen as read-only, can ignore all options related to updates
Open();
Open(shared);
} else {
fprintf(stderr, "Failed to preload db");
exit(1);
@ -2302,10 +2302,12 @@ void StressTest::PrintEnv() const {
fprintf(stdout, "------------------------------------------------\n");
}
void StressTest::Open() {
void StressTest::Open(SharedState* shared) {
assert(db_ == nullptr);
#ifndef ROCKSDB_LITE
assert(txn_db_ == nullptr);
#else
(void)shared;
#endif
if (FLAGS_options_file.empty()) {
BlockBasedTableOptions block_based_options;
@ -2752,7 +2754,7 @@ void StressTest::Open() {
static_cast<size_t>(FLAGS_wp_snapshot_cache_bits);
txn_db_options.wp_commit_cache_bits =
static_cast<size_t>(FLAGS_wp_commit_cache_bits);
PrepareTxnDbOptions(txn_db_options);
PrepareTxnDbOptions(shared, txn_db_options);
s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
cf_descriptors, &column_families_, &txn_db_);
if (!s.ok()) {
@ -2912,7 +2914,7 @@ void StressTest::Reopen(ThreadState* thread) {
auto now = clock_->NowMicros();
fprintf(stdout, "%s Reopening database for the %dth time\n",
clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
Open();
Open(thread->shared);
if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) {
Status s = thread->shared->SaveAtAndAfter(db_);

View File

@ -30,7 +30,7 @@ class StressTest {
bool BuildOptionsTable();
void InitDb();
void InitDb(SharedState*);
// The initialization work is split into two parts to avoid a circular
// dependency with `SharedState`.
virtual void FinishInitDb(SharedState*);
@ -219,7 +219,7 @@ class StressTest {
void PrintEnv() const;
void Open();
void Open(SharedState* shared);
void Reopen(ThreadState* thread);
@ -228,7 +228,8 @@ class StressTest {
virtual void RegisterAdditionalListeners() {}
#ifndef ROCKSDB_LITE
virtual void PrepareTxnDbOptions(TransactionDBOptions& /*txn_db_opts*/) {}
virtual void PrepareTxnDbOptions(SharedState* /*shared*/,
TransactionDBOptions& /*txn_db_opts*/) {}
#endif
std::shared_ptr<Cache> cache_;

View File

@ -533,7 +533,7 @@ void MultiOpsTxnsStressTest::RegisterAdditionalListeners() {
#ifndef ROCKSDB_LITE
void MultiOpsTxnsStressTest::PrepareTxnDbOptions(
TransactionDBOptions& txn_db_opts) {
SharedState* /*shared*/, TransactionDBOptions& txn_db_opts) {
// MultiOpsTxnStressTest uses SingleDelete to delete secondary keys, thus we
// register this callback to let TxnDb know that when rolling back
// a transaction, use only SingleDelete to cancel prior Put from the same

View File

@ -265,7 +265,8 @@ class MultiOpsTxnsStressTest : public StressTest {
void RegisterAdditionalListeners() override;
#ifndef ROCKSDB_LITE
void PrepareTxnDbOptions(TransactionDBOptions& txn_db_opts) override;
void PrepareTxnDbOptions(SharedState* /*shared*/,
TransactionDBOptions& txn_db_opts) override;
#endif // !ROCKSDB_LITE
Status PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a,

View File

@ -12,6 +12,7 @@
#ifndef NDEBUG
#include "utilities/fault_injection_fs.h"
#endif // NDEBUG
#include "rocksdb/utilities/transaction_db.h"
namespace ROCKSDB_NAMESPACE {
class NonBatchedOpsStressTest : public StressTest {
@ -930,6 +931,21 @@ class NonBatchedOpsStressTest : public StressTest {
}
return true;
}
#ifndef ROCKSDB_LITE
void PrepareTxnDbOptions(SharedState* shared,
TransactionDBOptions& txn_db_opts) override {
txn_db_opts.rollback_deletion_type_callback =
[shared](TransactionDB*, ColumnFamilyHandle*, const Slice& key) {
assert(shared);
uint64_t key_num = 0;
bool ok = GetIntVal(key.ToString(), &key_num);
assert(ok);
(void)ok;
return !shared->AllowsOverwrite(key_num);
};
}
#endif // ROCKSDB_LITE
};
StressTest* CreateNonBatchedOpsStressTest() {

View File

@ -39,6 +39,7 @@ class CompactionFilter : public Customizable {
enum class Decision {
kKeep,
kRemove,
kRemoveWithSingleDelete,
kChangeValue,
kRemoveAndSkipUntil,
kChangeBlobIndex, // used internally by BlobDB.

View File

@ -384,6 +384,8 @@ multiops_txn_default_params = {
"flush_one_in": 1000,
"key_spaces_path": setup_multiops_txn_key_spaces_file(),
"rollback_one_in": 4,
# Re-enable once we have a compaction for MultiOpsTxnStressTest
"enable_compaction_filter": 0,
}
multiops_wc_txn_params = {