Fix seqno in ingested file boundary key metadata (#8209)

Summary:
Fixes https://github.com/facebook/rocksdb/issues/6245.

Adapted from https://github.com/facebook/rocksdb/issues/8201 and https://github.com/facebook/rocksdb/issues/8205.

Previously we were writing the ingested file's smallest/largest internal keys
with sequence number zero, or `kMaxSequenceNumber` in case of range
tombstone. The former (sequence number zero) is incorrect and can lead
to files being incorrectly ordered. The fix in this PR is to overwrite
boundary keys that have sequence number zero with the ingested file's assigned
sequence number.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8209

Test Plan: repro unit test

Reviewed By: riversand963

Differential Revision: D27885678

Pulled By: ajkr

fbshipit-source-id: 4a9f2c6efdfff81c3a9923e915ea88b250ee7b6a
This commit is contained in:
Andrew Kryczka 2021-04-20 13:59:24 -07:00 committed by Facebook GitHub Bot
parent 1b99947e99
commit 905dd17b35
3 changed files with 62 additions and 0 deletions

View File

@ -2,6 +2,7 @@
## Unreleased
### Bug Fixes
* Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened.
* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
## 6.20.0 (04/16/2021)

View File

@ -1542,6 +1542,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
ASSERT_EQ(2, NumTableFilesAtLevel(0));
}
TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) {
// Repro https://github.com/facebook/rocksdb/issues/6245.
// Flush three files to L0. Ingest one more file to trigger L0->L1 compaction
// via trivial move. The bug happened when L1 files were incorrectly sorted
// resulting in an old value for "k" returned by `Get()`.
Options options = CurrentOptions();
ASSERT_OK(Put("k", "a"));
Flush();
ASSERT_OK(Put("k", "a"));
Flush();
ASSERT_OK(Put("k", "a"));
Flush();
SstFileWriter sst_file_writer(EnvOptions(), options);
// Current file size should be 0 after sst_file_writer init and before open a
// file.
ASSERT_EQ(sst_file_writer.FileSize(), 0);
std::string file1 = sst_files_dir_ + "file1.sst";
ASSERT_OK(sst_file_writer.Open(file1));
ASSERT_OK(sst_file_writer.Put("k", "b"));
ExternalSstFileInfo file1_info;
Status s = sst_file_writer.Finish(&file1_info);
ASSERT_OK(s) << s.ToString();
// Current file size should be non-zero after success write.
ASSERT_GT(sst_file_writer.FileSize(), 0);
IngestExternalFileOptions ifo;
s = db_->IngestExternalFile({file1}, ifo);
ASSERT_OK(s);
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_EQ(Get("k"), "b");
}
INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
testing::Values(std::make_tuple(true, true),
std::make_tuple(true, false),

View File

@ -367,9 +367,32 @@ Status ExternalSstFileIngestionJob::Run() {
super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
last_seqno, &f, &assigned_seqno);
}
// Modify the smallest/largest internal key to include the sequence number
// that we just learned. Only overwrite sequence number zero. There could
// be a nonzero sequence number already to indicate a range tombstone's
// exclusive endpoint.
ParsedInternalKey smallest_parsed, largest_parsed;
if (status.ok()) {
status = ParseInternalKey(*f.smallest_internal_key.rep(),
&smallest_parsed, false /* log_err_key */);
}
if (status.ok()) {
status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
false /* log_err_key */);
}
if (!status.ok()) {
return status;
}
if (smallest_parsed.sequence == 0) {
UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
smallest_parsed.type);
}
if (largest_parsed.sequence == 0) {
UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
largest_parsed.type);
}
status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
&assigned_seqno);