Fix bug of Checkpoint loses recent transactions with 2PC
Summary: If 2PC is enabled, checkpoint may not copy previous log files that contain uncommitted prepare records. In this diff we keep those files. Closes https://github.com/facebook/rocksdb/pull/1724 Differential Revision: D4368319 Pulled By: siying fbshipit-source-id: cc2c746
This commit is contained in:
parent
a00f9bc498
commit
beb5daeeac
@ -746,6 +746,34 @@ void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t DBImpl::MinLogNumberToKeep() {
|
||||||
|
uint64_t log_number = versions_->MinLogNumber();
|
||||||
|
|
||||||
|
if (allow_2pc()) {
|
||||||
|
// if are 2pc we must consider logs containing prepared
|
||||||
|
// sections of outstanding transactions.
|
||||||
|
//
|
||||||
|
// We must check min logs with outstanding prep before we check
|
||||||
|
// logs referneces by memtables because a log referenced by the
|
||||||
|
// first data structure could transition to the second under us.
|
||||||
|
//
|
||||||
|
// TODO(horuff): iterating over all column families under db mutex.
|
||||||
|
// should find more optimial solution
|
||||||
|
auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep();
|
||||||
|
|
||||||
|
if (min_log_in_prep_heap != 0 && min_log_in_prep_heap < log_number) {
|
||||||
|
log_number = min_log_in_prep_heap;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable();
|
||||||
|
|
||||||
|
if (min_log_refed_by_mem != 0 && min_log_refed_by_mem < log_number) {
|
||||||
|
log_number = min_log_refed_by_mem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return log_number;
|
||||||
|
}
|
||||||
|
|
||||||
// * Returns the list of live files in 'sst_live'
|
// * Returns the list of live files in 'sst_live'
|
||||||
// If it's doing full scan:
|
// If it's doing full scan:
|
||||||
// * Returns the list of all files in the filesystem in
|
// * Returns the list of all files in the filesystem in
|
||||||
@ -804,32 +832,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|||||||
job_context->manifest_file_number = versions_->manifest_file_number();
|
job_context->manifest_file_number = versions_->manifest_file_number();
|
||||||
job_context->pending_manifest_file_number =
|
job_context->pending_manifest_file_number =
|
||||||
versions_->pending_manifest_file_number();
|
versions_->pending_manifest_file_number();
|
||||||
job_context->log_number = versions_->MinLogNumber();
|
job_context->log_number = MinLogNumberToKeep();
|
||||||
|
|
||||||
if (allow_2pc()) {
|
|
||||||
// if are 2pc we must consider logs containing prepared
|
|
||||||
// sections of outstanding transactions.
|
|
||||||
//
|
|
||||||
// We must check min logs with outstanding prep before we check
|
|
||||||
// logs referneces by memtables because a log referenced by the
|
|
||||||
// first data structure could transition to the second under us.
|
|
||||||
//
|
|
||||||
// TODO(horuff): iterating over all column families under db mutex.
|
|
||||||
// should find more optimial solution
|
|
||||||
auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep();
|
|
||||||
|
|
||||||
if (min_log_in_prep_heap != 0 &&
|
|
||||||
min_log_in_prep_heap < job_context->log_number) {
|
|
||||||
job_context->log_number = min_log_in_prep_heap;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable();
|
|
||||||
|
|
||||||
if (min_log_refed_by_mem != 0 &&
|
|
||||||
min_log_refed_by_mem < job_context->log_number) {
|
|
||||||
job_context->log_number = min_log_refed_by_mem;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
job_context->prev_log_number = versions_->prev_log_number();
|
job_context->prev_log_number = versions_->prev_log_number();
|
||||||
|
|
||||||
|
@ -390,6 +390,8 @@ class DBImpl : public DB {
|
|||||||
// schedule a purge
|
// schedule a purge
|
||||||
void ScheduleBgLogWriterClose(JobContext* job_context);
|
void ScheduleBgLogWriterClose(JobContext* job_context);
|
||||||
|
|
||||||
|
uint64_t MinLogNumberToKeep();
|
||||||
|
|
||||||
// Returns the list of live files in 'live' and the list
|
// Returns the list of live files in 'live' and the list
|
||||||
// of all files in the filesystem in 'candidate_files'.
|
// of all files in the filesystem in 'candidate_files'.
|
||||||
// If force == false and the last call was less than
|
// If force == false and the last call was less than
|
||||||
|
@ -221,6 +221,7 @@ static const std::string num_live_versions = "num-live-versions";
|
|||||||
static const std::string current_version_number =
|
static const std::string current_version_number =
|
||||||
"current-super-version-number";
|
"current-super-version-number";
|
||||||
static const std::string estimate_live_data_size = "estimate-live-data-size";
|
static const std::string estimate_live_data_size = "estimate-live-data-size";
|
||||||
|
static const std::string min_log_number_to_keep = "min-log-number-to-keep";
|
||||||
static const std::string base_level = "base-level";
|
static const std::string base_level = "base-level";
|
||||||
static const std::string total_sst_files_size = "total-sst-files-size";
|
static const std::string total_sst_files_size = "total-sst-files-size";
|
||||||
static const std::string estimate_pending_comp_bytes =
|
static const std::string estimate_pending_comp_bytes =
|
||||||
@ -285,6 +286,8 @@ const std::string DB::Properties::kCurrentSuperVersionNumber =
|
|||||||
rocksdb_prefix + current_version_number;
|
rocksdb_prefix + current_version_number;
|
||||||
const std::string DB::Properties::kEstimateLiveDataSize =
|
const std::string DB::Properties::kEstimateLiveDataSize =
|
||||||
rocksdb_prefix + estimate_live_data_size;
|
rocksdb_prefix + estimate_live_data_size;
|
||||||
|
const std::string DB::Properties::kMinLogNumberToKeep =
|
||||||
|
rocksdb_prefix + min_log_number_to_keep;
|
||||||
const std::string DB::Properties::kTotalSstFilesSize =
|
const std::string DB::Properties::kTotalSstFilesSize =
|
||||||
rocksdb_prefix + total_sst_files_size;
|
rocksdb_prefix + total_sst_files_size;
|
||||||
const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level;
|
const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level;
|
||||||
@ -368,6 +371,8 @@ const std::unordered_map<std::string, DBPropertyInfo>
|
|||||||
nullptr}},
|
nullptr}},
|
||||||
{DB::Properties::kEstimateLiveDataSize,
|
{DB::Properties::kEstimateLiveDataSize,
|
||||||
{true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr}},
|
{true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr}},
|
||||||
|
{DB::Properties::kMinLogNumberToKeep,
|
||||||
|
{false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr}},
|
||||||
{DB::Properties::kBaseLevel,
|
{DB::Properties::kBaseLevel,
|
||||||
{false, nullptr, &InternalStats::HandleBaseLevel, nullptr}},
|
{false, nullptr, &InternalStats::HandleBaseLevel, nullptr}},
|
||||||
{DB::Properties::kTotalSstFilesSize,
|
{DB::Properties::kTotalSstFilesSize,
|
||||||
@ -705,6 +710,12 @@ bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
|
||||||
|
Version* version) {
|
||||||
|
*value = db->MinLogNumberToKeep();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void InternalStats::DumpDBStats(std::string* value) {
|
void InternalStats::DumpDBStats(std::string* value) {
|
||||||
char buf[1000];
|
char buf[1000];
|
||||||
// DB-level stats, only available from default column family
|
// DB-level stats, only available from default column family
|
||||||
|
@ -401,6 +401,7 @@ class InternalStats {
|
|||||||
Version* version);
|
Version* version);
|
||||||
bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
|
bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
|
||||||
Version* version);
|
Version* version);
|
||||||
|
bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
|
||||||
|
|
||||||
// Total number of background errors encountered. Every time a flush task
|
// Total number of background errors encountered. Every time a flush task
|
||||||
// or compaction task fails, this counter is incremented. The failure can
|
// or compaction task fails, this counter is incremented. The failure can
|
||||||
|
@ -500,6 +500,10 @@ class DB {
|
|||||||
// live data in bytes.
|
// live data in bytes.
|
||||||
static const std::string kEstimateLiveDataSize;
|
static const std::string kEstimateLiveDataSize;
|
||||||
|
|
||||||
|
// "rocksdb.min-log-number-to-keep" - return the minmum log number of the
|
||||||
|
// log files that should be kept.
|
||||||
|
static const std::string kMinLogNumberToKeep;
|
||||||
|
|
||||||
// "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
|
// "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
|
||||||
// files.
|
// files.
|
||||||
// WARNING: may slow down online queries if there are too many files.
|
// WARNING: may slow down online queries if there are too many files.
|
||||||
@ -565,6 +569,7 @@ class DB {
|
|||||||
// "rocksdb.num-live-versions"
|
// "rocksdb.num-live-versions"
|
||||||
// "rocksdb.current-super-version-number"
|
// "rocksdb.current-super-version-number"
|
||||||
// "rocksdb.estimate-live-data-size"
|
// "rocksdb.estimate-live-data-size"
|
||||||
|
// "rocksdb.min-log-number-to-keep"
|
||||||
// "rocksdb.total-sst-files-size"
|
// "rocksdb.total-sst-files-size"
|
||||||
// "rocksdb.base-level"
|
// "rocksdb.base-level"
|
||||||
// "rocksdb.estimate-pending-compaction-bytes"
|
// "rocksdb.estimate-pending-compaction-bytes"
|
||||||
|
@ -62,6 +62,8 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
|
|||||||
Status s;
|
Status s;
|
||||||
std::vector<std::string> live_files;
|
std::vector<std::string> live_files;
|
||||||
uint64_t manifest_file_size = 0;
|
uint64_t manifest_file_size = 0;
|
||||||
|
bool allow_2pc = db_->GetDBOptions().allow_2pc;
|
||||||
|
uint64_t min_log_num = port::kMaxUint64;
|
||||||
uint64_t sequence_number = db_->GetLatestSequenceNumber();
|
uint64_t sequence_number = db_->GetLatestSequenceNumber();
|
||||||
bool same_fs = true;
|
bool same_fs = true;
|
||||||
VectorLogPtr live_wal_files;
|
VectorLogPtr live_wal_files;
|
||||||
@ -78,6 +80,35 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
|
|||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
// this will return live_files prefixed with "/"
|
// this will return live_files prefixed with "/"
|
||||||
s = db_->GetLiveFiles(live_files, &manifest_file_size);
|
s = db_->GetLiveFiles(live_files, &manifest_file_size);
|
||||||
|
|
||||||
|
if (s.ok() && allow_2pc) {
|
||||||
|
// If 2PC is enabled, we need to get minimum log number after the flush.
|
||||||
|
// Need to refetch the live files to recapture the snapshot.
|
||||||
|
if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep,
|
||||||
|
&min_log_num)) {
|
||||||
|
db_->EnableFileDeletions(false);
|
||||||
|
return Status::InvalidArgument(
|
||||||
|
"2PC enabled but cannot fine the min log number to keep.");
|
||||||
|
}
|
||||||
|
// We need to refetch live files with flush to handle this case:
|
||||||
|
// A previous 000001.log contains the prepare record of transaction tnx1.
|
||||||
|
// The current log file is 000002.log, and sequence_number points to this
|
||||||
|
// file.
|
||||||
|
// After calling GetLiveFiles(), 000003.log is created.
|
||||||
|
// Then tnx1 is committed. The commit record is written to 000003.log.
|
||||||
|
// Now we fetch min_log_num, which will be 3.
|
||||||
|
// Then only 000002.log and 000003.log will be copied, and 000001.log will
|
||||||
|
// be skipped. 000003.log contains commit message of tnx1, but we don't
|
||||||
|
// have respective prepare record for it.
|
||||||
|
// In order to avoid this situation, we need to force flush to make sure
|
||||||
|
// all transactions commited before getting min_log_num will be flushed
|
||||||
|
// to SST files.
|
||||||
|
// We cannot get min_log_num before calling the GetLiveFiles() for the
|
||||||
|
// first time, because if we do that, all the logs files will be included,
|
||||||
|
// far more than needed.
|
||||||
|
s = db_->GetLiveFiles(live_files, &manifest_file_size, /* flush */ true);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
|
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
|
||||||
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
|
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
|
||||||
}
|
}
|
||||||
@ -156,7 +187,8 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
|
|||||||
// that has changes after the last flush.
|
// that has changes after the last flush.
|
||||||
for (size_t i = 0; s.ok() && i < wal_size; ++i) {
|
for (size_t i = 0; s.ok() && i < wal_size; ++i) {
|
||||||
if ((live_wal_files[i]->Type() == kAliveLogFile) &&
|
if ((live_wal_files[i]->Type() == kAliveLogFile) &&
|
||||||
(live_wal_files[i]->StartSequence() >= sequence_number)) {
|
(live_wal_files[i]->StartSequence() >= sequence_number ||
|
||||||
|
live_wal_files[i]->LogNumber() >= min_log_num)) {
|
||||||
if (i + 1 == wal_size) {
|
if (i + 1 == wal_size) {
|
||||||
Log(db_->GetOptions().info_log, "Copying %s",
|
Log(db_->GetOptions().info_log, "Copying %s",
|
||||||
live_wal_files[i]->PathName().c_str());
|
live_wal_files[i]->PathName().c_str());
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/utilities/checkpoint.h"
|
#include "rocksdb/utilities/checkpoint.h"
|
||||||
|
#include "rocksdb/utilities/transaction_db.h"
|
||||||
#include "util/sync_point.h"
|
#include "util/sync_point.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
#include "util/xfunc.h"
|
#include "util/xfunc.h"
|
||||||
@ -390,6 +391,120 @@ TEST_F(DBTest, CurrentFileModifiedWhileCheckpointing) {
|
|||||||
snapshotDB = nullptr;
|
snapshotDB = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(DBTest, CurrentFileModifiedWhileCheckpointing2PC) {
|
||||||
|
const std::string kSnapshotName = test::TmpDir(env_) + "/snapshot";
|
||||||
|
const std::string dbname = test::TmpDir() + "/transaction_testdb";
|
||||||
|
ASSERT_OK(DestroyDB(kSnapshotName, CurrentOptions()));
|
||||||
|
ASSERT_OK(DestroyDB(dbname, CurrentOptions()));
|
||||||
|
env_->DeleteDir(kSnapshotName);
|
||||||
|
env_->DeleteDir(dbname);
|
||||||
|
Close();
|
||||||
|
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
// allow_2pc is implicitly set with tx prepare
|
||||||
|
// options.allow_2pc = true;
|
||||||
|
TransactionDBOptions txn_db_options;
|
||||||
|
TransactionDB* txdb;
|
||||||
|
Status s = TransactionDB::Open(options, txn_db_options, dbname, &txdb);
|
||||||
|
assert(s.ok());
|
||||||
|
ColumnFamilyHandle* cfa;
|
||||||
|
ColumnFamilyHandle* cfb;
|
||||||
|
ColumnFamilyOptions cf_options;
|
||||||
|
ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFA", &cfa));
|
||||||
|
|
||||||
|
WriteOptions write_options;
|
||||||
|
// Insert something into CFB so lots of log files will be kept
|
||||||
|
// before creating the checkpoint.
|
||||||
|
ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFB", &cfb));
|
||||||
|
ASSERT_OK(txdb->Put(write_options, cfb, "", ""));
|
||||||
|
|
||||||
|
ReadOptions read_options;
|
||||||
|
std::string value;
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
Transaction* txn = txdb->BeginTransaction(write_options, txn_options);
|
||||||
|
s = txn->SetName("xid");
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_EQ(txdb->GetTransactionByName("xid"), txn);
|
||||||
|
|
||||||
|
s = txn->Put(Slice("foo"), Slice("bar"));
|
||||||
|
s = txn->Put(cfa, Slice("foocfa"), Slice("barcfa"));
|
||||||
|
ASSERT_OK(s);
|
||||||
|
// Writing prepare into middle of first WAL, then flush WALs many times
|
||||||
|
for (int i = 1; i <= 100000; i++) {
|
||||||
|
Transaction* tx = txdb->BeginTransaction(write_options, txn_options);
|
||||||
|
ASSERT_OK(tx->SetName("x"));
|
||||||
|
ASSERT_OK(tx->Put(Slice(std::to_string(i)), Slice("val")));
|
||||||
|
ASSERT_OK(tx->Put(cfa, Slice("aaa"), Slice("111")));
|
||||||
|
ASSERT_OK(tx->Prepare());
|
||||||
|
ASSERT_OK(tx->Commit());
|
||||||
|
if (i % 10000 == 0) {
|
||||||
|
txdb->Flush(FlushOptions());
|
||||||
|
}
|
||||||
|
if (i == 88888) {
|
||||||
|
ASSERT_OK(txn->Prepare());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rocksdb::SyncPoint::GetInstance()->LoadDependency(
|
||||||
|
{{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
|
||||||
|
"DBTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit"},
|
||||||
|
{"DBTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit",
|
||||||
|
"CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
|
||||||
|
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
std::thread t([&]() {
|
||||||
|
Checkpoint* checkpoint;
|
||||||
|
ASSERT_OK(Checkpoint::Create(txdb, &checkpoint));
|
||||||
|
ASSERT_OK(checkpoint->CreateCheckpoint(kSnapshotName));
|
||||||
|
delete checkpoint;
|
||||||
|
});
|
||||||
|
TEST_SYNC_POINT("DBTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit");
|
||||||
|
ASSERT_OK(txn->Commit());
|
||||||
|
TEST_SYNC_POINT(
|
||||||
|
"DBTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit");
|
||||||
|
t.join();
|
||||||
|
|
||||||
|
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
|
||||||
|
// No more than two logs files should exist.
|
||||||
|
std::vector<std::string> files;
|
||||||
|
env_->GetChildren(kSnapshotName, &files);
|
||||||
|
int num_log_files = 0;
|
||||||
|
for (auto& file : files) {
|
||||||
|
uint64_t num;
|
||||||
|
FileType type;
|
||||||
|
WalFileType log_type;
|
||||||
|
if (ParseFileName(file, &num, &type, &log_type) && type == kLogFile) {
|
||||||
|
num_log_files++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// One flush after preapare + one outstanding file before checkpoint + one log
|
||||||
|
// file generated after checkpoint.
|
||||||
|
ASSERT_LE(num_log_files, 3);
|
||||||
|
|
||||||
|
TransactionDB* snapshotDB;
|
||||||
|
std::vector<ColumnFamilyDescriptor> column_families;
|
||||||
|
column_families.push_back(
|
||||||
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
|
||||||
|
column_families.push_back(
|
||||||
|
ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
|
||||||
|
column_families.push_back(
|
||||||
|
ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
|
||||||
|
std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
|
||||||
|
ASSERT_OK(TransactionDB::Open(options, txn_db_options, kSnapshotName,
|
||||||
|
column_families, &cf_handles, &snapshotDB));
|
||||||
|
ASSERT_OK(snapshotDB->Get(read_options, "foo", &value));
|
||||||
|
ASSERT_EQ(value, "bar");
|
||||||
|
ASSERT_OK(snapshotDB->Get(read_options, cf_handles[1], "foocfa", &value));
|
||||||
|
ASSERT_EQ(value, "barcfa");
|
||||||
|
|
||||||
|
delete cfa;
|
||||||
|
delete cfb;
|
||||||
|
delete cf_handles[0];
|
||||||
|
delete cf_handles[1];
|
||||||
|
delete cf_handles[2];
|
||||||
|
delete snapshotDB;
|
||||||
|
snapshotDB = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user