Fix bug of Checkpoint loses recent transactions with 2PC

Summary:
If 2PC is enabled, checkpoint may not copy previous log files that contain uncommitted prepare records. In this diff we keep those files.
Closes https://github.com/facebook/rocksdb/pull/1724

Differential Revision: D4368319

Pulled By: siying

fbshipit-source-id: cc2c746
This commit is contained in:
Siying Dong 2016-12-28 11:53:29 -08:00 committed by Islam AbdelRahman
parent a00f9bc498
commit beb5daeeac
7 changed files with 196 additions and 27 deletions

View File

@ -746,6 +746,34 @@ void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
}
}
uint64_t DBImpl::MinLogNumberToKeep() {
uint64_t log_number = versions_->MinLogNumber();
if (allow_2pc()) {
// if are 2pc we must consider logs containing prepared
// sections of outstanding transactions.
//
// We must check min logs with outstanding prep before we check
// logs referneces by memtables because a log referenced by the
// first data structure could transition to the second under us.
//
// TODO(horuff): iterating over all column families under db mutex.
// should find more optimial solution
auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep();
if (min_log_in_prep_heap != 0 && min_log_in_prep_heap < log_number) {
log_number = min_log_in_prep_heap;
}
auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable();
if (min_log_refed_by_mem != 0 && min_log_refed_by_mem < log_number) {
log_number = min_log_refed_by_mem;
}
}
return log_number;
}
// * Returns the list of live files in 'sst_live'
// If it's doing full scan:
// * Returns the list of all files in the filesystem in
@ -804,32 +832,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
job_context->manifest_file_number = versions_->manifest_file_number();
job_context->pending_manifest_file_number =
versions_->pending_manifest_file_number();
job_context->log_number = versions_->MinLogNumber();
if (allow_2pc()) {
// if are 2pc we must consider logs containing prepared
// sections of outstanding transactions.
//
// We must check min logs with outstanding prep before we check
// logs referneces by memtables because a log referenced by the
// first data structure could transition to the second under us.
//
// TODO(horuff): iterating over all column families under db mutex.
// should find more optimial solution
auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep();
if (min_log_in_prep_heap != 0 &&
min_log_in_prep_heap < job_context->log_number) {
job_context->log_number = min_log_in_prep_heap;
}
auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable();
if (min_log_refed_by_mem != 0 &&
min_log_refed_by_mem < job_context->log_number) {
job_context->log_number = min_log_refed_by_mem;
}
}
job_context->log_number = MinLogNumberToKeep();
job_context->prev_log_number = versions_->prev_log_number();

View File

@ -390,6 +390,8 @@ class DBImpl : public DB {
// schedule a purge
void ScheduleBgLogWriterClose(JobContext* job_context);
uint64_t MinLogNumberToKeep();
// Returns the list of live files in 'live' and the list
// of all files in the filesystem in 'candidate_files'.
// If force == false and the last call was less than

View File

@ -221,6 +221,7 @@ static const std::string num_live_versions = "num-live-versions";
static const std::string current_version_number =
"current-super-version-number";
static const std::string estimate_live_data_size = "estimate-live-data-size";
static const std::string min_log_number_to_keep = "min-log-number-to-keep";
static const std::string base_level = "base-level";
static const std::string total_sst_files_size = "total-sst-files-size";
static const std::string estimate_pending_comp_bytes =
@ -285,6 +286,8 @@ const std::string DB::Properties::kCurrentSuperVersionNumber =
rocksdb_prefix + current_version_number;
const std::string DB::Properties::kEstimateLiveDataSize =
rocksdb_prefix + estimate_live_data_size;
const std::string DB::Properties::kMinLogNumberToKeep =
rocksdb_prefix + min_log_number_to_keep;
const std::string DB::Properties::kTotalSstFilesSize =
rocksdb_prefix + total_sst_files_size;
const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level;
@ -368,6 +371,8 @@ const std::unordered_map<std::string, DBPropertyInfo>
nullptr}},
{DB::Properties::kEstimateLiveDataSize,
{true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr}},
{DB::Properties::kMinLogNumberToKeep,
{false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr}},
{DB::Properties::kBaseLevel,
{false, nullptr, &InternalStats::HandleBaseLevel, nullptr}},
{DB::Properties::kTotalSstFilesSize,
@ -705,6 +710,12 @@ bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
return true;
}
bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
Version* version) {
*value = db->MinLogNumberToKeep();
return true;
}
void InternalStats::DumpDBStats(std::string* value) {
char buf[1000];
// DB-level stats, only available from default column family

View File

@ -401,6 +401,7 @@ class InternalStats {
Version* version);
bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
Version* version);
bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
// Total number of background errors encountered. Every time a flush task
// or compaction task fails, this counter is incremented. The failure can

View File

@ -500,6 +500,10 @@ class DB {
// live data in bytes.
static const std::string kEstimateLiveDataSize;
// "rocksdb.min-log-number-to-keep" - return the minmum log number of the
// log files that should be kept.
static const std::string kMinLogNumberToKeep;
// "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
// files.
// WARNING: may slow down online queries if there are too many files.
@ -565,6 +569,7 @@ class DB {
// "rocksdb.num-live-versions"
// "rocksdb.current-super-version-number"
// "rocksdb.estimate-live-data-size"
// "rocksdb.min-log-number-to-keep"
// "rocksdb.total-sst-files-size"
// "rocksdb.base-level"
// "rocksdb.estimate-pending-compaction-bytes"

View File

@ -62,6 +62,8 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
Status s;
std::vector<std::string> live_files;
uint64_t manifest_file_size = 0;
bool allow_2pc = db_->GetDBOptions().allow_2pc;
uint64_t min_log_num = port::kMaxUint64;
uint64_t sequence_number = db_->GetLatestSequenceNumber();
bool same_fs = true;
VectorLogPtr live_wal_files;
@ -78,6 +80,35 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
if (s.ok()) {
// this will return live_files prefixed with "/"
s = db_->GetLiveFiles(live_files, &manifest_file_size);
if (s.ok() && allow_2pc) {
// If 2PC is enabled, we need to get minimum log number after the flush.
// Need to refetch the live files to recapture the snapshot.
if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep,
&min_log_num)) {
db_->EnableFileDeletions(false);
return Status::InvalidArgument(
"2PC enabled but cannot fine the min log number to keep.");
}
// We need to refetch live files with flush to handle this case:
// A previous 000001.log contains the prepare record of transaction tnx1.
// The current log file is 000002.log, and sequence_number points to this
// file.
// After calling GetLiveFiles(), 000003.log is created.
// Then tnx1 is committed. The commit record is written to 000003.log.
// Now we fetch min_log_num, which will be 3.
// Then only 000002.log and 000003.log will be copied, and 000001.log will
// be skipped. 000003.log contains commit message of tnx1, but we don't
// have respective prepare record for it.
// In order to avoid this situation, we need to force flush to make sure
// all transactions commited before getting min_log_num will be flushed
// to SST files.
// We cannot get min_log_num before calling the GetLiveFiles() for the
// first time, because if we do that, all the logs files will be included,
// far more than needed.
s = db_->GetLiveFiles(live_files, &manifest_file_size, /* flush */ true);
}
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
}
@ -156,7 +187,8 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
// that has changes after the last flush.
for (size_t i = 0; s.ok() && i < wal_size; ++i) {
if ((live_wal_files[i]->Type() == kAliveLogFile) &&
(live_wal_files[i]->StartSequence() >= sequence_number)) {
(live_wal_files[i]->StartSequence() >= sequence_number ||
live_wal_files[i]->LogNumber() >= min_log_num)) {
if (i + 1 == wal_size) {
Log(db_->GetOptions().info_log, "Copying %s",
live_wal_files[i]->PathName().c_str());

View File

@ -21,6 +21,7 @@
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/utilities/checkpoint.h"
#include "rocksdb/utilities/transaction_db.h"
#include "util/sync_point.h"
#include "util/testharness.h"
#include "util/xfunc.h"
@ -390,6 +391,120 @@ TEST_F(DBTest, CurrentFileModifiedWhileCheckpointing) {
snapshotDB = nullptr;
}
TEST_F(DBTest, CurrentFileModifiedWhileCheckpointing2PC) {
const std::string kSnapshotName = test::TmpDir(env_) + "/snapshot";
const std::string dbname = test::TmpDir() + "/transaction_testdb";
ASSERT_OK(DestroyDB(kSnapshotName, CurrentOptions()));
ASSERT_OK(DestroyDB(dbname, CurrentOptions()));
env_->DeleteDir(kSnapshotName);
env_->DeleteDir(dbname);
Close();
Options options = CurrentOptions();
// allow_2pc is implicitly set with tx prepare
// options.allow_2pc = true;
TransactionDBOptions txn_db_options;
TransactionDB* txdb;
Status s = TransactionDB::Open(options, txn_db_options, dbname, &txdb);
assert(s.ok());
ColumnFamilyHandle* cfa;
ColumnFamilyHandle* cfb;
ColumnFamilyOptions cf_options;
ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFA", &cfa));
WriteOptions write_options;
// Insert something into CFB so lots of log files will be kept
// before creating the checkpoint.
ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFB", &cfb));
ASSERT_OK(txdb->Put(write_options, cfb, "", ""));
ReadOptions read_options;
std::string value;
TransactionOptions txn_options;
Transaction* txn = txdb->BeginTransaction(write_options, txn_options);
s = txn->SetName("xid");
ASSERT_OK(s);
ASSERT_EQ(txdb->GetTransactionByName("xid"), txn);
s = txn->Put(Slice("foo"), Slice("bar"));
s = txn->Put(cfa, Slice("foocfa"), Slice("barcfa"));
ASSERT_OK(s);
// Writing prepare into middle of first WAL, then flush WALs many times
for (int i = 1; i <= 100000; i++) {
Transaction* tx = txdb->BeginTransaction(write_options, txn_options);
ASSERT_OK(tx->SetName("x"));
ASSERT_OK(tx->Put(Slice(std::to_string(i)), Slice("val")));
ASSERT_OK(tx->Put(cfa, Slice("aaa"), Slice("111")));
ASSERT_OK(tx->Prepare());
ASSERT_OK(tx->Commit());
if (i % 10000 == 0) {
txdb->Flush(FlushOptions());
}
if (i == 88888) {
ASSERT_OK(txn->Prepare());
}
}
rocksdb::SyncPoint::GetInstance()->LoadDependency(
{{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
"DBTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit"},
{"DBTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit",
"CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
std::thread t([&]() {
Checkpoint* checkpoint;
ASSERT_OK(Checkpoint::Create(txdb, &checkpoint));
ASSERT_OK(checkpoint->CreateCheckpoint(kSnapshotName));
delete checkpoint;
});
TEST_SYNC_POINT("DBTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit");
ASSERT_OK(txn->Commit());
TEST_SYNC_POINT(
"DBTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit");
t.join();
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
// No more than two logs files should exist.
std::vector<std::string> files;
env_->GetChildren(kSnapshotName, &files);
int num_log_files = 0;
for (auto& file : files) {
uint64_t num;
FileType type;
WalFileType log_type;
if (ParseFileName(file, &num, &type, &log_type) && type == kLogFile) {
num_log_files++;
}
}
// One flush after preapare + one outstanding file before checkpoint + one log
// file generated after checkpoint.
ASSERT_LE(num_log_files, 3);
TransactionDB* snapshotDB;
std::vector<ColumnFamilyDescriptor> column_families;
column_families.push_back(
ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
column_families.push_back(
ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
column_families.push_back(
ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
ASSERT_OK(TransactionDB::Open(options, txn_db_options, kSnapshotName,
column_families, &cf_handles, &snapshotDB));
ASSERT_OK(snapshotDB->Get(read_options, "foo", &value));
ASSERT_EQ(value, "bar");
ASSERT_OK(snapshotDB->Get(read_options, cf_handles[1], "foocfa", &value));
ASSERT_EQ(value, "barcfa");
delete cfa;
delete cfb;
delete cf_handles[0];
delete cf_handles[1];
delete cf_handles[2];
delete snapshotDB;
snapshotDB = nullptr;
}
} // namespace rocksdb
int main(int argc, char** argv) {