Remove corrupted WAL files in kPointRecoveryMode with avoid_flush_duing_recovery set true (#9634)
Summary: 1) In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't flush the data from WAL to L0 for all column families if possible. As a result, not all column families can increase their log_numbers, and min_log_number_to_keep won't change. 2) For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change. If we persist a new MANIFEST with advanced log_numbers for some column families, then during a second crash after persisting the MANIFEST, RocksDB will see some column families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail. As a solution, 1. the corrupted WALs whose numbers are larger than the corrupted wal and smaller than the new WAL will be moved to archive folder. 2. Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful Pull Request resolved: https://github.com/facebook/rocksdb/pull/9634 Test Plan: 1. Added new unit tests 2. make crast_test -j Reviewed By: riversand963 Differential Revision: D34463666 Pulled By: akankshamahajan15 fbshipit-source-id: e233d3af0ed4e2028ca0cf051e5a334a0fdc9d19
This commit is contained in:
parent
63e68a4e77
commit
ae82d91492
@ -9,6 +9,8 @@
|
|||||||
* Fixed a bug affecting `track_and_verify_wals_in_manifest`. Without the fix, application may see "open error: Corruption: Missing WAL with log number" while trying to open the db. The corruption is a false alarm but prevents DB open (#9766).
|
* Fixed a bug affecting `track_and_verify_wals_in_manifest`. Without the fix, application may see "open error: Corruption: Missing WAL with log number" while trying to open the db. The corruption is a false alarm but prevents DB open (#9766).
|
||||||
* Fix segfault in FilePrefetchBuffer with async_io as it doesn't wait for pending jobs to complete on destruction.
|
* Fix segfault in FilePrefetchBuffer with async_io as it doesn't wait for pending jobs to complete on destruction.
|
||||||
* Fix ERROR_HANDLER_AUTORESUME_RETRY_COUNT stat whose value was set wrong in portal.h
|
* Fix ERROR_HANDLER_AUTORESUME_RETRY_COUNT stat whose value was set wrong in portal.h
|
||||||
|
* Fixed a bug for non-TransactionDB with avoid_flush_during_recovery = true and TransactionDB where in case of crash, min_log_number_to_keep may not change on recovery and persisting a new MANIFEST with advanced log_numbers for some column families, results in "column family inconsistency" error on second recovery. As a solution the corrupted WALs whose numbers are larger than the corrupted wal and smaller than the new WAL will be moved to archive folder.
|
||||||
|
* Fixed a bug in RocksDB DB::Open() which may creates and writes to two new MANIFEST files even before recovery succeeds. Now writes to MANIFEST are persisted only after recovery is successful.
|
||||||
|
|
||||||
### New Features
|
### New Features
|
||||||
* For db_bench when --seed=0 or --seed is not set then it uses the current time as the seed value. Previously it used the value 1000.
|
* For db_bench when --seed=0 or --seed is not set then it uses the current time as the seed value. Previously it used the value 1000.
|
||||||
|
@ -26,6 +26,7 @@
|
|||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/table.h"
|
#include "rocksdb/table.h"
|
||||||
|
#include "rocksdb/utilities/transaction_db.h"
|
||||||
#include "rocksdb/write_batch.h"
|
#include "rocksdb/write_batch.h"
|
||||||
#include "table/block_based/block_based_table_builder.h"
|
#include "table/block_based/block_based_table_builder.h"
|
||||||
#include "table/meta_blocks.h"
|
#include "table/meta_blocks.h"
|
||||||
@ -275,6 +276,42 @@ class CorruptionTest : public testing::Test {
|
|||||||
}
|
}
|
||||||
return Slice(*storage);
|
return Slice(*storage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GetSortedWalFiles(std::vector<uint64_t>& file_nums) {
|
||||||
|
std::vector<std::string> tmp_files;
|
||||||
|
ASSERT_OK(env_->GetChildren(dbname_, &tmp_files));
|
||||||
|
FileType type = kWalFile;
|
||||||
|
for (const auto& file : tmp_files) {
|
||||||
|
uint64_t number = 0;
|
||||||
|
if (ParseFileName(file, &number, &type) && type == kWalFile) {
|
||||||
|
file_nums.push_back(number);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::sort(file_nums.begin(), file_nums.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
void CorruptFileWithTruncation(FileType file, uint64_t number,
|
||||||
|
uint64_t bytes_to_truncate = 0) {
|
||||||
|
std::string path;
|
||||||
|
switch (file) {
|
||||||
|
case FileType::kWalFile:
|
||||||
|
path = LogFileName(dbname_, number);
|
||||||
|
break;
|
||||||
|
// TODO: Add other file types as this method is being used for those file
|
||||||
|
// types.
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
uint64_t old_size = 0;
|
||||||
|
ASSERT_OK(env_->GetFileSize(path, &old_size));
|
||||||
|
assert(old_size > bytes_to_truncate);
|
||||||
|
uint64_t new_size = old_size - bytes_to_truncate;
|
||||||
|
// If bytes_to_truncate == 0, it will do full truncation.
|
||||||
|
if (bytes_to_truncate == 0) {
|
||||||
|
new_size = old_size;
|
||||||
|
}
|
||||||
|
ASSERT_OK(test::TruncateFile(env_, path, new_size));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST_F(CorruptionTest, Recovery) {
|
TEST_F(CorruptionTest, Recovery) {
|
||||||
@ -912,6 +949,313 @@ TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
|
|||||||
ASSERT_EQ(1, count);
|
ASSERT_EQ(1, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class CrashDuringRecoveryWithCorruptionTest
|
||||||
|
: public CorruptionTest,
|
||||||
|
public testing::WithParamInterface<std::tuple<bool, bool>> {
|
||||||
|
public:
|
||||||
|
explicit CrashDuringRecoveryWithCorruptionTest()
|
||||||
|
: CorruptionTest(),
|
||||||
|
avoid_flush_during_recovery_(std::get<0>(GetParam())),
|
||||||
|
track_and_verify_wals_in_manifest_(std::get<1>(GetParam())) {}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const bool avoid_flush_during_recovery_;
|
||||||
|
const bool track_and_verify_wals_in_manifest_;
|
||||||
|
};
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
|
||||||
|
::testing::Values(std::make_tuple(true, false),
|
||||||
|
std::make_tuple(false, false),
|
||||||
|
std::make_tuple(true, true),
|
||||||
|
std::make_tuple(false, true)));
|
||||||
|
|
||||||
|
// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
|
||||||
|
// won't flush the data from WAL to L0 for all column families if possible. As a
|
||||||
|
// result, not all column families can increase their log_numbers, and
|
||||||
|
// min_log_number_to_keep won't change.
|
||||||
|
// It may prematurely persist a new MANIFEST even before we can declare the DB
|
||||||
|
// is in consistent state after recovery (this is when the new WAL is synced)
|
||||||
|
// and advances log_numbers for some column families.
|
||||||
|
//
|
||||||
|
// If there is power failure before we sync the new WAL, we will end up in
|
||||||
|
// a situation in which after persisting the MANIFEST, RocksDB will see some
|
||||||
|
// column families' log_numbers larger than the corrupted wal, and
|
||||||
|
// "Column family inconsistency: SST file contains data beyond the point of
|
||||||
|
// corruption" error will be hit, causing recovery to fail.
|
||||||
|
//
|
||||||
|
// After adding the fix, corrupted WALs whose numbers are larger than the
|
||||||
|
// corrupted wal and smaller than the new WAL are moved to a separate folder.
|
||||||
|
// Only after new WAL is synced, RocksDB persist a new MANIFEST with column
|
||||||
|
// families to ensure RocksDB is in consistent state.
|
||||||
|
// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
|
||||||
|
// synced immediately afterwards. The sequence number of the sentinel
|
||||||
|
// WriteBatch will be the next sequence number immediately after the largest
|
||||||
|
// sequence number recovered from previous WALs and MANIFEST because of which DB
|
||||||
|
// will be in consistent state.
|
||||||
|
TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
|
||||||
|
CloseDb();
|
||||||
|
Options options;
|
||||||
|
options.track_and_verify_wals_in_manifest =
|
||||||
|
track_and_verify_wals_in_manifest_;
|
||||||
|
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
||||||
|
options.avoid_flush_during_recovery = false;
|
||||||
|
options.env = env_;
|
||||||
|
ASSERT_OK(DestroyDB(dbname_, options));
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.max_write_buffer_number = 3;
|
||||||
|
|
||||||
|
Reopen(&options);
|
||||||
|
Status s;
|
||||||
|
const std::string test_cf_name = "test_cf";
|
||||||
|
ColumnFamilyHandle* cfh = nullptr;
|
||||||
|
s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
delete cfh;
|
||||||
|
CloseDb();
|
||||||
|
|
||||||
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
||||||
|
cf_descs.emplace_back(kDefaultColumnFamilyName, options);
|
||||||
|
cf_descs.emplace_back(test_cf_name, options);
|
||||||
|
std::vector<ColumnFamilyHandle*> handles;
|
||||||
|
|
||||||
|
// 1. Open and populate the DB. Write and flush default_cf several times to
|
||||||
|
// advance wal number so that some column families have advanced log_number
|
||||||
|
// while other don't.
|
||||||
|
{
|
||||||
|
ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
|
||||||
|
auto* dbimpl = static_cast_with_check<DBImpl>(db_);
|
||||||
|
assert(dbimpl);
|
||||||
|
|
||||||
|
// Write one key to test_cf.
|
||||||
|
ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
|
||||||
|
// Write to default_cf and flush this cf several times to advance wal
|
||||||
|
// number.
|
||||||
|
for (int i = 0; i < 2; ++i) {
|
||||||
|
ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value"));
|
||||||
|
ASSERT_OK(dbimpl->TEST_SwitchMemtable());
|
||||||
|
}
|
||||||
|
ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
|
||||||
|
|
||||||
|
for (auto* h : handles) {
|
||||||
|
delete h;
|
||||||
|
}
|
||||||
|
handles.clear();
|
||||||
|
CloseDb();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Corrupt second last wal file to emulate power reset which caused the DB
|
||||||
|
// to lose the un-synced WAL.
|
||||||
|
{
|
||||||
|
std::vector<uint64_t> file_nums;
|
||||||
|
GetSortedWalFiles(file_nums);
|
||||||
|
size_t size = file_nums.size();
|
||||||
|
uint64_t log_num = file_nums[size - 2];
|
||||||
|
CorruptFileWithTruncation(FileType::kWalFile, log_num,
|
||||||
|
/*bytes_to_truncate=*/8);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. After first crash reopen the DB which contains corrupted WAL. Default
|
||||||
|
// family has higher log number than corrupted wal number.
|
||||||
|
//
|
||||||
|
// Case1: If avoid_flush_during_recovery = true, RocksDB won't flush the data
|
||||||
|
// from WAL to L0 for all column families (test_cf_name in this case). As a
|
||||||
|
// result, not all column families can increase their log_numbers, and
|
||||||
|
// min_log_number_to_keep won't change.
|
||||||
|
//
|
||||||
|
// Case2: If avoid_flush_during_recovery = false, all column families have
|
||||||
|
// flushed their data from WAL to L0 during recovery, and none of them will
|
||||||
|
// ever need to read the WALs again.
|
||||||
|
{
|
||||||
|
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
|
||||||
|
s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
for (auto* h : handles) {
|
||||||
|
delete h;
|
||||||
|
}
|
||||||
|
handles.clear();
|
||||||
|
CloseDb();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Corrupt max_wal_num to emulate second power reset which caused the
|
||||||
|
// DB to again lose the un-synced WAL.
|
||||||
|
{
|
||||||
|
std::vector<uint64_t> file_nums;
|
||||||
|
GetSortedWalFiles(file_nums);
|
||||||
|
size_t size = file_nums.size();
|
||||||
|
uint64_t log_num = file_nums[size - 1];
|
||||||
|
CorruptFileWithTruncation(FileType::kWalFile, log_num);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. After second crash reopen the db with second corruption. Default family
|
||||||
|
// has higher log number than corrupted wal number.
|
||||||
|
//
|
||||||
|
// Case1: If avoid_flush_during_recovery = true, we persist a new
|
||||||
|
// MANIFEST with advanced log_numbers for some column families only after
|
||||||
|
// syncing the WAL. So during second crash, RocksDB will skip the corrupted
|
||||||
|
// WAL files as they have been moved to different folder. Since newly synced
|
||||||
|
// WAL file's sequence number (sentinel WriteBatch) will be the next
|
||||||
|
// sequence number immediately after the largest sequence number recovered
|
||||||
|
// from previous WALs and MANIFEST, db will be in consistent state and opens
|
||||||
|
// successfully.
|
||||||
|
//
|
||||||
|
// Case2: If avoid_flush_during_recovery = false, the corrupted WAL is below
|
||||||
|
// this number. So during a second crash after persisting the new MANIFEST,
|
||||||
|
// RocksDB will skip the corrupted WAL(s) because they are all below this
|
||||||
|
// bound. Therefore, we won't hit the "column family inconsistency" error
|
||||||
|
// message.
|
||||||
|
{
|
||||||
|
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
|
||||||
|
ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
|
||||||
|
for (auto* h : handles) {
|
||||||
|
delete h;
|
||||||
|
}
|
||||||
|
handles.clear();
|
||||||
|
CloseDb();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// In case of TransactionDB, it enables two-phase-commit. The prepare section of
|
||||||
|
// an uncommitted transaction always need to be kept. Even if we perform flush
|
||||||
|
// during recovery, we may still need to hold an old WAL. The
|
||||||
|
// min_log_number_to_keep won't change, and "Column family inconsistency: SST
|
||||||
|
// file contains data beyond the point of corruption" error will be hit, causing
|
||||||
|
// recovery to fail.
|
||||||
|
//
|
||||||
|
// After adding the fix, corrupted WALs whose numbers are larger than the
|
||||||
|
// corrupted wal and smaller than the new WAL are moved to a separate folder.
|
||||||
|
// Only after new WAL is synced, RocksDB persist a new MANIFEST with column
|
||||||
|
// families to ensure RocksDB is in consistent state.
|
||||||
|
// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
|
||||||
|
// synced immediately afterwards. The sequence number of the sentinel
|
||||||
|
// WriteBatch will be the next sequence number immediately after the largest
|
||||||
|
// sequence number recovered from previous WALs and MANIFEST because of which DB
|
||||||
|
// will be in consistent state.
|
||||||
|
TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
|
||||||
|
CloseDb();
|
||||||
|
Options options;
|
||||||
|
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
||||||
|
options.track_and_verify_wals_in_manifest =
|
||||||
|
track_and_verify_wals_in_manifest_;
|
||||||
|
options.avoid_flush_during_recovery = false;
|
||||||
|
options.env = env_;
|
||||||
|
ASSERT_OK(DestroyDB(dbname_, options));
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.max_write_buffer_number = 3;
|
||||||
|
Reopen(&options);
|
||||||
|
|
||||||
|
// Create cf test_cf_name.
|
||||||
|
ColumnFamilyHandle* cfh = nullptr;
|
||||||
|
const std::string test_cf_name = "test_cf";
|
||||||
|
Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
|
||||||
|
ASSERT_OK(s);
|
||||||
|
delete cfh;
|
||||||
|
CloseDb();
|
||||||
|
|
||||||
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
||||||
|
cf_descs.emplace_back(kDefaultColumnFamilyName, options);
|
||||||
|
cf_descs.emplace_back(test_cf_name, options);
|
||||||
|
std::vector<ColumnFamilyHandle*> handles;
|
||||||
|
|
||||||
|
TransactionDB* txn_db = nullptr;
|
||||||
|
TransactionDBOptions txn_db_opts;
|
||||||
|
|
||||||
|
// 1. Open and populate the DB. Write and flush default_cf several times to
|
||||||
|
// advance wal number so that some column families have advanced log_number
|
||||||
|
// while other don't.
|
||||||
|
{
|
||||||
|
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
|
||||||
|
ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
|
||||||
|
&handles, &txn_db));
|
||||||
|
|
||||||
|
auto* txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
|
||||||
|
// Put cf1
|
||||||
|
ASSERT_OK(txn->Put(handles[1], "foo", "value"));
|
||||||
|
ASSERT_OK(txn->SetName("txn0"));
|
||||||
|
ASSERT_OK(txn->Prepare());
|
||||||
|
delete txn;
|
||||||
|
txn = nullptr;
|
||||||
|
|
||||||
|
auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
|
||||||
|
assert(dbimpl);
|
||||||
|
|
||||||
|
// Put and flush cf0
|
||||||
|
for (int i = 0; i < 2; ++i) {
|
||||||
|
ASSERT_OK(txn_db->Put(WriteOptions(), "dontcare", "value"));
|
||||||
|
ASSERT_OK(dbimpl->TEST_SwitchMemtable());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Put cf1
|
||||||
|
txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
|
||||||
|
ASSERT_OK(txn->Put(handles[1], "foo1", "value"));
|
||||||
|
ASSERT_OK(txn->Commit());
|
||||||
|
|
||||||
|
delete txn;
|
||||||
|
txn = nullptr;
|
||||||
|
for (auto* h : handles) {
|
||||||
|
delete h;
|
||||||
|
}
|
||||||
|
handles.clear();
|
||||||
|
delete txn_db;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Corrupt second last wal to emulate power reset which caused the DB to
|
||||||
|
// lose the un-synced WAL.
|
||||||
|
{
|
||||||
|
std::vector<uint64_t> file_nums;
|
||||||
|
GetSortedWalFiles(file_nums);
|
||||||
|
size_t size = file_nums.size();
|
||||||
|
uint64_t log_num = file_nums[size - 2];
|
||||||
|
CorruptFileWithTruncation(FileType::kWalFile, log_num,
|
||||||
|
/*bytes_to_truncate=*/8);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. After first crash reopen the DB which contains corrupted WAL. Default
|
||||||
|
// family has higher log number than corrupted wal number. There may be old
|
||||||
|
// WAL files that it must not delete because they can contain data of
|
||||||
|
// uncommitted transactions. As a result, min_log_number_to_keep won't change.
|
||||||
|
{
|
||||||
|
options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
|
||||||
|
ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
|
||||||
|
&handles, &txn_db));
|
||||||
|
|
||||||
|
for (auto* h : handles) {
|
||||||
|
delete h;
|
||||||
|
}
|
||||||
|
handles.clear();
|
||||||
|
delete txn_db;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Corrupt max_wal_num to emulate second power reset which caused the
|
||||||
|
// DB to again lose the un-synced WAL.
|
||||||
|
{
|
||||||
|
std::vector<uint64_t> file_nums;
|
||||||
|
GetSortedWalFiles(file_nums);
|
||||||
|
size_t size = file_nums.size();
|
||||||
|
uint64_t log_num = file_nums[size - 1];
|
||||||
|
CorruptFileWithTruncation(FileType::kWalFile, log_num);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. After second crash reopen the db with second corruption. Default family
|
||||||
|
// has higher log number than corrupted wal number.
|
||||||
|
// We persist a new MANIFEST with advanced log_numbers for some column
|
||||||
|
// families only after syncing the WAL. So during second crash, RocksDB will
|
||||||
|
// skip the corrupted WAL files as they have been moved to different folder.
|
||||||
|
// Since newly synced WAL file's sequence number (sentinel WriteBatch) will be
|
||||||
|
// the next sequence number immediately after the largest sequence number
|
||||||
|
// recovered from previous WALs and MANIFEST, db will be in consistent state
|
||||||
|
// and opens successfully.
|
||||||
|
{
|
||||||
|
options.avoid_flush_during_recovery = false;
|
||||||
|
|
||||||
|
ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
|
||||||
|
&handles, &txn_db));
|
||||||
|
for (auto* h : handles) {
|
||||||
|
delete h;
|
||||||
|
}
|
||||||
|
delete txn_db;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
@ -175,7 +175,10 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
|
|||||||
ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
|
ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
|
// TODO akanksha: Update the test to check that combination
|
||||||
|
// does not actually write to FS (use open read-only with
|
||||||
|
// CompositeEnvWrapper+ReadOnlyFileSystem).
|
||||||
|
TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
|
||||||
ASSERT_OK(Put("foo", "v1"));
|
ASSERT_OK(Put("foo", "v1"));
|
||||||
ASSERT_OK(Put("bar", "v2"));
|
ASSERT_OK(Put("bar", "v2"));
|
||||||
ASSERT_OK(Put("foo", "v3"));
|
ASSERT_OK(Put("foo", "v3"));
|
||||||
|
@ -1240,6 +1240,43 @@ class DBImpl : public DB {
|
|||||||
|
|
||||||
std::atomic<bool> shutting_down_;
|
std::atomic<bool> shutting_down_;
|
||||||
|
|
||||||
|
// RecoveryContext struct stores the context about version edits along
|
||||||
|
// with corresponding column_family_data and column_family_options.
|
||||||
|
class RecoveryContext {
|
||||||
|
public:
|
||||||
|
~RecoveryContext() {
|
||||||
|
for (auto& edit_list : edit_lists_) {
|
||||||
|
for (auto* edit : edit_list) {
|
||||||
|
delete edit;
|
||||||
|
}
|
||||||
|
edit_list.clear();
|
||||||
|
}
|
||||||
|
cfds_.clear();
|
||||||
|
mutable_cf_opts_.clear();
|
||||||
|
edit_lists_.clear();
|
||||||
|
files_to_delete_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
|
||||||
|
if (map_.find(cfd->GetID()) == map_.end()) {
|
||||||
|
uint32_t size = static_cast<uint32_t>(map_.size());
|
||||||
|
map_.emplace(cfd->GetID(), size);
|
||||||
|
cfds_.emplace_back(cfd);
|
||||||
|
mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
|
||||||
|
edit_lists_.emplace_back(autovector<VersionEdit*>());
|
||||||
|
}
|
||||||
|
uint32_t i = map_[cfd->GetID()];
|
||||||
|
edit_lists_[i].emplace_back(new VersionEdit(edit));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unordered_map<uint32_t, uint32_t> map_; // cf_id to index;
|
||||||
|
autovector<ColumnFamilyData*> cfds_;
|
||||||
|
autovector<const MutableCFOptions*> mutable_cf_opts_;
|
||||||
|
autovector<autovector<VersionEdit*>> edit_lists_;
|
||||||
|
// files_to_delete_ contains sst files
|
||||||
|
std::set<std::string> files_to_delete_;
|
||||||
|
};
|
||||||
|
|
||||||
// Except in DB::Open(), WriteOptionsFile can only be called when:
|
// Except in DB::Open(), WriteOptionsFile can only be called when:
|
||||||
// Persist options to options file.
|
// Persist options to options file.
|
||||||
// If need_mutex_lock = false, the method will lock DB mutex.
|
// If need_mutex_lock = false, the method will lock DB mutex.
|
||||||
@ -1356,16 +1393,19 @@ class DBImpl : public DB {
|
|||||||
// be made to the descriptor are added to *edit.
|
// be made to the descriptor are added to *edit.
|
||||||
// recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
|
// recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
|
||||||
// skipped.
|
// skipped.
|
||||||
|
// recovery_ctx stores the context about version edits and all those
|
||||||
|
// edits are persisted to new Manifest after successfully syncing the new WAL.
|
||||||
virtual Status Recover(
|
virtual Status Recover(
|
||||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||||
bool read_only = false, bool error_if_wal_file_exists = false,
|
bool read_only = false, bool error_if_wal_file_exists = false,
|
||||||
bool error_if_data_exists_in_wals = false,
|
bool error_if_data_exists_in_wals = false,
|
||||||
uint64_t* recovered_seq = nullptr);
|
uint64_t* recovered_seq = nullptr,
|
||||||
|
RecoveryContext* recovery_ctx = nullptr);
|
||||||
|
|
||||||
virtual bool OwnTablesAndLogs() const { return true; }
|
virtual bool OwnTablesAndLogs() const { return true; }
|
||||||
|
|
||||||
// Set DB identity file, and write DB ID to manifest if necessary.
|
// Set DB identity file, and write DB ID to manifest if necessary.
|
||||||
Status SetDBId(bool read_only);
|
Status SetDBId(bool read_only, RecoveryContext* recovery_ctx);
|
||||||
|
|
||||||
// REQUIRES: db mutex held when calling this function, but the db mutex can
|
// REQUIRES: db mutex held when calling this function, but the db mutex can
|
||||||
// be released and re-acquired. Db mutex will be held when the function
|
// be released and re-acquired. Db mutex will be held when the function
|
||||||
@ -1374,12 +1414,15 @@ class DBImpl : public DB {
|
|||||||
// not referenced in the MANIFEST (e.g.
|
// not referenced in the MANIFEST (e.g.
|
||||||
// 1. It's best effort recovery;
|
// 1. It's best effort recovery;
|
||||||
// 2. The VersionEdits referencing the SST files are appended to
|
// 2. The VersionEdits referencing the SST files are appended to
|
||||||
// MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are
|
// RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
|
||||||
// still not synced to MANIFEST during recovery.)
|
// still not synced to MANIFEST during recovery.)
|
||||||
// We delete these SST files. In the
|
// It stores the SST files to be deleted in RecoveryContext. In the
|
||||||
// meantime, we find out the largest file number present in the paths, and
|
// meantime, we find out the largest file number present in the paths, and
|
||||||
// bump up the version set's next_file_number_ to be 1 + largest_file_number.
|
// bump up the version set's next_file_number_ to be 1 + largest_file_number.
|
||||||
Status DeleteUnreferencedSstFiles();
|
// recovery_ctx stores the context about version edits and files to be
|
||||||
|
// deleted. All those edits are persisted to new Manifest after successfully
|
||||||
|
// syncing the new WAL.
|
||||||
|
Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);
|
||||||
|
|
||||||
// SetDbSessionId() should be called in the constuctor DBImpl()
|
// SetDbSessionId() should be called in the constuctor DBImpl()
|
||||||
// to ensure that db_session_id_ gets updated every time the DB is opened
|
// to ensure that db_session_id_ gets updated every time the DB is opened
|
||||||
@ -1389,6 +1432,11 @@ class DBImpl : public DB {
|
|||||||
Status FailIfTsSizesMismatch(const ColumnFamilyHandle* column_family,
|
Status FailIfTsSizesMismatch(const ColumnFamilyHandle* column_family,
|
||||||
const Slice& ts) const;
|
const Slice& ts) const;
|
||||||
|
|
||||||
|
// recovery_ctx stores the context about version edits and
|
||||||
|
// LogAndApplyForRecovery persist all those edits to new Manifest after
|
||||||
|
// successfully syncing new WAL.
|
||||||
|
Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class DB;
|
friend class DB;
|
||||||
friend class ErrorHandler;
|
friend class ErrorHandler;
|
||||||
@ -1643,9 +1691,10 @@ class DBImpl : public DB {
|
|||||||
|
|
||||||
// REQUIRES: log_numbers are sorted in ascending order
|
// REQUIRES: log_numbers are sorted in ascending order
|
||||||
// corrupted_log_found is set to true if we recover from a corrupted log file.
|
// corrupted_log_found is set to true if we recover from a corrupted log file.
|
||||||
Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
Status RecoverLogFiles(std::vector<uint64_t>& log_numbers,
|
||||||
SequenceNumber* next_sequence, bool read_only,
|
SequenceNumber* next_sequence, bool read_only,
|
||||||
bool* corrupted_log_found);
|
bool* corrupted_log_found,
|
||||||
|
RecoveryContext* recovery_ctx);
|
||||||
|
|
||||||
// The following two methods are used to flush a memtable to
|
// The following two methods are used to flush a memtable to
|
||||||
// storage. The first one is used at database RecoveryTime (when the
|
// storage. The first one is used at database RecoveryTime (when the
|
||||||
@ -1655,6 +1704,12 @@ class DBImpl : public DB {
|
|||||||
Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
||||||
MemTable* mem, VersionEdit* edit);
|
MemTable* mem, VersionEdit* edit);
|
||||||
|
|
||||||
|
// Move all the WAL files starting from corrupted WAL found to
|
||||||
|
// max_wal_number to avoid column family inconsistency error on recovery. It
|
||||||
|
// also removes the deleted file from the vector wal_numbers.
|
||||||
|
void MoveCorruptedWalFiles(std::vector<uint64_t>& wal_numbers,
|
||||||
|
uint64_t corrupted_wal_number);
|
||||||
|
|
||||||
// Get the size of a log file and, if truncate is true, truncate the
|
// Get the size of a log file and, if truncate is true, truncate the
|
||||||
// log file to its actual size, thereby freeing preallocated space.
|
// log file to its actual size, thereby freeing preallocated space.
|
||||||
// Return success even if truncate fails
|
// Return success even if truncate fails
|
||||||
|
@ -863,7 +863,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
|
|||||||
return min_log_number_to_keep;
|
return min_log_number_to_keep;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status DBImpl::SetDBId(bool read_only) {
|
Status DBImpl::SetDBId(bool read_only, RecoveryContext* recovery_ctx) {
|
||||||
Status s;
|
Status s;
|
||||||
// Happens when immutable_db_options_.write_dbid_to_manifest is set to true
|
// Happens when immutable_db_options_.write_dbid_to_manifest is set to true
|
||||||
// the very first time.
|
// the very first time.
|
||||||
@ -890,14 +890,14 @@ Status DBImpl::SetDBId(bool read_only) {
|
|||||||
}
|
}
|
||||||
s = GetDbIdentityFromIdentityFile(&db_id_);
|
s = GetDbIdentityFromIdentityFile(&db_id_);
|
||||||
if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
|
if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
|
||||||
|
assert(!read_only);
|
||||||
|
assert(recovery_ctx != nullptr);
|
||||||
|
assert(versions_->GetColumnFamilySet() != nullptr);
|
||||||
VersionEdit edit;
|
VersionEdit edit;
|
||||||
edit.SetDBId(db_id_);
|
edit.SetDBId(db_id_);
|
||||||
Options options;
|
|
||||||
MutableCFOptions mutable_cf_options(options);
|
|
||||||
versions_->db_id_ = db_id_;
|
versions_->db_id_ = db_id_;
|
||||||
s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
|
recovery_ctx->UpdateVersionEdits(
|
||||||
mutable_cf_options, &edit, &mutex_, nullptr,
|
versions_->GetColumnFamilySet()->GetDefault(), edit);
|
||||||
/* new_descriptor_log */ false);
|
|
||||||
}
|
}
|
||||||
} else if (!read_only) {
|
} else if (!read_only) {
|
||||||
s = SetIdentityFile(env_, dbname_, db_id_);
|
s = SetIdentityFile(env_, dbname_, db_id_);
|
||||||
@ -905,7 +905,7 @@ Status DBImpl::SetDBId(bool read_only) {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status DBImpl::DeleteUnreferencedSstFiles() {
|
Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) {
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
std::vector<std::string> paths;
|
std::vector<std::string> paths;
|
||||||
paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
|
paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
|
||||||
@ -925,7 +925,6 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
|
|||||||
|
|
||||||
uint64_t next_file_number = versions_->current_next_file_number();
|
uint64_t next_file_number = versions_->current_next_file_number();
|
||||||
uint64_t largest_file_number = next_file_number;
|
uint64_t largest_file_number = next_file_number;
|
||||||
std::set<std::string> files_to_delete;
|
|
||||||
Status s;
|
Status s;
|
||||||
for (const auto& path : paths) {
|
for (const auto& path : paths) {
|
||||||
std::vector<std::string> files;
|
std::vector<std::string> files;
|
||||||
@ -943,8 +942,9 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
|
|||||||
const std::string normalized_fpath = path + fname;
|
const std::string normalized_fpath = path + fname;
|
||||||
largest_file_number = std::max(largest_file_number, number);
|
largest_file_number = std::max(largest_file_number, number);
|
||||||
if (type == kTableFile && number >= next_file_number &&
|
if (type == kTableFile && number >= next_file_number &&
|
||||||
files_to_delete.find(normalized_fpath) == files_to_delete.end()) {
|
recovery_ctx->files_to_delete_.find(normalized_fpath) ==
|
||||||
files_to_delete.insert(normalized_fpath);
|
recovery_ctx->files_to_delete_.end()) {
|
||||||
|
recovery_ctx->files_to_delete_.insert(normalized_fpath);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -961,21 +961,7 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
|
|||||||
assert(versions_->GetColumnFamilySet());
|
assert(versions_->GetColumnFamilySet());
|
||||||
ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
|
ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
|
||||||
assert(default_cfd);
|
assert(default_cfd);
|
||||||
s = versions_->LogAndApply(
|
recovery_ctx->UpdateVersionEdits(default_cfd, edit);
|
||||||
default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
|
|
||||||
directories_.GetDbDir(), /*new_descriptor_log*/ false);
|
|
||||||
if (!s.ok()) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
mutex_.Unlock();
|
|
||||||
for (const auto& fname : files_to_delete) {
|
|
||||||
s = env_->DeleteFile(fname);
|
|
||||||
if (!s.ok()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mutex_.Lock();
|
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -399,7 +399,7 @@ IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
|
|||||||
Status DBImpl::Recover(
|
Status DBImpl::Recover(
|
||||||
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
|
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
|
||||||
bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
|
bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
|
||||||
uint64_t* recovered_seq) {
|
uint64_t* recovered_seq, RecoveryContext* recovery_ctx) {
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
|
|
||||||
bool is_new_db = false;
|
bool is_new_db = false;
|
||||||
@ -518,9 +518,10 @@ Status DBImpl::Recover(
|
|||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
s = SetDBId(read_only);
|
|
||||||
|
s = SetDBId(read_only, recovery_ctx);
|
||||||
if (s.ok() && !read_only) {
|
if (s.ok() && !read_only) {
|
||||||
s = DeleteUnreferencedSstFiles();
|
s = DeleteUnreferencedSstFiles(recovery_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (immutable_db_options_.paranoid_checks && s.ok()) {
|
if (immutable_db_options_.paranoid_checks && s.ok()) {
|
||||||
@ -535,10 +536,6 @@ Status DBImpl::Recover(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// DB mutex is already held
|
|
||||||
if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
|
|
||||||
s = InitPersistStatsColumnFamily();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> files_in_wal_dir;
|
std::vector<std::string> files_in_wal_dir;
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
@ -608,7 +605,10 @@ Status DBImpl::Recover(
|
|||||||
WalNumber max_wal_number =
|
WalNumber max_wal_number =
|
||||||
versions_->GetWalSet().GetWals().rbegin()->first;
|
versions_->GetWalSet().GetWals().rbegin()->first;
|
||||||
edit.DeleteWalsBefore(max_wal_number + 1);
|
edit.DeleteWalsBefore(max_wal_number + 1);
|
||||||
s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_);
|
assert(recovery_ctx != nullptr);
|
||||||
|
assert(versions_->GetColumnFamilySet() != nullptr);
|
||||||
|
recovery_ctx->UpdateVersionEdits(
|
||||||
|
versions_->GetColumnFamilySet()->GetDefault(), edit);
|
||||||
}
|
}
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
@ -644,8 +644,8 @@ Status DBImpl::Recover(
|
|||||||
std::sort(wals.begin(), wals.end());
|
std::sort(wals.begin(), wals.end());
|
||||||
|
|
||||||
bool corrupted_wal_found = false;
|
bool corrupted_wal_found = false;
|
||||||
s = RecoverLogFiles(wals, &next_sequence, read_only,
|
s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found,
|
||||||
&corrupted_wal_found);
|
recovery_ctx);
|
||||||
if (corrupted_wal_found && recovered_seq != nullptr) {
|
if (corrupted_wal_found && recovered_seq != nullptr) {
|
||||||
*recovered_seq = next_sequence;
|
*recovered_seq = next_sequence;
|
||||||
}
|
}
|
||||||
@ -805,10 +805,30 @@ Status DBImpl::InitPersistStatsColumnFamily() {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
|
||||||
|
mutex_.AssertHeld();
|
||||||
|
assert(versions_->descriptor_log_ == nullptr);
|
||||||
|
Status s = versions_->LogAndApply(
|
||||||
|
recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_,
|
||||||
|
recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir());
|
||||||
|
if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) {
|
||||||
|
mutex_.Unlock();
|
||||||
|
for (const auto& fname : recovery_ctx.files_to_delete_) {
|
||||||
|
s = env_->DeleteFile(fname);
|
||||||
|
if (!s.ok()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mutex_.Lock();
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
// REQUIRES: wal_numbers are sorted in ascending order
|
// REQUIRES: wal_numbers are sorted in ascending order
|
||||||
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
Status DBImpl::RecoverLogFiles(std::vector<uint64_t>& wal_numbers,
|
||||||
SequenceNumber* next_sequence, bool read_only,
|
SequenceNumber* next_sequence, bool read_only,
|
||||||
bool* corrupted_wal_found) {
|
bool* corrupted_wal_found,
|
||||||
|
RecoveryContext* recovery_ctx) {
|
||||||
struct LogReporter : public log::Reader::Reporter {
|
struct LogReporter : public log::Reader::Reporter {
|
||||||
Env* env;
|
Env* env;
|
||||||
Logger* info_log;
|
Logger* info_log;
|
||||||
@ -833,6 +853,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|||||||
edit.SetColumnFamily(cfd->GetID());
|
edit.SetColumnFamily(cfd->GetID());
|
||||||
version_edits.insert({cfd->GetID(), edit});
|
version_edits.insert({cfd->GetID(), edit});
|
||||||
}
|
}
|
||||||
|
|
||||||
int job_id = next_job_id_.fetch_add(1);
|
int job_id = next_job_id_.fetch_add(1);
|
||||||
{
|
{
|
||||||
auto stream = event_logger_.Log();
|
auto stream = event_logger_.Log();
|
||||||
@ -1256,6 +1277,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|||||||
edit->SetLogNumber(max_wal_number + 1);
|
edit->SetLogNumber(max_wal_number + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
// we must mark the next log number as used, even though it's
|
// we must mark the next log number as used, even though it's
|
||||||
// not actually used. that is because VersionSet assumes
|
// not actually used. that is because VersionSet assumes
|
||||||
@ -1263,42 +1285,40 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|||||||
// log number
|
// log number
|
||||||
versions_->MarkFileNumberUsed(max_wal_number + 1);
|
versions_->MarkFileNumberUsed(max_wal_number + 1);
|
||||||
|
|
||||||
autovector<ColumnFamilyData*> cfds;
|
if (corrupted_wal_found != nullptr && *corrupted_wal_found == true &&
|
||||||
autovector<const MutableCFOptions*> cf_opts;
|
immutable_db_options_.wal_recovery_mode ==
|
||||||
autovector<autovector<VersionEdit*>> edit_lists;
|
WALRecoveryMode::kPointInTimeRecovery) {
|
||||||
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
MoveCorruptedWalFiles(wal_numbers, corrupted_wal_number);
|
||||||
cfds.push_back(cfd);
|
}
|
||||||
cf_opts.push_back(cfd->GetLatestMutableCFOptions());
|
|
||||||
auto iter = version_edits.find(cfd->GetID());
|
assert(recovery_ctx != nullptr);
|
||||||
assert(iter != version_edits.end());
|
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
||||||
edit_lists.push_back({&iter->second});
|
auto iter = version_edits.find(cfd->GetID());
|
||||||
|
assert(iter != version_edits.end());
|
||||||
|
recovery_ctx->UpdateVersionEdits(cfd, iter->second);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<VersionEdit> wal_deletion;
|
|
||||||
if (flushed) {
|
if (flushed) {
|
||||||
wal_deletion = std::make_unique<VersionEdit>();
|
VersionEdit wal_deletion;
|
||||||
if (immutable_db_options_.track_and_verify_wals_in_manifest) {
|
if (immutable_db_options_.track_and_verify_wals_in_manifest) {
|
||||||
wal_deletion->DeleteWalsBefore(max_wal_number + 1);
|
wal_deletion.DeleteWalsBefore(max_wal_number + 1);
|
||||||
}
|
}
|
||||||
if (!allow_2pc()) {
|
if (!allow_2pc()) {
|
||||||
// In non-2pc mode, flushing the memtables of the column families
|
// In non-2pc mode, flushing the memtables of the column families
|
||||||
// means we can advance min_log_number_to_keep.
|
// means we can advance min_log_number_to_keep.
|
||||||
wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1);
|
wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1);
|
||||||
}
|
}
|
||||||
edit_lists.back().push_back(wal_deletion.get());
|
assert(versions_->GetColumnFamilySet() != nullptr);
|
||||||
|
recovery_ctx->UpdateVersionEdits(
|
||||||
|
versions_->GetColumnFamilySet()->GetDefault(), wal_deletion);
|
||||||
}
|
}
|
||||||
|
|
||||||
// write MANIFEST with update
|
|
||||||
status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
|
|
||||||
directories_.GetDbDir(),
|
|
||||||
/*new_descriptor_log=*/true);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
if (data_seen && !flushed) {
|
if (data_seen && !flushed) {
|
||||||
status = RestoreAliveLogFiles(wal_numbers);
|
status = RestoreAliveLogFiles(wal_numbers);
|
||||||
} else {
|
} else if (!wal_numbers.empty()) {
|
||||||
// If there's no data in the WAL, or we flushed all the data, still
|
// If there's no data in the WAL, or we flushed all the data, still
|
||||||
// truncate the log file. If the process goes into a crash loop before
|
// truncate the log file. If the process goes into a crash loop before
|
||||||
// the file is deleted, the preallocated space will never get freed.
|
// the file is deleted, the preallocated space will never get freed.
|
||||||
@ -1314,6 +1334,48 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void DBImpl::MoveCorruptedWalFiles(std::vector<uint64_t>& wal_numbers,
|
||||||
|
uint64_t corrupted_wal_number) {
|
||||||
|
size_t num_wals = wal_numbers.size();
|
||||||
|
// Find the first corrupted wal.
|
||||||
|
auto iter = std::lower_bound(wal_numbers.begin(), wal_numbers.end(),
|
||||||
|
corrupted_wal_number);
|
||||||
|
auto corrupt_start_iter = iter;
|
||||||
|
|
||||||
|
// Increment iter to move WAL files from first corrupted_wal_number + 1.
|
||||||
|
iter++;
|
||||||
|
|
||||||
|
std::string archival_path =
|
||||||
|
ArchivalDirectory(immutable_db_options_.GetWalDir());
|
||||||
|
Status create_status = env_->CreateDirIfMissing(archival_path);
|
||||||
|
|
||||||
|
// create_status is only checked when it needs to move the corrupted WAL files
|
||||||
|
// to archive folder.
|
||||||
|
create_status.PermitUncheckedError();
|
||||||
|
|
||||||
|
// Truncate the last WAL to reclaim the pre allocated space before
|
||||||
|
// moving it.
|
||||||
|
GetLogSizeAndMaybeTruncate(wal_numbers.back(), /*truncate=*/true, nullptr)
|
||||||
|
.PermitUncheckedError();
|
||||||
|
|
||||||
|
// Move all the WAL files from corrupted_wal_number + 1 to last WAL
|
||||||
|
// (max_wal_number) to avoid column family inconsistency error to archival
|
||||||
|
// directory. If its unable to create archive dir, it will delete the
|
||||||
|
// corrupted WAL files.
|
||||||
|
// We are moving all but first corrupted WAL file to a different folder.
|
||||||
|
while (iter != wal_numbers.end()) {
|
||||||
|
LogFileNumberSize log(*iter);
|
||||||
|
std::string fname = LogFileName(immutable_db_options_.GetWalDir(), *iter);
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
if (create_status.ok()) {
|
||||||
|
wal_manager_.ArchiveWALFile(fname, *iter);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
iter++;
|
||||||
|
}
|
||||||
|
wal_numbers.erase(corrupt_start_iter + 1, wal_numbers.begin() + num_wals);
|
||||||
|
}
|
||||||
|
|
||||||
Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
|
Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
|
||||||
LogFileNumberSize* log_ptr) {
|
LogFileNumberSize* log_ptr) {
|
||||||
LogFileNumberSize log(wal_number);
|
LogFileNumberSize log(wal_number);
|
||||||
@ -1376,7 +1438,8 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
|||||||
// log has such preallocated space, so we only truncate for the last log.
|
// log has such preallocated space, so we only truncate for the last log.
|
||||||
LogFileNumberSize log;
|
LogFileNumberSize log;
|
||||||
s = GetLogSizeAndMaybeTruncate(
|
s = GetLogSizeAndMaybeTruncate(
|
||||||
wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
|
wal_number,
|
||||||
|
/*truncate=*/(wal_number == wal_numbers.back()), &log);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1737,9 +1800,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|||||||
impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
|
impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
|
||||||
|
|
||||||
impl->mutex_.Lock();
|
impl->mutex_.Lock();
|
||||||
|
|
||||||
|
RecoveryContext recovery_ctx;
|
||||||
|
|
||||||
// Handles create_if_missing, error_if_exists
|
// Handles create_if_missing, error_if_exists
|
||||||
uint64_t recovered_seq(kMaxSequenceNumber);
|
uint64_t recovered_seq(kMaxSequenceNumber);
|
||||||
s = impl->Recover(column_families, false, false, false, &recovered_seq);
|
s = impl->Recover(column_families, false, false, false, &recovered_seq,
|
||||||
|
&recovery_ctx);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
uint64_t new_log_number = impl->versions_->NewFileNumber();
|
uint64_t new_log_number = impl->versions_->NewFileNumber();
|
||||||
log::Writer* new_log = nullptr;
|
log::Writer* new_log = nullptr;
|
||||||
@ -1755,6 +1822,55 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|||||||
impl->logs_.emplace_back(new_log_number, new_log);
|
impl->logs_.emplace_back(new_log_number, new_log);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (s.ok()) {
|
||||||
|
if (impl->two_write_queues_) {
|
||||||
|
impl->log_write_mutex_.Lock();
|
||||||
|
}
|
||||||
|
impl->alive_log_files_.push_back(
|
||||||
|
DBImpl::LogFileNumberSize(impl->logfile_number_));
|
||||||
|
impl->alive_log_files_tail_ = impl->alive_log_files_.rbegin();
|
||||||
|
if (impl->two_write_queues_) {
|
||||||
|
impl->log_write_mutex_.Unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (s.ok()) {
|
||||||
|
// In WritePrepared there could be gap in sequence numbers. This breaks
|
||||||
|
// the trick we use in kPointInTimeRecovery which assumes the first seq
|
||||||
|
// in the log right after the corrupted log is one larger than the last
|
||||||
|
// seq we read from the wals. To let this trick keep working, we add a
|
||||||
|
// dummy entry with the expected sequence to the first log right after
|
||||||
|
// recovery. In non-WritePrepared case also the new log after recovery
|
||||||
|
// could be empty, and thus missing the consecutive seq hint to
|
||||||
|
// distinguish middle-log corruption to
|
||||||
|
// corrupted-log-remained-after-recovery. This case also will be
|
||||||
|
// addressed by a dummy write.
|
||||||
|
if (recovered_seq != kMaxSequenceNumber) {
|
||||||
|
WriteBatch empty_batch;
|
||||||
|
WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
|
||||||
|
WriteOptions write_options;
|
||||||
|
uint64_t log_used, log_size;
|
||||||
|
log::Writer* log_writer = impl->logs_.back().writer;
|
||||||
|
s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
|
||||||
|
Env::IO_TOTAL, /*with_db_mutex==*/true);
|
||||||
|
if (s.ok()) {
|
||||||
|
// Need to fsync, otherwise it might get lost after a power reset.
|
||||||
|
s = impl->FlushWAL(false);
|
||||||
|
if (s.ok()) {
|
||||||
|
s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (s.ok()) {
|
||||||
|
s = impl->LogAndApplyForRecovery(recovery_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
|
||||||
|
impl->mutex_.AssertHeld();
|
||||||
|
s = impl->InitPersistStatsColumnFamily();
|
||||||
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
// set column family handles
|
// set column family handles
|
||||||
for (auto cf : column_families) {
|
for (auto cf : column_families) {
|
||||||
@ -1783,6 +1899,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
SuperVersionContext sv_context(/* create_superversion */ true);
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
||||||
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
||||||
@ -1790,43 +1907,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|||||||
cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
|
cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
|
||||||
}
|
}
|
||||||
sv_context.Clean();
|
sv_context.Clean();
|
||||||
if (impl->two_write_queues_) {
|
|
||||||
impl->log_write_mutex_.Lock();
|
|
||||||
}
|
|
||||||
impl->alive_log_files_.push_back(
|
|
||||||
DBImpl::LogFileNumberSize(impl->logfile_number_));
|
|
||||||
impl->alive_log_files_tail_ = impl->alive_log_files_.rbegin();
|
|
||||||
if (impl->two_write_queues_) {
|
|
||||||
impl->log_write_mutex_.Unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (s.ok()) {
|
|
||||||
// In WritePrepared there could be gap in sequence numbers. This breaks
|
|
||||||
// the trick we use in kPointInTimeRecovery which assumes the first seq in
|
|
||||||
// the log right after the corrupted log is one larger than the last seq
|
|
||||||
// we read from the wals. To let this trick keep working, we add a dummy
|
|
||||||
// entry with the expected sequence to the first log right after recovery.
|
|
||||||
// In non-WritePrepared case also the new log after recovery could be
|
|
||||||
// empty, and thus missing the consecutive seq hint to distinguish
|
|
||||||
// middle-log corruption to corrupted-log-remained-after-recovery. This
|
|
||||||
// case also will be addressed by a dummy write.
|
|
||||||
if (recovered_seq != kMaxSequenceNumber) {
|
|
||||||
WriteBatch empty_batch;
|
|
||||||
WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
|
|
||||||
WriteOptions write_options;
|
|
||||||
uint64_t log_used, log_size;
|
|
||||||
log::Writer* log_writer = impl->logs_.back().writer;
|
|
||||||
s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
|
|
||||||
Env::IO_TOTAL, /*with_db_mutex==*/true);
|
|
||||||
if (s.ok()) {
|
|
||||||
// Need to fsync, otherwise it might get lost after a power reset.
|
|
||||||
s = impl->FlushWAL(false);
|
|
||||||
if (s.ok()) {
|
|
||||||
s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
|
if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
|
||||||
// try to read format version
|
// try to read format version
|
||||||
@ -1853,7 +1933,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|||||||
if (cfd->ioptions()->merge_operator != nullptr &&
|
if (cfd->ioptions()->merge_operator != nullptr &&
|
||||||
!cfd->mem()->IsMergeOperatorSupported()) {
|
!cfd->mem()->IsMergeOperatorSupported()) {
|
||||||
s = Status::InvalidArgument(
|
s = Status::InvalidArgument(
|
||||||
"The memtable of column family %s does not support merge operator "
|
"The memtable of column family %s does not support merge "
|
||||||
|
"operator "
|
||||||
"its options.merge_operator is non-null",
|
"its options.merge_operator is non-null",
|
||||||
cfd->GetName().c_str());
|
cfd->GetName().c_str());
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,8 @@ DBImplSecondary::~DBImplSecondary() {}
|
|||||||
Status DBImplSecondary::Recover(
|
Status DBImplSecondary::Recover(
|
||||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||||
bool /*readonly*/, bool /*error_if_wal_file_exists*/,
|
bool /*readonly*/, bool /*error_if_wal_file_exists*/,
|
||||||
bool /*error_if_data_exists_in_wals*/, uint64_t*) {
|
bool /*error_if_data_exists_in_wals*/, uint64_t*,
|
||||||
|
RecoveryContext* /*recovery_ctx*/) {
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
|
|
||||||
JobContext job_context(0);
|
JobContext job_context(0);
|
||||||
|
@ -81,8 +81,8 @@ class DBImplSecondary : public DBImpl {
|
|||||||
// and log_readers_ to facilitate future operations.
|
// and log_readers_ to facilitate future operations.
|
||||||
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||||
bool read_only, bool error_if_wal_file_exists,
|
bool read_only, bool error_if_wal_file_exists,
|
||||||
bool error_if_data_exists_in_wals,
|
bool error_if_data_exists_in_wals, uint64_t* = nullptr,
|
||||||
uint64_t* = nullptr) override;
|
RecoveryContext* recovery_ctx = nullptr) override;
|
||||||
|
|
||||||
// Implementations of the DB interface
|
// Implementations of the DB interface
|
||||||
using DB::Get;
|
using DB::Get;
|
||||||
|
@ -287,7 +287,6 @@ TEST_F(DBWALTest, Recover) {
|
|||||||
|
|
||||||
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
||||||
ASSERT_EQ("v1", Get(1, "foo"));
|
ASSERT_EQ("v1", Get(1, "foo"));
|
||||||
|
|
||||||
ASSERT_EQ("v1", Get(1, "foo"));
|
ASSERT_EQ("v1", Get(1, "foo"));
|
||||||
ASSERT_EQ("v5", Get(1, "baz"));
|
ASSERT_EQ("v5", Get(1, "baz"));
|
||||||
ASSERT_OK(Put(1, "bar", "v2"));
|
ASSERT_OK(Put(1, "bar", "v2"));
|
||||||
|
@ -1428,7 +1428,7 @@ public class RocksDBTest {
|
|||||||
assertThat(livefiles.manifestFileSize).isEqualTo(59);
|
assertThat(livefiles.manifestFileSize).isEqualTo(59);
|
||||||
assertThat(livefiles.files.size()).isEqualTo(3);
|
assertThat(livefiles.files.size()).isEqualTo(3);
|
||||||
assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
|
assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
|
||||||
assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000004");
|
assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000005");
|
||||||
assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007");
|
assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -604,10 +604,14 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
|
|||||||
dbfull()->TEST_WaitForStatsDumpRun(
|
dbfull()->TEST_WaitForStatsDumpRun(
|
||||||
[&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
|
[&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
|
||||||
// writing to all three cf, flush default cf
|
// writing to all three cf, flush default cf
|
||||||
// LogNumbers: default: 14, stats: 4, pikachu: 4
|
// LogNumbers: default: 16, stats: 10, pikachu: 5
|
||||||
|
// Since in recovery process, cfd_stats column is created after WAL is
|
||||||
|
// created, synced and MANIFEST is persisted, its log number which depends on
|
||||||
|
// logfile_number_ will be different. Since "pikachu" is never flushed, thus
|
||||||
|
// its log_number should be the smallest of the three.
|
||||||
ASSERT_OK(Flush());
|
ASSERT_OK(Flush());
|
||||||
ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
|
ASSERT_LT(cfd_test->GetLogNumber(), cfd_stats->GetLogNumber());
|
||||||
ASSERT_LT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
|
ASSERT_LT(cfd_test->GetLogNumber(), cfd_default->GetLogNumber());
|
||||||
|
|
||||||
ASSERT_OK(Put("foo1", "v1"));
|
ASSERT_OK(Put("foo1", "v1"));
|
||||||
ASSERT_OK(Put("bar1", "v1"));
|
ASSERT_OK(Put("bar1", "v1"));
|
||||||
|
Loading…
Reference in New Issue
Block a user