2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 23:59:46 +02:00
|
|
|
//
|
2011-03-18 23:37:00 +01:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/version_edit.h"
|
|
|
|
|
2020-03-12 18:58:27 +01:00
|
|
|
#include "db/blob/blob_index.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "db/version_set.h"
|
2019-06-01 02:19:43 +02:00
|
|
|
#include "logging/event_logger.h"
|
2017-04-06 23:49:13 +02:00
|
|
|
#include "rocksdb/slice.h"
|
2019-05-31 02:39:43 +02:00
|
|
|
#include "test_util/sync_point.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
#include "util/coding.h"
|
2017-04-06 23:49:13 +02:00
|
|
|
#include "util/string_util.h"
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2020-02-25 03:38:00 +01:00
|
|
|
|
|
|
|
namespace {
|
2019-09-03 17:50:47 +02:00
|
|
|
|
2020-02-25 03:38:00 +01:00
|
|
|
} // anonymous namespace
|
2015-10-03 02:32:46 +02:00
|
|
|
|
2014-07-02 18:54:20 +02:00
|
|
|
uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
|
|
|
|
assert(number <= kFileNumberMask);
|
|
|
|
return number | (path_id * (kFileNumberMask + 1));
|
|
|
|
}
|
|
|
|
|
2022-04-16 05:25:48 +02:00
|
|
|
Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
|
|
|
|
SequenceNumber seqno,
|
|
|
|
ValueType value_type) {
|
2019-10-15 00:19:31 +02:00
|
|
|
if (value_type == kTypeBlobIndex) {
|
|
|
|
BlobIndex blob_index;
|
|
|
|
const Status s = blob_index.DecodeFrom(value);
|
|
|
|
if (!s.ok()) {
|
2022-04-16 05:25:48 +02:00
|
|
|
return s;
|
2019-10-15 00:19:31 +02:00
|
|
|
}
|
|
|
|
|
2022-04-16 05:25:48 +02:00
|
|
|
if (!blob_index.IsInlined() && !blob_index.HasTTL()) {
|
|
|
|
if (blob_index.file_number() == kInvalidBlobFileNumber) {
|
|
|
|
return Status::Corruption("Invalid blob file number");
|
|
|
|
}
|
2019-10-15 00:19:31 +02:00
|
|
|
|
2022-04-16 05:25:48 +02:00
|
|
|
if (oldest_blob_file_number == kInvalidBlobFileNumber ||
|
|
|
|
oldest_blob_file_number > blob_index.file_number()) {
|
|
|
|
oldest_blob_file_number = blob_index.file_number();
|
|
|
|
}
|
2019-10-15 00:19:31 +02:00
|
|
|
}
|
2022-04-16 05:25:48 +02:00
|
|
|
}
|
2019-10-15 00:19:31 +02:00
|
|
|
|
2022-04-16 05:25:48 +02:00
|
|
|
if (smallest.size() == 0) {
|
|
|
|
smallest.DecodeFrom(key);
|
2019-10-15 00:19:31 +02:00
|
|
|
}
|
2022-04-16 05:25:48 +02:00
|
|
|
largest.DecodeFrom(key);
|
|
|
|
fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
|
|
|
|
fd.largest_seqno = std::max(fd.largest_seqno, seqno);
|
|
|
|
|
|
|
|
return Status::OK();
|
2019-10-15 00:19:31 +02:00
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
void VersionEdit::Clear() {
|
2020-02-07 22:25:07 +01:00
|
|
|
max_level_ = 0;
|
2019-09-03 17:50:47 +02:00
|
|
|
db_id_.clear();
|
2011-03-18 23:37:00 +01:00
|
|
|
comparator_.clear();
|
|
|
|
log_number_ = 0;
|
2011-04-12 21:38:58 +02:00
|
|
|
prev_log_number_ = 0;
|
2011-03-18 23:37:00 +01:00
|
|
|
next_file_number_ = 0;
|
2014-03-05 21:13:44 +01:00
|
|
|
max_column_family_ = 0;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
min_log_number_to_keep_ = 0;
|
2020-02-07 22:25:07 +01:00
|
|
|
last_sequence_ = 0;
|
2019-09-03 17:50:47 +02:00
|
|
|
has_db_id_ = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
has_comparator_ = false;
|
|
|
|
has_log_number_ = false;
|
2011-04-12 21:38:58 +02:00
|
|
|
has_prev_log_number_ = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
has_next_file_number_ = false;
|
2014-03-05 21:13:44 +01:00
|
|
|
has_max_column_family_ = false;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
has_min_log_number_to_keep_ = false;
|
2020-02-07 22:25:07 +01:00
|
|
|
has_last_sequence_ = false;
|
2011-03-18 23:37:00 +01:00
|
|
|
deleted_files_.clear();
|
|
|
|
new_files_.clear();
|
2020-03-11 01:24:38 +01:00
|
|
|
blob_file_additions_.clear();
|
|
|
|
blob_file_garbages_.clear();
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
wal_additions_.clear();
|
2020-11-07 01:30:44 +01:00
|
|
|
wal_deletion_.Reset();
|
2013-12-12 02:46:26 +01:00
|
|
|
column_family_ = 0;
|
2020-02-07 22:25:07 +01:00
|
|
|
is_column_family_add_ = false;
|
|
|
|
is_column_family_drop_ = false;
|
2013-12-12 02:46:26 +01:00
|
|
|
column_family_name_.clear();
|
2018-08-20 23:54:03 +02:00
|
|
|
is_in_atomic_group_ = false;
|
|
|
|
remaining_entries_ = 0;
|
2020-12-05 23:17:11 +01:00
|
|
|
full_history_ts_low_.clear();
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
2014-10-28 22:27:26 +01:00
|
|
|
bool VersionEdit::EncodeTo(std::string* dst) const {
|
2019-09-03 17:50:47 +02:00
|
|
|
if (has_db_id_) {
|
|
|
|
PutVarint32(dst, kDbId);
|
|
|
|
PutLengthPrefixedSlice(dst, db_id_);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
if (has_comparator_) {
|
|
|
|
PutVarint32(dst, kComparator);
|
|
|
|
PutLengthPrefixedSlice(dst, comparator_);
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
2016-06-13 18:57:43 +02:00
|
|
|
PutVarint32Varint64(dst, kLogNumber, log_number_);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2011-04-12 21:38:58 +02:00
|
|
|
if (has_prev_log_number_) {
|
2016-06-13 18:57:43 +02:00
|
|
|
PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
|
2011-04-12 21:38:58 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
if (has_next_file_number_) {
|
2016-06-13 18:57:43 +02:00
|
|
|
PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2014-03-05 21:13:44 +01:00
|
|
|
if (has_max_column_family_) {
|
2016-06-13 18:57:43 +02:00
|
|
|
PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
|
2014-03-05 21:13:44 +01:00
|
|
|
}
|
2022-04-01 05:00:52 +02:00
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_);
|
|
|
|
}
|
2020-02-07 22:25:07 +01:00
|
|
|
if (has_last_sequence_) {
|
|
|
|
PutVarint32Varint64(dst, kLastSequence, last_sequence_);
|
|
|
|
}
|
2013-12-31 03:33:57 +01:00
|
|
|
for (const auto& deleted : deleted_files_) {
|
2016-06-13 18:57:43 +02:00
|
|
|
PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
|
|
|
|
deleted.second /* file number */);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
bool min_log_num_written = false;
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const FileMetaData& f = new_files_[i].second;
|
2014-10-29 19:21:51 +01:00
|
|
|
if (!f.smallest.Valid() || !f.largest.Valid()) {
|
|
|
|
return false;
|
|
|
|
}
|
2019-11-23 01:01:21 +01:00
|
|
|
PutVarint32(dst, kNewFile4);
|
2016-06-13 18:57:43 +02:00
|
|
|
PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
|
2014-06-14 00:54:19 +02:00
|
|
|
PutVarint64(dst, f.fd.GetFileSize());
|
2011-03-18 23:37:00 +01:00
|
|
|
PutLengthPrefixedSlice(dst, f.smallest.Encode());
|
|
|
|
PutLengthPrefixedSlice(dst, f.largest.Encode());
|
2018-07-28 01:00:26 +02:00
|
|
|
PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
|
2019-11-23 01:01:21 +01:00
|
|
|
// Customized fields' format:
|
|
|
|
// +-----------------------------+
|
|
|
|
// | 1st field's tag (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | 1st field's size (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | bytes for 1st field |
|
|
|
|
// | (based on size decoded) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | |
|
|
|
|
// | ...... |
|
|
|
|
// | |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | last field's size (varint32)|
|
|
|
|
// +-----------------------------+
|
|
|
|
// | bytes for last field |
|
|
|
|
// | (based on size decoded) |
|
|
|
|
// +-----------------------------+
|
|
|
|
// | terminating tag (varint32) |
|
|
|
|
// +-----------------------------+
|
|
|
|
//
|
|
|
|
// Customized encoding for fields:
|
|
|
|
// tag kPathId: 1 byte as path_id
|
|
|
|
// tag kNeedCompaction:
|
|
|
|
// now only can take one char value 1 indicating need-compaction
|
|
|
|
//
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
|
2019-11-23 01:01:21 +01:00
|
|
|
std::string varint_oldest_ancester_time;
|
|
|
|
PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
|
|
|
|
&varint_oldest_ancester_time);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
|
2019-11-27 06:38:38 +01:00
|
|
|
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
|
2019-11-27 06:38:38 +01:00
|
|
|
std::string varint_file_creation_time;
|
|
|
|
PutVarint64(&varint_file_creation_time, f.file_creation_time);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
|
|
|
|
&varint_file_creation_time);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
|
|
|
|
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileChecksum);
|
2020-02-11 00:42:46 +01:00
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
|
|
|
|
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
|
2020-02-11 00:42:46 +01:00
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
|
|
|
|
|
2021-11-10 19:47:53 +01:00
|
|
|
if (f.max_timestamp != kDisableUserTimestamp) {
|
|
|
|
if (f.min_timestamp.size() != f.max_timestamp.size()) {
|
|
|
|
assert(false);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kMinTimestamp);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.min_timestamp));
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kMaxTimestamp);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(f.max_timestamp));
|
|
|
|
}
|
2019-11-23 01:01:21 +01:00
|
|
|
if (f.fd.GetPathId() != 0) {
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kPathId);
|
2019-11-23 01:01:21 +01:00
|
|
|
char p = static_cast<char>(f.fd.GetPathId());
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
2015-10-03 02:32:46 +02:00
|
|
|
}
|
2021-05-18 00:14:34 +02:00
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
|
|
|
PutVarint32(dst, NewFileCustomTag::kTemperature);
|
|
|
|
char p = static_cast<char>(f.temperature);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
2019-11-23 01:01:21 +01:00
|
|
|
if (f.marked_for_compaction) {
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
|
2019-11-23 01:01:21 +01:00
|
|
|
char p = static_cast<char>(1);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(&p, 1));
|
|
|
|
}
|
|
|
|
if (has_min_log_number_to_keep_ && !min_log_num_written) {
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
|
2019-11-23 01:01:21 +01:00
|
|
|
std::string varint_log_number;
|
|
|
|
PutFixed64(&varint_log_number, min_log_number_to_keep_);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(varint_log_number));
|
|
|
|
min_log_num_written = true;
|
|
|
|
}
|
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
|
2019-11-23 01:01:21 +01:00
|
|
|
std::string oldest_blob_file_number;
|
|
|
|
PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
|
|
|
|
dst);
|
|
|
|
|
2020-02-25 03:38:00 +01:00
|
|
|
PutVarint32(dst, NewFileCustomTag::kTerminate);
|
|
|
|
}
|
|
|
|
|
2020-03-11 01:24:38 +01:00
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
PutVarint32(dst, kBlobFileAddition);
|
|
|
|
blob_file_addition.EncodeTo(dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
PutVarint32(dst, kBlobFileGarbage);
|
|
|
|
blob_file_garbage.EncodeTo(dst);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2013-12-12 02:46:26 +01:00
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
2021-01-20 04:26:05 +01:00
|
|
|
PutVarint32(dst, kWalAddition2);
|
|
|
|
std::string encoded;
|
|
|
|
wal_addition.EncodeTo(&encoded);
|
|
|
|
PutLengthPrefixedSlice(dst, encoded);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
}
|
|
|
|
|
2020-11-07 01:30:44 +01:00
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
2021-01-20 04:26:05 +01:00
|
|
|
PutVarint32(dst, kWalDeletion2);
|
|
|
|
std::string encoded;
|
|
|
|
wal_deletion_.EncodeTo(&encoded);
|
|
|
|
PutLengthPrefixedSlice(dst, encoded);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
}
|
|
|
|
|
2013-12-12 02:46:26 +01:00
|
|
|
// 0 is default and does not need to be explicitly written
|
|
|
|
if (column_family_ != 0) {
|
2016-06-13 18:57:43 +02:00
|
|
|
PutVarint32Varint32(dst, kColumnFamily, column_family_);
|
2013-12-12 02:46:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
PutVarint32(dst, kColumnFamilyAdd);
|
|
|
|
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
PutVarint32(dst, kColumnFamilyDrop);
|
|
|
|
}
|
2018-08-20 23:54:03 +02:00
|
|
|
|
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
PutVarint32(dst, kInAtomicGroup);
|
|
|
|
PutVarint32(dst, remaining_entries_);
|
|
|
|
}
|
2020-12-05 23:17:11 +01:00
|
|
|
|
|
|
|
if (HasFullHistoryTsLow()) {
|
|
|
|
PutVarint32(dst, kFullHistoryTsLow);
|
|
|
|
PutLengthPrefixedSlice(dst, full_history_ts_low_);
|
|
|
|
}
|
2014-10-28 22:27:26 +01:00
|
|
|
return true;
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool GetInternalKey(Slice* input, InternalKey* dst) {
|
|
|
|
Slice str;
|
|
|
|
if (GetLengthPrefixedSlice(input, &str)) {
|
|
|
|
dst->DecodeFrom(str);
|
[fix] SIGSEGV when VersionEdit in MANIFEST is corrupted
Summary:
This was reported by our customers in task #4295529.
Cause:
* MANIFEST file contains a VersionEdit, which contains file entries whose 'smallest' and 'largest' internal keys are empty. String with zero characters. Root cause of corruption was not investigated. We should report corruption when this happens. However, we currently SIGSEGV.
Here's what happens:
* VersionEdit encodes zero-strings happily and stores them in smallest and largest InternalKeys. InternalKey::Encode() does assert when `rep_.empty()`, but we don't assert in production environemnts. Also, we should never assert as a result of DB corruption.
* As part of our ConsistencyCheck, we call GetLiveFilesMetaData()
* GetLiveFilesMetadata() calls `file->largest.user_key().ToString()`
* user_key() function does: 1. assert(size > 8) (ooops, no assert), 2. returns `Slice(internal_key.data(), internal_key.size() - 8)`
* since `internal_key.size()` is unsigned int, this call translates to `Slice(whatever, 1298471928561892576182756)`. Bazinga.
Fix:
* VersionEdit checks if InternalKey is valid in `VersionEdit::GetInternalKey()`. If it's invalid, returns corruption.
Lessons learned:
* Always keep in mind that even if you `assert()`, production code will continue execution even if assert fails.
* Never `assert` based on DB corruption. Assert only if the code should guarantee that assert can't fail.
Test Plan: dumped offending manifest. Before: assert. Now: corruption
Reviewers: dhruba, haobo, sdong
Reviewed By: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18507
2014-05-08 01:52:12 +02:00
|
|
|
return dst->Valid();
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-05 22:08:17 +01:00
|
|
|
bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
|
2020-02-07 22:25:07 +01:00
|
|
|
uint32_t v = 0;
|
2014-01-15 00:27:09 +01:00
|
|
|
if (GetVarint32(input, &v)) {
|
2011-03-18 23:37:00 +01:00
|
|
|
*level = v;
|
2014-01-15 00:27:09 +01:00
|
|
|
if (max_level_ < *level) {
|
|
|
|
max_level_ = *level;
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-03 02:32:46 +02:00
|
|
|
const char* VersionEdit::DecodeNewFile4From(Slice* input) {
|
|
|
|
const char* msg = nullptr;
|
2020-02-07 22:25:07 +01:00
|
|
|
int level = 0;
|
2015-10-03 02:32:46 +02:00
|
|
|
FileMetaData f;
|
2020-02-07 22:25:07 +01:00
|
|
|
uint64_t number = 0;
|
2015-10-03 02:32:46 +02:00
|
|
|
uint32_t path_id = 0;
|
2020-02-07 22:25:07 +01:00
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
2015-10-03 02:32:46 +02:00
|
|
|
if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
|
|
|
|
GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
|
|
|
|
GetInternalKey(input, &f.largest) &&
|
2018-07-28 01:00:26 +02:00
|
|
|
GetVarint64(input, &smallest_seqno) &&
|
|
|
|
GetVarint64(input, &largest_seqno)) {
|
2015-10-03 02:32:46 +02:00
|
|
|
// See comments in VersionEdit::EncodeTo() for format of customized fields
|
|
|
|
while (true) {
|
2020-02-07 22:25:07 +01:00
|
|
|
uint32_t custom_tag = 0;
|
2015-10-03 02:32:46 +02:00
|
|
|
Slice field;
|
|
|
|
if (!GetVarint32(input, &custom_tag)) {
|
|
|
|
return "new-file4 custom field";
|
|
|
|
}
|
|
|
|
if (custom_tag == kTerminate) {
|
2021-11-10 19:47:53 +01:00
|
|
|
if (f.min_timestamp.size() != f.max_timestamp.size()) {
|
|
|
|
assert(false);
|
|
|
|
return "new-file4 custom field timestamp size mismatch error";
|
|
|
|
}
|
2015-10-03 02:32:46 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!GetLengthPrefixedSlice(input, &field)) {
|
2019-01-02 20:15:01 +01:00
|
|
|
return "new-file4 custom field length prefixed slice error";
|
2015-10-03 02:32:46 +02:00
|
|
|
}
|
|
|
|
switch (custom_tag) {
|
|
|
|
case kPathId:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "path_id field wrong size";
|
|
|
|
}
|
|
|
|
path_id = field[0];
|
|
|
|
if (path_id > 3) {
|
|
|
|
return "path_id wrong vaue";
|
|
|
|
}
|
|
|
|
break;
|
2019-11-23 01:01:21 +01:00
|
|
|
case kOldestAncesterTime:
|
|
|
|
if (!GetVarint64(&field, &f.oldest_ancester_time)) {
|
|
|
|
return "invalid oldest ancester time";
|
|
|
|
}
|
|
|
|
break;
|
2019-11-27 06:38:38 +01:00
|
|
|
case kFileCreationTime:
|
|
|
|
if (!GetVarint64(&field, &f.file_creation_time)) {
|
|
|
|
return "invalid file creation time";
|
|
|
|
}
|
|
|
|
break;
|
2020-02-11 00:42:46 +01:00
|
|
|
case kFileChecksum:
|
|
|
|
f.file_checksum = field.ToString();
|
|
|
|
break;
|
|
|
|
case kFileChecksumFuncName:
|
|
|
|
f.file_checksum_func_name = field.ToString();
|
|
|
|
break;
|
2015-10-03 02:32:46 +02:00
|
|
|
case kNeedCompaction:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "need_compaction field wrong size";
|
|
|
|
}
|
|
|
|
f.marked_for_compaction = (field[0] == 1);
|
|
|
|
break;
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
case kMinLogNumberToKeepHack:
|
|
|
|
// This is a hack to encode kMinLogNumberToKeep in a
|
2018-08-10 01:49:45 +02:00
|
|
|
// forward-compatible fashion.
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
if (!GetFixed64(&field, &min_log_number_to_keep_)) {
|
|
|
|
return "deleted log number malformatted";
|
|
|
|
}
|
|
|
|
has_min_log_number_to_keep_ = true;
|
|
|
|
break;
|
2019-10-15 00:19:31 +02:00
|
|
|
case kOldestBlobFileNumber:
|
|
|
|
if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
|
|
|
|
return "invalid oldest blob file number";
|
|
|
|
}
|
|
|
|
break;
|
2021-05-18 00:14:34 +02:00
|
|
|
case kTemperature:
|
|
|
|
if (field.size() != 1) {
|
|
|
|
return "temperature field wrong size";
|
|
|
|
} else {
|
|
|
|
Temperature casted_field = static_cast<Temperature>(field[0]);
|
|
|
|
if (casted_field <= Temperature::kCold) {
|
|
|
|
f.temperature = casted_field;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2021-11-10 19:47:53 +01:00
|
|
|
case kMinTimestamp:
|
|
|
|
f.min_timestamp = field.ToString();
|
|
|
|
break;
|
|
|
|
case kMaxTimestamp:
|
|
|
|
f.max_timestamp = field.ToString();
|
|
|
|
break;
|
2015-10-03 02:32:46 +02:00
|
|
|
default:
|
|
|
|
if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
|
|
|
|
// Should not proceed if cannot understand it
|
|
|
|
return "new-file4 custom field not supported";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return "new-file4 entry";
|
|
|
|
}
|
2018-07-28 01:00:26 +02:00
|
|
|
f.fd =
|
|
|
|
FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
|
2015-10-03 02:32:46 +02:00
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
Status VersionEdit::DecodeFrom(const Slice& src) {
|
|
|
|
Clear();
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
2021-01-20 04:26:05 +01:00
|
|
|
#ifndef NDEBUG
|
|
|
|
bool ignore_ignorable_tags = false;
|
|
|
|
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags",
|
|
|
|
&ignore_ignorable_tags);
|
|
|
|
#endif
|
2011-03-18 23:37:00 +01:00
|
|
|
Slice input = src;
|
2013-03-01 03:04:58 +01:00
|
|
|
const char* msg = nullptr;
|
2020-02-07 22:25:07 +01:00
|
|
|
uint32_t tag = 0;
|
2011-03-18 23:37:00 +01:00
|
|
|
|
|
|
|
// Temporary storage for parsing
|
2020-02-07 22:25:07 +01:00
|
|
|
int level = 0;
|
2011-03-18 23:37:00 +01:00
|
|
|
FileMetaData f;
|
|
|
|
Slice str;
|
|
|
|
InternalKey key;
|
2013-03-01 03:04:58 +01:00
|
|
|
while (msg == nullptr && GetVarint32(&input, &tag)) {
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
2021-01-20 04:26:05 +01:00
|
|
|
#ifndef NDEBUG
|
|
|
|
if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) {
|
|
|
|
tag = kTagSafeIgnoreMask;
|
|
|
|
}
|
|
|
|
#endif
|
2011-03-18 23:37:00 +01:00
|
|
|
switch (tag) {
|
2019-09-03 17:50:47 +02:00
|
|
|
case kDbId:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
db_id_ = str.ToString();
|
|
|
|
has_db_id_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "db id";
|
|
|
|
}
|
|
|
|
break;
|
2011-03-18 23:37:00 +01:00
|
|
|
case kComparator:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
comparator_ = str.ToString();
|
|
|
|
has_comparator_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "comparator name";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kLogNumber:
|
|
|
|
if (GetVarint64(&input, &log_number_)) {
|
|
|
|
has_log_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "log number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2011-04-12 21:38:58 +02:00
|
|
|
case kPrevLogNumber:
|
|
|
|
if (GetVarint64(&input, &prev_log_number_)) {
|
|
|
|
has_prev_log_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "previous log number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
case kNextFileNumber:
|
|
|
|
if (GetVarint64(&input, &next_file_number_)) {
|
|
|
|
has_next_file_number_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "next file number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2014-03-05 21:13:44 +01:00
|
|
|
case kMaxColumnFamily:
|
|
|
|
if (GetVarint32(&input, &max_column_family_)) {
|
|
|
|
has_max_column_family_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "max column family";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
case kMinLogNumberToKeep:
|
|
|
|
if (GetVarint64(&input, &min_log_number_to_keep_)) {
|
|
|
|
has_min_log_number_to_keep_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "min log number to kee";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2020-02-07 22:25:07 +01:00
|
|
|
case kLastSequence:
|
|
|
|
if (GetVarint64(&input, &last_sequence_)) {
|
|
|
|
has_last_sequence_ = true;
|
|
|
|
} else {
|
|
|
|
msg = "last sequence number";
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
case kCompactPointer:
|
2013-01-24 19:54:26 +01:00
|
|
|
if (GetLevel(&input, &level, &msg) &&
|
2011-03-18 23:37:00 +01:00
|
|
|
GetInternalKey(&input, &key)) {
|
2014-01-16 23:06:53 +01:00
|
|
|
// we don't use compact pointers anymore,
|
|
|
|
// but we should not fail if they are still
|
|
|
|
// in manifest
|
2011-03-18 23:37:00 +01:00
|
|
|
} else {
|
2013-01-24 19:54:26 +01:00
|
|
|
if (!msg) {
|
|
|
|
msg = "compaction pointer";
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2014-10-31 19:59:54 +01:00
|
|
|
case kDeletedFile: {
|
2020-02-07 22:25:07 +01:00
|
|
|
uint64_t number = 0;
|
2014-10-31 19:59:54 +01:00
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
|
2011-03-18 23:37:00 +01:00
|
|
|
deleted_files_.insert(std::make_pair(level, number));
|
|
|
|
} else {
|
2013-01-24 19:54:26 +01:00
|
|
|
if (!msg) {
|
|
|
|
msg = "deleted file";
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
break;
|
2014-10-31 19:59:54 +01:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
|
2014-06-14 00:54:19 +02:00
|
|
|
case kNewFile: {
|
2020-02-07 22:25:07 +01:00
|
|
|
uint64_t number = 0;
|
|
|
|
uint64_t file_size = 0;
|
2014-06-14 00:54:19 +02:00
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint64(&input, &file_size) &&
|
2011-03-18 23:37:00 +01:00
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest)) {
|
2014-07-02 18:54:20 +02:00
|
|
|
f.fd = FileDescriptor(number, 0, file_size);
|
2011-03-18 23:37:00 +01:00
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
2013-01-24 19:54:26 +01:00
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file entry";
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
|
|
|
break;
|
2014-06-14 00:54:19 +02:00
|
|
|
}
|
|
|
|
case kNewFile2: {
|
2020-02-07 22:25:07 +01:00
|
|
|
uint64_t number = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
2014-06-14 00:54:19 +02:00
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint64(&input, &file_size) &&
|
2013-06-14 07:09:08 +02:00
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest) &&
|
2018-07-28 01:00:26 +02:00
|
|
|
GetVarint64(&input, &smallest_seqno) &&
|
|
|
|
GetVarint64(&input, &largest_seqno)) {
|
|
|
|
f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
|
|
|
|
largest_seqno);
|
2014-07-02 18:54:20 +02:00
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "new-file2 entry";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case kNewFile3: {
|
2020-02-07 22:25:07 +01:00
|
|
|
uint64_t number = 0;
|
|
|
|
uint32_t path_id = 0;
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
SequenceNumber smallest_seqno = 0;
|
|
|
|
SequenceNumber largest_seqno = kMaxSequenceNumber;
|
2014-07-02 18:54:20 +02:00
|
|
|
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
|
|
|
|
GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
|
|
|
|
GetInternalKey(&input, &f.smallest) &&
|
|
|
|
GetInternalKey(&input, &f.largest) &&
|
2018-07-28 01:00:26 +02:00
|
|
|
GetVarint64(&input, &smallest_seqno) &&
|
|
|
|
GetVarint64(&input, &largest_seqno)) {
|
|
|
|
f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
|
|
|
|
largest_seqno);
|
2013-06-14 07:09:08 +02:00
|
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
2014-10-23 19:41:58 +02:00
|
|
|
msg = "new-file3 entry";
|
2013-06-14 07:09:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2014-06-14 00:54:19 +02:00
|
|
|
}
|
2013-06-14 07:09:08 +02:00
|
|
|
|
2015-10-03 02:32:46 +02:00
|
|
|
case kNewFile4: {
|
|
|
|
msg = DecodeNewFile4From(&input);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2021-01-21 05:27:19 +01:00
|
|
|
case kBlobFileAddition:
|
|
|
|
case kBlobFileAddition_DEPRECATED: {
|
2020-03-11 01:24:38 +01:00
|
|
|
BlobFileAddition blob_file_addition;
|
|
|
|
const Status s = blob_file_addition.DecodeFrom(&input);
|
2020-02-25 03:38:00 +01:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-09-15 06:10:09 +02:00
|
|
|
AddBlobFile(std::move(blob_file_addition));
|
2020-03-11 01:24:38 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2021-01-21 05:27:19 +01:00
|
|
|
case kBlobFileGarbage:
|
|
|
|
case kBlobFileGarbage_DEPRECATED: {
|
2020-03-11 01:24:38 +01:00
|
|
|
BlobFileGarbage blob_file_garbage;
|
|
|
|
const Status s = blob_file_garbage.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-09-15 06:10:09 +02:00
|
|
|
AddBlobFileGarbage(std::move(blob_file_garbage));
|
2020-02-25 03:38:00 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
case kWalAddition: {
|
|
|
|
WalAddition wal_addition;
|
|
|
|
const Status s = wal_addition.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_additions_.emplace_back(std::move(wal_addition));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
2021-01-20 04:26:05 +01:00
|
|
|
case kWalAddition2: {
|
|
|
|
Slice encoded;
|
|
|
|
if (!GetLengthPrefixedSlice(&input, &encoded)) {
|
|
|
|
msg = "WalAddition not prefixed by length";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
WalAddition wal_addition;
|
|
|
|
const Status s = wal_addition.DecodeFrom(&encoded);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_additions_.emplace_back(std::move(wal_addition));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
case kWalDeletion: {
|
|
|
|
WalDeletion wal_deletion;
|
|
|
|
const Status s = wal_deletion.DecodeFrom(&input);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-11-07 01:30:44 +01:00
|
|
|
wal_deletion_ = std::move(wal_deletion);
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Make it able to ignore WAL related VersionEdits in older versions (#7873)
Summary:
Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST.
If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade.
Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873
Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries.
Reviewed By: ajkr
Differential Revision: D25935930
Pulled By: cheng-chang
fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb
2021-01-20 04:26:05 +01:00
|
|
|
case kWalDeletion2: {
|
|
|
|
Slice encoded;
|
|
|
|
if (!GetLengthPrefixedSlice(&input, &encoded)) {
|
|
|
|
msg = "WalDeletion not prefixed by length";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
WalDeletion wal_deletion;
|
|
|
|
const Status s = wal_deletion.DecodeFrom(&encoded);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
wal_deletion_ = std::move(wal_deletion);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2013-12-12 02:46:26 +01:00
|
|
|
case kColumnFamily:
|
|
|
|
if (!GetVarint32(&input, &column_family_)) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "set column family id";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kColumnFamilyAdd:
|
|
|
|
if (GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
is_column_family_add_ = true;
|
|
|
|
column_family_name_ = str.ToString();
|
|
|
|
} else {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "column family add";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kColumnFamilyDrop:
|
|
|
|
is_column_family_drop_ = true;
|
|
|
|
break;
|
|
|
|
|
2018-08-20 23:54:03 +02:00
|
|
|
case kInAtomicGroup:
|
|
|
|
is_in_atomic_group_ = true;
|
|
|
|
if (!GetVarint32(&input, &remaining_entries_)) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "remaining entries";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2020-12-05 23:17:11 +01:00
|
|
|
case kFullHistoryTsLow:
|
|
|
|
if (!GetLengthPrefixedSlice(&input, &str)) {
|
|
|
|
msg = "full_history_ts_low";
|
|
|
|
} else if (str.empty()) {
|
|
|
|
msg = "full_history_ts_low: empty";
|
|
|
|
} else {
|
|
|
|
full_history_ts_low_.assign(str.data(), str.size());
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2011-03-18 23:37:00 +01:00
|
|
|
default:
|
2019-02-08 19:48:08 +01:00
|
|
|
if (tag & kTagSafeIgnoreMask) {
|
|
|
|
// Tag from future which can be safely ignored.
|
|
|
|
// The next field must be the length of the entry.
|
|
|
|
uint32_t field_len;
|
|
|
|
if (!GetVarint32(&input, &field_len) ||
|
|
|
|
static_cast<size_t>(field_len) > input.size()) {
|
|
|
|
if (!msg) {
|
|
|
|
msg = "safely ignoreable tag length error";
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
input.remove_prefix(static_cast<size_t>(field_len));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
msg = "unknown tag";
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-03-01 03:04:58 +01:00
|
|
|
if (msg == nullptr && !input.empty()) {
|
2011-03-18 23:37:00 +01:00
|
|
|
msg = "invalid tag";
|
|
|
|
}
|
|
|
|
|
|
|
|
Status result;
|
2013-03-01 03:04:58 +01:00
|
|
|
if (msg != nullptr) {
|
2011-03-18 23:37:00 +01:00
|
|
|
result = Status::Corruption("VersionEdit", msg);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2013-08-09 00:51:16 +02:00
|
|
|
std::string VersionEdit::DebugString(bool hex_key) const {
|
2011-03-18 23:37:00 +01:00
|
|
|
std::string r;
|
|
|
|
r.append("VersionEdit {");
|
2019-09-03 17:50:47 +02:00
|
|
|
if (has_db_id_) {
|
|
|
|
r.append("\n DB ID: ");
|
|
|
|
r.append(db_id_);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
if (has_comparator_) {
|
|
|
|
r.append("\n Comparator: ");
|
|
|
|
r.append(comparator_);
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
r.append("\n LogNumber: ");
|
|
|
|
AppendNumberTo(&r, log_number_);
|
|
|
|
}
|
2011-04-12 21:38:58 +02:00
|
|
|
if (has_prev_log_number_) {
|
|
|
|
r.append("\n PrevLogNumber: ");
|
|
|
|
AppendNumberTo(&r, prev_log_number_);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
if (has_next_file_number_) {
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
r.append("\n NextFileNumber: ");
|
2011-03-18 23:37:00 +01:00
|
|
|
AppendNumberTo(&r, next_file_number_);
|
|
|
|
}
|
2020-02-07 22:25:07 +01:00
|
|
|
if (has_max_column_family_) {
|
|
|
|
r.append("\n MaxColumnFamily: ");
|
|
|
|
AppendNumberTo(&r, max_column_family_);
|
|
|
|
}
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-04 00:35:11 +02:00
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
r.append("\n MinLogNumberToKeep: ");
|
|
|
|
AppendNumberTo(&r, min_log_number_to_keep_);
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
if (has_last_sequence_) {
|
|
|
|
r.append("\n LastSeq: ");
|
|
|
|
AppendNumberTo(&r, last_sequence_);
|
|
|
|
}
|
2020-02-07 22:25:07 +01:00
|
|
|
for (const auto& deleted_file : deleted_files_) {
|
2011-03-18 23:37:00 +01:00
|
|
|
r.append("\n DeleteFile: ");
|
2020-02-07 22:25:07 +01:00
|
|
|
AppendNumberTo(&r, deleted_file.first);
|
2011-03-18 23:37:00 +01:00
|
|
|
r.append(" ");
|
2020-02-07 22:25:07 +01:00
|
|
|
AppendNumberTo(&r, deleted_file.second);
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2011-04-21 00:48:11 +02:00
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
2011-03-18 23:37:00 +01:00
|
|
|
const FileMetaData& f = new_files_[i].second;
|
|
|
|
r.append("\n AddFile: ");
|
|
|
|
AppendNumberTo(&r, new_files_[i].first);
|
|
|
|
r.append(" ");
|
2014-06-14 00:54:19 +02:00
|
|
|
AppendNumberTo(&r, f.fd.GetNumber());
|
2011-03-18 23:37:00 +01:00
|
|
|
r.append(" ");
|
2014-06-14 00:54:19 +02:00
|
|
|
AppendNumberTo(&r, f.fd.GetFileSize());
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append(" ");
|
2013-08-09 00:51:16 +02:00
|
|
|
r.append(f.smallest.DebugString(hex_key));
|
2011-10-06 01:30:28 +02:00
|
|
|
r.append(" .. ");
|
2013-08-09 00:51:16 +02:00
|
|
|
r.append(f.largest.DebugString(hex_key));
|
2019-10-15 00:19:31 +02:00
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
r.append(" blob_file:");
|
|
|
|
AppendNumberTo(&r, f.oldest_blob_file_number);
|
|
|
|
}
|
2021-11-22 18:29:30 +01:00
|
|
|
if (f.min_timestamp != kDisableUserTimestamp) {
|
|
|
|
assert(f.max_timestamp != kDisableUserTimestamp);
|
|
|
|
r.append(" min_timestamp:");
|
|
|
|
r.append(Slice(f.min_timestamp).ToString(true));
|
|
|
|
r.append(" max_timestamp:");
|
|
|
|
r.append(Slice(f.max_timestamp).ToString(true));
|
|
|
|
}
|
2019-11-23 01:01:21 +01:00
|
|
|
r.append(" oldest_ancester_time:");
|
|
|
|
AppendNumberTo(&r, f.oldest_ancester_time);
|
2019-11-27 06:38:38 +01:00
|
|
|
r.append(" file_creation_time:");
|
|
|
|
AppendNumberTo(&r, f.file_creation_time);
|
2020-02-11 00:42:46 +01:00
|
|
|
r.append(" file_checksum:");
|
2021-11-22 18:29:30 +01:00
|
|
|
r.append(Slice(f.file_checksum).ToString(true));
|
2020-02-11 00:42:46 +01:00
|
|
|
r.append(" file_checksum_func_name: ");
|
|
|
|
r.append(f.file_checksum_func_name);
|
2021-05-18 00:14:34 +02:00
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
|
|
|
r.append(" temperature: ");
|
|
|
|
// Maybe change to human readable format whenthe feature becomes
|
|
|
|
// permanent
|
2022-05-06 22:03:58 +02:00
|
|
|
r.append(std::to_string(static_cast<int>(f.temperature)));
|
2021-05-18 00:14:34 +02:00
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
}
|
2020-02-25 03:38:00 +01:00
|
|
|
|
2020-03-11 01:24:38 +01:00
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
r.append("\n BlobFileAddition: ");
|
|
|
|
r.append(blob_file_addition.DebugString());
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
|
|
|
r.append("\n BlobFileGarbage: ");
|
|
|
|
r.append(blob_file_garbage.DebugString());
|
2020-02-25 03:38:00 +01:00
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
|
|
|
r.append("\n WalAddition: ");
|
|
|
|
r.append(wal_addition.DebugString());
|
|
|
|
}
|
|
|
|
|
2020-11-07 01:30:44 +01:00
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
r.append("\n WalDeletion: ");
|
2020-11-07 01:30:44 +01:00
|
|
|
r.append(wal_deletion_.DebugString());
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
}
|
|
|
|
|
2013-12-12 02:46:26 +01:00
|
|
|
r.append("\n ColumnFamily: ");
|
|
|
|
AppendNumberTo(&r, column_family_);
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
r.append("\n ColumnFamilyAdd: ");
|
|
|
|
r.append(column_family_name_);
|
|
|
|
}
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
r.append("\n ColumnFamilyDrop");
|
|
|
|
}
|
2018-08-20 23:54:03 +02:00
|
|
|
if (is_in_atomic_group_) {
|
2018-12-14 00:10:16 +01:00
|
|
|
r.append("\n AtomicGroup: ");
|
2018-08-20 23:54:03 +02:00
|
|
|
AppendNumberTo(&r, remaining_entries_);
|
|
|
|
r.append(" entries remains");
|
|
|
|
}
|
2020-12-05 23:17:11 +01:00
|
|
|
if (HasFullHistoryTsLow()) {
|
|
|
|
r.append("\n FullHistoryTsLow: ");
|
|
|
|
r.append(Slice(full_history_ts_low_).ToString(hex_key));
|
|
|
|
}
|
2011-03-18 23:37:00 +01:00
|
|
|
r.append("\n}\n");
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
|
|
|
|
JSONWriter jw;
|
|
|
|
jw << "EditNumber" << edit_num;
|
|
|
|
|
2019-09-03 17:50:47 +02:00
|
|
|
if (has_db_id_) {
|
|
|
|
jw << "DB ID" << db_id_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
if (has_comparator_) {
|
|
|
|
jw << "Comparator" << comparator_;
|
|
|
|
}
|
|
|
|
if (has_log_number_) {
|
|
|
|
jw << "LogNumber" << log_number_;
|
|
|
|
}
|
|
|
|
if (has_prev_log_number_) {
|
|
|
|
jw << "PrevLogNumber" << prev_log_number_;
|
|
|
|
}
|
|
|
|
if (has_next_file_number_) {
|
|
|
|
jw << "NextFileNumber" << next_file_number_;
|
|
|
|
}
|
2020-02-07 22:25:07 +01:00
|
|
|
if (has_max_column_family_) {
|
|
|
|
jw << "MaxColumnFamily" << max_column_family_;
|
|
|
|
}
|
|
|
|
if (has_min_log_number_to_keep_) {
|
|
|
|
jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
if (has_last_sequence_) {
|
|
|
|
jw << "LastSeq" << last_sequence_;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!deleted_files_.empty()) {
|
|
|
|
jw << "DeletedFiles";
|
|
|
|
jw.StartArray();
|
|
|
|
|
2020-02-07 22:25:07 +01:00
|
|
|
for (const auto& deleted_file : deleted_files_) {
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
jw.StartArrayedObject();
|
2020-02-07 22:25:07 +01:00
|
|
|
jw << "Level" << deleted_file.first;
|
|
|
|
jw << "FileNumber" << deleted_file.second;
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!new_files_.empty()) {
|
|
|
|
jw << "AddedFiles";
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < new_files_.size(); i++) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << "Level" << new_files_[i].first;
|
|
|
|
const FileMetaData& f = new_files_[i].second;
|
|
|
|
jw << "FileNumber" << f.fd.GetNumber();
|
|
|
|
jw << "FileSize" << f.fd.GetFileSize();
|
|
|
|
jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
|
|
|
|
jw << "LargestIKey" << f.largest.DebugString(hex_key);
|
2021-11-10 19:47:53 +01:00
|
|
|
if (f.min_timestamp != kDisableUserTimestamp) {
|
|
|
|
assert(f.max_timestamp != kDisableUserTimestamp);
|
|
|
|
jw << "MinTimestamp" << Slice(f.min_timestamp).ToString(true);
|
|
|
|
jw << "MaxTimestamp" << Slice(f.max_timestamp).ToString(true);
|
|
|
|
}
|
2021-11-22 18:29:30 +01:00
|
|
|
jw << "OldestAncesterTime" << f.oldest_ancester_time;
|
|
|
|
jw << "FileCreationTime" << f.file_creation_time;
|
|
|
|
jw << "FileChecksum" << Slice(f.file_checksum).ToString(true);
|
|
|
|
jw << "FileChecksumFuncName" << f.file_checksum_func_name;
|
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
2022-05-06 22:03:58 +02:00
|
|
|
jw << "temperature" << std::to_string(static_cast<int>(f.temperature));
|
2021-11-22 18:29:30 +01:00
|
|
|
}
|
2019-10-15 00:19:31 +02:00
|
|
|
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
jw << "OldestBlobFile" << f.oldest_blob_file_number;
|
|
|
|
}
|
2021-05-18 00:14:34 +02:00
|
|
|
if (f.temperature != Temperature::kUnknown) {
|
|
|
|
// Maybe change to human readable format whenthe feature becomes
|
|
|
|
// permanent
|
|
|
|
jw << "Temperature" << static_cast<int>(f.temperature);
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
2020-03-11 01:24:38 +01:00
|
|
|
if (!blob_file_additions_.empty()) {
|
|
|
|
jw << "BlobFileAdditions";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& blob_file_addition : blob_file_additions_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << blob_file_addition;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!blob_file_garbages_.empty()) {
|
|
|
|
jw << "BlobFileGarbages";
|
2020-02-25 03:38:00 +01:00
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
2020-03-11 01:24:38 +01:00
|
|
|
for (const auto& blob_file_garbage : blob_file_garbages_) {
|
2020-02-25 03:38:00 +01:00
|
|
|
jw.StartArrayedObject();
|
2020-03-11 01:24:38 +01:00
|
|
|
jw << blob_file_garbage;
|
2020-02-25 03:38:00 +01:00
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
if (!wal_additions_.empty()) {
|
|
|
|
jw << "WalAdditions";
|
|
|
|
|
|
|
|
jw.StartArray();
|
|
|
|
|
|
|
|
for (const auto& wal_addition : wal_additions_) {
|
|
|
|
jw.StartArrayedObject();
|
|
|
|
jw << wal_addition;
|
|
|
|
jw.EndArrayedObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
jw.EndArray();
|
|
|
|
}
|
|
|
|
|
2020-11-07 01:30:44 +01:00
|
|
|
if (!wal_deletion_.IsEmpty()) {
|
|
|
|
jw << "WalDeletion";
|
|
|
|
jw.StartObject();
|
|
|
|
jw << wal_deletion_;
|
|
|
|
jw.EndObject();
|
Define WAL related classes to be used in VersionEdit and VersionSet (#7164)
Summary:
`WalAddition`, `WalDeletion` are defined in `wal_version.h` and used in `VersionEdit`.
`WalAddition` is used to represent events of creating a new WAL (no size, just log number), or closing a WAL (with size).
`WalDeletion` is used to represent events of deleting or archiving a WAL, it means the WAL is no longer alive (won't be replayed during recovery).
`WalSet` is the set of alive WALs kept in `VersionSet`.
1. Why use `WalDeletion` instead of relying on `MinLogNumber` to identify outdated WALs
On recovery, we can compute `MinLogNumber()` based on the log numbers kept in MANIFEST, any log with number < MinLogNumber can be ignored. So it seems that we don't need to persist `WalDeletion` to MANIFEST, since we can ignore the WALs based on MinLogNumber.
But the `MinLogNumber()` is actually a lower bound, it does not exactly mean that logs starting from MinLogNumber must exist. This is because in a corner case, when a column family is empty and never flushed, its log number is set to the largest log number, but not persisted in MANIFEST. So let's say there are 2 column families, when creating the DB, the first WAL has log number 1, so it's persisted to MANIFEST for both column families. Then CF 0 is empty and never flushed, CF 1 is updated and flushed, so a new WAL with log number 2 is created and persisted to MANIFEST for CF 1. But CF 0's log number in MANIFEST is still 1. So on recovery, MinLogNumber is 1, but since log 1 only contains data for CF 1, and CF 1 is flushed, log 1 might have already been deleted from disk.
We can make `MinLogNumber()` be the exactly minimum log number that must exist, by persisting the most recent log number for empty column families that are not flushed. But if there are N such column families, then every time a new WAL is created, we need to add N records to MANIFEST.
In current design, a record is persisted to MANIFEST only when WAL is created, closed, or deleted/archived, so the number of WAL related records are bounded to 3x number of WALs.
2. Why keep `WalSet` in `VersionSet` instead of applying the `VersionEdit`s to `VersionStorageInfo`
`VersionEdit`s are originally designed to track the addition and deletion of SST files. The SST files are related to column families, each column family has a list of `Version`s, and each `Version` keeps the set of active SST files in `VersionStorageInfo`.
But WALs are a concept of DB, they are not bounded to specific column families. So logically it does not make sense to store WALs in a column family's `Version`s.
Also, `Version`'s purpose is to keep reference to SST / blob files, so that they are not deleted until there is no version referencing them. But a WAL is deleted regardless of version references.
So we keep the WALs in `VersionSet` for the purpose of writing out the DB state's snapshot when creating new MANIFESTs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7164
Test Plan:
make version_edit_test && ./version_edit_test
make wal_edit_test && ./wal_edit_test
Reviewed By: ltamasi
Differential Revision: D22677936
Pulled By: cheng-chang
fbshipit-source-id: 5a3b6890140e572ffd79eb37e6e4c3c32361a859
2020-08-06 01:32:26 +02:00
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
jw << "ColumnFamily" << column_family_;
|
|
|
|
|
|
|
|
if (is_column_family_add_) {
|
|
|
|
jw << "ColumnFamilyAdd" << column_family_name_;
|
|
|
|
}
|
|
|
|
if (is_column_family_drop_) {
|
|
|
|
jw << "ColumnFamilyDrop" << column_family_name_;
|
|
|
|
}
|
2018-08-20 23:54:03 +02:00
|
|
|
if (is_in_atomic_group_) {
|
|
|
|
jw << "AtomicGroup" << remaining_entries_;
|
|
|
|
}
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
|
2020-12-05 23:17:11 +01:00
|
|
|
if (HasFullHistoryTsLow()) {
|
|
|
|
jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
|
|
|
|
}
|
|
|
|
|
Added JSON manifest dump option to ldb command
Summary:
Added a new flag --json to the ldb manifest_dump command
that prints out the version edits as JSON objects for easier
reading and parsing of information.
Test Plan:
**Sample usage: **
```
./ldb manifest_dump --json --path=path/to/manifest/file
```
**Sample output:**
```
{"EditNumber": 0, "Comparator": "leveldb.BytewiseComparator", "ColumnFamily": 0}
{"EditNumber": 1, "LogNumber": 0, "ColumnFamily": 0}
{"EditNumber": 2, "LogNumber": 4, "PrevLogNumber": 0, "NextFileNumber": 7, "LastSeq": 35356, "AddedFiles": [{"Level": 0, "FileNumber": 5, "FileSize": 1949284, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
{"EditNumber": 13, "PrevLogNumber": 0, "NextFileNumber": 36, "LastSeq": 290994, "DeletedFiles": [{"Level": 0, "FileNumber": 17}, {"Level": 0, "FileNumber": 20}, {"Level": 0, "FileNumber": 22}, {"Level": 0, "FileNumber": 24}, {"Level": 1, "FileNumber": 13}, {"Level": 1, "FileNumber": 14}, {"Level": 1, "FileNumber": 15}, {"Level": 1, "FileNumber": 18}], "AddedFiles": [{"Level": 1, "FileNumber": 25, "FileSize": 2114340, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 26, "FileSize": 2115213, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 27, "FileSize": 2114807, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 30, "FileSize": 2115271, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 31, "FileSize": 2115165, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 32, "FileSize": 2114683, "SmallestIKey": "'", "LargestIKey": "'"}, {"Level": 1, "FileNumber": 35, "FileSize": 1757512, "SmallestIKey": "'", "LargestIKey": "'"}], "ColumnFamily": 0}
...
```
Reviewers: sdong, anthony, yhchiang, igor
Reviewed By: igor
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D41727
2015-07-17 19:07:40 +02:00
|
|
|
jw.EndObject();
|
|
|
|
|
|
|
|
return jw.Get();
|
|
|
|
}
|
|
|
|
|
2020-02-20 21:07:53 +01:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|