Detect column family from properties [CF + RepairDB part 2/3]

Summary:
This diff uses the CF ID and CF name properties in the SST file
to associate recovered data with the proper column family. Depends on D59775.

- In ScanTable(), create column families in VersionSet each time a new one is discovered (via reading SST file properties)
- In ConvertLogToTable(), dump an SST file for every column family with data in the WAL
- In AddTables(), make a VersionEdit per-column family that adds all of that CF's tables

Test Plan:
  $ ./repair_test

Reviewers: yhchiang, IslamAbdelRahman, sdong

Reviewed By: sdong

Subscribers: andrewkr, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D59781
This commit is contained in:
Andrew Kryczka 2016-06-24 13:12:13 -07:00
parent 3fc713ed92
commit 56ac686292
3 changed files with 155 additions and 55 deletions

View File

@ -8,6 +8,7 @@
### New Features ### New Features
* Add avoid_flush_during_recovery option. * Add avoid_flush_during_recovery option.
* Add a read option background_purge_on_iterator_cleanup to avoid deleting files in foreground when destroying iterators. Instead, a job is scheduled in high priority queue and would be executed in a separate background thread. * Add a read option background_purge_on_iterator_cleanup to avoid deleting files in foreground when destroying iterators. Instead, a job is scheduled in high priority queue and would be executed in a separate background thread.
* RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family.
## 4.9.0 (6/9/2016) ## 4.9.0 (6/9/2016)
### Public API changes ### Public API changes

View File

@ -154,6 +154,14 @@ class Repairer {
status = vset_.Recover({{kDefaultColumnFamilyName, cf_options_}}, false); status = vset_.Recover({{kDefaultColumnFamilyName, cf_options_}}, false);
} }
if (status.ok()) { if (status.ok()) {
// Need to scan existing SST files first so the column families are
// created before we process WAL files
ExtractMetaData();
// ExtractMetaData() uses table_fds_ to know which SST files' metadata to
// extract -- we need to clear it here since metadata for existing SST
// files has been extracted already
table_fds_.clear();
ConvertLogFilesToTables(); ConvertLogFilesToTables();
ExtractMetaData(); ExtractMetaData();
status = AddTables(); status = AddTables();
@ -177,6 +185,8 @@ class Repairer {
private: private:
struct TableInfo { struct TableInfo {
FileMetaData meta; FileMetaData meta;
uint32_t column_family_id;
std::string column_family_name;
SequenceNumber min_sequence; SequenceNumber min_sequence;
SequenceNumber max_sequence; SequenceNumber max_sequence;
}; };
@ -294,16 +304,17 @@ class Repairer {
log::Reader reader(options_.info_log, std::move(lfile_reader), &reporter, log::Reader reader(options_.info_log, std::move(lfile_reader), &reporter,
true /*enable checksum*/, 0 /*initial_offset*/, log); true /*enable checksum*/, 0 /*initial_offset*/, log);
// Initialize per-column family memtables
for (auto* cfd : *vset_.GetColumnFamilySet()) {
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
kMaxSequenceNumber);
}
auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
// Read all the records and add to a memtable // Read all the records and add to a memtable
std::string scratch; std::string scratch;
Slice record; Slice record;
WriteBatch batch; WriteBatch batch;
WriteBuffer wb(options_.db_write_buffer_size);
MemTable* mem =
new MemTable(icmp_, ioptions_, MutableCFOptions(options_, ioptions_),
&wb, kMaxSequenceNumber);
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
mem->Ref();
int counter = 0; int counter = 0;
while (reader.ReadRecord(&record, &scratch)) { while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < WriteBatchInternal::kHeader) { if (record.size() < WriteBatchInternal::kHeader) {
@ -312,7 +323,7 @@ class Repairer {
continue; continue;
} }
WriteBatchInternal::SetContents(&batch, record); WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, cf_mems_default, nullptr); status = WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr);
if (status.ok()) { if (status.ok()) {
counter += WriteBatchInternal::Count(&batch); counter += WriteBatchInternal::Count(&batch);
} else { } else {
@ -323,36 +334,40 @@ class Repairer {
} }
} }
// Do not record a version edit for this conversion to a Table // Dump a table for each column family with entries in this log file.
// since ExtractMetaData() will also generate edits. for (auto* cfd : *vset_.GetColumnFamilySet()) {
FileMetaData meta; // Do not record a version edit for this conversion to a Table
meta.fd = FileDescriptor(next_file_number_++, 0, 0); // since ExtractMetaData() will also generate edits.
{ MemTable* mem = cfd->mem();
if (mem->IsEmpty()) {
continue;
}
FileMetaData meta;
meta.fd = FileDescriptor(next_file_number_++, 0, 0);
ReadOptions ro; ReadOptions ro;
ro.total_order_seek = true; ro.total_order_seek = true;
Arena arena; Arena arena;
ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
MutableCFOptions mutable_cf_options(options_, ioptions_);
status = BuildTable( status = BuildTable(
dbname_, env_, ioptions_, mutable_cf_options, env_options_, dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
table_cache_, iter.get(), &meta, icmp_, env_options_, table_cache_, iter.get(), &meta,
&int_tbl_prop_collector_factories_, cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber, kNoCompression,
std::string() /* column_family_name */, {}, kMaxSequenceNumber, CompressionOptions(), false, nullptr /* internal_stats */,
kNoCompression, CompressionOptions(), false, TableFileCreationReason::kRecovery);
nullptr /* internal_stats */, TableFileCreationReason::kRecovery); Log(InfoLogLevel::INFO_LEVEL, options_.info_log,
} "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log,
delete mem->Unref(); counter, meta.fd.GetNumber(), status.ToString().c_str());
delete cf_mems_default; if (status.ok()) {
mem = nullptr; if (meta.fd.GetFileSize() > 0) {
if (status.ok()) { table_fds_.push_back(meta.fd);
if (meta.fd.GetFileSize() > 0) { }
table_fds_.push_back(meta.fd); } else {
break;
} }
} }
Log(InfoLogLevel::INFO_LEVEL, options_.info_log, delete cf_mems;
"Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
log, counter, meta.fd.GetNumber(), status.ToString().c_str());
return status; return status;
} }
@ -385,9 +400,45 @@ class Repairer {
Status status = env_->GetFileSize(fname, &file_size); Status status = env_->GetFileSize(fname, &file_size);
t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(), t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
file_size); file_size);
std::shared_ptr<const TableProperties> props;
if (status.ok()) {
status = table_cache_->GetTableProperties(env_options_, icmp_, t->meta.fd,
&props);
}
if (status.ok()) {
t->column_family_id = static_cast<uint32_t>(props->column_family_id);
if (t->column_family_id ==
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
"Table #%" PRIu64
": column family unknown (probably due to legacy format); "
"adding to default column family id 0.",
t->meta.fd.GetNumber());
t->column_family_id = 0;
}
if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
nullptr) {
status =
AddColumnFamily(props->column_family_name, t->column_family_id);
}
}
ColumnFamilyData* cfd;
if (status.ok()) {
cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
if (cfd->GetName() != props->column_family_name) {
Log(InfoLogLevel::ERROR_LEVEL, options_.info_log,
"Table #%" PRIu64
": inconsistent column family name '%s'; expected '%s' for column "
"family id %" PRIu32 ".",
t->meta.fd.GetNumber(), props->column_family_name.c_str(),
cfd->GetName().c_str(), t->column_family_id);
status = Status::Corruption(dbname_, "inconsistent column family name");
}
}
if (status.ok()) { if (status.ok()) {
InternalIterator* iter = table_cache_->NewIterator( InternalIterator* iter = table_cache_->NewIterator(
ReadOptions(), env_options_, icmp_, t->meta.fd); ReadOptions(), env_options_, cfd->internal_comparator(), t->meta.fd);
bool empty = true; bool empty = true;
ParsedInternalKey parsed; ParsedInternalKey parsed;
t->min_sequence = 0; t->min_sequence = 0;
@ -395,9 +446,9 @@ class Repairer {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key(); Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) { if (!ParseInternalKey(key, &parsed)) {
Log(InfoLogLevel::ERROR_LEVEL, Log(InfoLogLevel::ERROR_LEVEL, options_.info_log,
options_.info_log, "Table #%" PRIu64 ": unparsable key %s", "Table #%" PRIu64 ": unparsable key %s", t->meta.fd.GetNumber(),
t->meta.fd.GetNumber(), EscapeString(key).c_str()); EscapeString(key).c_str());
continue; continue;
} }
@ -418,42 +469,51 @@ class Repairer {
status = iter->status(); status = iter->status();
} }
delete iter; delete iter;
Log(InfoLogLevel::INFO_LEVEL, options_.info_log,
"Table #%" PRIu64 ": %d entries %s", t->meta.fd.GetNumber(), counter,
status.ToString().c_str());
} }
Log(InfoLogLevel::INFO_LEVEL,
options_.info_log, "Table #%" PRIu64 ": %d entries %s",
t->meta.fd.GetNumber(), counter, status.ToString().c_str());
return status; return status;
} }
Status AddTables() { Status AddTables() {
std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
SequenceNumber max_sequence = 0; SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) { for (size_t i = 0; i < tables_.size(); i++) {
cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
if (max_sequence < tables_[i].max_sequence) { if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence; max_sequence = tables_[i].max_sequence;
} }
} }
vset_.SetLastSequence(max_sequence); vset_.SetLastSequence(max_sequence);
auto* cfd = vset_.GetColumnFamilySet()->GetDefault(); for (const auto& cf_id_and_tables : cf_id_to_tables) {
VersionEdit edit; auto* cfd =
edit.SetComparatorName(cfd->user_comparator()->Name()); vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
edit.SetLogNumber(0); VersionEdit edit;
edit.SetNextFile(next_file_number_); edit.SetComparatorName(cfd->user_comparator()->Name());
edit.SetColumnFamily(cfd->GetID()); edit.SetLogNumber(0);
edit.SetNextFile(next_file_number_);
edit.SetColumnFamily(cfd->GetID());
// TODO(opt): separate out into multiple levels // TODO(opt): separate out into multiple levels
for (const auto& table : tables_) { for (const auto* table : cf_id_and_tables.second) {
edit.AddFile(0, table.meta.fd.GetNumber(), table.meta.fd.GetPathId(), edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
table.meta.fd.GetFileSize(), table.meta.smallest, table->meta.fd.GetFileSize(), table->meta.smallest,
table.meta.largest, table.min_sequence, table.max_sequence, table->meta.largest, table->min_sequence,
table.meta.marked_for_compaction); table->max_sequence, table->meta.marked_for_compaction);
}
mutex_.Lock();
Status status = vset_.LogAndApply(
cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
nullptr /* db_directory */, false /* new_descriptor_log */);
mutex_.Unlock();
if (!status.ok()) {
return status;
}
} }
mutex_.Lock(); return Status::OK();
Status status = vset_.LogAndApply(
cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
nullptr /* db_directory */, false /* new_descriptor_log */);
mutex_.Unlock();
return status;
} }
void ArchiveFile(const std::string& fname) { void ArchiveFile(const std::string& fname) {

View File

@ -12,6 +12,7 @@
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/transaction_log.h" #include "rocksdb/transaction_log.h"
#include "util/file_util.h" #include "util/file_util.h"
#include "util/string_util.h"
namespace rocksdb { namespace rocksdb {
@ -169,6 +170,44 @@ TEST_F(RepairTest, UnflushedSst) {
ASSERT_EQ(Get("key"), "val"); ASSERT_EQ(Get("key"), "val");
} }
TEST_F(RepairTest, RepairMultipleColumnFamilies) {
// Verify repair logic associates SST files with their original column
// families.
const int kNumCfs = 3;
const int kEntriesPerCf = 2;
DestroyAndReopen(CurrentOptions());
CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions());
for (int i = 0; i < kNumCfs; ++i) {
for (int j = 0; j < kEntriesPerCf; ++j) {
Put(i, "key" + ToString(j), "val" + ToString(j));
if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) {
// Leave one unflushed so we can verify WAL entries are properly
// associated with column families.
continue;
}
Flush(i);
}
}
// Need to get path before Close() deletes db_, but delete it after Close() to
// ensure Close() doesn't re-create the manifest.
std::string manifest_path =
DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
Close();
ASSERT_OK(env_->FileExists(manifest_path));
ASSERT_OK(env_->DeleteFile(manifest_path));
ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"},
CurrentOptions());
for (int i = 0; i < kNumCfs; ++i) {
for (int j = 0; j < kEntriesPerCf; ++j) {
ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j));
}
}
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {