Add list live files metadata (#8446)

Summary:
Add an argument to ldb to dump live file names, column families, and levels, `list_live_files_metadata`. The output shows all active SST file names, sorted first by column family and then by level. For each level the SST files are sorted alphabetically.

Typically, the output looks like this:
```
./ldb --db=/tmp/test_db list_live_files_metadata
Live SST Files:
===== Column Family: default =====
---------- level 0 ----------
/tmp/test_db/000069.sst
---------- level 1 ----------
/tmp/test_db/000064.sst
/tmp/test_db/000065.sst
/tmp/test_db/000066.sst
/tmp/test_db/000071.sst
---------- level 2 ----------
/tmp/test_db/000038.sst
/tmp/test_db/000039.sst
/tmp/test_db/000052.sst
/tmp/test_db/000067.sst
/tmp/test_db/000070.sst
------------------------------
```

Second, a flag was added `--sort_by_filename`, to change the layout of the output. When this flag is added to the command, the output shows all active SST files sorted by name, in front of which the LSM level and the column family are mentioned. With the same example, the following command would return:
```
./ldb --db=/tmp/test_db list_live_files_metadata --sort_by_filename
Live SST Files:
/tmp/test_db/000038.sst : level 2, column family 'default'
/tmp/test_db/000039.sst : level 2, column family 'default'
/tmp/test_db/000052.sst : level 2, column family 'default'
/tmp/test_db/000064.sst : level 1, column family 'default'
/tmp/test_db/000065.sst : level 1, column family 'default'
/tmp/test_db/000066.sst : level 1, column family 'default'
/tmp/test_db/000067.sst : level 2, column family 'default'
/tmp/test_db/000069.sst : level 0, column family 'default'
/tmp/test_db/000070.sst : level 2, column family 'default'
/tmp/test_db/000071.sst : level 1, column family 'default'
------------------------------
```

Thus, the user can either request to show the files by levels, or sorted by filenames.
This PR includes a simple Python unit test that makes sure the file name and level printed out by this new feature matches the one found with an existing feature, `dump_live_file`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8446

Reviewed By: akankshamahajan15

Differential Revision: D29320080

Pulled By: bjlemaire

fbshipit-source-id: 01fb7b5637c59010d74c80730a28d815994e7009
This commit is contained in:
Baptiste Lemaire 2021-06-22 19:06:44 -07:00 committed by Facebook GitHub Bot
parent 3ab0eae860
commit 3f20925dc4
5 changed files with 226 additions and 0 deletions

View File

@ -6,6 +6,9 @@
### Bug Fixes
* Blob file checksums are now printed in hexadecimal format when using the `manifest_dump` `ldb` command.
### New Features
* ldb has a new feature, `list_live_files_metadata`, that shows the live SST files, as well as their LSM storage level and the column family they belong to.
## 6.22.0 (2021-06-18)
### Behavior Changes
* Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time.

View File

@ -247,6 +247,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
return new DBFileDumperCommand(parsed_params.cmd_params,
parsed_params.option_map,
parsed_params.flags);
} else if (parsed_params.cmd == DBLiveFilesMetadataDumperCommand::Name()) {
return new DBLiveFilesMetadataDumperCommand(parsed_params.cmd_params,
parsed_params.option_map,
parsed_params.flags);
} else if (parsed_params.cmd == InternalDumpCommand::Name()) {
return new InternalDumpCommand(parsed_params.cmd_params,
parsed_params.option_map,
@ -3396,6 +3400,118 @@ void DBFileDumperCommand::DoCommand() {
}
}
const std::string DBLiveFilesMetadataDumperCommand::ARG_SORT_BY_FILENAME =
"sort_by_filename";
DBLiveFilesMetadataDumperCommand::DBLiveFilesMetadataDumperCommand(
const std::vector<std::string>& /*params*/,
const std::map<std::string, std::string>& options,
const std::vector<std::string>& flags)
: LDBCommand(options, flags, true,
BuildCmdLineOptions({ARG_SORT_BY_FILENAME})) {
sort_by_filename_ = IsFlagPresent(flags, ARG_SORT_BY_FILENAME);
}
void DBLiveFilesMetadataDumperCommand::Help(std::string& ret) {
ret.append(" ");
ret.append(DBLiveFilesMetadataDumperCommand::Name());
ret.append(" [--" + ARG_SORT_BY_FILENAME + "] ");
ret.append("\n");
}
void DBLiveFilesMetadataDumperCommand::DoCommand() {
if (!db_) {
assert(GetExecuteState().IsFailed());
return;
}
Status s;
std::cout << "Live SST Files:" << std::endl;
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
if (sort_by_filename_) {
// Sort metadata vector by filename.
std::sort(metadata.begin(), metadata.end(),
[](const LiveFileMetaData& a, const LiveFileMetaData& b) -> bool {
std::string aName = a.db_path + a.name;
std::string bName = b.db_path + b.name;
return (aName.compare(bName) < 0);
});
for (auto& fileMetadata : metadata) {
// The fileMetada.name alwasy starts with "/",
// however fileMetada.db_path is the string provided by
// the user as an input. Therefore we check if we can
// concantenate the two string sdirectly or if we need to
// drop a possible extra "/" at the end of fileMetadata.db_path.
std::string filename = fileMetadata.db_path + "/" + fileMetadata.name;
// Drops any repeating '/' character that could happen during
// concatenation of db path and file name.
filename = NormalizePath(filename);
std::string cf = fileMetadata.column_family_name;
int level = fileMetadata.level;
std::cout << filename << " : level " << level << ", column family '" << cf
<< "'" << std::endl;
}
} else {
std::map<std::string, std::map<int, std::vector<std::string>>>
filesPerLevelPerCf;
// Collect live files metadata.
// Store filenames into a 2D map, that will automatically
// sort by column family (first key) and by level (second key).
for (auto& fileMetadata : metadata) {
std::string cf = fileMetadata.column_family_name;
int level = fileMetadata.level;
if (filesPerLevelPerCf.find(cf) == filesPerLevelPerCf.end()) {
filesPerLevelPerCf.emplace(cf,
std::map<int, std::vector<std::string>>());
}
if (filesPerLevelPerCf[cf].find(level) == filesPerLevelPerCf[cf].end()) {
filesPerLevelPerCf[cf].emplace(level, std::vector<std::string>());
}
// The fileMetada.name alwasy starts with "/",
// however fileMetada.db_path is the string provided by
// the user as an input. Therefore we check if we can
// concantenate the two string sdirectly or if we need to
// drop a possible extra "/" at the end of fileMetadata.db_path.
std::string filename = fileMetadata.db_path + "/" + fileMetadata.name;
// Drops any repeating '/' character that could happen during
// concatenation of db path and file name.
filename = NormalizePath(filename);
filesPerLevelPerCf[cf][level].push_back(filename);
}
// For each column family,
// iterate through the levels and print out the live SST file names.
for (auto it = filesPerLevelPerCf.begin(); it != filesPerLevelPerCf.end();
it++) {
// it->first: Column Family name (string)
// it->second: map[level]={SST files...}.
std::cout << "===== Column Family: " << it->first
<< " =====" << std::endl;
// For simplicity, create reference to the inner map (level={live SST
// files}).
std::map<int, std::vector<std::string>>& filesPerLevel = it->second;
int maxLevel = filesPerLevel.rbegin()->first;
// Even if the first few levels are empty, they are printed out.
for (int level = 0; level <= maxLevel; level++) {
std::cout << "---------- level " << level << " ----------" << std::endl;
if (filesPerLevel.find(level) != filesPerLevel.end()) {
std::vector<std::string>& fileList = filesPerLevel[level];
// Locally sort by filename for better information display.
std::sort(fileList.begin(), fileList.end());
for (const std::string& filename : fileList) {
std::cout << filename << std::endl;
}
}
} // End of for-loop over levels.
} // End of for-loop over filesPerLevelPerCf.
} // End of else ("not sort_by_filename").
std::cout << "------------------------------" << std::endl;
}
void WriteExternalSstFilesCommand::Help(std::string& ret) {
ret.append(" ");
ret.append(WriteExternalSstFilesCommand::Name());

View File

@ -46,6 +46,25 @@ class DBFileDumperCommand : public LDBCommand {
virtual void DoCommand() override;
};
class DBLiveFilesMetadataDumperCommand : public LDBCommand {
public:
static std::string Name() { return "list_live_files_metadata"; }
DBLiveFilesMetadataDumperCommand(
const std::vector<std::string>& params,
const std::map<std::string, std::string>& options,
const std::vector<std::string>& flags);
static void Help(std::string& ret);
virtual void DoCommand() override;
private:
bool sort_by_filename_;
static const std::string ARG_SORT_BY_FILENAME;
};
class DBDumperCommand : public LDBCommand {
public:
static std::string Name() { return "dump"; }

View File

@ -452,6 +452,93 @@ class LDBTestCase(unittest.TestCase):
filenumber = re.findall(r"(?<=MANIFEST-)\d+", manifestFilename)[0]
self.assertEqual(manifestFilename, dbPath+"MANIFEST-"+filenumber)
def listLiveFilesMetadata(self, params, dumpFile):
return 0 == run_err_null("./ldb list_live_files_metadata %s > %s" % (
params, dumpFile))
def testListLiveFilesMetadata(self):
print("Running testListLiveFilesMetadata...")
dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
self.assertRunOK("put x1 y1 --create_if_missing", "OK")
self.assertRunOK("put x2 y2", "OK")
# Compare the SST filename and the level of list_live_files_metadata
# with the data collected from dump_live_files.
dumpFilePath1 = os.path.join(self.TMP_DIR, "dump1")
self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath1))
dumpFilePath2 = os.path.join(self.TMP_DIR, "dump2")
self.assertTrue(self.listLiveFilesMetadata("--sort_by_filename --db=%s" % dbPath, dumpFilePath2))
# Collect SST filename and level from dump_live_files
with open(dumpFilePath1, "r") as tmp:
data = tmp.read()
filename1 = re.findall(r".*\d+\.sst",data)[0]
level1 = re.findall(r"level:\d+",data)[0].split(':')[1]
# Collect SST filename and level from list_live_files_metadata
with open(dumpFilePath2, "r") as tmp:
data = tmp.read()
filename2 = re.findall(r".*\d+\.sst",data)[0]
level2 = re.findall(r"level \d+",data)[0].split(' ')[1]
# Assert equality between filenames and levels.
self.assertEqual(filename1,filename2)
self.assertEqual(level1,level2)
# Create multiple column families and compare the output
# of list_live_files_metadata with dump_live_files once again.
# Create new CF, and insert data:
self.assertRunOK("create_column_family mycol1", "OK")
self.assertRunOK("put --column_family=mycol1 v1 v2", "OK")
self.assertRunOK("create_column_family mycol2", "OK")
self.assertRunOK("put --column_family=mycol2 h1 h2", "OK")
self.assertRunOK("put --column_family=mycol2 h3 h4", "OK")
# Call dump_live_files and list_live_files_metadata
# and pipe the output to compare them later.
dumpFilePath3 = os.path.join(self.TMP_DIR, "dump3")
self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath3))
dumpFilePath4 = os.path.join(self.TMP_DIR, "dump4")
self.assertTrue(self.listLiveFilesMetadata("--sort_by_filename --db=%s" % dbPath, dumpFilePath4))
# dump_live_files:
# parse the output and create a map:
# [key: sstFilename]->[value:[LSM level, Column Family Name]]
referenceMap = {}
with open(dumpFilePath3, "r") as tmp:
data = tmp.read()
# Note: the following regex are contingent on what the
# dump_live_files outputs.
namesAndLevels = re.findall(r"\d+.sst level:\d+", data)
cfs = re.findall(r"(?<=column family name=)\w+", data)
# re.findall should not reorder the data.
# Therefore namesAndLevels[i] matches the data from cfs[i].
for count, nameAndLevel in enumerate(namesAndLevels):
sstFilename = re.findall(r"\d+.sst",nameAndLevel)[0]
sstLevel = re.findall(r"(?<=level:)\d+", nameAndLevel)[0]
cf = cfs[count]
referenceMap[sstFilename] = [sstLevel, cf]
# list_live_files_metadata:
# parse the output and create a map:
# [key: sstFilename]->[value:[LSM level, Column Family Name]]
testMap = {}
with open(dumpFilePath4, "r") as tmp:
data = tmp.read()
# Since for each SST file, all the information is contained
# on one line, the parsing is easy to perform and relies on
# the appearance of an "00xxx.sst" pattern.
sstLines = re.findall(r".*\d+.sst.*", data)
for line in sstLines:
sstFilename = re.findall(r"\d+.sst", line)[0]
sstLevel = re.findall(r"(?<=level )\d+",line)[0]
cf = re.findall(r"(?<=column family \')\w+(?=\')",line)[0]
testMap[sstFilename] = [sstLevel, cf]
# Compare the map obtained from dump_live_files and the map
# obtained from list_live_files_metadata. Everything should match.
self.assertEqual(referenceMap,testMap)
def getManifests(self, directory):
return glob.glob(directory + "/MANIFEST-*")

View File

@ -94,6 +94,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
DropColumnFamilyCommand::Help(ret);
DBFileDumperCommand::Help(ret);
InternalDumpCommand::Help(ret);
DBLiveFilesMetadataDumperCommand::Help(ret);
RepairCommand::Help(ret);
BackupCommand::Help(ret);
RestoreCommand::Help(ret);