From 3f20925dc49f3a0e818865fee1cd283584e602b7 Mon Sep 17 00:00:00 2001 From: Baptiste Lemaire Date: Tue, 22 Jun 2021 19:06:44 -0700 Subject: [PATCH] Add list live files metadata (#8446) Summary: Add an argument to ldb to dump live file names, column families, and levels, `list_live_files_metadata`. The output shows all active SST file names, sorted first by column family and then by level. For each level the SST files are sorted alphabetically. Typically, the output looks like this: ``` ./ldb --db=/tmp/test_db list_live_files_metadata Live SST Files: ===== Column Family: default ===== ---------- level 0 ---------- /tmp/test_db/000069.sst ---------- level 1 ---------- /tmp/test_db/000064.sst /tmp/test_db/000065.sst /tmp/test_db/000066.sst /tmp/test_db/000071.sst ---------- level 2 ---------- /tmp/test_db/000038.sst /tmp/test_db/000039.sst /tmp/test_db/000052.sst /tmp/test_db/000067.sst /tmp/test_db/000070.sst ------------------------------ ``` Second, a flag was added `--sort_by_filename`, to change the layout of the output. When this flag is added to the command, the output shows all active SST files sorted by name, in front of which the LSM level and the column family are mentioned. With the same example, the following command would return: ``` ./ldb --db=/tmp/test_db list_live_files_metadata --sort_by_filename Live SST Files: /tmp/test_db/000038.sst : level 2, column family 'default' /tmp/test_db/000039.sst : level 2, column family 'default' /tmp/test_db/000052.sst : level 2, column family 'default' /tmp/test_db/000064.sst : level 1, column family 'default' /tmp/test_db/000065.sst : level 1, column family 'default' /tmp/test_db/000066.sst : level 1, column family 'default' /tmp/test_db/000067.sst : level 2, column family 'default' /tmp/test_db/000069.sst : level 0, column family 'default' /tmp/test_db/000070.sst : level 2, column family 'default' /tmp/test_db/000071.sst : level 1, column family 'default' ------------------------------ ``` Thus, the user can either request to show the files by levels, or sorted by filenames. This PR includes a simple Python unit test that makes sure the file name and level printed out by this new feature matches the one found with an existing feature, `dump_live_file`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8446 Reviewed By: akankshamahajan15 Differential Revision: D29320080 Pulled By: bjlemaire fbshipit-source-id: 01fb7b5637c59010d74c80730a28d815994e7009 --- HISTORY.md | 3 ++ tools/ldb_cmd.cc | 116 +++++++++++++++++++++++++++++++++++++++++++ tools/ldb_cmd_impl.h | 19 +++++++ tools/ldb_test.py | 87 ++++++++++++++++++++++++++++++++ tools/ldb_tool.cc | 1 + 5 files changed, 226 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 2f9eb04fe..3ed1bb77b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,9 @@ ### Bug Fixes * Blob file checksums are now printed in hexadecimal format when using the `manifest_dump` `ldb` command. +### New Features +* ldb has a new feature, `list_live_files_metadata`, that shows the live SST files, as well as their LSM storage level and the column family they belong to. + ## 6.22.0 (2021-06-18) ### Behavior Changes * Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time. diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index ec4e0d0f6..471dcbb5a 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -247,6 +247,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) { return new DBFileDumperCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == DBLiveFilesMetadataDumperCommand::Name()) { + return new DBLiveFilesMetadataDumperCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); } else if (parsed_params.cmd == InternalDumpCommand::Name()) { return new InternalDumpCommand(parsed_params.cmd_params, parsed_params.option_map, @@ -3396,6 +3400,118 @@ void DBFileDumperCommand::DoCommand() { } } +const std::string DBLiveFilesMetadataDumperCommand::ARG_SORT_BY_FILENAME = + "sort_by_filename"; + +DBLiveFilesMetadataDumperCommand::DBLiveFilesMetadataDumperCommand( + const std::vector& /*params*/, + const std::map& options, + const std::vector& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions({ARG_SORT_BY_FILENAME})) { + sort_by_filename_ = IsFlagPresent(flags, ARG_SORT_BY_FILENAME); +} + +void DBLiveFilesMetadataDumperCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(DBLiveFilesMetadataDumperCommand::Name()); + ret.append(" [--" + ARG_SORT_BY_FILENAME + "] "); + ret.append("\n"); +} + +void DBLiveFilesMetadataDumperCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status s; + + std::cout << "Live SST Files:" << std::endl; + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + if (sort_by_filename_) { + // Sort metadata vector by filename. + std::sort(metadata.begin(), metadata.end(), + [](const LiveFileMetaData& a, const LiveFileMetaData& b) -> bool { + std::string aName = a.db_path + a.name; + std::string bName = b.db_path + b.name; + return (aName.compare(bName) < 0); + }); + for (auto& fileMetadata : metadata) { + // The fileMetada.name alwasy starts with "/", + // however fileMetada.db_path is the string provided by + // the user as an input. Therefore we check if we can + // concantenate the two string sdirectly or if we need to + // drop a possible extra "/" at the end of fileMetadata.db_path. + std::string filename = fileMetadata.db_path + "/" + fileMetadata.name; + // Drops any repeating '/' character that could happen during + // concatenation of db path and file name. + filename = NormalizePath(filename); + std::string cf = fileMetadata.column_family_name; + int level = fileMetadata.level; + std::cout << filename << " : level " << level << ", column family '" << cf + << "'" << std::endl; + } + } else { + std::map>> + filesPerLevelPerCf; + // Collect live files metadata. + // Store filenames into a 2D map, that will automatically + // sort by column family (first key) and by level (second key). + for (auto& fileMetadata : metadata) { + std::string cf = fileMetadata.column_family_name; + int level = fileMetadata.level; + if (filesPerLevelPerCf.find(cf) == filesPerLevelPerCf.end()) { + filesPerLevelPerCf.emplace(cf, + std::map>()); + } + if (filesPerLevelPerCf[cf].find(level) == filesPerLevelPerCf[cf].end()) { + filesPerLevelPerCf[cf].emplace(level, std::vector()); + } + + // The fileMetada.name alwasy starts with "/", + // however fileMetada.db_path is the string provided by + // the user as an input. Therefore we check if we can + // concantenate the two string sdirectly or if we need to + // drop a possible extra "/" at the end of fileMetadata.db_path. + std::string filename = fileMetadata.db_path + "/" + fileMetadata.name; + // Drops any repeating '/' character that could happen during + // concatenation of db path and file name. + filename = NormalizePath(filename); + filesPerLevelPerCf[cf][level].push_back(filename); + } + // For each column family, + // iterate through the levels and print out the live SST file names. + for (auto it = filesPerLevelPerCf.begin(); it != filesPerLevelPerCf.end(); + it++) { + // it->first: Column Family name (string) + // it->second: map[level]={SST files...}. + std::cout << "===== Column Family: " << it->first + << " =====" << std::endl; + + // For simplicity, create reference to the inner map (level={live SST + // files}). + std::map>& filesPerLevel = it->second; + int maxLevel = filesPerLevel.rbegin()->first; + + // Even if the first few levels are empty, they are printed out. + for (int level = 0; level <= maxLevel; level++) { + std::cout << "---------- level " << level << " ----------" << std::endl; + if (filesPerLevel.find(level) != filesPerLevel.end()) { + std::vector& fileList = filesPerLevel[level]; + + // Locally sort by filename for better information display. + std::sort(fileList.begin(), fileList.end()); + for (const std::string& filename : fileList) { + std::cout << filename << std::endl; + } + } + } // End of for-loop over levels. + } // End of for-loop over filesPerLevelPerCf. + } // End of else ("not sort_by_filename"). + std::cout << "------------------------------" << std::endl; +} + void WriteExternalSstFilesCommand::Help(std::string& ret) { ret.append(" "); ret.append(WriteExternalSstFilesCommand::Name()); diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index f5f7eff2e..9944dc11e 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -46,6 +46,25 @@ class DBFileDumperCommand : public LDBCommand { virtual void DoCommand() override; }; +class DBLiveFilesMetadataDumperCommand : public LDBCommand { + public: + static std::string Name() { return "list_live_files_metadata"; } + + DBLiveFilesMetadataDumperCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + virtual void DoCommand() override; + + private: + bool sort_by_filename_; + + static const std::string ARG_SORT_BY_FILENAME; +}; + class DBDumperCommand : public LDBCommand { public: static std::string Name() { return "dump"; } diff --git a/tools/ldb_test.py b/tools/ldb_test.py index 699317b95..c94d9efaf 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -452,6 +452,93 @@ class LDBTestCase(unittest.TestCase): filenumber = re.findall(r"(?<=MANIFEST-)\d+", manifestFilename)[0] self.assertEqual(manifestFilename, dbPath+"MANIFEST-"+filenumber) + def listLiveFilesMetadata(self, params, dumpFile): + return 0 == run_err_null("./ldb list_live_files_metadata %s > %s" % ( + params, dumpFile)) + + def testListLiveFilesMetadata(self): + print("Running testListLiveFilesMetadata...") + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + self.assertRunOK("put x2 y2", "OK") + + # Compare the SST filename and the level of list_live_files_metadata + # with the data collected from dump_live_files. + dumpFilePath1 = os.path.join(self.TMP_DIR, "dump1") + self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath1)) + dumpFilePath2 = os.path.join(self.TMP_DIR, "dump2") + self.assertTrue(self.listLiveFilesMetadata("--sort_by_filename --db=%s" % dbPath, dumpFilePath2)) + + # Collect SST filename and level from dump_live_files + with open(dumpFilePath1, "r") as tmp: + data = tmp.read() + filename1 = re.findall(r".*\d+\.sst",data)[0] + level1 = re.findall(r"level:\d+",data)[0].split(':')[1] + + # Collect SST filename and level from list_live_files_metadata + with open(dumpFilePath2, "r") as tmp: + data = tmp.read() + filename2 = re.findall(r".*\d+\.sst",data)[0] + level2 = re.findall(r"level \d+",data)[0].split(' ')[1] + + # Assert equality between filenames and levels. + self.assertEqual(filename1,filename2) + self.assertEqual(level1,level2) + + # Create multiple column families and compare the output + # of list_live_files_metadata with dump_live_files once again. + # Create new CF, and insert data: + self.assertRunOK("create_column_family mycol1", "OK") + self.assertRunOK("put --column_family=mycol1 v1 v2", "OK") + self.assertRunOK("create_column_family mycol2", "OK") + self.assertRunOK("put --column_family=mycol2 h1 h2", "OK") + self.assertRunOK("put --column_family=mycol2 h3 h4", "OK") + + # Call dump_live_files and list_live_files_metadata + # and pipe the output to compare them later. + dumpFilePath3 = os.path.join(self.TMP_DIR, "dump3") + self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath3)) + dumpFilePath4 = os.path.join(self.TMP_DIR, "dump4") + self.assertTrue(self.listLiveFilesMetadata("--sort_by_filename --db=%s" % dbPath, dumpFilePath4)) + + # dump_live_files: + # parse the output and create a map: + # [key: sstFilename]->[value:[LSM level, Column Family Name]] + referenceMap = {} + with open(dumpFilePath3, "r") as tmp: + data = tmp.read() + # Note: the following regex are contingent on what the + # dump_live_files outputs. + namesAndLevels = re.findall(r"\d+.sst level:\d+", data) + cfs = re.findall(r"(?<=column family name=)\w+", data) + # re.findall should not reorder the data. + # Therefore namesAndLevels[i] matches the data from cfs[i]. + for count, nameAndLevel in enumerate(namesAndLevels): + sstFilename = re.findall(r"\d+.sst",nameAndLevel)[0] + sstLevel = re.findall(r"(?<=level:)\d+", nameAndLevel)[0] + cf = cfs[count] + referenceMap[sstFilename] = [sstLevel, cf] + + # list_live_files_metadata: + # parse the output and create a map: + # [key: sstFilename]->[value:[LSM level, Column Family Name]] + testMap = {} + with open(dumpFilePath4, "r") as tmp: + data = tmp.read() + # Since for each SST file, all the information is contained + # on one line, the parsing is easy to perform and relies on + # the appearance of an "00xxx.sst" pattern. + sstLines = re.findall(r".*\d+.sst.*", data) + for line in sstLines: + sstFilename = re.findall(r"\d+.sst", line)[0] + sstLevel = re.findall(r"(?<=level )\d+",line)[0] + cf = re.findall(r"(?<=column family \')\w+(?=\')",line)[0] + testMap[sstFilename] = [sstLevel, cf] + + # Compare the map obtained from dump_live_files and the map + # obtained from list_live_files_metadata. Everything should match. + self.assertEqual(referenceMap,testMap) def getManifests(self, directory): return glob.glob(directory + "/MANIFEST-*") diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index f8f7e7181..08a22c0ad 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -94,6 +94,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, DropColumnFamilyCommand::Help(ret); DBFileDumperCommand::Help(ret); InternalDumpCommand::Help(ret); + DBLiveFilesMetadataDumperCommand::Help(ret); RepairCommand::Help(ret); BackupCommand::Help(ret); RestoreCommand::Help(ret);