tdlight/tddb/td/db/binlog/binlog_dump.cpp

150 lines
4.3 KiB
C++
Raw Normal View History

//
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2020
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
#include "td/db/binlog/Binlog.h"
#include "td/db/DbKey.h"
#include "td/utils/common.h"
#include "td/utils/format.h"
#include "td/utils/logging.h"
#include "td/utils/misc.h"
#include "td/utils/Slice.h"
#include "td/utils/StringBuilder.h"
#include "td/utils/tl_parsers.h"
#include <map>
struct Trie {
public:
Trie() {
nodes_.resize(1);
}
void add(td::Slice value) {
do_add(0, PSLICE() << value << '\0');
}
void dump() {
if (nodes_[0].sum == 0) { // division by zero
return;
}
LOG(PLAIN) << "TOTAL: " << nodes_[0].sum;
do_dump("", 0);
}
private:
struct FullNode {
int next[256] = {};
int sum = 0;
};
td::vector<FullNode> nodes_;
void do_add(int id, td::Slice value) {
nodes_[id].sum++;
if (value.empty()) {
return;
}
auto c = static_cast<td::uint8>(value[0]);
auto next_id = nodes_[id].next[c];
if (next_id == 0) {
next_id = static_cast<int>(nodes_.size());
nodes_.emplace_back();
nodes_[id].next[c] = next_id;
}
do_add(next_id, value.substr(1));
}
void do_dump(td::string path, int v) {
bool is_word_end = !path.empty() && path.back() == '\0';
bool need_stop = false;
int next_count = 0;
for (int c = 0; c < 256; c++) {
if (nodes_[v].next[c] != 0) {
need_stop |= c >= 128 || !(td::is_alpha(static_cast<char>(c)) || c == '.' || c == '_');
next_count++;
}
}
need_stop |= next_count == 0 || (next_count >= 2 && nodes_[v].sum <= nodes_[0].sum / 100);
if (is_word_end || need_stop) {
if (is_word_end) {
path.pop_back();
} else if (next_count != 1 || nodes_[v].next[0] == 0) {
path.push_back('*');
}
LOG(PLAIN) << nodes_[v].sum << " " << td::StringBuilder::FixedDouble(nodes_[v].sum * 100.0 / nodes_[0].sum, 2)
<< "% [" << td::format::escaped(path) << "]";
return;
}
for (int c = 0; c < 256; c++) {
auto next_id = nodes_[v].next[c];
if (next_id == 0) {
continue;
}
do_dump(path + static_cast<char>(c), next_id);
}
}
};
enum Magic { ConfigPmcMagic = 0x1f18, BinlogPmcMagic = 0x4327 };
int main(int argc, char *argv[]) {
if (argc < 2) {
LOG(PLAIN) << "Usage: binlog_dump <binlog_file_name>";
return 1;
}
struct Info {
std::size_t full_size = 0;
std::size_t compressed_size = 0;
Trie trie;
Trie compressed_trie;
};
std::map<td::uint64, Info> info;
SET_VERBOSITY_LEVEL(VERBOSITY_NAME(ERROR));
td::Binlog binlog;
binlog
.init(
argv[1],
[&](auto &event) {
info[0].compressed_size += event.raw_event_.size();
info[event.type_].compressed_size += event.raw_event_.size();
if (event.type_ == ConfigPmcMagic || event.type_ == BinlogPmcMagic) {
auto key = td::TlParser(event.data_).fetch_string<td::Slice>();
info[event.type_].compressed_trie.add(key);
}
},
td::DbKey::raw_key("cucumber"), td::DbKey::empty(), -1,
[&](auto &event) mutable {
info[0].full_size += event.raw_event_.size();
info[event.type_].full_size += event.raw_event_.size();
if (event.type_ == ConfigPmcMagic || event.type_ == BinlogPmcMagic) {
auto key = td::TlParser(event.data_).fetch_string<td::Slice>();
info[event.type_].trie.add(key);
}
LOG(PLAIN) << "LogEvent[" << td::tag("id", td::format::as_hex(event.id_)) << td::tag("type", event.type_)
<< td::tag("flags", event.flags_) << td::tag("size", event.data_.size())
<< td::tag("data", td::format::escaped(event.data_)) << "]\n";
})
.ensure();
for (auto &it : info) {
LOG(PLAIN) << td::tag("handler", td::format::as_hex(it.first))
<< td::tag("full_size", td::format::as_size(it.second.full_size))
<< td::tag("compressed_size", td::format::as_size(it.second.compressed_size));
it.second.trie.dump();
if (it.second.full_size != it.second.compressed_size) {
it.second.compressed_trie.dump();
}
}
return 0;
}