2018-12-31 22:04:05 +03:00
|
|
|
//
|
2021-01-01 15:57:46 +03:00
|
|
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
|
2018-12-31 22:04:05 +03:00
|
|
|
//
|
|
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
//
|
|
|
|
#include "td/db/binlog/Binlog.h"
|
|
|
|
|
2019-10-22 20:01:15 +03:00
|
|
|
#include "td/db/DbKey.h"
|
|
|
|
|
2018-12-31 22:04:05 +03:00
|
|
|
#include "td/utils/common.h"
|
|
|
|
#include "td/utils/format.h"
|
|
|
|
#include "td/utils/logging.h"
|
2020-07-13 04:04:30 +03:00
|
|
|
#include "td/utils/misc.h"
|
2020-07-13 20:46:17 +03:00
|
|
|
#include "td/utils/port/Stat.h"
|
2020-07-13 03:40:03 +03:00
|
|
|
#include "td/utils/Slice.h"
|
2020-07-13 04:04:30 +03:00
|
|
|
#include "td/utils/StringBuilder.h"
|
2020-07-09 22:15:37 +03:00
|
|
|
#include "td/utils/tl_parsers.h"
|
2018-12-31 22:04:05 +03:00
|
|
|
|
|
|
|
#include <map>
|
|
|
|
|
2020-07-09 22:15:37 +03:00
|
|
|
struct Trie {
|
|
|
|
Trie() {
|
|
|
|
nodes_.resize(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
void add(td::Slice value) {
|
|
|
|
do_add(0, PSLICE() << value << '\0');
|
|
|
|
}
|
|
|
|
|
|
|
|
void dump() {
|
|
|
|
if (nodes_[0].sum == 0) { // division by zero
|
|
|
|
return;
|
|
|
|
}
|
2020-07-13 03:40:03 +03:00
|
|
|
LOG(PLAIN) << "TOTAL: " << nodes_[0].sum;
|
|
|
|
do_dump("", 0);
|
2020-07-09 22:15:37 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
struct FullNode {
|
|
|
|
int next[256] = {};
|
|
|
|
int sum = 0;
|
|
|
|
};
|
2020-07-13 03:40:03 +03:00
|
|
|
td::vector<FullNode> nodes_;
|
|
|
|
|
2020-07-09 22:15:37 +03:00
|
|
|
void do_add(int id, td::Slice value) {
|
|
|
|
nodes_[id].sum++;
|
|
|
|
if (value.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
2020-07-13 03:40:03 +03:00
|
|
|
|
|
|
|
auto c = static_cast<td::uint8>(value[0]);
|
|
|
|
auto next_id = nodes_[id].next[c];
|
|
|
|
if (next_id == 0) {
|
|
|
|
next_id = static_cast<int>(nodes_.size());
|
2020-07-09 22:15:37 +03:00
|
|
|
nodes_.emplace_back();
|
|
|
|
nodes_[id].next[c] = next_id;
|
|
|
|
}
|
2020-07-13 03:40:03 +03:00
|
|
|
do_add(next_id, value.substr(1));
|
2020-07-09 22:15:37 +03:00
|
|
|
}
|
2020-07-13 03:40:03 +03:00
|
|
|
|
|
|
|
void do_dump(td::string path, int v) {
|
2020-07-13 04:04:30 +03:00
|
|
|
bool is_word_end = !path.empty() && path.back() == '\0';
|
|
|
|
|
|
|
|
bool need_stop = false;
|
|
|
|
int next_count = 0;
|
|
|
|
for (int c = 0; c < 256; c++) {
|
|
|
|
if (nodes_[v].next[c] != 0) {
|
|
|
|
need_stop |= c >= 128 || !(td::is_alpha(static_cast<char>(c)) || c == '.' || c == '_');
|
|
|
|
next_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
need_stop |= next_count == 0 || (next_count >= 2 && nodes_[v].sum <= nodes_[0].sum / 100);
|
|
|
|
|
|
|
|
if (is_word_end || need_stop) {
|
|
|
|
if (is_word_end) {
|
2020-07-13 03:40:03 +03:00
|
|
|
path.pop_back();
|
2020-07-13 04:04:30 +03:00
|
|
|
} else if (next_count != 1 || nodes_[v].next[0] == 0) {
|
2020-07-13 03:40:03 +03:00
|
|
|
path.push_back('*');
|
2020-07-09 22:15:37 +03:00
|
|
|
}
|
2020-07-13 04:04:30 +03:00
|
|
|
LOG(PLAIN) << nodes_[v].sum << " " << td::StringBuilder::FixedDouble(nodes_[v].sum * 100.0 / nodes_[0].sum, 2)
|
|
|
|
<< "% [" << td::format::escaped(path) << "]";
|
2020-07-09 22:15:37 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
for (int c = 0; c < 256; c++) {
|
|
|
|
auto next_id = nodes_[v].next[c];
|
|
|
|
if (next_id == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
2020-07-13 03:40:03 +03:00
|
|
|
do_dump(path + static_cast<char>(c), next_id);
|
2020-07-09 22:15:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
enum Magic { ConfigPmcMagic = 0x1f18, BinlogPmcMagic = 0x4327 };
|
|
|
|
|
2018-12-31 22:04:05 +03:00
|
|
|
int main(int argc, char *argv[]) {
|
|
|
|
if (argc < 2) {
|
2020-07-13 03:40:03 +03:00
|
|
|
LOG(PLAIN) << "Usage: binlog_dump <binlog_file_name>";
|
2018-12-31 22:04:05 +03:00
|
|
|
return 1;
|
|
|
|
}
|
2020-07-13 20:46:17 +03:00
|
|
|
td::string binlog_file_name = argv[1];
|
|
|
|
auto r_stat = td::stat(binlog_file_name);
|
|
|
|
if (r_stat.is_error() || r_stat.ok().size_ == 0 || !r_stat.ok().is_reg_) {
|
|
|
|
LOG(PLAIN) << "Wrong binlog file name specified";
|
|
|
|
LOG(PLAIN) << "Usage: binlog_dump <binlog_file_name>";
|
|
|
|
return 1;
|
|
|
|
}
|
2018-12-31 22:04:05 +03:00
|
|
|
|
|
|
|
struct Info {
|
|
|
|
std::size_t full_size = 0;
|
|
|
|
std::size_t compressed_size = 0;
|
2020-07-09 22:15:37 +03:00
|
|
|
Trie trie;
|
|
|
|
Trie compressed_trie;
|
2018-12-31 22:04:05 +03:00
|
|
|
};
|
|
|
|
std::map<td::uint64, Info> info;
|
|
|
|
|
|
|
|
SET_VERBOSITY_LEVEL(VERBOSITY_NAME(ERROR));
|
|
|
|
td::Binlog binlog;
|
|
|
|
binlog
|
2019-09-28 05:14:21 +03:00
|
|
|
.init(
|
2020-07-13 20:46:17 +03:00
|
|
|
binlog_file_name,
|
2019-09-28 05:14:21 +03:00
|
|
|
[&](auto &event) {
|
|
|
|
info[0].compressed_size += event.raw_event_.size();
|
|
|
|
info[event.type_].compressed_size += event.raw_event_.size();
|
2020-07-09 22:15:37 +03:00
|
|
|
if (event.type_ == ConfigPmcMagic || event.type_ == BinlogPmcMagic) {
|
2020-07-13 03:40:03 +03:00
|
|
|
auto key = td::TlParser(event.data_).fetch_string<td::Slice>();
|
|
|
|
info[event.type_].compressed_trie.add(key);
|
2020-07-09 22:15:37 +03:00
|
|
|
}
|
2019-09-28 05:14:21 +03:00
|
|
|
},
|
|
|
|
td::DbKey::raw_key("cucumber"), td::DbKey::empty(), -1,
|
|
|
|
[&](auto &event) mutable {
|
|
|
|
info[0].full_size += event.raw_event_.size();
|
|
|
|
info[event.type_].full_size += event.raw_event_.size();
|
2020-07-09 22:15:37 +03:00
|
|
|
if (event.type_ == ConfigPmcMagic || event.type_ == BinlogPmcMagic) {
|
2020-07-13 03:40:03 +03:00
|
|
|
auto key = td::TlParser(event.data_).fetch_string<td::Slice>();
|
|
|
|
info[event.type_].trie.add(key);
|
2020-07-09 22:15:37 +03:00
|
|
|
}
|
2019-09-28 05:14:21 +03:00
|
|
|
LOG(PLAIN) << "LogEvent[" << td::tag("id", td::format::as_hex(event.id_)) << td::tag("type", event.type_)
|
|
|
|
<< td::tag("flags", event.flags_) << td::tag("size", event.data_.size())
|
|
|
|
<< td::tag("data", td::format::escaped(event.data_)) << "]\n";
|
|
|
|
})
|
2018-12-31 22:04:05 +03:00
|
|
|
.ensure();
|
|
|
|
|
|
|
|
for (auto &it : info) {
|
2020-07-13 03:40:03 +03:00
|
|
|
LOG(PLAIN) << td::tag("handler", td::format::as_hex(it.first))
|
2018-12-31 22:04:05 +03:00
|
|
|
<< td::tag("full_size", td::format::as_size(it.second.full_size))
|
|
|
|
<< td::tag("compressed_size", td::format::as_size(it.second.compressed_size));
|
2020-07-09 22:15:37 +03:00
|
|
|
it.second.trie.dump();
|
2020-07-13 03:40:03 +03:00
|
|
|
if (it.second.full_size != it.second.compressed_size) {
|
|
|
|
it.second.compressed_trie.dump();
|
|
|
|
}
|
2018-12-31 22:04:05 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|