Add tg URLs parsing.

This commit is contained in:
levlam 2021-06-03 18:27:40 +03:00
parent 2fb9df33d2
commit a630c87178
3 changed files with 140 additions and 16 deletions

View File

@ -493,6 +493,81 @@ static bool is_url_unicode_symbol(uint32 c) {
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
}
static bool is_url_path_symbol(uint32 c) {
switch (c) {
case '\n':
case '<':
case '>':
case '"':
case 0xab: // «
case 0xbb: // »
return false;
default:
return is_url_unicode_symbol(c);
}
}
static vector<Slice> match_tg_urls(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '(tg|ton)://[a-z0-9_-]{1,253}([/?#][^\s\x{2000}-\x{200b}\x{200e}-\x{200f}\x{2016}-\x{206f}<>«»"]*)?'
Slice bad_path_end_chars(".:;,('?!`");
while (end - ptr > 5) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, ':', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
const unsigned char *url_begin = nullptr;
if (end - ptr >= 3 && ptr[1] == '/' && ptr[2] == '/') {
if (ptr - begin >= 2 && to_lower(ptr[-2]) == 't' && to_lower(ptr[-1]) == 'g') {
url_begin = ptr - 2;
} else if (ptr - begin >= 3 && to_lower(ptr[-3]) == 't' && to_lower(ptr[-2]) == 'o' && to_lower(ptr[-1]) == 'n') {
url_begin = ptr - 3;
}
}
if (url_begin == nullptr) {
++ptr;
continue;
}
ptr += 3;
auto domain_begin = ptr;
while (ptr != end && ptr - domain_begin != 253 && is_alpha_digit_or_underscore_or_minus(*ptr)) {
ptr++;
}
if (ptr == domain_begin) {
continue;
}
if (ptr != end && (*ptr == '/' || *ptr == '?' || *ptr == '#')) {
auto path_end_ptr = ptr + 1;
while (path_end_ptr != end) {
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_tg_urls");
if (!is_url_path_symbol(code)) {
break;
}
path_end_ptr = next_ptr;
}
while (path_end_ptr > ptr + 1 && bad_path_end_chars.find(path_end_ptr[-1]) < bad_path_end_chars.size()) {
path_end_ptr--;
}
if (ptr[0] == '/' || path_end_ptr > ptr + 1) {
ptr = path_end_ptr;
}
}
result.emplace_back(url_begin, ptr);
}
return result;
}
static vector<Slice> match_urls(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
@ -537,20 +612,6 @@ static vector<Slice> match_urls(Slice str) {
return is_url_unicode_symbol(c);
};
const auto &is_path_symbol = [](uint32 c) {
switch (c) {
case '\n':
case '<':
case '>':
case '"':
case 0xab: // «
case 0xbb: // »
return false;
default:
return is_url_unicode_symbol(c);
}
};
Slice bad_path_end_chars(".:;,('?!`");
while (true) {
@ -624,7 +685,7 @@ static vector<Slice> match_urls(Slice str) {
while (path_end_ptr != end) {
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_urls 4");
if (!is_path_symbol(code)) {
if (!is_url_path_symbol(code)) {
break;
}
path_end_ptr = next_ptr;
@ -978,7 +1039,7 @@ static bool is_common_tld(Slice str) {
return tlds.count(str_lower) > 0;
}
Slice fix_url(Slice str) {
static Slice fix_url(Slice str) {
auto full_url = str;
bool has_protocol = false;
@ -1156,6 +1217,10 @@ vector<Slice> find_bank_card_numbers(Slice str) {
return result;
}
vector<Slice> find_tg_urls(Slice str) {
return match_tg_urls(str);
}
vector<std::pair<Slice, bool>> find_urls(Slice str) {
vector<std::pair<Slice, bool>> result;
for (auto url : match_urls(str)) {
@ -1395,6 +1460,7 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands) {
add_entities(MessageEntity::Type::Cashtag, find_cashtags);
// TODO find_phone_numbers
add_entities(MessageEntity::Type::BankCardNumber, find_bank_card_numbers);
add_entities(MessageEntity::Type::Url, find_tg_urls);
auto urls = find_urls(text);
for (auto &url : urls) {

View File

@ -142,6 +142,7 @@ vector<Slice> find_bot_commands(Slice str);
vector<Slice> find_hashtags(Slice str);
vector<Slice> find_cashtags(Slice str);
vector<Slice> find_bank_card_numbers(Slice str);
vector<Slice> find_tg_urls(Slice str);
bool is_email_address(Slice str);
vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address

View File

@ -214,6 +214,62 @@ TEST(MessageEntities, bank_card_number) {
check_bank_card_number("+1234567890128", {});
}
static void check_tg_url(const td::string &str, const td::vector<td::string> &expected) {
auto result_slice = td::find_tg_urls(str);
td::vector<td::string> result;
for (auto &it : result_slice) {
result.push_back(it.str());
}
if (result != expected) {
LOG(FATAL) << td::tag("text", str) << td::tag("got", td::format::as_array(result))
<< td::tag("expected", td::format::as_array(expected));
}
}
TEST(MessageEntities, tg_url) {
check_tg_url("", {});
check_tg_url("tg://", {});
check_tg_url("tg://a", {"tg://a"});
check_tg_url("a", {});
check_tg_url("stg://a", {"tg://a"});
check_tg_url("asd asdas das ton:asd tg:test ton://resolve tg://resolve TON://_-RESOLVE_- TG://-_RESOLVE-_",
{"ton://resolve", "tg://resolve", "TON://_-RESOLVE_-", "TG://-_RESOLVE-_"});
check_tg_url("tg:test/", {});
check_tg_url("tg:/test/", {});
check_tg_url("tg://test/", {"tg://test/"});
check_tg_url("tg://test/?", {"tg://test/"});
check_tg_url("tg://test/#", {"tg://test/#"});
check_tg_url("tg://test?", {"tg://test"});
check_tg_url("tg://test#", {"tg://test"});
check_tg_url("tg://test/―asd―?asd=asd&asdas=―#――――", {"tg://test/―asd―?asd=asd&asdas=―#――――"});
check_tg_url("tg://test/?asd", {"tg://test/?asd"});
check_tg_url("tg://test/?.:;,('?!`.:;,('?!`", {"tg://test/"});
check_tg_url("tg://test/#asdf", {"tg://test/#asdf"});
check_tg_url("tg://test?asdf", {"tg://test?asdf"});
check_tg_url("tg://test#asdf", {"tg://test#asdf"});
check_tg_url("tg://test?as‖df", {"tg://test?as"});
check_tg_url("tg://test?as<df", {"tg://test?as"});
check_tg_url("tg://test?as>df", {"tg://test?as"});
check_tg_url("tg://test?as\"df", {"tg://test?as"});
check_tg_url("tg://test?as«df", {"tg://test?as"});
check_tg_url("tg://test?as»df", {"tg://test?as"});
check_tg_url("tg://test?as(df", {"tg://test?as(df"});
check_tg_url("tg://test?as)df", {"tg://test?as)df"});
check_tg_url("tg://test?as[df", {"tg://test?as[df"});
check_tg_url("tg://test?as]df", {"tg://test?as]df"});
check_tg_url("tg://test?as{df", {"tg://test?as{df"});
check_tg_url("tg://test?as'df", {"tg://test?as'df"});
check_tg_url("tg://test?as}df", {"tg://test?as}df"});
check_tg_url("tg://test?as$df", {"tg://test?as$df"});
check_tg_url("tg://test?as%df", {"tg://test?as%df"});
check_tg_url("tg://%30/sccct", {});
check_tg_url("tg://test:asd@google.com:80", {"tg://test"});
check_tg_url("tg://google.com", {"tg://google"});
check_tg_url("tg://google/.com", {"tg://google/.com"});
check_tg_url("tg://127.0.0.1", {"tg://127"});
check_tg_url("tg://б.а.н.а.на", {});
}
static void check_is_email_address(const td::string &str, bool expected) {
bool result = td::is_email_address(str);
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";
@ -455,6 +511,7 @@ TEST(MessageEntities, url) {
check_url("http://google_.com", {});
check_url("http://google._com_", {});
check_url("http://[2001:4860:0:2001::68]/", {}); // TODO
check_url("tg://resolve", {});
check_url("test.abd", {});
check_url("/.b/..a @.....@/. a.ba", {"a.ba"});
check_url("bbbbbbbbbbbbbb.@.@", {});