Add tg URLs parsing.
This commit is contained in:
parent
2fb9df33d2
commit
a630c87178
@ -493,6 +493,81 @@ static bool is_url_unicode_symbol(uint32 c) {
|
||||
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
|
||||
}
|
||||
|
||||
static bool is_url_path_symbol(uint32 c) {
|
||||
switch (c) {
|
||||
case '\n':
|
||||
case '<':
|
||||
case '>':
|
||||
case '"':
|
||||
case 0xab: // «
|
||||
case 0xbb: // »
|
||||
return false;
|
||||
default:
|
||||
return is_url_unicode_symbol(c);
|
||||
}
|
||||
}
|
||||
|
||||
static vector<Slice> match_tg_urls(Slice str) {
|
||||
vector<Slice> result;
|
||||
const unsigned char *begin = str.ubegin();
|
||||
const unsigned char *end = str.uend();
|
||||
const unsigned char *ptr = begin;
|
||||
|
||||
// '(tg|ton)://[a-z0-9_-]{1,253}([/?#][^\s\x{2000}-\x{200b}\x{200e}-\x{200f}\x{2016}-\x{206f}<>«»"]*)?'
|
||||
|
||||
Slice bad_path_end_chars(".:;,('?!`");
|
||||
|
||||
while (end - ptr > 5) {
|
||||
ptr = static_cast<const unsigned char *>(std::memchr(ptr, ':', narrow_cast<int32>(end - ptr)));
|
||||
if (ptr == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
const unsigned char *url_begin = nullptr;
|
||||
if (end - ptr >= 3 && ptr[1] == '/' && ptr[2] == '/') {
|
||||
if (ptr - begin >= 2 && to_lower(ptr[-2]) == 't' && to_lower(ptr[-1]) == 'g') {
|
||||
url_begin = ptr - 2;
|
||||
} else if (ptr - begin >= 3 && to_lower(ptr[-3]) == 't' && to_lower(ptr[-2]) == 'o' && to_lower(ptr[-1]) == 'n') {
|
||||
url_begin = ptr - 3;
|
||||
}
|
||||
}
|
||||
if (url_begin == nullptr) {
|
||||
++ptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
ptr += 3;
|
||||
auto domain_begin = ptr;
|
||||
while (ptr != end && ptr - domain_begin != 253 && is_alpha_digit_or_underscore_or_minus(*ptr)) {
|
||||
ptr++;
|
||||
}
|
||||
if (ptr == domain_begin) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ptr != end && (*ptr == '/' || *ptr == '?' || *ptr == '#')) {
|
||||
auto path_end_ptr = ptr + 1;
|
||||
while (path_end_ptr != end) {
|
||||
uint32 code = 0;
|
||||
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_tg_urls");
|
||||
if (!is_url_path_symbol(code)) {
|
||||
break;
|
||||
}
|
||||
path_end_ptr = next_ptr;
|
||||
}
|
||||
while (path_end_ptr > ptr + 1 && bad_path_end_chars.find(path_end_ptr[-1]) < bad_path_end_chars.size()) {
|
||||
path_end_ptr--;
|
||||
}
|
||||
if (ptr[0] == '/' || path_end_ptr > ptr + 1) {
|
||||
ptr = path_end_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
result.emplace_back(url_begin, ptr);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static vector<Slice> match_urls(Slice str) {
|
||||
vector<Slice> result;
|
||||
const unsigned char *begin = str.ubegin();
|
||||
@ -537,20 +612,6 @@ static vector<Slice> match_urls(Slice str) {
|
||||
return is_url_unicode_symbol(c);
|
||||
};
|
||||
|
||||
const auto &is_path_symbol = [](uint32 c) {
|
||||
switch (c) {
|
||||
case '\n':
|
||||
case '<':
|
||||
case '>':
|
||||
case '"':
|
||||
case 0xab: // «
|
||||
case 0xbb: // »
|
||||
return false;
|
||||
default:
|
||||
return is_url_unicode_symbol(c);
|
||||
}
|
||||
};
|
||||
|
||||
Slice bad_path_end_chars(".:;,('?!`");
|
||||
|
||||
while (true) {
|
||||
@ -624,7 +685,7 @@ static vector<Slice> match_urls(Slice str) {
|
||||
while (path_end_ptr != end) {
|
||||
uint32 code = 0;
|
||||
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_urls 4");
|
||||
if (!is_path_symbol(code)) {
|
||||
if (!is_url_path_symbol(code)) {
|
||||
break;
|
||||
}
|
||||
path_end_ptr = next_ptr;
|
||||
@ -978,7 +1039,7 @@ static bool is_common_tld(Slice str) {
|
||||
return tlds.count(str_lower) > 0;
|
||||
}
|
||||
|
||||
Slice fix_url(Slice str) {
|
||||
static Slice fix_url(Slice str) {
|
||||
auto full_url = str;
|
||||
|
||||
bool has_protocol = false;
|
||||
@ -1156,6 +1217,10 @@ vector<Slice> find_bank_card_numbers(Slice str) {
|
||||
return result;
|
||||
}
|
||||
|
||||
vector<Slice> find_tg_urls(Slice str) {
|
||||
return match_tg_urls(str);
|
||||
}
|
||||
|
||||
vector<std::pair<Slice, bool>> find_urls(Slice str) {
|
||||
vector<std::pair<Slice, bool>> result;
|
||||
for (auto url : match_urls(str)) {
|
||||
@ -1395,6 +1460,7 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands) {
|
||||
add_entities(MessageEntity::Type::Cashtag, find_cashtags);
|
||||
// TODO find_phone_numbers
|
||||
add_entities(MessageEntity::Type::BankCardNumber, find_bank_card_numbers);
|
||||
add_entities(MessageEntity::Type::Url, find_tg_urls);
|
||||
|
||||
auto urls = find_urls(text);
|
||||
for (auto &url : urls) {
|
||||
|
@ -142,6 +142,7 @@ vector<Slice> find_bot_commands(Slice str);
|
||||
vector<Slice> find_hashtags(Slice str);
|
||||
vector<Slice> find_cashtags(Slice str);
|
||||
vector<Slice> find_bank_card_numbers(Slice str);
|
||||
vector<Slice> find_tg_urls(Slice str);
|
||||
bool is_email_address(Slice str);
|
||||
vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address
|
||||
|
||||
|
@ -214,6 +214,62 @@ TEST(MessageEntities, bank_card_number) {
|
||||
check_bank_card_number("+1234567890128", {});
|
||||
}
|
||||
|
||||
static void check_tg_url(const td::string &str, const td::vector<td::string> &expected) {
|
||||
auto result_slice = td::find_tg_urls(str);
|
||||
td::vector<td::string> result;
|
||||
for (auto &it : result_slice) {
|
||||
result.push_back(it.str());
|
||||
}
|
||||
if (result != expected) {
|
||||
LOG(FATAL) << td::tag("text", str) << td::tag("got", td::format::as_array(result))
|
||||
<< td::tag("expected", td::format::as_array(expected));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MessageEntities, tg_url) {
|
||||
check_tg_url("", {});
|
||||
check_tg_url("tg://", {});
|
||||
check_tg_url("tg://a", {"tg://a"});
|
||||
check_tg_url("a", {});
|
||||
check_tg_url("stg://a", {"tg://a"});
|
||||
check_tg_url("asd asdas das ton:asd tg:test ton://resolve tg://resolve TON://_-RESOLVE_- TG://-_RESOLVE-_",
|
||||
{"ton://resolve", "tg://resolve", "TON://_-RESOLVE_-", "TG://-_RESOLVE-_"});
|
||||
check_tg_url("tg:test/", {});
|
||||
check_tg_url("tg:/test/", {});
|
||||
check_tg_url("tg://test/", {"tg://test/"});
|
||||
check_tg_url("tg://test/?", {"tg://test/"});
|
||||
check_tg_url("tg://test/#", {"tg://test/#"});
|
||||
check_tg_url("tg://test?", {"tg://test"});
|
||||
check_tg_url("tg://test#", {"tg://test"});
|
||||
check_tg_url("tg://test/―asd―?asd=asd&asdas=―#――――", {"tg://test/―asd―?asd=asd&asdas=―#――――"});
|
||||
check_tg_url("tg://test/?asd", {"tg://test/?asd"});
|
||||
check_tg_url("tg://test/?.:;,('?!`.:;,('?!`", {"tg://test/"});
|
||||
check_tg_url("tg://test/#asdf", {"tg://test/#asdf"});
|
||||
check_tg_url("tg://test?asdf", {"tg://test?asdf"});
|
||||
check_tg_url("tg://test#asdf", {"tg://test#asdf"});
|
||||
check_tg_url("tg://test?as‖df", {"tg://test?as"});
|
||||
check_tg_url("tg://test?as<df", {"tg://test?as"});
|
||||
check_tg_url("tg://test?as>df", {"tg://test?as"});
|
||||
check_tg_url("tg://test?as\"df", {"tg://test?as"});
|
||||
check_tg_url("tg://test?as«df", {"tg://test?as"});
|
||||
check_tg_url("tg://test?as»df", {"tg://test?as"});
|
||||
check_tg_url("tg://test?as(df", {"tg://test?as(df"});
|
||||
check_tg_url("tg://test?as)df", {"tg://test?as)df"});
|
||||
check_tg_url("tg://test?as[df", {"tg://test?as[df"});
|
||||
check_tg_url("tg://test?as]df", {"tg://test?as]df"});
|
||||
check_tg_url("tg://test?as{df", {"tg://test?as{df"});
|
||||
check_tg_url("tg://test?as'df", {"tg://test?as'df"});
|
||||
check_tg_url("tg://test?as}df", {"tg://test?as}df"});
|
||||
check_tg_url("tg://test?as$df", {"tg://test?as$df"});
|
||||
check_tg_url("tg://test?as%df", {"tg://test?as%df"});
|
||||
check_tg_url("tg://%30/sccct", {});
|
||||
check_tg_url("tg://test:asd@google.com:80", {"tg://test"});
|
||||
check_tg_url("tg://google.com", {"tg://google"});
|
||||
check_tg_url("tg://google/.com", {"tg://google/.com"});
|
||||
check_tg_url("tg://127.0.0.1", {"tg://127"});
|
||||
check_tg_url("tg://б.а.н.а.на", {});
|
||||
}
|
||||
|
||||
static void check_is_email_address(const td::string &str, bool expected) {
|
||||
bool result = td::is_email_address(str);
|
||||
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";
|
||||
@ -455,6 +511,7 @@ TEST(MessageEntities, url) {
|
||||
check_url("http://google_.com", {});
|
||||
check_url("http://google._com_", {});
|
||||
check_url("http://[2001:4860:0:2001::68]/", {}); // TODO
|
||||
check_url("tg://resolve", {});
|
||||
check_url("test.abd", {});
|
||||
check_url("/.b/..a @.....@/. a.ba", {"a.ba"});
|
||||
check_url("bbbbbbbbbbbbbb.@.@", {});
|
||||
|
Loading…
x
Reference in New Issue
Block a user