diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 72f6b1c2e..31143923d 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -493,6 +493,81 @@ static bool is_url_unicode_symbol(uint32 c) { return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator; } +static bool is_url_path_symbol(uint32 c) { + switch (c) { + case '\n': + case '<': + case '>': + case '"': + case 0xab: // « + case 0xbb: // » + return false; + default: + return is_url_unicode_symbol(c); + } +} + +static vector match_tg_urls(Slice str) { + vector result; + const unsigned char *begin = str.ubegin(); + const unsigned char *end = str.uend(); + const unsigned char *ptr = begin; + + // '(tg|ton)://[a-z0-9_-]{1,253}([/?#][^\s\x{2000}-\x{200b}\x{200e}-\x{200f}\x{2016}-\x{206f}<>«»"]*)?' + + Slice bad_path_end_chars(".:;,('?!`"); + + while (end - ptr > 5) { + ptr = static_cast(std::memchr(ptr, ':', narrow_cast(end - ptr))); + if (ptr == nullptr) { + break; + } + + const unsigned char *url_begin = nullptr; + if (end - ptr >= 3 && ptr[1] == '/' && ptr[2] == '/') { + if (ptr - begin >= 2 && to_lower(ptr[-2]) == 't' && to_lower(ptr[-1]) == 'g') { + url_begin = ptr - 2; + } else if (ptr - begin >= 3 && to_lower(ptr[-3]) == 't' && to_lower(ptr[-2]) == 'o' && to_lower(ptr[-1]) == 'n') { + url_begin = ptr - 3; + } + } + if (url_begin == nullptr) { + ++ptr; + continue; + } + + ptr += 3; + auto domain_begin = ptr; + while (ptr != end && ptr - domain_begin != 253 && is_alpha_digit_or_underscore_or_minus(*ptr)) { + ptr++; + } + if (ptr == domain_begin) { + continue; + } + + if (ptr != end && (*ptr == '/' || *ptr == '?' || *ptr == '#')) { + auto path_end_ptr = ptr + 1; + while (path_end_ptr != end) { + uint32 code = 0; + auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_tg_urls"); + if (!is_url_path_symbol(code)) { + break; + } + path_end_ptr = next_ptr; + } + while (path_end_ptr > ptr + 1 && bad_path_end_chars.find(path_end_ptr[-1]) < bad_path_end_chars.size()) { + path_end_ptr--; + } + if (ptr[0] == '/' || path_end_ptr > ptr + 1) { + ptr = path_end_ptr; + } + } + + result.emplace_back(url_begin, ptr); + } + return result; +} + static vector match_urls(Slice str) { vector result; const unsigned char *begin = str.ubegin(); @@ -537,20 +612,6 @@ static vector match_urls(Slice str) { return is_url_unicode_symbol(c); }; - const auto &is_path_symbol = [](uint32 c) { - switch (c) { - case '\n': - case '<': - case '>': - case '"': - case 0xab: // « - case 0xbb: // » - return false; - default: - return is_url_unicode_symbol(c); - } - }; - Slice bad_path_end_chars(".:;,('?!`"); while (true) { @@ -624,7 +685,7 @@ static vector match_urls(Slice str) { while (path_end_ptr != end) { uint32 code = 0; auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_urls 4"); - if (!is_path_symbol(code)) { + if (!is_url_path_symbol(code)) { break; } path_end_ptr = next_ptr; @@ -978,7 +1039,7 @@ static bool is_common_tld(Slice str) { return tlds.count(str_lower) > 0; } -Slice fix_url(Slice str) { +static Slice fix_url(Slice str) { auto full_url = str; bool has_protocol = false; @@ -1156,6 +1217,10 @@ vector find_bank_card_numbers(Slice str) { return result; } +vector find_tg_urls(Slice str) { + return match_tg_urls(str); +} + vector> find_urls(Slice str) { vector> result; for (auto url : match_urls(str)) { @@ -1395,6 +1460,7 @@ vector find_entities(Slice text, bool skip_bot_commands) { add_entities(MessageEntity::Type::Cashtag, find_cashtags); // TODO find_phone_numbers add_entities(MessageEntity::Type::BankCardNumber, find_bank_card_numbers); + add_entities(MessageEntity::Type::Url, find_tg_urls); auto urls = find_urls(text); for (auto &url : urls) { diff --git a/td/telegram/MessageEntity.h b/td/telegram/MessageEntity.h index 4f385cd38..f731b29e5 100644 --- a/td/telegram/MessageEntity.h +++ b/td/telegram/MessageEntity.h @@ -142,6 +142,7 @@ vector find_bot_commands(Slice str); vector find_hashtags(Slice str); vector find_cashtags(Slice str); vector find_bank_card_numbers(Slice str); +vector find_tg_urls(Slice str); bool is_email_address(Slice str); vector> find_urls(Slice str); // slice + is_email_address diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 767b082ed..86fae01ae 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -214,6 +214,62 @@ TEST(MessageEntities, bank_card_number) { check_bank_card_number("+1234567890128", {}); } +static void check_tg_url(const td::string &str, const td::vector &expected) { + auto result_slice = td::find_tg_urls(str); + td::vector result; + for (auto &it : result_slice) { + result.push_back(it.str()); + } + if (result != expected) { + LOG(FATAL) << td::tag("text", str) << td::tag("got", td::format::as_array(result)) + << td::tag("expected", td::format::as_array(expected)); + } +} + +TEST(MessageEntities, tg_url) { + check_tg_url("", {}); + check_tg_url("tg://", {}); + check_tg_url("tg://a", {"tg://a"}); + check_tg_url("a", {}); + check_tg_url("stg://a", {"tg://a"}); + check_tg_url("asd asdas das ton:asd tg:test ton://resolve tg://resolve TON://_-RESOLVE_- TG://-_RESOLVE-_", + {"ton://resolve", "tg://resolve", "TON://_-RESOLVE_-", "TG://-_RESOLVE-_"}); + check_tg_url("tg:test/", {}); + check_tg_url("tg:/test/", {}); + check_tg_url("tg://test/", {"tg://test/"}); + check_tg_url("tg://test/?", {"tg://test/"}); + check_tg_url("tg://test/#", {"tg://test/#"}); + check_tg_url("tg://test?", {"tg://test"}); + check_tg_url("tg://test#", {"tg://test"}); + check_tg_url("tg://test/―asd―?asd=asd&asdas=―#――――", {"tg://test/―asd―?asd=asd&asdas=―#――――"}); + check_tg_url("tg://test/?asd", {"tg://test/?asd"}); + check_tg_url("tg://test/?.:;,('?!`.:;,('?!`", {"tg://test/"}); + check_tg_url("tg://test/#asdf", {"tg://test/#asdf"}); + check_tg_url("tg://test?asdf", {"tg://test?asdf"}); + check_tg_url("tg://test#asdf", {"tg://test#asdf"}); + check_tg_url("tg://test?as‖df", {"tg://test?as"}); + check_tg_url("tg://test?asdf", {"tg://test?as"}); + check_tg_url("tg://test?as\"df", {"tg://test?as"}); + check_tg_url("tg://test?as«df", {"tg://test?as"}); + check_tg_url("tg://test?as»df", {"tg://test?as"}); + check_tg_url("tg://test?as(df", {"tg://test?as(df"}); + check_tg_url("tg://test?as)df", {"tg://test?as)df"}); + check_tg_url("tg://test?as[df", {"tg://test?as[df"}); + check_tg_url("tg://test?as]df", {"tg://test?as]df"}); + check_tg_url("tg://test?as{df", {"tg://test?as{df"}); + check_tg_url("tg://test?as'df", {"tg://test?as'df"}); + check_tg_url("tg://test?as}df", {"tg://test?as}df"}); + check_tg_url("tg://test?as$df", {"tg://test?as$df"}); + check_tg_url("tg://test?as%df", {"tg://test?as%df"}); + check_tg_url("tg://%30/sccct", {}); + check_tg_url("tg://test:asd@google.com:80", {"tg://test"}); + check_tg_url("tg://google.com", {"tg://google"}); + check_tg_url("tg://google/.com", {"tg://google/.com"}); + check_tg_url("tg://127.0.0.1", {"tg://127"}); + check_tg_url("tg://б.а.н.а.на", {}); +} + static void check_is_email_address(const td::string &str, bool expected) { bool result = td::is_email_address(str); LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")"; @@ -455,6 +511,7 @@ TEST(MessageEntities, url) { check_url("http://google_.com", {}); check_url("http://google._com_", {}); check_url("http://[2001:4860:0:2001::68]/", {}); // TODO + check_url("tg://resolve", {}); check_url("test.abd", {}); check_url("/.b/..a @.....@/. a.ba", {"a.ba"}); check_url("bbbbbbbbbbbbbb.@.@", {});