diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index cbe9f13f..aa275e4e 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -177,13 +177,14 @@ static vector match_mentions(Slice str) { break; } - uint32 prev = 0; if (ptr != begin) { + uint32 prev; next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); - } - if (is_word_character(prev)) { - ptr++; - continue; + + if (is_word_character(prev)) { + ptr++; + continue; + } } auto mention_begin = ++ptr; while (ptr != end && is_alpha_digit_or_underscore(*ptr)) { @@ -220,13 +221,14 @@ static vector match_bot_commands(Slice str) { break; } - uint32 prev = 0; if (ptr != begin) { + uint32 prev; next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); - } - if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') { - ptr++; - continue; + + if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') { + ptr++; + continue; + } } auto command_begin = ++ptr; @@ -264,6 +266,20 @@ static vector match_bot_commands(Slice str) { return result; } +static bool is_hashtag_letter(uint32 c, UnicodeSimpleCategory &category) { + category = get_unicode_simple_category(c); + if (c == '_' || c == 0x200c) { + return true; + } + switch (category) { + case UnicodeSimpleCategory::DecimalNumber: + case UnicodeSimpleCategory::Letter: + return true; + default: + return false; + } +} + static vector match_hashtags(Slice str) { vector result; const unsigned char *begin = str.ubegin(); @@ -274,19 +290,6 @@ static vector match_hashtags(Slice str) { // and at least one letter UnicodeSimpleCategory category; - const auto &is_hashtag_letter = [&category](uint32 c) { - category = get_unicode_simple_category(c); - if (c == '_' || c == 0x200c) { - return true; - } - switch (category) { - case UnicodeSimpleCategory::DecimalNumber: - case UnicodeSimpleCategory::Letter: - return true; - default: - return false; - } - }; while (true) { ptr = reinterpret_cast(std::memchr(ptr, '#', narrow_cast(end - ptr))); @@ -294,13 +297,14 @@ static vector match_hashtags(Slice str) { break; } - uint32 prev = 0; if (ptr != begin) { + uint32 prev; next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); - } - if (is_hashtag_letter(prev)) { - ptr++; - continue; + + if (is_hashtag_letter(prev, category)) { + ptr++; + continue; + } } auto hashtag_begin = ++ptr; size_t hashtag_size = 0; @@ -309,7 +313,7 @@ static vector match_hashtags(Slice str) { while (ptr != end) { uint32 code; auto next_ptr = next_utf8_unsafe(ptr, &code); - if (!is_hashtag_letter(code)) { + if (!is_hashtag_letter(code, category)) { break; } ptr = next_ptr; @@ -339,6 +343,54 @@ static vector match_hashtags(Slice str) { return result; } +static vector match_cashtags(Slice str) { + vector result; + const unsigned char *begin = str.ubegin(); + const unsigned char *end = str.uend(); + const unsigned char *ptr = begin; + + // '/(?<=^|[^$\d_\pL\x{200c}])\$([A-Z]{3,8})(?![$\d_\pL\x{200c}])/u' + + UnicodeSimpleCategory category; + while (true) { + ptr = reinterpret_cast(std::memchr(ptr, '$', narrow_cast(end - ptr))); + if (ptr == nullptr) { + break; + } + + if (ptr != begin) { + uint32 prev; + next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); + + if (is_hashtag_letter(prev, category) || prev == '$') { + ptr++; + continue; + } + } + + auto cashtag_begin = ++ptr; + while (ptr != end && 'Z' >= *ptr && *ptr >= 'A') { + ptr++; + } + auto cashtag_end = ptr; + auto cashtag_size = cashtag_end - cashtag_begin; + if (cashtag_size < 3 || cashtag_size > 8) { + continue; + } + + if (cashtag_end != end) { + uint32 code; + next_utf8_unsafe(ptr, &code); + if (is_hashtag_letter(code, category) || code == '$') { + continue; + } + } + + result.emplace_back(cashtag_begin - 1, cashtag_end); + } + return result; +} + static vector match_urls(Slice str) { vector result; const unsigned char *begin = str.ubegin(); @@ -941,6 +993,10 @@ vector find_hashtags(Slice str) { return match_hashtags(str); } +vector find_cashtags(Slice str) { + return match_cashtags(str); +} + vector> find_urls(Slice str) { vector> result; for (auto url : match_urls(str)) { diff --git a/td/telegram/MessageEntity.h b/td/telegram/MessageEntity.h index a74b9dcc..6a5e5001 100644 --- a/td/telegram/MessageEntity.h +++ b/td/telegram/MessageEntity.h @@ -145,6 +145,7 @@ vector find_entities(Slice text, bool skip_bot_commands, bool onl vector find_mentions(Slice str); vector find_bot_commands(Slice str); vector find_hashtags(Slice str); +vector find_cashtags(Slice str); bool is_email_address(Slice str); vector> find_urls(Slice str); // slice + is_email_address diff --git a/tdutils/td/utils/utf8.cpp b/tdutils/td/utils/utf8.cpp index 0e89c2bd..50f82d63 100644 --- a/tdutils/td/utils/utf8.cpp +++ b/tdutils/td/utils/utf8.cpp @@ -103,6 +103,9 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) { return ptr + 4; } UNREACHABLE(); + if (code) { + *code = 0; + } return ptr; } diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 2d8be9b8..473a8714 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -109,6 +109,58 @@ TEST(MessageEntities, hashtag) { check_hashtag(u8"#a\u2122", {"#a"}); } +static void check_cashtag(string str, std::vector expected) { + auto result_slice = find_cashtags(str); + std::vector result; + for (auto &it : result_slice) { + result.push_back(it.str()); + } + if (result != expected) { + LOG(FATAL) << tag("text", str) << tag("got", format::as_array(result)) + << tag("expected", format::as_array(expected)); + } +} + +TEST(MessageEntities, cashtag) { + check_cashtag("", {}); + check_cashtag("$", {}); + check_cashtag("$$", {}); + check_cashtag("$$$", {}); + check_cashtag("$a", {}); + check_cashtag(" $a", {}); + check_cashtag("$a ", {}); + check_cashtag(" $я ", {}); + check_cashtag("$ab", {}); + check_cashtag("$abc", {}); + check_cashtag("$", {}); + check_cashtag("$A", {}); + check_cashtag("$AB", {}); + check_cashtag("$АBC", {}); + check_cashtag("$АВС", {}); + check_cashtag("$ABC", {"$ABC"}); + check_cashtag("$ABCD", {"$ABCD"}); + check_cashtag("$ABCDE", {"$ABCDE"}); + check_cashtag("$ABCDEF", {"$ABCDEF"}); + check_cashtag("$ABCDEFG", {"$ABCDEFG"}); + check_cashtag("$ABCDEFGH", {"$ABCDEFGH"}); + check_cashtag("$ABCDEFGHJ", {}); + check_cashtag("$ABCDEFGH1", {}); + check_cashtag(" $XYZ", {"$XYZ"}); + check_cashtag("$XYZ ", {"$XYZ"}); + check_cashtag(" $XYZ ", {"$XYZ"}); + check_cashtag(" $$XYZ ", {}); + check_cashtag(" $XYZ$ ", {}); + check_cashtag(" $ABC1 ", {}); + check_cashtag(" $1ABC ", {}); + check_cashtag(" 1$ABC ", {}); + check_cashtag(" А$ABC ", {}); + check_cashtag("$ABC$DEF $GHI $KLM", {"$GHI", "$KLM"}); + check_cashtag("$TEST", {"$TEST"}); + check_cashtag(u8"$ABC\u2122", {"$ABC"}); + check_cashtag(u8"\u2122$ABC", {"$ABC"}); + check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"}); +} + static void check_is_email_address(string str, bool expected) { bool result = is_email_address(str); LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";