Implement find_cashtags.
GitOrigin-RevId: 40026a19c1ee1f1ea9c0157d28bdc0b67cbf9c58
This commit is contained in:
parent
4daeaa593e
commit
a665c4e822
@ -177,13 +177,14 @@ static vector<Slice> match_mentions(Slice str) {
|
||||
break;
|
||||
}
|
||||
|
||||
uint32 prev = 0;
|
||||
if (ptr != begin) {
|
||||
uint32 prev;
|
||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
||||
}
|
||||
if (is_word_character(prev)) {
|
||||
ptr++;
|
||||
continue;
|
||||
|
||||
if (is_word_character(prev)) {
|
||||
ptr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
auto mention_begin = ++ptr;
|
||||
while (ptr != end && is_alpha_digit_or_underscore(*ptr)) {
|
||||
@ -220,13 +221,14 @@ static vector<Slice> match_bot_commands(Slice str) {
|
||||
break;
|
||||
}
|
||||
|
||||
uint32 prev = 0;
|
||||
if (ptr != begin) {
|
||||
uint32 prev;
|
||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
||||
}
|
||||
if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
|
||||
ptr++;
|
||||
continue;
|
||||
|
||||
if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
|
||||
ptr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
auto command_begin = ++ptr;
|
||||
@ -264,6 +266,20 @@ static vector<Slice> match_bot_commands(Slice str) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool is_hashtag_letter(uint32 c, UnicodeSimpleCategory &category) {
|
||||
category = get_unicode_simple_category(c);
|
||||
if (c == '_' || c == 0x200c) {
|
||||
return true;
|
||||
}
|
||||
switch (category) {
|
||||
case UnicodeSimpleCategory::DecimalNumber:
|
||||
case UnicodeSimpleCategory::Letter:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static vector<Slice> match_hashtags(Slice str) {
|
||||
vector<Slice> result;
|
||||
const unsigned char *begin = str.ubegin();
|
||||
@ -274,19 +290,6 @@ static vector<Slice> match_hashtags(Slice str) {
|
||||
// and at least one letter
|
||||
|
||||
UnicodeSimpleCategory category;
|
||||
const auto &is_hashtag_letter = [&category](uint32 c) {
|
||||
category = get_unicode_simple_category(c);
|
||||
if (c == '_' || c == 0x200c) {
|
||||
return true;
|
||||
}
|
||||
switch (category) {
|
||||
case UnicodeSimpleCategory::DecimalNumber:
|
||||
case UnicodeSimpleCategory::Letter:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
while (true) {
|
||||
ptr = reinterpret_cast<const unsigned char *>(std::memchr(ptr, '#', narrow_cast<int32>(end - ptr)));
|
||||
@ -294,13 +297,14 @@ static vector<Slice> match_hashtags(Slice str) {
|
||||
break;
|
||||
}
|
||||
|
||||
uint32 prev = 0;
|
||||
if (ptr != begin) {
|
||||
uint32 prev;
|
||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
||||
}
|
||||
if (is_hashtag_letter(prev)) {
|
||||
ptr++;
|
||||
continue;
|
||||
|
||||
if (is_hashtag_letter(prev, category)) {
|
||||
ptr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
auto hashtag_begin = ++ptr;
|
||||
size_t hashtag_size = 0;
|
||||
@ -309,7 +313,7 @@ static vector<Slice> match_hashtags(Slice str) {
|
||||
while (ptr != end) {
|
||||
uint32 code;
|
||||
auto next_ptr = next_utf8_unsafe(ptr, &code);
|
||||
if (!is_hashtag_letter(code)) {
|
||||
if (!is_hashtag_letter(code, category)) {
|
||||
break;
|
||||
}
|
||||
ptr = next_ptr;
|
||||
@ -339,6 +343,54 @@ static vector<Slice> match_hashtags(Slice str) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static vector<Slice> match_cashtags(Slice str) {
|
||||
vector<Slice> result;
|
||||
const unsigned char *begin = str.ubegin();
|
||||
const unsigned char *end = str.uend();
|
||||
const unsigned char *ptr = begin;
|
||||
|
||||
// '/(?<=^|[^$\d_\pL\x{200c}])\$([A-Z]{3,8})(?![$\d_\pL\x{200c}])/u'
|
||||
|
||||
UnicodeSimpleCategory category;
|
||||
while (true) {
|
||||
ptr = reinterpret_cast<const unsigned char *>(std::memchr(ptr, '$', narrow_cast<int32>(end - ptr)));
|
||||
if (ptr == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ptr != begin) {
|
||||
uint32 prev;
|
||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
||||
|
||||
if (is_hashtag_letter(prev, category) || prev == '$') {
|
||||
ptr++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
auto cashtag_begin = ++ptr;
|
||||
while (ptr != end && 'Z' >= *ptr && *ptr >= 'A') {
|
||||
ptr++;
|
||||
}
|
||||
auto cashtag_end = ptr;
|
||||
auto cashtag_size = cashtag_end - cashtag_begin;
|
||||
if (cashtag_size < 3 || cashtag_size > 8) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cashtag_end != end) {
|
||||
uint32 code;
|
||||
next_utf8_unsafe(ptr, &code);
|
||||
if (is_hashtag_letter(code, category) || code == '$') {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
result.emplace_back(cashtag_begin - 1, cashtag_end);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static vector<Slice> match_urls(Slice str) {
|
||||
vector<Slice> result;
|
||||
const unsigned char *begin = str.ubegin();
|
||||
@ -941,6 +993,10 @@ vector<Slice> find_hashtags(Slice str) {
|
||||
return match_hashtags(str);
|
||||
}
|
||||
|
||||
vector<Slice> find_cashtags(Slice str) {
|
||||
return match_cashtags(str);
|
||||
}
|
||||
|
||||
vector<std::pair<Slice, bool>> find_urls(Slice str) {
|
||||
vector<std::pair<Slice, bool>> result;
|
||||
for (auto url : match_urls(str)) {
|
||||
|
@ -145,6 +145,7 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands, bool onl
|
||||
vector<Slice> find_mentions(Slice str);
|
||||
vector<Slice> find_bot_commands(Slice str);
|
||||
vector<Slice> find_hashtags(Slice str);
|
||||
vector<Slice> find_cashtags(Slice str);
|
||||
bool is_email_address(Slice str);
|
||||
vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address
|
||||
|
||||
|
@ -103,6 +103,9 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
|
||||
return ptr + 4;
|
||||
}
|
||||
UNREACHABLE();
|
||||
if (code) {
|
||||
*code = 0;
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
@ -109,6 +109,58 @@ TEST(MessageEntities, hashtag) {
|
||||
check_hashtag(u8"#a\u2122", {"#a"});
|
||||
}
|
||||
|
||||
static void check_cashtag(string str, std::vector<string> expected) {
|
||||
auto result_slice = find_cashtags(str);
|
||||
std::vector<string> result;
|
||||
for (auto &it : result_slice) {
|
||||
result.push_back(it.str());
|
||||
}
|
||||
if (result != expected) {
|
||||
LOG(FATAL) << tag("text", str) << tag("got", format::as_array(result))
|
||||
<< tag("expected", format::as_array(expected));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MessageEntities, cashtag) {
|
||||
check_cashtag("", {});
|
||||
check_cashtag("$", {});
|
||||
check_cashtag("$$", {});
|
||||
check_cashtag("$$$", {});
|
||||
check_cashtag("$a", {});
|
||||
check_cashtag(" $a", {});
|
||||
check_cashtag("$a ", {});
|
||||
check_cashtag(" $я ", {});
|
||||
check_cashtag("$ab", {});
|
||||
check_cashtag("$abc", {});
|
||||
check_cashtag("$", {});
|
||||
check_cashtag("$A", {});
|
||||
check_cashtag("$AB", {});
|
||||
check_cashtag("$АBC", {});
|
||||
check_cashtag("$АВС", {});
|
||||
check_cashtag("$ABC", {"$ABC"});
|
||||
check_cashtag("$ABCD", {"$ABCD"});
|
||||
check_cashtag("$ABCDE", {"$ABCDE"});
|
||||
check_cashtag("$ABCDEF", {"$ABCDEF"});
|
||||
check_cashtag("$ABCDEFG", {"$ABCDEFG"});
|
||||
check_cashtag("$ABCDEFGH", {"$ABCDEFGH"});
|
||||
check_cashtag("$ABCDEFGHJ", {});
|
||||
check_cashtag("$ABCDEFGH1", {});
|
||||
check_cashtag(" $XYZ", {"$XYZ"});
|
||||
check_cashtag("$XYZ ", {"$XYZ"});
|
||||
check_cashtag(" $XYZ ", {"$XYZ"});
|
||||
check_cashtag(" $$XYZ ", {});
|
||||
check_cashtag(" $XYZ$ ", {});
|
||||
check_cashtag(" $ABC1 ", {});
|
||||
check_cashtag(" $1ABC ", {});
|
||||
check_cashtag(" 1$ABC ", {});
|
||||
check_cashtag(" А$ABC ", {});
|
||||
check_cashtag("$ABC$DEF $GHI $KLM", {"$GHI", "$KLM"});
|
||||
check_cashtag("$TEST", {"$TEST"});
|
||||
check_cashtag(u8"$ABC\u2122", {"$ABC"});
|
||||
check_cashtag(u8"\u2122$ABC", {"$ABC"});
|
||||
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
|
||||
}
|
||||
|
||||
static void check_is_email_address(string str, bool expected) {
|
||||
bool result = is_email_address(str);
|
||||
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";
|
||||
|
Reference in New Issue
Block a user