Implement find_cashtags.

GitOrigin-RevId: 40026a19c1ee1f1ea9c0157d28bdc0b67cbf9c58
This commit is contained in:
levlam 2018-03-07 20:29:33 +03:00
parent 4daeaa593e
commit a665c4e822
4 changed files with 141 additions and 29 deletions

View File

@ -177,14 +177,15 @@ static vector<Slice> match_mentions(Slice str) {
break;
}
uint32 prev = 0;
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
}
if (is_word_character(prev)) {
ptr++;
continue;
}
}
auto mention_begin = ++ptr;
while (ptr != end && is_alpha_digit_or_underscore(*ptr)) {
ptr++;
@ -220,14 +221,15 @@ static vector<Slice> match_bot_commands(Slice str) {
break;
}
uint32 prev = 0;
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
}
if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
ptr++;
continue;
}
}
auto command_begin = ++ptr;
while (ptr != end && is_alpha_digit_or_underscore(*ptr)) {
@ -264,17 +266,7 @@ static vector<Slice> match_bot_commands(Slice str) {
return result;
}
static vector<Slice> match_hashtags(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<=^|[^\d_\pL\x{200c}])#([\d_\pL\x{200c}]{1,256})(?![\d_\pL\x{200c}]*#)/u'
// and at least one letter
UnicodeSimpleCategory category;
const auto &is_hashtag_letter = [&category](uint32 c) {
static bool is_hashtag_letter(uint32 c, UnicodeSimpleCategory &category) {
category = get_unicode_simple_category(c);
if (c == '_' || c == 0x200c) {
return true;
@ -286,7 +278,18 @@ static vector<Slice> match_hashtags(Slice str) {
default:
return false;
}
};
}
static vector<Slice> match_hashtags(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<=^|[^\d_\pL\x{200c}])#([\d_\pL\x{200c}]{1,256})(?![\d_\pL\x{200c}]*#)/u'
// and at least one letter
UnicodeSimpleCategory category;
while (true) {
ptr = reinterpret_cast<const unsigned char *>(std::memchr(ptr, '#', narrow_cast<int32>(end - ptr)));
@ -294,14 +297,15 @@ static vector<Slice> match_hashtags(Slice str) {
break;
}
uint32 prev = 0;
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
}
if (is_hashtag_letter(prev)) {
if (is_hashtag_letter(prev, category)) {
ptr++;
continue;
}
}
auto hashtag_begin = ++ptr;
size_t hashtag_size = 0;
const unsigned char *hashtag_end = nullptr;
@ -309,7 +313,7 @@ static vector<Slice> match_hashtags(Slice str) {
while (ptr != end) {
uint32 code;
auto next_ptr = next_utf8_unsafe(ptr, &code);
if (!is_hashtag_letter(code)) {
if (!is_hashtag_letter(code, category)) {
break;
}
ptr = next_ptr;
@ -339,6 +343,54 @@ static vector<Slice> match_hashtags(Slice str) {
return result;
}
static vector<Slice> match_cashtags(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<=^|[^$\d_\pL\x{200c}])\$([A-Z]{3,8})(?![$\d_\pL\x{200c}])/u'
UnicodeSimpleCategory category;
while (true) {
ptr = reinterpret_cast<const unsigned char *>(std::memchr(ptr, '$', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_hashtag_letter(prev, category) || prev == '$') {
ptr++;
continue;
}
}
auto cashtag_begin = ++ptr;
while (ptr != end && 'Z' >= *ptr && *ptr >= 'A') {
ptr++;
}
auto cashtag_end = ptr;
auto cashtag_size = cashtag_end - cashtag_begin;
if (cashtag_size < 3 || cashtag_size > 8) {
continue;
}
if (cashtag_end != end) {
uint32 code;
next_utf8_unsafe(ptr, &code);
if (is_hashtag_letter(code, category) || code == '$') {
continue;
}
}
result.emplace_back(cashtag_begin - 1, cashtag_end);
}
return result;
}
static vector<Slice> match_urls(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
@ -941,6 +993,10 @@ vector<Slice> find_hashtags(Slice str) {
return match_hashtags(str);
}
vector<Slice> find_cashtags(Slice str) {
return match_cashtags(str);
}
vector<std::pair<Slice, bool>> find_urls(Slice str) {
vector<std::pair<Slice, bool>> result;
for (auto url : match_urls(str)) {

View File

@ -145,6 +145,7 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands, bool onl
vector<Slice> find_mentions(Slice str);
vector<Slice> find_bot_commands(Slice str);
vector<Slice> find_hashtags(Slice str);
vector<Slice> find_cashtags(Slice str);
bool is_email_address(Slice str);
vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address

View File

@ -103,6 +103,9 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
return ptr + 4;
}
UNREACHABLE();
if (code) {
*code = 0;
}
return ptr;
}

View File

@ -109,6 +109,58 @@ TEST(MessageEntities, hashtag) {
check_hashtag(u8"#a\u2122", {"#a"});
}
static void check_cashtag(string str, std::vector<string> expected) {
auto result_slice = find_cashtags(str);
std::vector<string> result;
for (auto &it : result_slice) {
result.push_back(it.str());
}
if (result != expected) {
LOG(FATAL) << tag("text", str) << tag("got", format::as_array(result))
<< tag("expected", format::as_array(expected));
}
}
TEST(MessageEntities, cashtag) {
check_cashtag("", {});
check_cashtag("$", {});
check_cashtag("$$", {});
check_cashtag("$$$", {});
check_cashtag("$a", {});
check_cashtag(" $a", {});
check_cashtag("$a ", {});
check_cashtag("", {});
check_cashtag("$ab", {});
check_cashtag("$abc", {});
check_cashtag("$", {});
check_cashtag("$A", {});
check_cashtag("$AB", {});
check_cashtag("$АBC", {});
check_cashtag("$АВС", {});
check_cashtag("$ABC", {"$ABC"});
check_cashtag("$ABCD", {"$ABCD"});
check_cashtag("$ABCDE", {"$ABCDE"});
check_cashtag("$ABCDEF", {"$ABCDEF"});
check_cashtag("$ABCDEFG", {"$ABCDEFG"});
check_cashtag("$ABCDEFGH", {"$ABCDEFGH"});
check_cashtag("$ABCDEFGHJ", {});
check_cashtag("$ABCDEFGH1", {});
check_cashtag(" $XYZ", {"$XYZ"});
check_cashtag("$XYZ ", {"$XYZ"});
check_cashtag(" $XYZ ", {"$XYZ"});
check_cashtag(" $$XYZ ", {});
check_cashtag(" $XYZ$ ", {});
check_cashtag(" $ABC1 ", {});
check_cashtag(" $1ABC ", {});
check_cashtag(" 1$ABC ", {});
check_cashtag(" А$ABC ", {});
check_cashtag("$ABC$DEF $GHI $KLM", {"$GHI", "$KLM"});
check_cashtag("$TEST", {"$TEST"});
check_cashtag(u8"$ABC\u2122", {"$ABC"});
check_cashtag(u8"\u2122$ABC", {"$ABC"});
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
}
static void check_is_email_address(string str, bool expected) {
bool result = is_email_address(str);
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";