Improve URL and email matching.

This commit is contained in:
levlam 2021-09-09 17:59:24 +03:00
parent 9bf5b5343b
commit f63196f9fa
2 changed files with 40 additions and 19 deletions

View File

@ -668,6 +668,7 @@ static vector<Slice> match_urls(Slice str) {
case '<':
case '>':
case '"':
case '@':
case 0xab: // «
case 0xbb: // »
return false;
@ -697,20 +698,34 @@ static vector<Slice> match_urls(Slice str) {
continue;
}
const unsigned char *last_at_ptr = nullptr;
const unsigned char *domain_end_ptr = begin + dot_pos;
while (domain_end_ptr != end) {
const unsigned char *domain_begin_ptr = begin + dot_pos;
while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls");
if (code == '@') {
last_at_ptr = domain_end_ptr;
}
if (!is_user_data_symbol(code)) {
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 0");
if (!is_domain_symbol(code)) {
domain_begin_ptr = next_ptr;
break;
}
domain_end_ptr = next_ptr;
}
domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
const unsigned char *last_at_ptr = nullptr;
const unsigned char *domain_end_ptr = begin + dot_pos;
if (domain_begin_ptr == begin || domain_begin_ptr[-1] != '@') {
// try to find '@' to the right if there is no '@' to the left
while (domain_end_ptr != end) {
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls");
if (code == '@') {
last_at_ptr = domain_end_ptr;
}
if (!is_user_data_symbol(code)) {
break;
}
domain_end_ptr = next_ptr;
}
domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
}
while (domain_end_ptr != end) {
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2");
@ -720,14 +735,15 @@ static vector<Slice> match_urls(Slice str) {
domain_end_ptr = next_ptr;
}
const unsigned char *domain_begin_ptr = begin + dot_pos;
while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3");
if (last_at_ptr == nullptr ? !is_domain_symbol(code) : !is_user_data_symbol(code)) {
domain_begin_ptr = next_ptr;
break;
if (last_at_ptr != nullptr) {
while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3");
if (!is_user_data_symbol(code)) {
domain_begin_ptr = next_ptr;
break;
}
}
}
// LOG(ERROR) << "Domain: " << Slice(domain_begin_ptr, domain_end_ptr);
@ -776,6 +792,9 @@ static vector<Slice> match_urls(Slice str) {
bool is_bad = false;
const unsigned char *url_begin_ptr = domain_begin_ptr;
if (url_begin_ptr != begin && url_begin_ptr[-1] == '@') {
if (last_at_ptr != nullptr) {
is_bad = true;
}
auto user_data_begin_ptr = url_begin_ptr - 1;
while (user_data_begin_ptr != begin) {
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);

View File

@ -480,7 +480,7 @@ TEST(MessageEntities, url) {
check_url(".", {});
check_url("http://@google.com", {});
check_url("http://@goog.com", {}); // TODO: server fix
check_url("http://@@google.com", {"http://@@google.com"});
check_url("http://@@google.com", {});
check_url("http://a@google.com", {"http://a@google.com"});
check_url("http://test@google.com", {"http://test@google.com"});
check_url("google.com:᪉᪉᪉᪉᪉", {"google.com"});
@ -692,6 +692,8 @@ TEST(MessageEntities, url) {
check_url(".?", {});
check_url("http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――", {"http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――"});
check_url("http://google.com/‖", {"http://google.com/"});
check_url("a@b@c.com", {}, {});
check_url("a@b.com:c@1", {}, {"a@b.com"});
}
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,