Improve URL and email matching.

This commit is contained in:
levlam 2021-09-09 17:59:24 +03:00
parent 9bf5b5343b
commit f63196f9fa
2 changed files with 40 additions and 19 deletions

View File

@ -668,6 +668,7 @@ static vector<Slice> match_urls(Slice str) {
case '<': case '<':
case '>': case '>':
case '"': case '"':
case '@':
case 0xab: // « case 0xab: // «
case 0xbb: // » case 0xbb: // »
return false; return false;
@ -697,8 +698,21 @@ static vector<Slice> match_urls(Slice str) {
continue; continue;
} }
const unsigned char *domain_begin_ptr = begin + dot_pos;
while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 0");
if (!is_domain_symbol(code)) {
domain_begin_ptr = next_ptr;
break;
}
}
const unsigned char *last_at_ptr = nullptr; const unsigned char *last_at_ptr = nullptr;
const unsigned char *domain_end_ptr = begin + dot_pos; const unsigned char *domain_end_ptr = begin + dot_pos;
if (domain_begin_ptr == begin || domain_begin_ptr[-1] != '@') {
// try to find '@' to the right if there is no '@' to the left
while (domain_end_ptr != end) { while (domain_end_ptr != end) {
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls"); auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls");
@ -711,6 +725,7 @@ static vector<Slice> match_urls(Slice str) {
domain_end_ptr = next_ptr; domain_end_ptr = next_ptr;
} }
domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1; domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
}
while (domain_end_ptr != end) { while (domain_end_ptr != end) {
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2"); auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2");
@ -720,16 +735,17 @@ static vector<Slice> match_urls(Slice str) {
domain_end_ptr = next_ptr; domain_end_ptr = next_ptr;
} }
const unsigned char *domain_begin_ptr = begin + dot_pos; if (last_at_ptr != nullptr) {
while (domain_begin_ptr != begin) { while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr); domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3"); auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3");
if (last_at_ptr == nullptr ? !is_domain_symbol(code) : !is_user_data_symbol(code)) { if (!is_user_data_symbol(code)) {
domain_begin_ptr = next_ptr; domain_begin_ptr = next_ptr;
break; break;
} }
} }
}
// LOG(ERROR) << "Domain: " << Slice(domain_begin_ptr, domain_end_ptr); // LOG(ERROR) << "Domain: " << Slice(domain_begin_ptr, domain_end_ptr);
const unsigned char *url_end_ptr = domain_end_ptr; const unsigned char *url_end_ptr = domain_end_ptr;
@ -776,6 +792,9 @@ static vector<Slice> match_urls(Slice str) {
bool is_bad = false; bool is_bad = false;
const unsigned char *url_begin_ptr = domain_begin_ptr; const unsigned char *url_begin_ptr = domain_begin_ptr;
if (url_begin_ptr != begin && url_begin_ptr[-1] == '@') { if (url_begin_ptr != begin && url_begin_ptr[-1] == '@') {
if (last_at_ptr != nullptr) {
is_bad = true;
}
auto user_data_begin_ptr = url_begin_ptr - 1; auto user_data_begin_ptr = url_begin_ptr - 1;
while (user_data_begin_ptr != begin) { while (user_data_begin_ptr != begin) {
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr); user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);

View File

@ -480,7 +480,7 @@ TEST(MessageEntities, url) {
check_url(".", {}); check_url(".", {});
check_url("http://@google.com", {}); check_url("http://@google.com", {});
check_url("http://@goog.com", {}); // TODO: server fix check_url("http://@goog.com", {}); // TODO: server fix
check_url("http://@@google.com", {"http://@@google.com"}); check_url("http://@@google.com", {});
check_url("http://a@google.com", {"http://a@google.com"}); check_url("http://a@google.com", {"http://a@google.com"});
check_url("http://test@google.com", {"http://test@google.com"}); check_url("http://test@google.com", {"http://test@google.com"});
check_url("google.com:᪉᪉᪉᪉᪉", {"google.com"}); check_url("google.com:᪉᪉᪉᪉᪉", {"google.com"});
@ -692,6 +692,8 @@ TEST(MessageEntities, url) {
check_url(".?", {}); check_url(".?", {});
check_url("http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――", {"http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――"}); check_url("http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――", {"http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――"});
check_url("http://google.com/‖", {"http://google.com/"}); check_url("http://google.com/‖", {"http://google.com/"});
check_url("a@b@c.com", {}, {});
check_url("a@b.com:c@1", {}, {"a@b.com"});
} }
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities, static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,