Improve URL and email matching.
This commit is contained in:
parent
9bf5b5343b
commit
f63196f9fa
@ -668,6 +668,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
case '<':
|
case '<':
|
||||||
case '>':
|
case '>':
|
||||||
case '"':
|
case '"':
|
||||||
|
case '@':
|
||||||
case 0xab: // «
|
case 0xab: // «
|
||||||
case 0xbb: // »
|
case 0xbb: // »
|
||||||
return false;
|
return false;
|
||||||
@ -697,8 +698,21 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const unsigned char *domain_begin_ptr = begin + dot_pos;
|
||||||
|
while (domain_begin_ptr != begin) {
|
||||||
|
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
|
||||||
|
uint32 code = 0;
|
||||||
|
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 0");
|
||||||
|
if (!is_domain_symbol(code)) {
|
||||||
|
domain_begin_ptr = next_ptr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const unsigned char *last_at_ptr = nullptr;
|
const unsigned char *last_at_ptr = nullptr;
|
||||||
const unsigned char *domain_end_ptr = begin + dot_pos;
|
const unsigned char *domain_end_ptr = begin + dot_pos;
|
||||||
|
if (domain_begin_ptr == begin || domain_begin_ptr[-1] != '@') {
|
||||||
|
// try to find '@' to the right if there is no '@' to the left
|
||||||
while (domain_end_ptr != end) {
|
while (domain_end_ptr != end) {
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls");
|
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls");
|
||||||
@ -711,6 +725,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
domain_end_ptr = next_ptr;
|
domain_end_ptr = next_ptr;
|
||||||
}
|
}
|
||||||
domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
|
domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
|
||||||
|
}
|
||||||
while (domain_end_ptr != end) {
|
while (domain_end_ptr != end) {
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2");
|
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2");
|
||||||
@ -720,16 +735,17 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
domain_end_ptr = next_ptr;
|
domain_end_ptr = next_ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
const unsigned char *domain_begin_ptr = begin + dot_pos;
|
if (last_at_ptr != nullptr) {
|
||||||
while (domain_begin_ptr != begin) {
|
while (domain_begin_ptr != begin) {
|
||||||
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
|
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3");
|
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3");
|
||||||
if (last_at_ptr == nullptr ? !is_domain_symbol(code) : !is_user_data_symbol(code)) {
|
if (!is_user_data_symbol(code)) {
|
||||||
domain_begin_ptr = next_ptr;
|
domain_begin_ptr = next_ptr;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// LOG(ERROR) << "Domain: " << Slice(domain_begin_ptr, domain_end_ptr);
|
// LOG(ERROR) << "Domain: " << Slice(domain_begin_ptr, domain_end_ptr);
|
||||||
|
|
||||||
const unsigned char *url_end_ptr = domain_end_ptr;
|
const unsigned char *url_end_ptr = domain_end_ptr;
|
||||||
@ -776,6 +792,9 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
bool is_bad = false;
|
bool is_bad = false;
|
||||||
const unsigned char *url_begin_ptr = domain_begin_ptr;
|
const unsigned char *url_begin_ptr = domain_begin_ptr;
|
||||||
if (url_begin_ptr != begin && url_begin_ptr[-1] == '@') {
|
if (url_begin_ptr != begin && url_begin_ptr[-1] == '@') {
|
||||||
|
if (last_at_ptr != nullptr) {
|
||||||
|
is_bad = true;
|
||||||
|
}
|
||||||
auto user_data_begin_ptr = url_begin_ptr - 1;
|
auto user_data_begin_ptr = url_begin_ptr - 1;
|
||||||
while (user_data_begin_ptr != begin) {
|
while (user_data_begin_ptr != begin) {
|
||||||
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);
|
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);
|
||||||
|
@ -480,7 +480,7 @@ TEST(MessageEntities, url) {
|
|||||||
check_url(".", {});
|
check_url(".", {});
|
||||||
check_url("http://@google.com", {});
|
check_url("http://@google.com", {});
|
||||||
check_url("http://@goog.com", {}); // TODO: server fix
|
check_url("http://@goog.com", {}); // TODO: server fix
|
||||||
check_url("http://@@google.com", {"http://@@google.com"});
|
check_url("http://@@google.com", {});
|
||||||
check_url("http://a@google.com", {"http://a@google.com"});
|
check_url("http://a@google.com", {"http://a@google.com"});
|
||||||
check_url("http://test@google.com", {"http://test@google.com"});
|
check_url("http://test@google.com", {"http://test@google.com"});
|
||||||
check_url("google.com:᪉᪉᪉᪉᪉", {"google.com"});
|
check_url("google.com:᪉᪉᪉᪉᪉", {"google.com"});
|
||||||
@ -692,6 +692,8 @@ TEST(MessageEntities, url) {
|
|||||||
check_url(".?", {});
|
check_url(".?", {});
|
||||||
check_url("http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――", {"http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――"});
|
check_url("http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――", {"http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――"});
|
||||||
check_url("http://google.com/‖", {"http://google.com/"});
|
check_url("http://google.com/‖", {"http://google.com/"});
|
||||||
|
check_url("a@b@c.com", {}, {});
|
||||||
|
check_url("a@b.com:c@1", {}, {"a@b.com"});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,
|
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,
|
||||||
|
Loading…
Reference in New Issue
Block a user