Allow dashes in URLs.

This commit is contained in:
levlam 2021-06-03 15:21:26 +03:00
parent 1ab2f9fe9d
commit c9de490c7f
2 changed files with 13 additions and 12 deletions

View File

@ -485,6 +485,14 @@ static vector<Slice> match_bank_card_numbers(Slice str) {
return result;
}
static bool is_url_unicode_symbol(uint32 c) {
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
// Zero Width Non-Joiner/Joiner and various dashes
return c == 0x200c || c == 0x200d || (0x2010 <= c && c <= 0x2015);
}
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
}
static vector<Slice> match_urls(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
@ -518,10 +526,7 @@ static vector<Slice> match_urls(Slice str) {
case 0xbb: // »
return false;
default:
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
return c == 0x200c || c == 0x200d; // Zero Width Non-Joiner/Joiner
}
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
return is_url_unicode_symbol(c);
}
};
@ -529,10 +534,7 @@ static vector<Slice> match_urls(Slice str) {
if (c < 0xc0) {
return c == '.' || is_alpha_digit_or_underscore_or_minus(c) || c == '~';
}
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
return c == 0x200c || c == 0x200d; // Zero Width Non-Joiner/Joiner
}
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
return is_url_unicode_symbol(c);
};
const auto &is_path_symbol = [](uint32 c) {
@ -545,10 +547,7 @@ static vector<Slice> match_urls(Slice str) {
case 0xbb: // »
return false;
default:
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
return c == 0x200c || c == 0x200d; // Zero Width Non-Joiner/Joiner
}
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
return is_url_unicode_symbol(c);
}
};

View File

@ -582,6 +582,8 @@ TEST(MessageEntities, url) {
check_url("👉http://ab.com/cdefgh-1IJ", {"http://ab.com/cdefgh-1IJ"});
check_url("...👉http://ab.com/cdefgh-1IJ", {}); // TODO
check_url(".?", {});
check_url("http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――", {"http://test―@―google―.―com―/―–―‐―/―/―/―?―‑―#―――"});
check_url("http://google.com/‖", {"http://google.com/"});
}
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,