Allow dashes in URLs.
This commit is contained in:
parent
1ab2f9fe9d
commit
c9de490c7f
@ -485,6 +485,14 @@ static vector<Slice> match_bank_card_numbers(Slice str) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool is_url_unicode_symbol(uint32 c) {
|
||||||
|
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
|
||||||
|
// Zero Width Non-Joiner/Joiner and various dashes
|
||||||
|
return c == 0x200c || c == 0x200d || (0x2010 <= c && c <= 0x2015);
|
||||||
|
}
|
||||||
|
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
|
||||||
|
}
|
||||||
|
|
||||||
static vector<Slice> match_urls(Slice str) {
|
static vector<Slice> match_urls(Slice str) {
|
||||||
vector<Slice> result;
|
vector<Slice> result;
|
||||||
const unsigned char *begin = str.ubegin();
|
const unsigned char *begin = str.ubegin();
|
||||||
@ -518,10 +526,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
case 0xbb: // »
|
case 0xbb: // »
|
||||||
return false;
|
return false;
|
||||||
default:
|
default:
|
||||||
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
|
return is_url_unicode_symbol(c);
|
||||||
return c == 0x200c || c == 0x200d; // Zero Width Non-Joiner/Joiner
|
|
||||||
}
|
|
||||||
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -529,10 +534,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
if (c < 0xc0) {
|
if (c < 0xc0) {
|
||||||
return c == '.' || is_alpha_digit_or_underscore_or_minus(c) || c == '~';
|
return c == '.' || is_alpha_digit_or_underscore_or_minus(c) || c == '~';
|
||||||
}
|
}
|
||||||
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
|
return is_url_unicode_symbol(c);
|
||||||
return c == 0x200c || c == 0x200d; // Zero Width Non-Joiner/Joiner
|
|
||||||
}
|
|
||||||
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto &is_path_symbol = [](uint32 c) {
|
const auto &is_path_symbol = [](uint32 c) {
|
||||||
@ -545,10 +547,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
case 0xbb: // »
|
case 0xbb: // »
|
||||||
return false;
|
return false;
|
||||||
default:
|
default:
|
||||||
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
|
return is_url_unicode_symbol(c);
|
||||||
return c == 0x200c || c == 0x200d; // Zero Width Non-Joiner/Joiner
|
|
||||||
}
|
|
||||||
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -582,6 +582,8 @@ TEST(MessageEntities, url) {
|
|||||||
check_url("👉http://ab.com/cdefgh-1IJ", {"http://ab.com/cdefgh-1IJ"});
|
check_url("👉http://ab.com/cdefgh-1IJ", {"http://ab.com/cdefgh-1IJ"});
|
||||||
check_url("...👉http://ab.com/cdefgh-1IJ", {}); // TODO
|
check_url("...👉http://ab.com/cdefgh-1IJ", {}); // TODO
|
||||||
check_url(".?", {});
|
check_url(".?", {});
|
||||||
|
check_url("http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――", {"http://test―‑@―google―.―com―/―–―‐―/―/―/―?―‑―#―――"});
|
||||||
|
check_url("http://google.com/‖", {"http://google.com/"});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,
|
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user