Replace some offending characters in all strings.
GitOrigin-RevId: a942e9a71db7b5bc3c1be6b15f9b21d767c3d803
This commit is contained in:
parent
178211e069
commit
f42a955d75
@ -3473,6 +3473,8 @@ static Result<string> clean_input_string_with_entities(const string &text, vecto
|
||||
<< entity->offset + entity->length);
|
||||
}
|
||||
|
||||
replace_offending_characters(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -52,6 +52,20 @@ string clean_username(string str) {
|
||||
return trim(str);
|
||||
}
|
||||
|
||||
void replace_offending_characters(string &str) {
|
||||
// "(\xe2\x80\x8f|\xe2\x80\x8e){N}(\xe2\x80\x8f|\xe2\x80\x8e)" -> "(\xe2\x80\x8c){N}$2"
|
||||
auto s = MutableSlice(str).ubegin();
|
||||
for (size_t pos = 0; pos < str.size(); pos++) {
|
||||
if (s[pos] == 0xe2 && s[pos + 1] == 0x80 && (s[pos + 2] == 0x8e || s[pos + 2] == 0x8f)) {
|
||||
while (s[pos + 3] == 0xe2 && s[pos + 4] == 0x80 && (s[pos + 5] == 0x8e || s[pos + 5] == 0x8f)) {
|
||||
s[pos + 2] = static_cast<unsigned char>(0x8c);
|
||||
pos += 3;
|
||||
}
|
||||
pos += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool clean_input_string(string &str) {
|
||||
constexpr size_t LENGTH_LIMIT = 35000; // server side limit
|
||||
if (!check_utf8(str)) {
|
||||
@ -133,6 +147,9 @@ bool clean_input_string(string &str) {
|
||||
}
|
||||
|
||||
str.resize(new_size);
|
||||
|
||||
replace_offending_characters(str);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,9 @@ string clean_name(string str, size_t max_length) TD_WARN_UNUSED_RESULT;
|
||||
// prepares username/stickername for search
|
||||
string clean_username(string str) TD_WARN_UNUSED_RESULT;
|
||||
|
||||
// replaces some offending characters without changing string length
|
||||
void replace_offending_characters(string &str);
|
||||
|
||||
// removes control characters from the string, will fail if input string is not in UTF-8
|
||||
bool clean_input_string(string &str) TD_WARN_UNUSED_RESULT;
|
||||
|
||||
|
@ -1006,6 +1006,14 @@ TEST(MessageEntities, fix_formatted_text) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
check_fix_formatted_text(
|
||||
"\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e "
|
||||
"\xe2\x80\x8f",
|
||||
{},
|
||||
"\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e "
|
||||
"\xe2\x80\x8f",
|
||||
{});
|
||||
}
|
||||
|
||||
static void check_parse_html(td::string text, const td::string &result, const td::vector<td::MessageEntity> &entities) {
|
||||
|
@ -66,6 +66,12 @@ TEST(StringCleaning, clean_input_string) {
|
||||
check_clean_input_string(
|
||||
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae\xe2\x80\xaf",
|
||||
"\xe2\x80\xa7\xe2\x80\xaf", true);
|
||||
check_clean_input_string(
|
||||
"\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e "
|
||||
"\xe2\x80\x8f",
|
||||
"\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e "
|
||||
"\xe2\x80\x8f",
|
||||
true);
|
||||
check_clean_input_string("\xcc\xb3\xcc\xbf\xcc\x8a", "", true);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user