Replace some offending characters in all strings.

GitOrigin-RevId: a942e9a71db7b5bc3c1be6b15f9b21d767c3d803
This commit is contained in:
levlam 2020-04-08 01:23:05 +03:00
parent 178211e069
commit f42a955d75
5 changed files with 36 additions and 0 deletions

View File

@ -3473,6 +3473,8 @@ static Result<string> clean_input_string_with_entities(const string &text, vecto
<< entity->offset + entity->length);
}
replace_offending_characters(result);
return result;
}

View File

@ -52,6 +52,20 @@ string clean_username(string str) {
return trim(str);
}
void replace_offending_characters(string &str) {
// "(\xe2\x80\x8f|\xe2\x80\x8e){N}(\xe2\x80\x8f|\xe2\x80\x8e)" -> "(\xe2\x80\x8c){N}$2"
auto s = MutableSlice(str).ubegin();
for (size_t pos = 0; pos < str.size(); pos++) {
if (s[pos] == 0xe2 && s[pos + 1] == 0x80 && (s[pos + 2] == 0x8e || s[pos + 2] == 0x8f)) {
while (s[pos + 3] == 0xe2 && s[pos + 4] == 0x80 && (s[pos + 5] == 0x8e || s[pos + 5] == 0x8f)) {
s[pos + 2] = static_cast<unsigned char>(0x8c);
pos += 3;
}
pos += 2;
}
}
}
bool clean_input_string(string &str) {
constexpr size_t LENGTH_LIMIT = 35000; // server side limit
if (!check_utf8(str)) {
@ -133,6 +147,9 @@ bool clean_input_string(string &str) {
}
str.resize(new_size);
replace_offending_characters(str);
return true;
}

View File

@ -18,6 +18,9 @@ string clean_name(string str, size_t max_length) TD_WARN_UNUSED_RESULT;
// prepares username/stickername for search
string clean_username(string str) TD_WARN_UNUSED_RESULT;
// replaces some offending characters without changing string length
void replace_offending_characters(string &str);
// removes control characters from the string, will fail if input string is not in UTF-8
bool clean_input_string(string &str) TD_WARN_UNUSED_RESULT;

View File

@ -1006,6 +1006,14 @@ TEST(MessageEntities, fix_formatted_text) {
}
}
}
check_fix_formatted_text(
"\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e "
"\xe2\x80\x8f",
{},
"\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e "
"\xe2\x80\x8f",
{});
}
static void check_parse_html(td::string text, const td::string &result, const td::vector<td::MessageEntity> &entities) {

View File

@ -66,6 +66,12 @@ TEST(StringCleaning, clean_input_string) {
check_clean_input_string(
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae\xe2\x80\xaf",
"\xe2\x80\xa7\xe2\x80\xaf", true);
check_clean_input_string(
"\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e "
"\xe2\x80\x8f",
"\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e "
"\xe2\x80\x8f",
true);
check_clean_input_string("\xcc\xb3\xcc\xbf\xcc\x8a", "", true);
}