Replace some offending characters in all strings.
GitOrigin-RevId: a942e9a71db7b5bc3c1be6b15f9b21d767c3d803
This commit is contained in:
parent
178211e069
commit
f42a955d75
@ -3473,6 +3473,8 @@ static Result<string> clean_input_string_with_entities(const string &text, vecto
|
|||||||
<< entity->offset + entity->length);
|
<< entity->offset + entity->length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
replace_offending_characters(result);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,6 +52,20 @@ string clean_username(string str) {
|
|||||||
return trim(str);
|
return trim(str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void replace_offending_characters(string &str) {
|
||||||
|
// "(\xe2\x80\x8f|\xe2\x80\x8e){N}(\xe2\x80\x8f|\xe2\x80\x8e)" -> "(\xe2\x80\x8c){N}$2"
|
||||||
|
auto s = MutableSlice(str).ubegin();
|
||||||
|
for (size_t pos = 0; pos < str.size(); pos++) {
|
||||||
|
if (s[pos] == 0xe2 && s[pos + 1] == 0x80 && (s[pos + 2] == 0x8e || s[pos + 2] == 0x8f)) {
|
||||||
|
while (s[pos + 3] == 0xe2 && s[pos + 4] == 0x80 && (s[pos + 5] == 0x8e || s[pos + 5] == 0x8f)) {
|
||||||
|
s[pos + 2] = static_cast<unsigned char>(0x8c);
|
||||||
|
pos += 3;
|
||||||
|
}
|
||||||
|
pos += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool clean_input_string(string &str) {
|
bool clean_input_string(string &str) {
|
||||||
constexpr size_t LENGTH_LIMIT = 35000; // server side limit
|
constexpr size_t LENGTH_LIMIT = 35000; // server side limit
|
||||||
if (!check_utf8(str)) {
|
if (!check_utf8(str)) {
|
||||||
@ -133,6 +147,9 @@ bool clean_input_string(string &str) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
str.resize(new_size);
|
str.resize(new_size);
|
||||||
|
|
||||||
|
replace_offending_characters(str);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,9 @@ string clean_name(string str, size_t max_length) TD_WARN_UNUSED_RESULT;
|
|||||||
// prepares username/stickername for search
|
// prepares username/stickername for search
|
||||||
string clean_username(string str) TD_WARN_UNUSED_RESULT;
|
string clean_username(string str) TD_WARN_UNUSED_RESULT;
|
||||||
|
|
||||||
|
// replaces some offending characters without changing string length
|
||||||
|
void replace_offending_characters(string &str);
|
||||||
|
|
||||||
// removes control characters from the string, will fail if input string is not in UTF-8
|
// removes control characters from the string, will fail if input string is not in UTF-8
|
||||||
bool clean_input_string(string &str) TD_WARN_UNUSED_RESULT;
|
bool clean_input_string(string &str) TD_WARN_UNUSED_RESULT;
|
||||||
|
|
||||||
|
@ -1006,6 +1006,14 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
check_fix_formatted_text(
|
||||||
|
"\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e "
|
||||||
|
"\xe2\x80\x8f",
|
||||||
|
{},
|
||||||
|
"\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e "
|
||||||
|
"\xe2\x80\x8f",
|
||||||
|
{});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_parse_html(td::string text, const td::string &result, const td::vector<td::MessageEntity> &entities) {
|
static void check_parse_html(td::string text, const td::string &result, const td::vector<td::MessageEntity> &entities) {
|
||||||
|
@ -66,6 +66,12 @@ TEST(StringCleaning, clean_input_string) {
|
|||||||
check_clean_input_string(
|
check_clean_input_string(
|
||||||
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae\xe2\x80\xaf",
|
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae\xe2\x80\xaf",
|
||||||
"\xe2\x80\xa7\xe2\x80\xaf", true);
|
"\xe2\x80\xa7\xe2\x80\xaf", true);
|
||||||
|
check_clean_input_string(
|
||||||
|
"\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e "
|
||||||
|
"\xe2\x80\x8f",
|
||||||
|
"\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e "
|
||||||
|
"\xe2\x80\x8f",
|
||||||
|
true);
|
||||||
check_clean_input_string("\xcc\xb3\xcc\xbf\xcc\x8a", "", true);
|
check_clean_input_string("\xcc\xb3\xcc\xbf\xcc\x8a", "", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user