diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 1576623f..4524c8ce 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -3473,6 +3473,8 @@ static Result clean_input_string_with_entities(const string &text, vecto << entity->offset + entity->length); } + replace_offending_characters(result); + return result; } diff --git a/td/telegram/misc.cpp b/td/telegram/misc.cpp index 9405df4a..e6d9e944 100644 --- a/td/telegram/misc.cpp +++ b/td/telegram/misc.cpp @@ -52,6 +52,20 @@ string clean_username(string str) { return trim(str); } +void replace_offending_characters(string &str) { + // "(\xe2\x80\x8f|\xe2\x80\x8e){N}(\xe2\x80\x8f|\xe2\x80\x8e)" -> "(\xe2\x80\x8c){N}$2" + auto s = MutableSlice(str).ubegin(); + for (size_t pos = 0; pos < str.size(); pos++) { + if (s[pos] == 0xe2 && s[pos + 1] == 0x80 && (s[pos + 2] == 0x8e || s[pos + 2] == 0x8f)) { + while (s[pos + 3] == 0xe2 && s[pos + 4] == 0x80 && (s[pos + 5] == 0x8e || s[pos + 5] == 0x8f)) { + s[pos + 2] = static_cast(0x8c); + pos += 3; + } + pos += 2; + } + } +} + bool clean_input_string(string &str) { constexpr size_t LENGTH_LIMIT = 35000; // server side limit if (!check_utf8(str)) { @@ -133,6 +147,9 @@ bool clean_input_string(string &str) { } str.resize(new_size); + + replace_offending_characters(str); + return true; } diff --git a/td/telegram/misc.h b/td/telegram/misc.h index 2f033150..0b62e315 100644 --- a/td/telegram/misc.h +++ b/td/telegram/misc.h @@ -18,6 +18,9 @@ string clean_name(string str, size_t max_length) TD_WARN_UNUSED_RESULT; // prepares username/stickername for search string clean_username(string str) TD_WARN_UNUSED_RESULT; +// replaces some offending characters without changing string length +void replace_offending_characters(string &str); + // removes control characters from the string, will fail if input string is not in UTF-8 bool clean_input_string(string &str) TD_WARN_UNUSED_RESULT; diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 9e1b5ab7..ae4b7a31 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -1006,6 +1006,14 @@ TEST(MessageEntities, fix_formatted_text) { } } } + + check_fix_formatted_text( + "\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e " + "\xe2\x80\x8f", + {}, + "\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e " + "\xe2\x80\x8f", + {}); } static void check_parse_html(td::string text, const td::string &result, const td::vector &entities) { diff --git a/test/string_cleaning.cpp b/test/string_cleaning.cpp index 890c1a5b..4ac98fc2 100644 --- a/test/string_cleaning.cpp +++ b/test/string_cleaning.cpp @@ -66,6 +66,12 @@ TEST(StringCleaning, clean_input_string) { check_clean_input_string( "\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae\xe2\x80\xaf", "\xe2\x80\xa7\xe2\x80\xaf", true); + check_clean_input_string( + "\xe2\x80\x8f\xe2\x80\x8f \xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8f\xe2\x80\x8e " + "\xe2\x80\x8f", + "\xe2\x80\x8c\xe2\x80\x8f \xe2\x80\x8c\xe2\x80\x8c\xe2\x80\x8e\xe2\x80\x8c \xe2\x80\x8c\xe2\x80\x8e " + "\xe2\x80\x8f", + true); check_clean_input_string("\xcc\xb3\xcc\xbf\xcc\x8a", "", true); }