diff --git a/td/telegram/misc.cpp b/td/telegram/misc.cpp index d2a0abd0..f99e222d 100644 --- a/td/telegram/misc.cpp +++ b/td/telegram/misc.cpp @@ -156,8 +156,8 @@ bool clean_input_string(string &str) { string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) { static const char *space_characters[] = {u8"\u1680", u8"\u180E", u8"\u2000", u8"\u2001", u8"\u2002", u8"\u2003", u8"\u2004", u8"\u2005", u8"\u2006", u8"\u2007", - u8"\u2008", u8"\u2009", u8"\u200A", u8"\u200B", u8"\u202E", - u8"\u202F", u8"\u205F", u8"\u3000", u8"\uFEFF", u8"\uFFFC"}; + u8"\u2008", u8"\u2009", u8"\u200A", u8"\u202E", u8"\u202F", + u8"\u205F", u8"\u2800", u8"\u3000", u8"\uFFFC"}; static bool can_be_first[std::numeric_limits::max() + 1]; static bool can_be_first_inited = [&] { for (auto space_ch : space_characters) { @@ -197,9 +197,13 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) { Slice trimmed = trim(utf8_truncate(trim(Slice(str.c_str(), new_len)), max_length)); // check if there is some non-empty character, empty characters: + // "\xE2\x80\x8B", ZERO WIDTH SPACE // "\xE2\x80\x8C", ZERO WIDTH NON-JOINER // "\xE2\x80\x8D", ZERO WIDTH JOINER + // "\xE2\x80\x8E", LEFT-TO-RIGHT MARK + // "\xE2\x80\x8F", RIGHT-TO-LEFT MARK // "\xE2\x80\xAE", RIGHT-TO-LEFT OVERRIDE + // "\xEF\xBB\xBF", ZERO WIDTH NO-BREAK SPACE aka BYTE ORDER MARK // "\xC2\xA0", NO-BREAK SPACE for (i = 0;;) { if (i == trimmed.size()) { @@ -211,9 +215,15 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) { i++; continue; } - if (static_cast(trimmed[i]) == 0xE2 && static_cast(trimmed[i + 1]) == 0x80 && - (static_cast(trimmed[i + 2]) == 0x8C || static_cast(trimmed[i + 2]) == 0x8D || - static_cast(trimmed[i + 2]) == 0xAE)) { + if (static_cast(trimmed[i]) == 0xE2 && static_cast(trimmed[i + 1]) == 0x80) { + auto next = static_cast(trimmed[i + 2]); + if ((0x8B <= next && next <= 0x8F) || next == 0xAE) { + i += 3; + continue; + } + } + if (static_cast(trimmed[i]) == 0xEF && static_cast(trimmed[i + 1]) == 0xBB && + static_cast(trimmed[i + 2]) == 0xBF) { i += 3; continue; } diff --git a/test/string_cleaning.cpp b/test/string_cleaning.cpp index 4ac98fc2..71535b0f 100644 --- a/test/string_cleaning.cpp +++ b/test/string_cleaning.cpp @@ -85,11 +85,11 @@ TEST(StringCleaning, strip_empty_characters) { check_strip_empty_characters("/abc", 0, ""); check_strip_empty_characters("/abc", 10000000, "/abc"); string spaces = - u8"\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF" - u8"\uFFFC\uFFFC"; - string spaces_replace = " "; + u8"\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F\u205F\u2800\u3000\uFFFC" + u8"\uFFFC"; + string spaces_replace = " "; string rtlo = u8"\u202E"; - string empty = "\xE2\x80\x8C\xE2\x80\x8D\xE2\x80\xAE\xC2\xA0\xC2\xA0"; + string empty = "\xE2\x80\x8B\xE2\x80\x8C\xE2\x80\x8D\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAE\xC2\xA0\xC2\xA0"; check_strip_empty_characters(spaces, 1000000, ""); check_strip_empty_characters(spaces + rtlo, 1000000, "");