Strip more empty characters.

This commit is contained in:
levlam 2024-02-01 21:16:01 +03:00
parent d6ba869457
commit db4bd6b133
2 changed files with 27 additions and 13 deletions

View File

@ -169,6 +169,7 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
CHECK(std::strlen(space_ch) == 3); CHECK(std::strlen(space_ch) == 3);
can_be_first[static_cast<unsigned char>(space_ch[0])] = true; can_be_first[static_cast<unsigned char>(space_ch[0])] = true;
} }
can_be_first[0xF3] = true;
return true; return true;
}(); }();
CHECK(can_be_first_inited); CHECK(can_be_first_inited);
@ -181,6 +182,14 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
size_t new_len = i; size_t new_len = i;
while (i < str.size()) { while (i < str.size()) {
if (can_be_first[static_cast<unsigned char>(str[i])] && i + 3 <= str.size()) { if (can_be_first[static_cast<unsigned char>(str[i])] && i + 3 <= str.size()) {
if (static_cast<unsigned char>(str[i]) == 0xF3) {
if (static_cast<unsigned char>(str[i + 1]) == 0xA0 && (static_cast<unsigned char>(str[i + 2]) & 0xFE) == 0x80 &&
i + 4 <= str.size()) {
str[new_len++] = ' ';
i += 4;
continue;
}
} else {
bool found = false; bool found = false;
for (auto space_ch : space_characters) { for (auto space_ch : space_characters) {
if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) { if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) {
@ -197,6 +206,7 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
continue; continue;
} }
} }
}
str[new_len++] = str[i++]; str[new_len++] = str[i++];
} }
Slice trimmed = trim(utf8_truncate(trim(Slice(str.c_str(), new_len)), max_length)); Slice trimmed = trim(utf8_truncate(trim(Slice(str.c_str(), new_len)), max_length));

View File

@ -112,4 +112,8 @@ TEST(StringCleaning, strip_empty_characters) {
check_strip_empty_characters( check_strip_empty_characters(
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae", 3, "\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae", 3,
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9"); "\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9");
check_strip_empty_characters(
"\xF3\x9F\xBF\xBF\xF3\xA0\x80\x80\xF3\xA0\x80\x81\xF3\xA0\x80\xBF\xF3\xA0\x81\x80\xF3\xA0\x81\x81\xF3\xA0\x81\xBF"
"\xF3\xA0\x82\x80",
9, "\xF3\x9F\xBF\xBF \xF3\xA0\x82\x80");
} }