Strip more empty characters.
This commit is contained in:
parent
d6ba869457
commit
db4bd6b133
@ -169,6 +169,7 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
||||
CHECK(std::strlen(space_ch) == 3);
|
||||
can_be_first[static_cast<unsigned char>(space_ch[0])] = true;
|
||||
}
|
||||
can_be_first[0xF3] = true;
|
||||
return true;
|
||||
}();
|
||||
CHECK(can_be_first_inited);
|
||||
@ -181,20 +182,29 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
||||
size_t new_len = i;
|
||||
while (i < str.size()) {
|
||||
if (can_be_first[static_cast<unsigned char>(str[i])] && i + 3 <= str.size()) {
|
||||
bool found = false;
|
||||
for (auto space_ch : space_characters) {
|
||||
if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) {
|
||||
if (static_cast<unsigned char>(str[i + 2]) != 0xAE || static_cast<unsigned char>(str[i + 1]) != 0x80 ||
|
||||
static_cast<unsigned char>(str[i]) != 0xE2 || strip_rtlo) {
|
||||
found = true;
|
||||
}
|
||||
break;
|
||||
if (static_cast<unsigned char>(str[i]) == 0xF3) {
|
||||
if (static_cast<unsigned char>(str[i + 1]) == 0xA0 && (static_cast<unsigned char>(str[i + 2]) & 0xFE) == 0x80 &&
|
||||
i + 4 <= str.size()) {
|
||||
str[new_len++] = ' ';
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
bool found = false;
|
||||
for (auto space_ch : space_characters) {
|
||||
if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) {
|
||||
if (static_cast<unsigned char>(str[i + 2]) != 0xAE || static_cast<unsigned char>(str[i + 1]) != 0x80 ||
|
||||
static_cast<unsigned char>(str[i]) != 0xE2 || strip_rtlo) {
|
||||
found = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found) {
|
||||
str[new_len++] = ' ';
|
||||
i += 3;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (found) {
|
||||
str[new_len++] = ' ';
|
||||
i += 3;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
str[new_len++] = str[i++];
|
||||
|
@ -112,4 +112,8 @@ TEST(StringCleaning, strip_empty_characters) {
|
||||
check_strip_empty_characters(
|
||||
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae", 3,
|
||||
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9");
|
||||
check_strip_empty_characters(
|
||||
"\xF3\x9F\xBF\xBF\xF3\xA0\x80\x80\xF3\xA0\x80\x81\xF3\xA0\x80\xBF\xF3\xA0\x81\x80\xF3\xA0\x81\x81\xF3\xA0\x81\xBF"
|
||||
"\xF3\xA0\x82\x80",
|
||||
9, "\xF3\x9F\xBF\xBF \xF3\xA0\x82\x80");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user