Strip more empty characters.
This commit is contained in:
parent
d6ba869457
commit
db4bd6b133
@ -169,6 +169,7 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
|||||||
CHECK(std::strlen(space_ch) == 3);
|
CHECK(std::strlen(space_ch) == 3);
|
||||||
can_be_first[static_cast<unsigned char>(space_ch[0])] = true;
|
can_be_first[static_cast<unsigned char>(space_ch[0])] = true;
|
||||||
}
|
}
|
||||||
|
can_be_first[0xF3] = true;
|
||||||
return true;
|
return true;
|
||||||
}();
|
}();
|
||||||
CHECK(can_be_first_inited);
|
CHECK(can_be_first_inited);
|
||||||
@ -181,20 +182,29 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
|||||||
size_t new_len = i;
|
size_t new_len = i;
|
||||||
while (i < str.size()) {
|
while (i < str.size()) {
|
||||||
if (can_be_first[static_cast<unsigned char>(str[i])] && i + 3 <= str.size()) {
|
if (can_be_first[static_cast<unsigned char>(str[i])] && i + 3 <= str.size()) {
|
||||||
bool found = false;
|
if (static_cast<unsigned char>(str[i]) == 0xF3) {
|
||||||
for (auto space_ch : space_characters) {
|
if (static_cast<unsigned char>(str[i + 1]) == 0xA0 && (static_cast<unsigned char>(str[i + 2]) & 0xFE) == 0x80 &&
|
||||||
if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) {
|
i + 4 <= str.size()) {
|
||||||
if (static_cast<unsigned char>(str[i + 2]) != 0xAE || static_cast<unsigned char>(str[i + 1]) != 0x80 ||
|
str[new_len++] = ' ';
|
||||||
static_cast<unsigned char>(str[i]) != 0xE2 || strip_rtlo) {
|
i += 4;
|
||||||
found = true;
|
continue;
|
||||||
}
|
}
|
||||||
break;
|
} else {
|
||||||
|
bool found = false;
|
||||||
|
for (auto space_ch : space_characters) {
|
||||||
|
if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) {
|
||||||
|
if (static_cast<unsigned char>(str[i + 2]) != 0xAE || static_cast<unsigned char>(str[i + 1]) != 0x80 ||
|
||||||
|
static_cast<unsigned char>(str[i]) != 0xE2 || strip_rtlo) {
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (found) {
|
||||||
|
str[new_len++] = ' ';
|
||||||
|
i += 3;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if (found) {
|
|
||||||
str[new_len++] = ' ';
|
|
||||||
i += 3;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
str[new_len++] = str[i++];
|
str[new_len++] = str[i++];
|
||||||
|
@ -112,4 +112,8 @@ TEST(StringCleaning, strip_empty_characters) {
|
|||||||
check_strip_empty_characters(
|
check_strip_empty_characters(
|
||||||
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae", 3,
|
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xac\xe2\x80\xad\xe2\x80\xae", 3,
|
||||||
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9");
|
"\xe2\x80\xa7\xe2\x80\xa8\xe2\x80\xa9");
|
||||||
|
check_strip_empty_characters(
|
||||||
|
"\xF3\x9F\xBF\xBF\xF3\xA0\x80\x80\xF3\xA0\x80\x81\xF3\xA0\x80\xBF\xF3\xA0\x81\x80\xF3\xA0\x81\x81\xF3\xA0\x81\xBF"
|
||||||
|
"\xF3\xA0\x82\x80",
|
||||||
|
9, "\xF3\x9F\xBF\xBF \xF3\xA0\x82\x80");
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user