Update strip_empty_characters.
GitOrigin-RevId: 480beb7b352b28f59f65a63fd1d4550d8e16803f
This commit is contained in:
parent
c376c1ac08
commit
4d9b8cf016
@ -156,8 +156,8 @@ bool clean_input_string(string &str) {
|
|||||||
string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
||||||
static const char *space_characters[] = {u8"\u1680", u8"\u180E", u8"\u2000", u8"\u2001", u8"\u2002",
|
static const char *space_characters[] = {u8"\u1680", u8"\u180E", u8"\u2000", u8"\u2001", u8"\u2002",
|
||||||
u8"\u2003", u8"\u2004", u8"\u2005", u8"\u2006", u8"\u2007",
|
u8"\u2003", u8"\u2004", u8"\u2005", u8"\u2006", u8"\u2007",
|
||||||
u8"\u2008", u8"\u2009", u8"\u200A", u8"\u200B", u8"\u202E",
|
u8"\u2008", u8"\u2009", u8"\u200A", u8"\u202E", u8"\u202F",
|
||||||
u8"\u202F", u8"\u205F", u8"\u3000", u8"\uFEFF", u8"\uFFFC"};
|
u8"\u205F", u8"\u2800", u8"\u3000", u8"\uFFFC"};
|
||||||
static bool can_be_first[std::numeric_limits<unsigned char>::max() + 1];
|
static bool can_be_first[std::numeric_limits<unsigned char>::max() + 1];
|
||||||
static bool can_be_first_inited = [&] {
|
static bool can_be_first_inited = [&] {
|
||||||
for (auto space_ch : space_characters) {
|
for (auto space_ch : space_characters) {
|
||||||
@ -197,9 +197,13 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
|||||||
Slice trimmed = trim(utf8_truncate(trim(Slice(str.c_str(), new_len)), max_length));
|
Slice trimmed = trim(utf8_truncate(trim(Slice(str.c_str(), new_len)), max_length));
|
||||||
|
|
||||||
// check if there is some non-empty character, empty characters:
|
// check if there is some non-empty character, empty characters:
|
||||||
|
// "\xE2\x80\x8B", ZERO WIDTH SPACE
|
||||||
// "\xE2\x80\x8C", ZERO WIDTH NON-JOINER
|
// "\xE2\x80\x8C", ZERO WIDTH NON-JOINER
|
||||||
// "\xE2\x80\x8D", ZERO WIDTH JOINER
|
// "\xE2\x80\x8D", ZERO WIDTH JOINER
|
||||||
|
// "\xE2\x80\x8E", LEFT-TO-RIGHT MARK
|
||||||
|
// "\xE2\x80\x8F", RIGHT-TO-LEFT MARK
|
||||||
// "\xE2\x80\xAE", RIGHT-TO-LEFT OVERRIDE
|
// "\xE2\x80\xAE", RIGHT-TO-LEFT OVERRIDE
|
||||||
|
// "\xEF\xBB\xBF", ZERO WIDTH NO-BREAK SPACE aka BYTE ORDER MARK
|
||||||
// "\xC2\xA0", NO-BREAK SPACE
|
// "\xC2\xA0", NO-BREAK SPACE
|
||||||
for (i = 0;;) {
|
for (i = 0;;) {
|
||||||
if (i == trimmed.size()) {
|
if (i == trimmed.size()) {
|
||||||
@ -211,9 +215,15 @@ string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
|
|||||||
i++;
|
i++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (static_cast<unsigned char>(trimmed[i]) == 0xE2 && static_cast<unsigned char>(trimmed[i + 1]) == 0x80 &&
|
if (static_cast<unsigned char>(trimmed[i]) == 0xE2 && static_cast<unsigned char>(trimmed[i + 1]) == 0x80) {
|
||||||
(static_cast<unsigned char>(trimmed[i + 2]) == 0x8C || static_cast<unsigned char>(trimmed[i + 2]) == 0x8D ||
|
auto next = static_cast<unsigned char>(trimmed[i + 2]);
|
||||||
static_cast<unsigned char>(trimmed[i + 2]) == 0xAE)) {
|
if ((0x8B <= next && next <= 0x8F) || next == 0xAE) {
|
||||||
|
i += 3;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (static_cast<unsigned char>(trimmed[i]) == 0xEF && static_cast<unsigned char>(trimmed[i + 1]) == 0xBB &&
|
||||||
|
static_cast<unsigned char>(trimmed[i + 2]) == 0xBF) {
|
||||||
i += 3;
|
i += 3;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -85,11 +85,11 @@ TEST(StringCleaning, strip_empty_characters) {
|
|||||||
check_strip_empty_characters("/abc", 0, "");
|
check_strip_empty_characters("/abc", 0, "");
|
||||||
check_strip_empty_characters("/abc", 10000000, "/abc");
|
check_strip_empty_characters("/abc", 10000000, "/abc");
|
||||||
string spaces =
|
string spaces =
|
||||||
u8"\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF"
|
u8"\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F\u205F\u2800\u3000\uFFFC"
|
||||||
u8"\uFFFC\uFFFC";
|
u8"\uFFFC";
|
||||||
string spaces_replace = " ";
|
string spaces_replace = " ";
|
||||||
string rtlo = u8"\u202E";
|
string rtlo = u8"\u202E";
|
||||||
string empty = "\xE2\x80\x8C\xE2\x80\x8D\xE2\x80\xAE\xC2\xA0\xC2\xA0";
|
string empty = "\xE2\x80\x8B\xE2\x80\x8C\xE2\x80\x8D\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAE\xC2\xA0\xC2\xA0";
|
||||||
|
|
||||||
check_strip_empty_characters(spaces, 1000000, "");
|
check_strip_empty_characters(spaces, 1000000, "");
|
||||||
check_strip_empty_characters(spaces + rtlo, 1000000, "");
|
check_strip_empty_characters(spaces + rtlo, 1000000, "");
|
||||||
|
Reference in New Issue
Block a user