Better strip_empty_characters.

GitOrigin-RevId: 35863d02683e75da361712647d643866ae4800cf
This commit is contained in:
levlam 2018-03-21 17:54:39 +03:00
parent cfe4d9bdce
commit a92860a046
2 changed files with 13 additions and 9 deletions

View File

@ -138,14 +138,14 @@ vector<tl_object_ptr<td_api::textEntity>> get_text_entities_object(const vector<
return result; return result;
} }
static bool is_word_character(uint32 a) { static bool is_word_character(uint32 code) {
switch (get_unicode_simple_category(a)) { switch (get_unicode_simple_category(code)) {
case UnicodeSimpleCategory::Letter: case UnicodeSimpleCategory::Letter:
case UnicodeSimpleCategory::DecimalNumber: case UnicodeSimpleCategory::DecimalNumber:
case UnicodeSimpleCategory::Number: case UnicodeSimpleCategory::Number:
return true; return true;
default: default:
return a == '_'; return code == '_';
} }
} }
@ -159,16 +159,16 @@ static bool is_word_boundary(uint32 a, uint32 b) {
} }
*/ */
static bool is_alpha_digit(uint32 a) { static bool is_alpha_digit(uint32 code) {
return ('0' <= a && a <= '9') || ('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z'); return ('0' <= code && code <= '9') || ('a' <= code && code <= 'z') || ('A' <= code && code <= 'Z');
} }
static bool is_alpha_digit_or_underscore(uint32 a) { static bool is_alpha_digit_or_underscore(uint32 code) {
return is_alpha_digit(a) || a == '_'; return is_alpha_digit(code) || code == '_';
} }
static bool is_alpha_digit_or_underscore_or_minus(uint32 a) { static bool is_alpha_digit_or_underscore_or_minus(uint32 code) {
return is_alpha_digit_or_underscore(a) || a == '-'; return is_alpha_digit_or_underscore(code) || code == '-';
} }
// This functions just implements corresponding regexps // This functions just implements corresponding regexps

View File

@ -186,6 +186,10 @@ string strip_empty_characters(string str, size_t max_length) {
return string(); return string();
} }
if (trimmed[i] == ' ' || trimmed[i] == '\n') {
i++;
continue;
}
if (static_cast<unsigned char>(trimmed[i]) == 0xE2 && static_cast<unsigned char>(trimmed[i + 1]) == 0x80 && if (static_cast<unsigned char>(trimmed[i]) == 0xE2 && static_cast<unsigned char>(trimmed[i + 1]) == 0x80 &&
(static_cast<unsigned char>(trimmed[i + 2]) == 0x8C || static_cast<unsigned char>(trimmed[i + 2]) == 0x8D || (static_cast<unsigned char>(trimmed[i + 2]) == 0x8C || static_cast<unsigned char>(trimmed[i + 2]) == 0x8D ||
static_cast<unsigned char>(trimmed[i + 2]) == 0xAE)) { static_cast<unsigned char>(trimmed[i + 2]) == 0xAE)) {