Keep only Slice overload of utf8_utf16_substr.

This commit is contained in:
levlam 2022-09-22 12:08:34 +03:00
parent 5c5d19e76f
commit faa738d6a9
5 changed files with 45 additions and 40 deletions

View File

@ -16889,7 +16889,7 @@ tl_object_ptr<td_api::userFullInfo> ContactsManager::get_user_full_info_object(U
return true;
}
if (entity.type == MessageEntity::Type::Url &&
!LinkManager::is_internal_link(utf8_utf16_substr(Slice(bio.text), entity.offset, entity.length))) {
!LinkManager::is_internal_link(utf8_utf16_substr(bio.text, entity.offset, entity.length))) {
return true;
}
return false;

View File

@ -1793,7 +1793,7 @@ string get_first_url(const FormattedText &text) {
if (entity.length <= 4) {
continue;
}
Slice url = utf8_utf16_substr(text.text, entity.offset, entity.length);
auto url = utf8_utf16_substr(text.text, entity.offset, entity.length);
string scheme = to_lower(url.substr(0, 4));
if (scheme == "ton:" || begins_with(scheme, "tg:") || scheme == "ftp:" || is_plain_domain(url)) {
continue;

View File

@ -119,4 +119,41 @@ string utf8_encode(CSlice data) {
return PSTRING() << "url_decode(" << url_encode(data) << ')';
}
size_t utf8_utf16_length(Slice str) {
size_t result = 0;
for (auto c : str) {
result += is_utf8_character_first_code_unit(c) + ((c & 0xf8) == 0xf0);
}
return result;
}
Slice utf8_utf16_truncate(Slice str, size_t length) {
for (size_t i = 0; i < str.size(); i++) {
auto c = static_cast<unsigned char>(str[i]);
if (is_utf8_character_first_code_unit(c)) {
if (length <= 0) {
return str.substr(0, i);
} else {
length--;
if (c >= 0xf0) { // >= 4 bytes in symbol => surrogate pair
length--;
}
}
}
}
return str;
}
Slice utf8_utf16_substr(Slice str, size_t offset) {
if (offset == 0) {
return str;
}
auto offset_pos = utf8_utf16_truncate(str, offset).size();
return str.substr(offset_pos);
}
Slice utf8_utf16_substr(Slice str, size_t offset, size_t length) {
return utf8_utf16_truncate(utf8_utf16_substr(str, offset), length);
}
} // namespace td

View File

@ -29,13 +29,7 @@ inline size_t utf8_length(Slice str) {
}
/// returns length of UTF-8 string in UTF-16 code units
inline size_t utf8_utf16_length(Slice str) {
size_t result = 0;
for (auto c : str) {
result += is_utf8_character_first_code_unit(c) + ((c & 0xf8) == 0xf0);
}
return result;
}
size_t utf8_utf16_length(Slice str);
/// appends a Unicode character using UTF-8 encoding
void append_utf8_character(string &str, uint32 ch);
@ -69,23 +63,7 @@ T utf8_truncate(T str, size_t length) {
}
/// truncates UTF-8 string to the given length given in UTF-16 code units
template <class T>
T utf8_utf16_truncate(T str, size_t length) {
for (size_t i = 0; i < str.size(); i++) {
auto c = static_cast<unsigned char>(str[i]);
if (is_utf8_character_first_code_unit(c)) {
if (length <= 0) {
return str.substr(0, i);
} else {
length--;
if (c >= 0xf0) { // >= 4 bytes in symbol => surrogate pair
length--;
}
}
}
}
return str;
}
Slice utf8_utf16_truncate(Slice str, size_t length);
template <class T>
T utf8_substr(T str, size_t offset) {
@ -101,19 +79,9 @@ T utf8_substr(T str, size_t offset, size_t length) {
return utf8_truncate(utf8_substr(str, offset), length);
}
template <class T>
T utf8_utf16_substr(T str, size_t offset) {
if (offset == 0) {
return str;
}
auto offset_pos = utf8_utf16_truncate(str, offset).size();
return str.substr(offset_pos);
}
Slice utf8_utf16_substr(Slice str, size_t offset);
template <class T>
T utf8_utf16_substr(T str, size_t offset, size_t length) {
return utf8_utf16_truncate(utf8_utf16_substr(str, offset), length);
}
Slice utf8_utf16_substr(Slice str, size_t offset, size_t length);
/// Returns UTF-8 string converted to lower case.
string utf8_to_lower(Slice str);

View File

@ -959,8 +959,8 @@ TEST(MessageEntities, fix_formatted_text) {
}
}
check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11), fixed_entities, false, false, false,
false);
check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11).str(), fixed_entities, false, false,
false, false);
}
for (td::string text : {"\t", "\r", "\n", "\t ", "\r ", "\n "}) {