From 137fd3beb981633f7fad1747baad19017f91e36e Mon Sep 17 00:00:00 2001 From: levlam Date: Mon, 9 Jan 2023 15:12:14 +0300 Subject: [PATCH] Replace text in-place in parse_html. --- td/telegram/MessageEntity.cpp | 18 +++++++++--------- tdutils/td/utils/utf8.cpp | 19 +++++++++++++++++++ tdutils/td/utils/utf8.h | 31 +++++++++++++++++-------------- 3 files changed, 45 insertions(+), 23 deletions(-) diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 8b3b31e1f..0642f106a 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -2991,14 +2991,13 @@ static uint32 decode_html_entity(CSlice text, size_t &pos) { Result> parse_html(string &str) { auto str_size = str.size(); const char *text = str.c_str(); + auto result_end = MutableSlice(str).ubegin(); + const unsigned char *result_begin = result_end; vector entities; int32 utf16_offset = 0; bool need_recheck_utf8 = false; - auto buf = StackAllocator::alloc(str_size + 30); - StringBuilder new_text(buf.as_slice(), true); - struct EntityInfo { string tag_name; string argument; @@ -3025,7 +3024,8 @@ Result> parse_html(string &str) { // half of a surrogate pair need_recheck_utf8 = true; } - append_utf8_character(new_text, ch); + result_end = append_utf8_character_unsafe(result_end, ch); + CHECK(result_end <= result_begin + i); continue; } } @@ -3033,7 +3033,7 @@ Result> parse_html(string &str) { if (is_utf8_character_first_code_unit(c)) { utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair } - new_text.push_back(text[i]); + *result_end++ = c; continue; } @@ -3139,7 +3139,7 @@ Result> parse_html(string &str) { << "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos); } - nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, new_text.size()); + nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result_end - result_begin); } else { // end of an entity if (nested_entities.empty()) { @@ -3186,7 +3186,7 @@ Result> parse_html(string &str) { } else if (tag_name == "a") { auto url = std::move(nested_entities.back().argument); if (url.empty()) { - url = new_text.as_cslice().substr(nested_entities.back().entity_begin_pos).str(); + url = Slice(result_begin + nested_entities.back().entity_begin_pos, result_end).str(); } auto user_id = LinkManager::get_link_user_id(url); if (user_id.is_valid()) { @@ -3235,12 +3235,12 @@ Result> parse_html(string &str) { sort_entities(entities); - if (need_recheck_utf8 && !check_utf8(new_text.as_cslice())) { + str.resize(static_cast(result_end - result_begin)); + if (need_recheck_utf8 && !check_utf8(str)) { return Status::Error(400, "Text contains invalid Unicode characters after decoding HTML entities, check for unmatched " "surrogate code units"); } - str = new_text.as_cslice().str(); return std::move(entities); } diff --git a/tdutils/td/utils/utf8.cpp b/tdutils/td/utils/utf8.cpp index c280baba4..a2c9256f6 100644 --- a/tdutils/td/utils/utf8.cpp +++ b/tdutils/td/utils/utf8.cpp @@ -82,6 +82,25 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) { return ptr; } +unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code) { + if (code <= 0x7f) { + *ptr++ = static_cast(code); + } else if (code <= 0x7ff) { + *ptr++ = static_cast(0xc0 | (code >> 6)); + *ptr++ = static_cast(0x80 | (code & 0x3f)); + } else if (code <= 0xffff) { + *ptr++ = static_cast(0xe0 | (code >> 12)); + *ptr++ = static_cast(0x80 | ((code >> 6) & 0x3f)); + *ptr++ = static_cast(0x80 | (code & 0x3f)); + } else { + *ptr++ = static_cast(0xf0 | (code >> 18)); + *ptr++ = static_cast(0x80 | ((code >> 12) & 0x3f)); + *ptr++ = static_cast(0x80 | ((code >> 6) & 0x3f)); + *ptr++ = static_cast(0x80 | (code & 0x3f)); + } + return ptr; +} + string utf8_to_lower(Slice str) { string result; auto pos = str.ubegin(); diff --git a/tdutils/td/utils/utf8.h b/tdutils/td/utils/utf8.h index 13f4896b9..21a02eca1 100644 --- a/tdutils/td/utils/utf8.h +++ b/tdutils/td/utils/utf8.h @@ -33,21 +33,21 @@ size_t utf8_utf16_length(Slice str); /// appends a Unicode character using UTF-8 encoding template -void append_utf8_character(T &str, uint32 ch) { - if (ch <= 0x7f) { - str.push_back(static_cast(ch)); - } else if (ch <= 0x7ff) { - str.push_back(static_cast(0xc0 | (ch >> 6))); // implementation-defined - str.push_back(static_cast(0x80 | (ch & 0x3f))); - } else if (ch <= 0xffff) { - str.push_back(static_cast(0xe0 | (ch >> 12))); // implementation-defined - str.push_back(static_cast(0x80 | ((ch >> 6) & 0x3f))); - str.push_back(static_cast(0x80 | (ch & 0x3f))); +void append_utf8_character(T &str, uint32 code) { + if (code <= 0x7f) { + str.push_back(static_cast(code)); + } else if (code <= 0x7ff) { + str.push_back(static_cast(0xc0 | (code >> 6))); // implementation-defined + str.push_back(static_cast(0x80 | (code & 0x3f))); + } else if (code <= 0xffff) { + str.push_back(static_cast(0xe0 | (code >> 12))); // implementation-defined + str.push_back(static_cast(0x80 | ((code >> 6) & 0x3f))); + str.push_back(static_cast(0x80 | (code & 0x3f))); } else { - str.push_back(static_cast(0xf0 | (ch >> 18))); // implementation-defined - str.push_back(static_cast(0x80 | ((ch >> 12) & 0x3f))); - str.push_back(static_cast(0x80 | ((ch >> 6) & 0x3f))); - str.push_back(static_cast(0x80 | (ch & 0x3f))); + str.push_back(static_cast(0xf0 | (code >> 18))); // implementation-defined + str.push_back(static_cast(0x80 | ((code >> 12) & 0x3f))); + str.push_back(static_cast(0x80 | ((code >> 6) & 0x3f))); + str.push_back(static_cast(0x80 | (code & 0x3f))); } } @@ -62,6 +62,9 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) { /// moves pointer one UTF-8 character forward and saves code of the skipped character in *code const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code); +/// appends a Unicode character using UTF-8 encoding and returns updated pointer +unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code); + /// truncates UTF-8 string to the given length in Unicode characters template T utf8_truncate(T str, size_t length) {