Replace text in-place in parse_html.

This commit is contained in:
levlam 2023-01-09 15:12:14 +03:00
parent ac0de06b5f
commit 137fd3beb9
3 changed files with 45 additions and 23 deletions

View File

@ -2991,14 +2991,13 @@ static uint32 decode_html_entity(CSlice text, size_t &pos) {
Result<vector<MessageEntity>> parse_html(string &str) { Result<vector<MessageEntity>> parse_html(string &str) {
auto str_size = str.size(); auto str_size = str.size();
const char *text = str.c_str(); const char *text = str.c_str();
auto result_end = MutableSlice(str).ubegin();
const unsigned char *result_begin = result_end;
vector<MessageEntity> entities; vector<MessageEntity> entities;
int32 utf16_offset = 0; int32 utf16_offset = 0;
bool need_recheck_utf8 = false; bool need_recheck_utf8 = false;
auto buf = StackAllocator::alloc(str_size + 30);
StringBuilder new_text(buf.as_slice(), true);
struct EntityInfo { struct EntityInfo {
string tag_name; string tag_name;
string argument; string argument;
@ -3025,7 +3024,8 @@ Result<vector<MessageEntity>> parse_html(string &str) {
// half of a surrogate pair // half of a surrogate pair
need_recheck_utf8 = true; need_recheck_utf8 = true;
} }
append_utf8_character(new_text, ch); result_end = append_utf8_character_unsafe(result_end, ch);
CHECK(result_end <= result_begin + i);
continue; continue;
} }
} }
@ -3033,7 +3033,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
if (is_utf8_character_first_code_unit(c)) { if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
} }
new_text.push_back(text[i]); *result_end++ = c;
continue; continue;
} }
@ -3139,7 +3139,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
<< "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos); << "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
} }
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, new_text.size()); nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result_end - result_begin);
} else { } else {
// end of an entity // end of an entity
if (nested_entities.empty()) { if (nested_entities.empty()) {
@ -3186,7 +3186,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
} else if (tag_name == "a") { } else if (tag_name == "a") {
auto url = std::move(nested_entities.back().argument); auto url = std::move(nested_entities.back().argument);
if (url.empty()) { if (url.empty()) {
url = new_text.as_cslice().substr(nested_entities.back().entity_begin_pos).str(); url = Slice(result_begin + nested_entities.back().entity_begin_pos, result_end).str();
} }
auto user_id = LinkManager::get_link_user_id(url); auto user_id = LinkManager::get_link_user_id(url);
if (user_id.is_valid()) { if (user_id.is_valid()) {
@ -3235,12 +3235,12 @@ Result<vector<MessageEntity>> parse_html(string &str) {
sort_entities(entities); sort_entities(entities);
if (need_recheck_utf8 && !check_utf8(new_text.as_cslice())) { str.resize(static_cast<size_t>(result_end - result_begin));
if (need_recheck_utf8 && !check_utf8(str)) {
return Status::Error(400, return Status::Error(400,
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched " "Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
"surrogate code units"); "surrogate code units");
} }
str = new_text.as_cslice().str();
return std::move(entities); return std::move(entities);
} }

View File

@ -82,6 +82,25 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
return ptr; return ptr;
} }
unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code) {
if (code <= 0x7f) {
*ptr++ = static_cast<unsigned char>(code);
} else if (code <= 0x7ff) {
*ptr++ = static_cast<unsigned char>(0xc0 | (code >> 6));
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
} else if (code <= 0xffff) {
*ptr++ = static_cast<unsigned char>(0xe0 | (code >> 12));
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
} else {
*ptr++ = static_cast<unsigned char>(0xf0 | (code >> 18));
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 12) & 0x3f));
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
}
return ptr;
}
string utf8_to_lower(Slice str) { string utf8_to_lower(Slice str) {
string result; string result;
auto pos = str.ubegin(); auto pos = str.ubegin();

View File

@ -33,21 +33,21 @@ size_t utf8_utf16_length(Slice str);
/// appends a Unicode character using UTF-8 encoding /// appends a Unicode character using UTF-8 encoding
template <class T> template <class T>
void append_utf8_character(T &str, uint32 ch) { void append_utf8_character(T &str, uint32 code) {
if (ch <= 0x7f) { if (code <= 0x7f) {
str.push_back(static_cast<char>(ch)); str.push_back(static_cast<char>(code));
} else if (ch <= 0x7ff) { } else if (code <= 0x7ff) {
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined str.push_back(static_cast<char>(0xc0 | (code >> 6))); // implementation-defined
str.push_back(static_cast<char>(0x80 | (ch & 0x3f))); str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
} else if (ch <= 0xffff) { } else if (code <= 0xffff) {
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined str.push_back(static_cast<char>(0xe0 | (code >> 12))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f))); str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f))); str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
} else { } else {
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined str.push_back(static_cast<char>(0xf0 | (code >> 18))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f))); str.push_back(static_cast<char>(0x80 | ((code >> 12) & 0x3f)));
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f))); str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f))); str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
} }
} }
@ -62,6 +62,9 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code /// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code); const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
/// appends a Unicode character using UTF-8 encoding and returns updated pointer
unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code);
/// truncates UTF-8 string to the given length in Unicode characters /// truncates UTF-8 string to the given length in Unicode characters
template <class T> template <class T>
T utf8_truncate(T str, size_t length) { T utf8_truncate(T str, size_t length) {