Replace text in-place in parse_html.

This commit is contained in:
levlam 2023-01-09 15:12:14 +03:00
parent ac0de06b5f
commit 137fd3beb9
3 changed files with 45 additions and 23 deletions

View File

@ -2991,14 +2991,13 @@ static uint32 decode_html_entity(CSlice text, size_t &pos) {
Result<vector<MessageEntity>> parse_html(string &str) {
auto str_size = str.size();
const char *text = str.c_str();
auto result_end = MutableSlice(str).ubegin();
const unsigned char *result_begin = result_end;
vector<MessageEntity> entities;
int32 utf16_offset = 0;
bool need_recheck_utf8 = false;
auto buf = StackAllocator::alloc(str_size + 30);
StringBuilder new_text(buf.as_slice(), true);
struct EntityInfo {
string tag_name;
string argument;
@ -3025,7 +3024,8 @@ Result<vector<MessageEntity>> parse_html(string &str) {
// half of a surrogate pair
need_recheck_utf8 = true;
}
append_utf8_character(new_text, ch);
result_end = append_utf8_character_unsafe(result_end, ch);
CHECK(result_end <= result_begin + i);
continue;
}
}
@ -3033,7 +3033,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
new_text.push_back(text[i]);
*result_end++ = c;
continue;
}
@ -3139,7 +3139,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
<< "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
}
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, new_text.size());
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result_end - result_begin);
} else {
// end of an entity
if (nested_entities.empty()) {
@ -3186,7 +3186,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
} else if (tag_name == "a") {
auto url = std::move(nested_entities.back().argument);
if (url.empty()) {
url = new_text.as_cslice().substr(nested_entities.back().entity_begin_pos).str();
url = Slice(result_begin + nested_entities.back().entity_begin_pos, result_end).str();
}
auto user_id = LinkManager::get_link_user_id(url);
if (user_id.is_valid()) {
@ -3235,12 +3235,12 @@ Result<vector<MessageEntity>> parse_html(string &str) {
sort_entities(entities);
if (need_recheck_utf8 && !check_utf8(new_text.as_cslice())) {
str.resize(static_cast<size_t>(result_end - result_begin));
if (need_recheck_utf8 && !check_utf8(str)) {
return Status::Error(400,
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
"surrogate code units");
}
str = new_text.as_cslice().str();
return std::move(entities);
}

View File

@ -82,6 +82,25 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
return ptr;
}
unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code) {
if (code <= 0x7f) {
*ptr++ = static_cast<unsigned char>(code);
} else if (code <= 0x7ff) {
*ptr++ = static_cast<unsigned char>(0xc0 | (code >> 6));
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
} else if (code <= 0xffff) {
*ptr++ = static_cast<unsigned char>(0xe0 | (code >> 12));
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
} else {
*ptr++ = static_cast<unsigned char>(0xf0 | (code >> 18));
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 12) & 0x3f));
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
}
return ptr;
}
string utf8_to_lower(Slice str) {
string result;
auto pos = str.ubegin();

View File

@ -33,21 +33,21 @@ size_t utf8_utf16_length(Slice str);
/// appends a Unicode character using UTF-8 encoding
template <class T>
void append_utf8_character(T &str, uint32 ch) {
if (ch <= 0x7f) {
str.push_back(static_cast<char>(ch));
} else if (ch <= 0x7ff) {
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
} else if (ch <= 0xffff) {
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
void append_utf8_character(T &str, uint32 code) {
if (code <= 0x7f) {
str.push_back(static_cast<char>(code));
} else if (code <= 0x7ff) {
str.push_back(static_cast<char>(0xc0 | (code >> 6))); // implementation-defined
str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
} else if (code <= 0xffff) {
str.push_back(static_cast<char>(0xe0 | (code >> 12))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
} else {
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
str.push_back(static_cast<char>(0xf0 | (code >> 18))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((code >> 12) & 0x3f)));
str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
}
}
@ -62,6 +62,9 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
/// appends a Unicode character using UTF-8 encoding and returns updated pointer
unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code);
/// truncates UTF-8 string to the given length in Unicode characters
template <class T>
T utf8_truncate(T str, size_t length) {