Optimize parsing of argument value in HTML tags.

This commit is contained in:
levlam 2023-01-09 17:14:31 +03:00
parent 1919671df3
commit eeb73d4eb2

View File

@ -2936,11 +2936,7 @@ FormattedText get_markdown_v3(FormattedText text) {
}
static uint32 decode_html_entity(CSlice text, size_t &pos) {
auto c = static_cast<unsigned char>(text[pos]);
if (c != '&') {
return 0;
}
CHECK(text[pos] == '&');
size_t end_pos = pos + 1;
uint32 res = 0;
if (text[pos + 1] == '#') {
@ -3016,15 +3012,15 @@ Result<vector<MessageEntity>> parse_html(string &str) {
for (size_t i = 0; i < str_size; i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c == '&') {
auto ch = decode_html_entity(str, i);
if (ch != 0) {
auto code = decode_html_entity(str, i);
if (code != 0) {
i--; // i will be incremented in for
utf16_offset += 1 + (ch > 0xffff);
if (ch >= 0xd800 && ch <= 0xdfff) {
utf16_offset += 1 + (code > 0xffff);
if (code >= 0xd800 && code <= 0xdfff) {
// half of a surrogate pair
need_recheck_utf8 = true;
}
result_end = append_utf8_character_unsafe(result_end, ch);
result_end = append_utf8_character_unsafe(result_end, code);
CHECK(result_end <= result_begin + i);
continue;
}
@ -3104,19 +3100,23 @@ Result<vector<MessageEntity>> parse_html(string &str) {
} else {
// A string literal
char end_character = text[i++];
char *attribute_end = &str[i];
const char *attribute_begin = attribute_end;
while (text[i] != end_character && text[i] != 0) {
if (text[i] == '&') {
auto ch = decode_html_entity(str, i);
if (ch != 0) {
append_utf8_character(attribute_value, ch);
auto code = decode_html_entity(str, i);
if (code != 0) {
attribute_end = reinterpret_cast<char *>(
append_utf8_character_unsafe(reinterpret_cast<unsigned char *>(attribute_end), code));
continue;
}
}
attribute_value.push_back(text[i++]);
*attribute_end++ = text[i++];
}
if (text[i] == end_character) {
i++;
}
attribute_value.assign(attribute_begin, static_cast<size_t>(attribute_end - attribute_begin));
}
if (text[i] == 0) {
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);