Recheck UTF-8 in parse_html only if needed.
This commit is contained in:
parent
0fb64f97a4
commit
34226ac4ad
@ -2992,11 +2992,15 @@ static uint32 decode_html_entity(CSlice text, size_t &pos) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result) {
|
Result<vector<MessageEntity>> parse_html(string &str) {
|
||||||
|
auto str_size = str.size();
|
||||||
|
const char *text = str.c_str();
|
||||||
|
|
||||||
vector<MessageEntity> entities;
|
vector<MessageEntity> entities;
|
||||||
int32 utf16_offset = 0;
|
int32 utf16_offset = 0;
|
||||||
|
bool need_recheck_utf8 = false;
|
||||||
|
|
||||||
auto buf = StackAllocator::alloc(text.size() + 30);
|
auto buf = StackAllocator::alloc(str_size + 30);
|
||||||
StringBuilder new_text(buf.as_slice(), true);
|
StringBuilder new_text(buf.as_slice(), true);
|
||||||
|
|
||||||
struct EntityInfo {
|
struct EntityInfo {
|
||||||
@ -3014,13 +3018,17 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
};
|
};
|
||||||
vector<EntityInfo> nested_entities;
|
vector<EntityInfo> nested_entities;
|
||||||
|
|
||||||
for (size_t i = 0; i < text.size(); i++) {
|
for (size_t i = 0; i < str_size; i++) {
|
||||||
auto c = static_cast<unsigned char>(text[i]);
|
auto c = static_cast<unsigned char>(text[i]);
|
||||||
if (c == '&') {
|
if (c == '&') {
|
||||||
auto ch = decode_html_entity(text, i);
|
auto ch = decode_html_entity(str, i);
|
||||||
if (ch != 0) {
|
if (ch != 0) {
|
||||||
i--; // i will be incremented in for
|
i--; // i will be incremented in for
|
||||||
utf16_offset += 1 + (ch > 0xffff);
|
utf16_offset += 1 + (ch > 0xffff);
|
||||||
|
if (ch >= 0xd800 && ch <= 0xdfff) {
|
||||||
|
// half of a surrogate pair
|
||||||
|
need_recheck_utf8 = true;
|
||||||
|
}
|
||||||
append_utf8_character(new_text, ch);
|
append_utf8_character(new_text, ch);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -3043,7 +3051,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1));
|
string tag_name = to_lower(Slice(text + begin_pos + 1, i - begin_pos - 1));
|
||||||
if (tag_name != "a" && tag_name != "b" && tag_name != "strong" && tag_name != "i" && tag_name != "em" &&
|
if (tag_name != "a" && tag_name != "b" && tag_name != "strong" && tag_name != "i" && tag_name != "em" &&
|
||||||
tag_name != "s" && tag_name != "strike" && tag_name != "del" && tag_name != "u" && tag_name != "ins" &&
|
tag_name != "s" && tag_name != "strike" && tag_name != "del" && tag_name != "u" && tag_name != "ins" &&
|
||||||
tag_name != "tg-spoiler" && tag_name != "tg-emoji" && tag_name != "span" && tag_name != "pre" &&
|
tag_name != "tg-spoiler" && tag_name != "tg-emoji" && tag_name != "span" && tag_name != "pre" &&
|
||||||
@ -3064,7 +3072,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
while (!is_space(text[i]) && text[i] != '=') {
|
while (!is_space(text[i]) && text[i] != '=') {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos);
|
Slice attribute_name(text + attribute_begin_pos, i - attribute_begin_pos);
|
||||||
if (attribute_name.empty()) {
|
if (attribute_name.empty()) {
|
||||||
return Status::Error(
|
return Status::Error(
|
||||||
400, PSLICE() << "Empty attribute name in the tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
400, PSLICE() << "Empty attribute name in the tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||||
@ -3092,7 +3100,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
|
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos));
|
attribute_value = to_lower(Slice(text + token_begin_pos, i - token_begin_pos));
|
||||||
|
|
||||||
if (!is_space(text[i]) && text[i] != '>') {
|
if (!is_space(text[i]) && text[i] != '>') {
|
||||||
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
|
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
|
||||||
@ -3102,7 +3110,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
char end_character = text[i++];
|
char end_character = text[i++];
|
||||||
while (text[i] != end_character && text[i] != 0) {
|
while (text[i] != end_character && text[i] != 0) {
|
||||||
if (text[i] == '&') {
|
if (text[i] == '&') {
|
||||||
auto ch = decode_html_entity(text, i);
|
auto ch = decode_html_entity(str, i);
|
||||||
if (ch != 0) {
|
if (ch != 0) {
|
||||||
append_utf8_character(attribute_value, ch);
|
append_utf8_character(attribute_value, ch);
|
||||||
continue;
|
continue;
|
||||||
@ -3145,7 +3153,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
while (!is_space(text[i]) && text[i] != '>') {
|
while (!is_space(text[i]) && text[i] != '>') {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
string end_tag_name = to_lower(text.substr(begin_pos + 2, i - begin_pos - 2));
|
string end_tag_name = to_lower(Slice(text + begin_pos + 2, i - begin_pos - 2));
|
||||||
while (is_space(text[i]) && text[i] != 0) {
|
while (is_space(text[i]) && text[i] != 0) {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
@ -3231,19 +3239,12 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
|
|
||||||
sort_entities(entities);
|
sort_entities(entities);
|
||||||
|
|
||||||
result = new_text.as_cslice().str();
|
if (need_recheck_utf8 && !check_utf8(new_text.as_cslice())) {
|
||||||
return std::move(entities);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result<vector<MessageEntity>> parse_html(string &text) {
|
|
||||||
string result;
|
|
||||||
TRY_RESULT(entities, do_parse_html(text, result));
|
|
||||||
if (!check_utf8(result)) {
|
|
||||||
return Status::Error(400,
|
return Status::Error(400,
|
||||||
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
|
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
|
||||||
"surrogate code units");
|
"surrogate code units");
|
||||||
}
|
}
|
||||||
text = std::move(result);
|
str = new_text.as_cslice().str();
|
||||||
return std::move(entities);
|
return std::move(entities);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -181,7 +181,7 @@ FormattedText parse_markdown_v3(FormattedText text);
|
|||||||
|
|
||||||
FormattedText get_markdown_v3(FormattedText text);
|
FormattedText get_markdown_v3(FormattedText text);
|
||||||
|
|
||||||
Result<vector<MessageEntity>> parse_html(string &text);
|
Result<vector<MessageEntity>> parse_html(string &str);
|
||||||
|
|
||||||
vector<tl_object_ptr<telegram_api::MessageEntity>> get_input_message_entities(const ContactsManager *contacts_manager,
|
vector<tl_object_ptr<telegram_api::MessageEntity>> get_input_message_entities(const ContactsManager *contacts_manager,
|
||||||
const vector<MessageEntity> &entities,
|
const vector<MessageEntity> &entities,
|
||||||
|
Loading…
Reference in New Issue
Block a user