Use Slice in parse_html.
GitOrigin-RevId: f0fa29d7fccf407dde8ec6085a285f1fa2c52aa2
This commit is contained in:
parent
f20ef8c0c5
commit
38d5be1ee9
@ -1406,7 +1406,7 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
|
|||||||
return entities;
|
return entities;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32 decode_html_entity(const string &text, size_t &pos) {
|
static uint32 decode_html_entity(Slice text, size_t &pos) {
|
||||||
auto c = static_cast<unsigned char>(text[pos]);
|
auto c = static_cast<unsigned char>(text[pos]);
|
||||||
if (c != '&') {
|
if (c != '&') {
|
||||||
return 0;
|
return 0;
|
||||||
@ -1436,14 +1436,14 @@ static uint32 decode_html_entity(const string &text, size_t &pos) {
|
|||||||
while (is_alpha(text[end_pos])) {
|
while (is_alpha(text[end_pos])) {
|
||||||
end_pos++;
|
end_pos++;
|
||||||
}
|
}
|
||||||
string entity(text, pos + 1, end_pos - pos - 1);
|
Slice entity = text.substr(pos + 1, end_pos - pos - 1);
|
||||||
if (entity == "lt") {
|
if (entity == Slice("lt")) {
|
||||||
res = static_cast<uint32>('<');
|
res = static_cast<uint32>('<');
|
||||||
} else if (entity == "gt") {
|
} else if (entity == Slice("gt")) {
|
||||||
res = static_cast<uint32>('>');
|
res = static_cast<uint32>('>');
|
||||||
} else if (entity == "amp") {
|
} else if (entity == Slice("amp")) {
|
||||||
res = static_cast<uint32>('&');
|
res = static_cast<uint32>('&');
|
||||||
} else if (entity == "quot") {
|
} else if (entity == Slice("quot")) {
|
||||||
res = static_cast<uint32>('"');
|
res = static_cast<uint32>('"');
|
||||||
} else {
|
} else {
|
||||||
// unsupported literal entity
|
// unsupported literal entity
|
||||||
@ -1459,12 +1459,10 @@ static uint32 decode_html_entity(const string &text, size_t &pos) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
Result<vector<MessageEntity>> parse_html(string &text) {
|
static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||||
string result;
|
|
||||||
vector<MessageEntity> entities;
|
vector<MessageEntity> entities;
|
||||||
size_t size = text.size();
|
|
||||||
int32 utf16_offset = 0;
|
int32 utf16_offset = 0;
|
||||||
for (size_t i = 0; i < size; i++) {
|
for (size_t i = 0; i < text.size(); i++) {
|
||||||
auto c = static_cast<unsigned char>(text[i]);
|
auto c = static_cast<unsigned char>(text[i]);
|
||||||
if (c == '&') {
|
if (c == '&') {
|
||||||
auto ch = decode_html_entity(text, i);
|
auto ch = decode_html_entity(text, i);
|
||||||
@ -1495,8 +1493,7 @@ Result<vector<MessageEntity>> parse_html(string &text) {
|
|||||||
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
string tag_name(text, begin_pos + 1, i - begin_pos - 1);
|
string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1));
|
||||||
to_lower_inplace(tag_name);
|
|
||||||
if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" &&
|
if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" &&
|
||||||
tag_name != "pre" && tag_name != "code") {
|
tag_name != "pre" && tag_name != "code") {
|
||||||
return Status::Error(400,
|
return Status::Error(400,
|
||||||
@ -1516,7 +1513,7 @@ Result<vector<MessageEntity>> parse_html(string &text) {
|
|||||||
while (!is_space(text[i]) && text[i] != '=') {
|
while (!is_space(text[i]) && text[i] != '=') {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
string attribute_name(text, attribute_begin_pos, i - attribute_begin_pos);
|
Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos);
|
||||||
if (attribute_name.empty()) {
|
if (attribute_name.empty()) {
|
||||||
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
|
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
|
||||||
<< tag_name << "\" at byte offset " << begin_pos);
|
<< tag_name << "\" at byte offset " << begin_pos);
|
||||||
@ -1543,8 +1540,7 @@ Result<vector<MessageEntity>> parse_html(string &text) {
|
|||||||
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
|
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
attribute_value.assign(text, token_begin_pos, i - token_begin_pos);
|
attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos));
|
||||||
to_lower_inplace(attribute_value);
|
|
||||||
|
|
||||||
if (!is_space(text[i]) && text[i] != '>') {
|
if (!is_space(text[i]) && text[i] != '>') {
|
||||||
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
|
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
|
||||||
@ -1570,8 +1566,8 @@ Result<vector<MessageEntity>> parse_html(string &text) {
|
|||||||
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tag_name == "a" && attribute_name == "href") {
|
if (tag_name == "a" && attribute_name == Slice("href")) {
|
||||||
url = attribute_value;
|
url = std::move(attribute_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
@ -1605,7 +1601,7 @@ Result<vector<MessageEntity>> parse_html(string &text) {
|
|||||||
while (!is_space(text[i]) && text[i] != '>') {
|
while (!is_space(text[i]) && text[i] != '>') {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
string end_tag_name(text, end_tag_begin_pos + 2, i - end_tag_begin_pos - 2);
|
Slice end_tag_name = text.substr(end_tag_begin_pos + 2, i - end_tag_begin_pos - 2);
|
||||||
while (is_space(text[i]) && text[i] != 0) {
|
while (is_space(text[i]) && text[i] != 0) {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
@ -1643,12 +1639,17 @@ Result<vector<MessageEntity>> parse_html(string &text) {
|
|||||||
utf16_offset += utf16_entity_length;
|
utf16_offset += utf16_entity_length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return entities;
|
||||||
|
}
|
||||||
|
|
||||||
|
Result<vector<MessageEntity>> parse_html(string &text) {
|
||||||
|
string result;
|
||||||
|
TRY_RESULT(entities, do_parse_html(text, result));
|
||||||
if (!check_utf8(result)) {
|
if (!check_utf8(result)) {
|
||||||
return Status::Error(400,
|
return Status::Error(400,
|
||||||
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
|
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
|
||||||
"surrogate code units");
|
"surrogate code units");
|
||||||
}
|
}
|
||||||
|
|
||||||
text = result;
|
text = result;
|
||||||
return entities;
|
return entities;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user