Support nested entities in parse_html.

GitOrigin-RevId: c8a00262f5a8739d09b45ce710a5c7b920f2bfd4
This commit is contained in:
levlam 2019-09-26 18:36:45 +03:00
parent 3006357f7c
commit b1d1ea2e6c
2 changed files with 246 additions and 151 deletions

View File

@ -24,7 +24,7 @@
namespace td {
int MessageEntity::get_type_priority(Type type) {
static const int types[] = {5, 5, 5, 5, 5, 9, 9, 2, 1, 1, 5, 5, 5, 5, 9, 9, 0};
static const int types[] = {50, 50, 50, 50, 50, 90, 91, 20, 11, 10, 49, 49, 50, 50, 92, 93, 0};
return types[static_cast<int32>(type)];
}
@ -1461,6 +1461,22 @@ static uint32 decode_html_entity(Slice text, size_t &pos) {
static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
vector<MessageEntity> entities;
int32 utf16_offset = 0;
struct EntityInfo {
string tag_name;
string url;
int32 entity_offset;
size_t entity_begin_pos;
EntityInfo(string tag_name, string url, int32 entity_offset, size_t entity_begin_pos)
: tag_name(std::move(tag_name))
, url(std::move(url))
, entity_offset(entity_offset)
, entity_begin_pos(entity_begin_pos) {
}
};
std::vector<EntityInfo> nested_entities;
for (size_t i = 0; i < text.size(); i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c == '&') {
@ -1480,164 +1496,158 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
continue;
}
// we are at begin of the entity
size_t begin_pos = i++;
if (text[i] == '/') {
return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos);
}
while (!is_space(text[i]) && text[i] != '>') {
i++;
}
if (text[i] == 0) {
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
}
string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1));
if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" &&
tag_name != "pre" && tag_name != "code") {
return Status::Error(400,
PSLICE() << "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
string url;
// string language; TODO PreCode support
while (text[i] != '>') {
while (text[i] != 0 && is_space(text[i])) {
i++;
}
if (text[i] == '>') {
break;
}
auto attribute_begin_pos = i;
while (!is_space(text[i]) && text[i] != '=') {
i++;
}
Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos);
if (attribute_name.empty()) {
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
<< tag_name << "\" at byte offset " << begin_pos);
}
while (text[i] != 0 && is_space(text[i])) {
i++;
}
if (text[i] != '=') {
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
<< tag_name << "\" at byte offset " << begin_pos);
}
i++;
while (text[i] != 0 && is_space(text[i])) {
auto begin_pos = i++;
if (text[i] != '/') {
// begin of an entity
while (!is_space(text[i]) && text[i] != '>') {
i++;
}
if (text[i] == 0) {
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
}
string attribute_value;
if (text[i] != '\'' && text[i] != '"') {
// A name token (a sequence of letters, digits, periods, or hyphens). Name tokens are not case sensitive.
auto token_begin_pos = i;
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1));
if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" &&
tag_name != "pre" && tag_name != "code") {
return Status::Error(400, PSLICE()
<< "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
string url;
// string language; TODO PreCode support
while (text[i] != '>') {
while (text[i] != 0 && is_space(text[i])) {
i++;
}
attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos));
if (!is_space(text[i]) && text[i] != '>') {
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
if (text[i] == '>') {
break;
}
} else {
// A string literal
char end_character = text[i++];
while (text[i] != end_character && text[i] != 0) {
if (text[i] == '&') {
auto ch = decode_html_entity(text, i);
if (ch != 0) {
append_utf8_character(attribute_value, ch);
continue;
auto attribute_begin_pos = i;
while (!is_space(text[i]) && text[i] != '=') {
i++;
}
Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos);
if (attribute_name.empty()) {
return Status::Error(
400, PSLICE() << "Empty attribute name in the tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
while (text[i] != 0 && is_space(text[i])) {
i++;
}
if (text[i] != '=') {
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of an attribute of the tag \""
<< tag_name << "\" at byte offset " << begin_pos);
}
i++;
while (text[i] != 0 && is_space(text[i])) {
i++;
}
if (text[i] == 0) {
return Status::Error(400, PSLICE()
<< "Unclosed start tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
string attribute_value;
if (text[i] != '\'' && text[i] != '"') {
// A name token (a sequence of letters, digits, periods, or hyphens). Name tokens are not case sensitive.
auto token_begin_pos = i;
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
i++;
}
attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos));
if (!is_space(text[i]) && text[i] != '>') {
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
}
} else {
// A string literal
char end_character = text[i++];
while (text[i] != end_character && text[i] != 0) {
if (text[i] == '&') {
auto ch = decode_html_entity(text, i);
if (ch != 0) {
append_utf8_character(attribute_value, ch);
continue;
}
}
attribute_value.push_back(text[i++]);
}
if (text[i] == end_character) {
i++;
}
}
if (text[i] == 0) {
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
}
if (tag_name == "a" && attribute_name == Slice("href")) {
url = std::move(attribute_value);
}
}
nested_entities.emplace_back(std::move(tag_name), std::move(url), utf16_offset, result.size());
} else {
// end of an entity
if (nested_entities.empty()) {
return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos);
}
while (!is_space(text[i]) && text[i] != '>') {
i++;
}
Slice end_tag_name = text.substr(begin_pos + 2, i - begin_pos - 2);
while (is_space(text[i]) && text[i] != 0) {
i++;
}
if (text[i] != '>') {
return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << begin_pos);
}
string tag_name = std::move(nested_entities.back().tag_name);
if (!end_tag_name.empty() && end_tag_name != tag_name) {
return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << begin_pos << ", expected \"</"
<< tag_name << ">\", found \"</" << end_tag_name << ">\"");
}
if (utf16_offset > nested_entities.back().entity_offset) {
auto entity_offset = nested_entities.back().entity_offset;
auto entity_length = utf16_offset - entity_offset;
if (tag_name == "i" || tag_name == "em") {
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
} else if (tag_name == "b" || tag_name == "strong") {
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
} else if (tag_name == "a") {
auto url = std::move(nested_entities.back().url);
if (url.empty()) {
url = result.substr(nested_entities.back().entity_begin_pos);
}
auto user_id = get_link_user_id(url);
if (user_id.is_valid()) {
entities.emplace_back(entity_offset, entity_length, user_id);
} else {
auto r_url = check_url(url);
if (r_url.is_ok()) {
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
}
}
attribute_value.push_back(text[i++]);
}
if (text[i] == end_character) {
i++;
}
}
if (text[i] == 0) {
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
}
if (tag_name == "a" && attribute_name == Slice("href")) {
url = std::move(attribute_value);
}
}
i++;
int32 entity_offset = utf16_offset;
size_t entity_begin_pos = result.size();
while (text[i] != 0 && text[i] != '<') {
auto cur_ch = static_cast<unsigned char>(text[i]);
if (cur_ch == '&') {
auto ch = decode_html_entity(text, i);
if (ch != 0) {
utf16_offset += 1 + (ch > 0xffff);
append_utf8_character(result, ch);
continue;
}
}
if (is_utf8_character_first_code_unit(cur_ch)) {
utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
}
result.push_back(text[i++]);
}
if (text[i] == 0) {
return Status::Error(400,
PSLICE() << "Can't find end tag corresponding to start tag at byte offset " << begin_pos);
}
auto end_tag_begin_pos = i++;
if (text[i] != '/') {
return Status::Error(400, PSLICE() << "Expected end tag at byte offset " << end_tag_begin_pos);
}
while (!is_space(text[i]) && text[i] != '>') {
i++;
}
Slice end_tag_name = text.substr(end_tag_begin_pos + 2, i - end_tag_begin_pos - 2);
while (is_space(text[i]) && text[i] != 0) {
i++;
}
if (text[i] != '>') {
return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << end_tag_begin_pos);
}
if (!end_tag_name.empty() && end_tag_name != tag_name) {
return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << end_tag_begin_pos
<< ", expected \"</" << tag_name << ">\", found\"</" << end_tag_name << ">\"");
}
if (utf16_offset > entity_offset) {
auto entity_length = utf16_offset - entity_offset;
if (tag_name == "i" || tag_name == "em") {
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
} else if (tag_name == "b" || tag_name == "strong") {
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
} else if (tag_name == "a") {
if (url.empty()) {
url = result.substr(entity_begin_pos);
}
auto user_id = get_link_user_id(url);
if (user_id.is_valid()) {
entities.emplace_back(entity_offset, entity_length, user_id);
} else if (tag_name == "pre") {
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
} else if (tag_name == "code") {
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
} else {
auto r_url = check_url(url);
if (r_url.is_ok()) {
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
}
UNREACHABLE();
}
} else if (tag_name == "pre") {
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
} else if (tag_name == "code") {
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
}
nested_entities.pop_back();
}
}
if (!nested_entities.empty()) {
return Status::Error(
400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name);
}
std::sort(entities.begin(), entities.end());
return entities;
}

View File

@ -15,7 +15,7 @@
REGISTER_TESTS(message_entities);
static void check_mention(td::string str, td::vector<td::string> expected) {
static void check_mention(const td::string &str, const td::vector<td::string> &expected) {
auto result_slice = td::find_mentions(str);
td::vector<td::string> result;
for (auto &it : result_slice) {
@ -44,7 +44,7 @@ TEST(MessageEntities, mention) {
{"@gif", "@wiki", "@vid", "@bing", "@pic", "@bold", "@imdb", "@coub", "@like", "@vote", "@bingg"});
};
static void check_bot_command(td::string str, td::vector<td::string> expected) {
static void check_bot_command(const td::string &str, const td::vector<td::string> &expected) {
auto result_slice = td::find_bot_commands(str);
td::vector<td::string> result;
for (auto &it : result_slice) {
@ -68,7 +68,7 @@ TEST(MessageEntities, bot_command) {
check_bot_command("/test/", {});
}
static void check_hashtag(td::string str, td::vector<td::string> expected) {
static void check_hashtag(const td::string &str, const td::vector<td::string> &expected) {
auto result_slice = td::find_hashtags(str);
td::vector<td::string> result;
for (auto &it : result_slice) {
@ -109,7 +109,7 @@ TEST(MessageEntities, hashtag) {
check_hashtag(u8"#a\u2122", {"#a"});
}
static void check_cashtag(td::string str, td::vector<td::string> expected) {
static void check_cashtag(const td::string &str, const td::vector<td::string> &expected) {
auto result_slice = td::find_cashtags(str);
td::vector<td::string> result;
for (auto &it : result_slice) {
@ -161,7 +161,7 @@ TEST(MessageEntities, cashtag) {
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
}
static void check_is_email_address(td::string str, bool expected) {
static void check_is_email_address(const td::string &str, bool expected) {
bool result = td::is_email_address(str);
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";
}
@ -279,7 +279,7 @@ TEST(MessageEntities, is_email_address) {
}
}
static void check_url(td::string str, td::vector<td::string> expected_urls,
static void check_url(const td::string &str, const td::vector<td::string> &expected_urls,
td::vector<td::string> expected_email_addresses = {}) {
auto result_slice = td::find_urls(str);
td::vector<td::string> result_urls;
@ -530,8 +530,9 @@ TEST(MessageEntities, url) {
check_url("...๐Ÿ‘‰http://ab.com/cdefgh-1IJ", {}); // TODO
}
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities, td::string expected_str,
td::vector<td::MessageEntity> expected_entities, bool allow_empty,
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,
const td::string &expected_str,
const td::vector<td::MessageEntity> &expected_entities, bool allow_empty,
bool skip_new_entities, bool skip_bot_commands, bool for_draft) {
ASSERT_TRUE(
td::fix_formatted_text(str, entities, allow_empty, skip_new_entities, skip_bot_commands, for_draft).is_ok());
@ -721,3 +722,87 @@ TEST(MessageEntities, fix_formatted_text) {
false);
}
}
static void check_parse_html(td::string text, const td::string &result, const td::vector<td::MessageEntity> &entities) {
auto r_entities = td::parse_html(text);
ASSERT_TRUE(r_entities.is_ok());
ASSERT_EQ(entities, r_entities.ok());
ASSERT_STREQ(result, text);
}
static void check_parse_html(td::string text, const td::string &error_message) {
auto r_entities = td::parse_html(text);
ASSERT_TRUE(r_entities.is_error());
ASSERT_EQ(400, r_entities.error().code());
ASSERT_STREQ(error_message, r_entities.error().message());
}
TEST(MessageEntities, parse_html) {
td::string invalid_surrogate_pair_error_message =
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched surrogate code units";
check_parse_html("&#57311;", invalid_surrogate_pair_error_message);
check_parse_html("&#xDFDF;", invalid_surrogate_pair_error_message);
check_parse_html("&#xDFDF", invalid_surrogate_pair_error_message);
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<abacaba", "Unclosed start tag at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<abac aba>", "Unsupported start tag \"abac\" at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<abac>", "Unsupported start tag \"abac\" at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i =aba>", "Empty attribute name in the tag \"i\" at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba>",
"Expected equal sign in declaration of an attribute of the tag \"i\" at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = ", "Unclosed start tag \"i\" at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = 190azAz-.,", "Unexpected end of name token at byte offset 27");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = \"&lt;&gt;&quot;>", "Unclosed start tag at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = \'&lt;&gt;&quot;>", "Unclosed start tag at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;</", "Unexpected end tag at byte offset 13");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<b></b></", "Unexpected end tag at byte offset 20");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i>a</i ", "Unclosed end tag at byte offset 17");
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i>a</em >",
"Unmatched end tag at byte offset 17, expected \"</i>\", found \"</em>\"");
check_parse_html("", "", {});
check_parse_html("โžก๏ธ โžก๏ธ", "โžก๏ธ โžก๏ธ", {});
check_parse_html("&lt;&gt;&amp;&quot;&laquo;&raquo;&#12345678;", "<>&\"&laquo;&raquo;&#12345678;", {});
check_parse_html("โžก๏ธ โžก๏ธ<i>โžก๏ธ โžก๏ธ</i>", "โžก๏ธ โžก๏ธโžก๏ธ โžก๏ธ",
{{td::MessageEntity::Type::Italic, 5, 5}});
check_parse_html("๐ŸŸ ๐ŸŸ<i>๐ŸŸ &lt๐ŸŸ</i>", "๐ŸŸ ๐ŸŸ๐ŸŸ <๐ŸŸ", {{td::MessageEntity::Type::Italic, 5, 6}});
check_parse_html("๐ŸŸ ๐ŸŸ<i>๐ŸŸ &gt;<b aba = caba>&lt๐ŸŸ</b></i>", "๐ŸŸ ๐ŸŸ๐ŸŸ ><๐ŸŸ",
{{td::MessageEntity::Type::Italic, 5, 7}, {td::MessageEntity::Type::Bold, 9, 3}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = 190azAz-. >a</i>", "๐ŸŸ ๐ŸŸ<a",
{{td::MessageEntity::Type::Italic, 6, 1}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = 190azAz-.>a</i>", "๐ŸŸ ๐ŸŸ<a",
{{td::MessageEntity::Type::Italic, 6, 1}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = \"&lt;&gt;&quot;\">a</i>", "๐ŸŸ ๐ŸŸ<a",
{{td::MessageEntity::Type::Italic, 6, 1}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = '&lt;&gt;&quot;'>a</i>", "๐ŸŸ ๐ŸŸ<a",
{{td::MessageEntity::Type::Italic, 6, 1}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i aba = '&lt;&gt;&quot;'>a</>", "๐ŸŸ ๐ŸŸ<a",
{{td::MessageEntity::Type::Italic, 6, 1}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i>a</ >", "๐ŸŸ ๐ŸŸ<a", {{td::MessageEntity::Type::Italic, 6, 1}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<i>a</i >", "๐ŸŸ ๐ŸŸ<a", {{td::MessageEntity::Type::Italic, 6, 1}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<b></b>", "๐ŸŸ ๐ŸŸ<", {});
check_parse_html("<code><i><b> </b></i></code><i><b><code> </code></b></i>", " ",
{{td::MessageEntity::Type::Code, 0, 1},
{td::MessageEntity::Type::Bold, 0, 1},
{td::MessageEntity::Type::Italic, 0, 1},
{td::MessageEntity::Type::Code, 1, 1},
{td::MessageEntity::Type::Bold, 1, 1},
{td::MessageEntity::Type::Italic, 1, 1}});
check_parse_html("<i><b> </b> <code> </code></i>", " ",
{{td::MessageEntity::Type::Italic, 0, 3},
{td::MessageEntity::Type::Bold, 0, 1},
{td::MessageEntity::Type::Code, 2, 1}});
check_parse_html("<a href=telegram.org> </a>", " ",
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
check_parse_html("<a href =\"telegram.org\" > </a>", " ",
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
check_parse_html("<a href= 'telegram.org' > </a>", " ",
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
check_parse_html("<a href= 'telegram.org?&lt;' > </a>", " ",
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/?<"}});
check_parse_html("<a> </a>", " ", {});
check_parse_html("<a>telegram.org </a>", "telegram.org ", {});
check_parse_html("<a>telegram.org</a>", "telegram.org",
{{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}});
check_parse_html("<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>", "https://telegram.org/asdsa?asdasdwe#12e3we",
{{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}});
}