Support nested entities in parse_html.
GitOrigin-RevId: c8a00262f5a8739d09b45ce710a5c7b920f2bfd4
This commit is contained in:
parent
3006357f7c
commit
b1d1ea2e6c
@ -24,7 +24,7 @@
|
||||
namespace td {
|
||||
|
||||
int MessageEntity::get_type_priority(Type type) {
|
||||
static const int types[] = {5, 5, 5, 5, 5, 9, 9, 2, 1, 1, 5, 5, 5, 5, 9, 9, 0};
|
||||
static const int types[] = {50, 50, 50, 50, 50, 90, 91, 20, 11, 10, 49, 49, 50, 50, 92, 93, 0};
|
||||
return types[static_cast<int32>(type)];
|
||||
}
|
||||
|
||||
@ -1461,6 +1461,22 @@ static uint32 decode_html_entity(Slice text, size_t &pos) {
|
||||
static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||
vector<MessageEntity> entities;
|
||||
int32 utf16_offset = 0;
|
||||
|
||||
struct EntityInfo {
|
||||
string tag_name;
|
||||
string url;
|
||||
int32 entity_offset;
|
||||
size_t entity_begin_pos;
|
||||
|
||||
EntityInfo(string tag_name, string url, int32 entity_offset, size_t entity_begin_pos)
|
||||
: tag_name(std::move(tag_name))
|
||||
, url(std::move(url))
|
||||
, entity_offset(entity_offset)
|
||||
, entity_begin_pos(entity_begin_pos) {
|
||||
}
|
||||
};
|
||||
std::vector<EntityInfo> nested_entities;
|
||||
|
||||
for (size_t i = 0; i < text.size(); i++) {
|
||||
auto c = static_cast<unsigned char>(text[i]);
|
||||
if (c == '&') {
|
||||
@ -1480,11 +1496,9 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// we are at begin of the entity
|
||||
size_t begin_pos = i++;
|
||||
if (text[i] == '/') {
|
||||
return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos);
|
||||
}
|
||||
auto begin_pos = i++;
|
||||
if (text[i] != '/') {
|
||||
// begin of an entity
|
||||
while (!is_space(text[i]) && text[i] != '>') {
|
||||
i++;
|
||||
}
|
||||
@ -1495,8 +1509,8 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||
string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1));
|
||||
if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" &&
|
||||
tag_name != "pre" && tag_name != "code") {
|
||||
return Status::Error(400,
|
||||
PSLICE() << "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||
return Status::Error(400, PSLICE()
|
||||
<< "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||
}
|
||||
|
||||
string url;
|
||||
@ -1514,14 +1528,14 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||
}
|
||||
Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos);
|
||||
if (attribute_name.empty()) {
|
||||
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
|
||||
<< tag_name << "\" at byte offset " << begin_pos);
|
||||
return Status::Error(
|
||||
400, PSLICE() << "Empty attribute name in the tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||
}
|
||||
while (text[i] != 0 && is_space(text[i])) {
|
||||
i++;
|
||||
}
|
||||
if (text[i] != '=') {
|
||||
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
|
||||
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of an attribute of the tag \""
|
||||
<< tag_name << "\" at byte offset " << begin_pos);
|
||||
}
|
||||
i++;
|
||||
@ -1529,7 +1543,8 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||
i++;
|
||||
}
|
||||
if (text[i] == 0) {
|
||||
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
||||
return Status::Error(400, PSLICE()
|
||||
<< "Unclosed start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||
}
|
||||
|
||||
string attribute_value;
|
||||
@ -1569,58 +1584,42 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||
url = std::move(attribute_value);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
|
||||
int32 entity_offset = utf16_offset;
|
||||
size_t entity_begin_pos = result.size();
|
||||
while (text[i] != 0 && text[i] != '<') {
|
||||
auto cur_ch = static_cast<unsigned char>(text[i]);
|
||||
if (cur_ch == '&') {
|
||||
auto ch = decode_html_entity(text, i);
|
||||
if (ch != 0) {
|
||||
utf16_offset += 1 + (ch > 0xffff);
|
||||
append_utf8_character(result, ch);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (is_utf8_character_first_code_unit(cur_ch)) {
|
||||
utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
|
||||
}
|
||||
result.push_back(text[i++]);
|
||||
}
|
||||
if (text[i] == 0) {
|
||||
return Status::Error(400,
|
||||
PSLICE() << "Can't find end tag corresponding to start tag at byte offset " << begin_pos);
|
||||
nested_entities.emplace_back(std::move(tag_name), std::move(url), utf16_offset, result.size());
|
||||
} else {
|
||||
// end of an entity
|
||||
if (nested_entities.empty()) {
|
||||
return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos);
|
||||
}
|
||||
|
||||
auto end_tag_begin_pos = i++;
|
||||
if (text[i] != '/') {
|
||||
return Status::Error(400, PSLICE() << "Expected end tag at byte offset " << end_tag_begin_pos);
|
||||
}
|
||||
while (!is_space(text[i]) && text[i] != '>') {
|
||||
i++;
|
||||
}
|
||||
Slice end_tag_name = text.substr(end_tag_begin_pos + 2, i - end_tag_begin_pos - 2);
|
||||
Slice end_tag_name = text.substr(begin_pos + 2, i - begin_pos - 2);
|
||||
while (is_space(text[i]) && text[i] != 0) {
|
||||
i++;
|
||||
}
|
||||
if (text[i] != '>') {
|
||||
return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << end_tag_begin_pos);
|
||||
}
|
||||
if (!end_tag_name.empty() && end_tag_name != tag_name) {
|
||||
return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << end_tag_begin_pos
|
||||
<< ", expected \"</" << tag_name << ">\", found\"</" << end_tag_name << ">\"");
|
||||
return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << begin_pos);
|
||||
}
|
||||
|
||||
if (utf16_offset > entity_offset) {
|
||||
string tag_name = std::move(nested_entities.back().tag_name);
|
||||
if (!end_tag_name.empty() && end_tag_name != tag_name) {
|
||||
return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << begin_pos << ", expected \"</"
|
||||
<< tag_name << ">\", found \"</" << end_tag_name << ">\"");
|
||||
}
|
||||
|
||||
if (utf16_offset > nested_entities.back().entity_offset) {
|
||||
auto entity_offset = nested_entities.back().entity_offset;
|
||||
auto entity_length = utf16_offset - entity_offset;
|
||||
if (tag_name == "i" || tag_name == "em") {
|
||||
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
|
||||
} else if (tag_name == "b" || tag_name == "strong") {
|
||||
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
|
||||
} else if (tag_name == "a") {
|
||||
auto url = std::move(nested_entities.back().url);
|
||||
if (url.empty()) {
|
||||
url = result.substr(entity_begin_pos);
|
||||
url = result.substr(nested_entities.back().entity_begin_pos);
|
||||
}
|
||||
auto user_id = get_link_user_id(url);
|
||||
if (user_id.is_valid()) {
|
||||
@ -1635,9 +1634,20 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
|
||||
} else if (tag_name == "code") {
|
||||
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
|
||||
} else {
|
||||
UNREACHABLE();
|
||||
}
|
||||
}
|
||||
nested_entities.pop_back();
|
||||
}
|
||||
}
|
||||
if (!nested_entities.empty()) {
|
||||
return Status::Error(
|
||||
400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name);
|
||||
}
|
||||
|
||||
std::sort(entities.begin(), entities.end());
|
||||
|
||||
return entities;
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
REGISTER_TESTS(message_entities);
|
||||
|
||||
static void check_mention(td::string str, td::vector<td::string> expected) {
|
||||
static void check_mention(const td::string &str, const td::vector<td::string> &expected) {
|
||||
auto result_slice = td::find_mentions(str);
|
||||
td::vector<td::string> result;
|
||||
for (auto &it : result_slice) {
|
||||
@ -44,7 +44,7 @@ TEST(MessageEntities, mention) {
|
||||
{"@gif", "@wiki", "@vid", "@bing", "@pic", "@bold", "@imdb", "@coub", "@like", "@vote", "@bingg"});
|
||||
};
|
||||
|
||||
static void check_bot_command(td::string str, td::vector<td::string> expected) {
|
||||
static void check_bot_command(const td::string &str, const td::vector<td::string> &expected) {
|
||||
auto result_slice = td::find_bot_commands(str);
|
||||
td::vector<td::string> result;
|
||||
for (auto &it : result_slice) {
|
||||
@ -68,7 +68,7 @@ TEST(MessageEntities, bot_command) {
|
||||
check_bot_command("/test/", {});
|
||||
}
|
||||
|
||||
static void check_hashtag(td::string str, td::vector<td::string> expected) {
|
||||
static void check_hashtag(const td::string &str, const td::vector<td::string> &expected) {
|
||||
auto result_slice = td::find_hashtags(str);
|
||||
td::vector<td::string> result;
|
||||
for (auto &it : result_slice) {
|
||||
@ -109,7 +109,7 @@ TEST(MessageEntities, hashtag) {
|
||||
check_hashtag(u8"#a\u2122", {"#a"});
|
||||
}
|
||||
|
||||
static void check_cashtag(td::string str, td::vector<td::string> expected) {
|
||||
static void check_cashtag(const td::string &str, const td::vector<td::string> &expected) {
|
||||
auto result_slice = td::find_cashtags(str);
|
||||
td::vector<td::string> result;
|
||||
for (auto &it : result_slice) {
|
||||
@ -161,7 +161,7 @@ TEST(MessageEntities, cashtag) {
|
||||
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
|
||||
}
|
||||
|
||||
static void check_is_email_address(td::string str, bool expected) {
|
||||
static void check_is_email_address(const td::string &str, bool expected) {
|
||||
bool result = td::is_email_address(str);
|
||||
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";
|
||||
}
|
||||
@ -279,7 +279,7 @@ TEST(MessageEntities, is_email_address) {
|
||||
}
|
||||
}
|
||||
|
||||
static void check_url(td::string str, td::vector<td::string> expected_urls,
|
||||
static void check_url(const td::string &str, const td::vector<td::string> &expected_urls,
|
||||
td::vector<td::string> expected_email_addresses = {}) {
|
||||
auto result_slice = td::find_urls(str);
|
||||
td::vector<td::string> result_urls;
|
||||
@ -530,8 +530,9 @@ TEST(MessageEntities, url) {
|
||||
check_url("...๐http://ab.com/cdefgh-1IJ", {}); // TODO
|
||||
}
|
||||
|
||||
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities, td::string expected_str,
|
||||
td::vector<td::MessageEntity> expected_entities, bool allow_empty,
|
||||
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,
|
||||
const td::string &expected_str,
|
||||
const td::vector<td::MessageEntity> &expected_entities, bool allow_empty,
|
||||
bool skip_new_entities, bool skip_bot_commands, bool for_draft) {
|
||||
ASSERT_TRUE(
|
||||
td::fix_formatted_text(str, entities, allow_empty, skip_new_entities, skip_bot_commands, for_draft).is_ok());
|
||||
@ -721,3 +722,87 @@ TEST(MessageEntities, fix_formatted_text) {
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_parse_html(td::string text, const td::string &result, const td::vector<td::MessageEntity> &entities) {
|
||||
auto r_entities = td::parse_html(text);
|
||||
ASSERT_TRUE(r_entities.is_ok());
|
||||
ASSERT_EQ(entities, r_entities.ok());
|
||||
ASSERT_STREQ(result, text);
|
||||
}
|
||||
|
||||
static void check_parse_html(td::string text, const td::string &error_message) {
|
||||
auto r_entities = td::parse_html(text);
|
||||
ASSERT_TRUE(r_entities.is_error());
|
||||
ASSERT_EQ(400, r_entities.error().code());
|
||||
ASSERT_STREQ(error_message, r_entities.error().message());
|
||||
}
|
||||
|
||||
TEST(MessageEntities, parse_html) {
|
||||
td::string invalid_surrogate_pair_error_message =
|
||||
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched surrogate code units";
|
||||
check_parse_html("�", invalid_surrogate_pair_error_message);
|
||||
check_parse_html("�", invalid_surrogate_pair_error_message);
|
||||
check_parse_html("�", invalid_surrogate_pair_error_message);
|
||||
check_parse_html("๐ ๐<<abacaba", "Unclosed start tag at byte offset 13");
|
||||
check_parse_html("๐ ๐<<abac aba>", "Unsupported start tag \"abac\" at byte offset 13");
|
||||
check_parse_html("๐ ๐<<abac>", "Unsupported start tag \"abac\" at byte offset 13");
|
||||
check_parse_html("๐ ๐<<i =aba>", "Empty attribute name in the tag \"i\" at byte offset 13");
|
||||
check_parse_html("๐ ๐<<i aba>",
|
||||
"Expected equal sign in declaration of an attribute of the tag \"i\" at byte offset 13");
|
||||
check_parse_html("๐ ๐<<i aba = ", "Unclosed start tag \"i\" at byte offset 13");
|
||||
check_parse_html("๐ ๐<<i aba = 190azAz-.,", "Unexpected end of name token at byte offset 27");
|
||||
check_parse_html("๐ ๐<<i aba = \"<>">", "Unclosed start tag at byte offset 13");
|
||||
check_parse_html("๐ ๐<<i aba = \'<>">", "Unclosed start tag at byte offset 13");
|
||||
check_parse_html("๐ ๐<</", "Unexpected end tag at byte offset 13");
|
||||
check_parse_html("๐ ๐<<b></b></", "Unexpected end tag at byte offset 20");
|
||||
check_parse_html("๐ ๐<<i>a</i ", "Unclosed end tag at byte offset 17");
|
||||
check_parse_html("๐ ๐<<i>a</em >",
|
||||
"Unmatched end tag at byte offset 17, expected \"</i>\", found \"</em>\"");
|
||||
|
||||
check_parse_html("", "", {});
|
||||
check_parse_html("โก๏ธ โก๏ธ", "โก๏ธ โก๏ธ", {});
|
||||
check_parse_html("<>&"«»�", "<>&\"«»�", {});
|
||||
check_parse_html("โก๏ธ โก๏ธ<i>โก๏ธ โก๏ธ</i>", "โก๏ธ โก๏ธโก๏ธ โก๏ธ",
|
||||
{{td::MessageEntity::Type::Italic, 5, 5}});
|
||||
check_parse_html("๐ ๐<i>๐ <๐</i>", "๐ ๐๐ <๐", {{td::MessageEntity::Type::Italic, 5, 6}});
|
||||
check_parse_html("๐ ๐<i>๐ ><b aba = caba><๐</b></i>", "๐ ๐๐ ><๐",
|
||||
{{td::MessageEntity::Type::Italic, 5, 7}, {td::MessageEntity::Type::Bold, 9, 3}});
|
||||
check_parse_html("๐ ๐<<i aba = 190azAz-. >a</i>", "๐ ๐<a",
|
||||
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||
check_parse_html("๐ ๐<<i aba = 190azAz-.>a</i>", "๐ ๐<a",
|
||||
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||
check_parse_html("๐ ๐<<i aba = \"<>"\">a</i>", "๐ ๐<a",
|
||||
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||
check_parse_html("๐ ๐<<i aba = '<>"'>a</i>", "๐ ๐<a",
|
||||
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||
check_parse_html("๐ ๐<<i aba = '<>"'>a</>", "๐ ๐<a",
|
||||
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||
check_parse_html("๐ ๐<<i>a</ >", "๐ ๐<a", {{td::MessageEntity::Type::Italic, 6, 1}});
|
||||
check_parse_html("๐ ๐<<i>a</i >", "๐ ๐<a", {{td::MessageEntity::Type::Italic, 6, 1}});
|
||||
check_parse_html("๐ ๐<<b></b>", "๐ ๐<", {});
|
||||
check_parse_html("<code><i><b> </b></i></code><i><b><code> </code></b></i>", " ",
|
||||
{{td::MessageEntity::Type::Code, 0, 1},
|
||||
{td::MessageEntity::Type::Bold, 0, 1},
|
||||
{td::MessageEntity::Type::Italic, 0, 1},
|
||||
{td::MessageEntity::Type::Code, 1, 1},
|
||||
{td::MessageEntity::Type::Bold, 1, 1},
|
||||
{td::MessageEntity::Type::Italic, 1, 1}});
|
||||
check_parse_html("<i><b> </b> <code> </code></i>", " ",
|
||||
{{td::MessageEntity::Type::Italic, 0, 3},
|
||||
{td::MessageEntity::Type::Bold, 0, 1},
|
||||
{td::MessageEntity::Type::Code, 2, 1}});
|
||||
check_parse_html("<a href=telegram.org> </a>", " ",
|
||||
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
|
||||
check_parse_html("<a href =\"telegram.org\" > </a>", " ",
|
||||
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
|
||||
check_parse_html("<a href= 'telegram.org' > </a>", " ",
|
||||
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
|
||||
check_parse_html("<a href= 'telegram.org?<' > </a>", " ",
|
||||
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/?<"}});
|
||||
check_parse_html("<a> </a>", " ", {});
|
||||
check_parse_html("<a>telegram.org </a>", "telegram.org ", {});
|
||||
check_parse_html("<a>telegram.org</a>", "telegram.org",
|
||||
{{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}});
|
||||
check_parse_html("<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>", "https://telegram.org/asdsa?asdasdwe#12e3we",
|
||||
{{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}});
|
||||
}
|
||||
|
Reference in New Issue
Block a user