diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 248eccb6..02140754 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -24,7 +24,7 @@ namespace td { int MessageEntity::get_type_priority(Type type) { - static const int types[] = {5, 5, 5, 5, 5, 9, 9, 2, 1, 1, 5, 5, 5, 5, 9, 9, 0}; + static const int types[] = {50, 50, 50, 50, 50, 90, 91, 20, 11, 10, 49, 49, 50, 50, 92, 93, 0}; return types[static_cast(type)]; } @@ -1461,6 +1461,22 @@ static uint32 decode_html_entity(Slice text, size_t &pos) { static Result> do_parse_html(Slice text, string &result) { vector entities; int32 utf16_offset = 0; + + struct EntityInfo { + string tag_name; + string url; + int32 entity_offset; + size_t entity_begin_pos; + + EntityInfo(string tag_name, string url, int32 entity_offset, size_t entity_begin_pos) + : tag_name(std::move(tag_name)) + , url(std::move(url)) + , entity_offset(entity_offset) + , entity_begin_pos(entity_begin_pos) { + } + }; + std::vector nested_entities; + for (size_t i = 0; i < text.size(); i++) { auto c = static_cast(text[i]); if (c == '&') { @@ -1480,164 +1496,158 @@ static Result> do_parse_html(Slice text, string &result) { continue; } - // we are at begin of the entity - size_t begin_pos = i++; - if (text[i] == '/') { - return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos); - } - while (!is_space(text[i]) && text[i] != '>') { - i++; - } - if (text[i] == 0) { - return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos); - } - - string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1)); - if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" && - tag_name != "pre" && tag_name != "code") { - return Status::Error(400, - PSLICE() << "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos); - } - - string url; - // string language; TODO PreCode support - while (text[i] != '>') { - while (text[i] != 0 && is_space(text[i])) { - i++; - } - if (text[i] == '>') { - break; - } - auto attribute_begin_pos = i; - while (!is_space(text[i]) && text[i] != '=') { - i++; - } - Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos); - if (attribute_name.empty()) { - return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \"" - << tag_name << "\" at byte offset " << begin_pos); - } - while (text[i] != 0 && is_space(text[i])) { - i++; - } - if (text[i] != '=') { - return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \"" - << tag_name << "\" at byte offset " << begin_pos); - } - i++; - while (text[i] != 0 && is_space(text[i])) { + auto begin_pos = i++; + if (text[i] != '/') { + // begin of an entity + while (!is_space(text[i]) && text[i] != '>') { i++; } if (text[i] == 0) { return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos); } - string attribute_value; - if (text[i] != '\'' && text[i] != '"') { - // A name token (a sequence of letters, digits, periods, or hyphens). Name tokens are not case sensitive. - auto token_begin_pos = i; - while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') { + string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1)); + if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" && + tag_name != "pre" && tag_name != "code") { + return Status::Error(400, PSLICE() + << "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos); + } + + string url; + // string language; TODO PreCode support + while (text[i] != '>') { + while (text[i] != 0 && is_space(text[i])) { i++; } - attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos)); - - if (!is_space(text[i]) && text[i] != '>') { - return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos); + if (text[i] == '>') { + break; } - } else { - // A string literal - char end_character = text[i++]; - while (text[i] != end_character && text[i] != 0) { - if (text[i] == '&') { - auto ch = decode_html_entity(text, i); - if (ch != 0) { - append_utf8_character(attribute_value, ch); - continue; + auto attribute_begin_pos = i; + while (!is_space(text[i]) && text[i] != '=') { + i++; + } + Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos); + if (attribute_name.empty()) { + return Status::Error( + 400, PSLICE() << "Empty attribute name in the tag \"" << tag_name << "\" at byte offset " << begin_pos); + } + while (text[i] != 0 && is_space(text[i])) { + i++; + } + if (text[i] != '=') { + return Status::Error(400, PSLICE() << "Expected equal sign in declaration of an attribute of the tag \"" + << tag_name << "\" at byte offset " << begin_pos); + } + i++; + while (text[i] != 0 && is_space(text[i])) { + i++; + } + if (text[i] == 0) { + return Status::Error(400, PSLICE() + << "Unclosed start tag \"" << tag_name << "\" at byte offset " << begin_pos); + } + + string attribute_value; + if (text[i] != '\'' && text[i] != '"') { + // A name token (a sequence of letters, digits, periods, or hyphens). Name tokens are not case sensitive. + auto token_begin_pos = i; + while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') { + i++; + } + attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos)); + + if (!is_space(text[i]) && text[i] != '>') { + return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos); + } + } else { + // A string literal + char end_character = text[i++]; + while (text[i] != end_character && text[i] != 0) { + if (text[i] == '&') { + auto ch = decode_html_entity(text, i); + if (ch != 0) { + append_utf8_character(attribute_value, ch); + continue; + } + } + attribute_value.push_back(text[i++]); + } + if (text[i] == end_character) { + i++; + } + } + if (text[i] == 0) { + return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos); + } + + if (tag_name == "a" && attribute_name == Slice("href")) { + url = std::move(attribute_value); + } + } + + nested_entities.emplace_back(std::move(tag_name), std::move(url), utf16_offset, result.size()); + } else { + // end of an entity + if (nested_entities.empty()) { + return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos); + } + + while (!is_space(text[i]) && text[i] != '>') { + i++; + } + Slice end_tag_name = text.substr(begin_pos + 2, i - begin_pos - 2); + while (is_space(text[i]) && text[i] != 0) { + i++; + } + if (text[i] != '>') { + return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << begin_pos); + } + + string tag_name = std::move(nested_entities.back().tag_name); + if (!end_tag_name.empty() && end_tag_name != tag_name) { + return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << begin_pos << ", expected \"\", found \"\""); + } + + if (utf16_offset > nested_entities.back().entity_offset) { + auto entity_offset = nested_entities.back().entity_offset; + auto entity_length = utf16_offset - entity_offset; + if (tag_name == "i" || tag_name == "em") { + entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length); + } else if (tag_name == "b" || tag_name == "strong") { + entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length); + } else if (tag_name == "a") { + auto url = std::move(nested_entities.back().url); + if (url.empty()) { + url = result.substr(nested_entities.back().entity_begin_pos); + } + auto user_id = get_link_user_id(url); + if (user_id.is_valid()) { + entities.emplace_back(entity_offset, entity_length, user_id); + } else { + auto r_url = check_url(url); + if (r_url.is_ok()) { + entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok()); } } - attribute_value.push_back(text[i++]); - } - if (text[i] == end_character) { - i++; - } - } - if (text[i] == 0) { - return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos); - } - - if (tag_name == "a" && attribute_name == Slice("href")) { - url = std::move(attribute_value); - } - } - i++; - - int32 entity_offset = utf16_offset; - size_t entity_begin_pos = result.size(); - while (text[i] != 0 && text[i] != '<') { - auto cur_ch = static_cast(text[i]); - if (cur_ch == '&') { - auto ch = decode_html_entity(text, i); - if (ch != 0) { - utf16_offset += 1 + (ch > 0xffff); - append_utf8_character(result, ch); - continue; - } - } - if (is_utf8_character_first_code_unit(cur_ch)) { - utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair - } - result.push_back(text[i++]); - } - if (text[i] == 0) { - return Status::Error(400, - PSLICE() << "Can't find end tag corresponding to start tag at byte offset " << begin_pos); - } - - auto end_tag_begin_pos = i++; - if (text[i] != '/') { - return Status::Error(400, PSLICE() << "Expected end tag at byte offset " << end_tag_begin_pos); - } - while (!is_space(text[i]) && text[i] != '>') { - i++; - } - Slice end_tag_name = text.substr(end_tag_begin_pos + 2, i - end_tag_begin_pos - 2); - while (is_space(text[i]) && text[i] != 0) { - i++; - } - if (text[i] != '>') { - return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << end_tag_begin_pos); - } - if (!end_tag_name.empty() && end_tag_name != tag_name) { - return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << end_tag_begin_pos - << ", expected \"\", found\"\""); - } - - if (utf16_offset > entity_offset) { - auto entity_length = utf16_offset - entity_offset; - if (tag_name == "i" || tag_name == "em") { - entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length); - } else if (tag_name == "b" || tag_name == "strong") { - entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length); - } else if (tag_name == "a") { - if (url.empty()) { - url = result.substr(entity_begin_pos); - } - auto user_id = get_link_user_id(url); - if (user_id.is_valid()) { - entities.emplace_back(entity_offset, entity_length, user_id); + } else if (tag_name == "pre") { + entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length); + } else if (tag_name == "code") { + entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length); } else { - auto r_url = check_url(url); - if (r_url.is_ok()) { - entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok()); - } + UNREACHABLE(); } - } else if (tag_name == "pre") { - entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length); - } else if (tag_name == "code") { - entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length); } + nested_entities.pop_back(); } } + if (!nested_entities.empty()) { + return Status::Error( + 400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name); + } + + std::sort(entities.begin(), entities.end()); + return entities; } diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 8e376d21..3481f397 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -15,7 +15,7 @@ REGISTER_TESTS(message_entities); -static void check_mention(td::string str, td::vector expected) { +static void check_mention(const td::string &str, const td::vector &expected) { auto result_slice = td::find_mentions(str); td::vector result; for (auto &it : result_slice) { @@ -44,7 +44,7 @@ TEST(MessageEntities, mention) { {"@gif", "@wiki", "@vid", "@bing", "@pic", "@bold", "@imdb", "@coub", "@like", "@vote", "@bingg"}); }; -static void check_bot_command(td::string str, td::vector expected) { +static void check_bot_command(const td::string &str, const td::vector &expected) { auto result_slice = td::find_bot_commands(str); td::vector result; for (auto &it : result_slice) { @@ -68,7 +68,7 @@ TEST(MessageEntities, bot_command) { check_bot_command("/test/", {}); } -static void check_hashtag(td::string str, td::vector expected) { +static void check_hashtag(const td::string &str, const td::vector &expected) { auto result_slice = td::find_hashtags(str); td::vector result; for (auto &it : result_slice) { @@ -109,7 +109,7 @@ TEST(MessageEntities, hashtag) { check_hashtag(u8"#a\u2122", {"#a"}); } -static void check_cashtag(td::string str, td::vector expected) { +static void check_cashtag(const td::string &str, const td::vector &expected) { auto result_slice = td::find_cashtags(str); td::vector result; for (auto &it : result_slice) { @@ -161,7 +161,7 @@ TEST(MessageEntities, cashtag) { check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"}); } -static void check_is_email_address(td::string str, bool expected) { +static void check_is_email_address(const td::string &str, bool expected) { bool result = td::is_email_address(str); LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")"; } @@ -279,7 +279,7 @@ TEST(MessageEntities, is_email_address) { } } -static void check_url(td::string str, td::vector expected_urls, +static void check_url(const td::string &str, const td::vector &expected_urls, td::vector expected_email_addresses = {}) { auto result_slice = td::find_urls(str); td::vector result_urls; @@ -530,8 +530,9 @@ TEST(MessageEntities, url) { check_url("...👉http://ab.com/cdefgh-1IJ", {}); // TODO } -static void check_fix_formatted_text(td::string str, td::vector entities, td::string expected_str, - td::vector expected_entities, bool allow_empty, +static void check_fix_formatted_text(td::string str, td::vector entities, + const td::string &expected_str, + const td::vector &expected_entities, bool allow_empty, bool skip_new_entities, bool skip_bot_commands, bool for_draft) { ASSERT_TRUE( td::fix_formatted_text(str, entities, allow_empty, skip_new_entities, skip_bot_commands, for_draft).is_ok()); @@ -721,3 +722,87 @@ TEST(MessageEntities, fix_formatted_text) { false); } } + +static void check_parse_html(td::string text, const td::string &result, const td::vector &entities) { + auto r_entities = td::parse_html(text); + ASSERT_TRUE(r_entities.is_ok()); + ASSERT_EQ(entities, r_entities.ok()); + ASSERT_STREQ(result, text); +} + +static void check_parse_html(td::string text, const td::string &error_message) { + auto r_entities = td::parse_html(text); + ASSERT_TRUE(r_entities.is_error()); + ASSERT_EQ(400, r_entities.error().code()); + ASSERT_STREQ(error_message, r_entities.error().message()); +} + +TEST(MessageEntities, parse_html) { + td::string invalid_surrogate_pair_error_message = + "Text contains invalid Unicode characters after decoding HTML entities, check for unmatched surrogate code units"; + check_parse_html("�", invalid_surrogate_pair_error_message); + check_parse_html("�", invalid_surrogate_pair_error_message); + check_parse_html("�", invalid_surrogate_pair_error_message); + check_parse_html("🏟 🏟<", "Unsupported start tag \"abac\" at byte offset 13"); + check_parse_html("🏟 🏟<", "Unsupported start tag \"abac\" at byte offset 13"); + check_parse_html("🏟 🏟<", "Empty attribute name in the tag \"i\" at byte offset 13"); + check_parse_html("🏟 🏟<", + "Expected equal sign in declaration of an attribute of the tag \"i\" at byte offset 13"); + check_parse_html("🏟 🏟<", "Unclosed start tag at byte offset 13"); + check_parse_html("🏟 🏟<", "Unclosed start tag at byte offset 13"); + check_parse_html("🏟 🏟<aa", + "Unmatched end tag at byte offset 17, expected \"\", found \"\""); + + check_parse_html("", "", {}); + check_parse_html("➡️ ➡️", "➡️ ➡️", {}); + check_parse_html("<>&"«»�", "<>&\"«»�", {}); + check_parse_html("➡️ ➡️➡️ ➡️", "➡️ ➡️➡️ ➡️", + {{td::MessageEntity::Type::Italic, 5, 5}}); + check_parse_html("🏟 🏟🏟 <🏟", "🏟 🏟🏟 <🏟", {{td::MessageEntity::Type::Italic, 5, 6}}); + check_parse_html("🏟 🏟🏟 ><🏟", "🏟 🏟🏟 ><🏟", + {{td::MessageEntity::Type::Italic, 5, 7}, {td::MessageEntity::Type::Bold, 9, 3}}); + check_parse_html("🏟 🏟<a", "🏟 🏟a", "🏟 🏟a", "🏟 🏟a", "🏟 🏟a", "🏟 🏟a", "🏟 🏟a", "🏟 🏟", "🏟 🏟<", {}); + check_parse_html(" ", " ", + {{td::MessageEntity::Type::Code, 0, 1}, + {td::MessageEntity::Type::Bold, 0, 1}, + {td::MessageEntity::Type::Italic, 0, 1}, + {td::MessageEntity::Type::Code, 1, 1}, + {td::MessageEntity::Type::Bold, 1, 1}, + {td::MessageEntity::Type::Italic, 1, 1}}); + check_parse_html(" ", " ", + {{td::MessageEntity::Type::Italic, 0, 3}, + {td::MessageEntity::Type::Bold, 0, 1}, + {td::MessageEntity::Type::Code, 2, 1}}); + check_parse_html(" ", " ", + {{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}}); + check_parse_html(" ", " ", + {{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}}); + check_parse_html(" ", " ", + {{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}}); + check_parse_html(" ", " ", + {{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/?<"}}); + check_parse_html(" ", " ", {}); + check_parse_html("telegram.org ", "telegram.org ", {}); + check_parse_html("telegram.org", "telegram.org", + {{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}}); + check_parse_html("https://telegram.org/asdsa?asdasdwe#12e3we", "https://telegram.org/asdsa?asdasdwe#12e3we", + {{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}}); +}