From fdf70df4924a188844c424498628509849da956e Mon Sep 17 00:00:00 2001 From: levlam Date: Thu, 3 Oct 2019 02:31:06 +0300 Subject: [PATCH] Add parse_markdown_v2. GitOrigin-RevId: ea2ce8bad64becc53d2e6466019469dffec2dc27 --- td/telegram/MessageEntity.cpp | 292 ++++++++++++++++++++++++++++------ td/telegram/MessageEntity.h | 4 + test/message_entities.cpp | 130 ++++++++++++++- 3 files changed, 378 insertions(+), 48 deletions(-) diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 02140754..2d953032 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -28,71 +28,52 @@ int MessageEntity::get_type_priority(Type type) { return types[static_cast(type)]; } -StringBuilder &operator<<(StringBuilder &string_builder, const MessageEntity &message_entity) { - bool has_argument = false; - string_builder << '['; - switch (message_entity.type) { +StringBuilder &operator<<(StringBuilder &string_builder, const MessageEntity::Type &message_entity_type) { + switch (message_entity_type) { case MessageEntity::Type::Mention: - string_builder << "Mention"; - break; + return string_builder << "Mention"; case MessageEntity::Type::Hashtag: - string_builder << "Hashtag"; - break; + return string_builder << "Hashtag"; case MessageEntity::Type::BotCommand: - string_builder << "BotCommand"; - break; + return string_builder << "BotCommand"; case MessageEntity::Type::Url: - string_builder << "Url"; - break; + return string_builder << "Url"; case MessageEntity::Type::EmailAddress: - string_builder << "EmailAddress"; - break; + return string_builder << "EmailAddress"; case MessageEntity::Type::Bold: - string_builder << "Bold"; - break; + return string_builder << "Bold"; case MessageEntity::Type::Italic: - string_builder << "Italic"; - break; + return string_builder << "Italic"; case MessageEntity::Type::Underline: - string_builder << "Underline"; - break; + return string_builder << "Underline"; case MessageEntity::Type::Strikethrough: - string_builder << "Strikethrough"; - break; + return string_builder << "Strikethrough"; case MessageEntity::Type::BlockQuote: - string_builder << "BlockQuote"; - break; + return string_builder << "BlockQuote"; case MessageEntity::Type::Code: - string_builder << "Code"; - break; + return string_builder << "Code"; case MessageEntity::Type::Pre: - string_builder << "Pre"; - break; + return string_builder << "Pre"; case MessageEntity::Type::PreCode: - string_builder << "PreCode"; - has_argument = true; - break; + return string_builder << "PreCode"; case MessageEntity::Type::TextUrl: - string_builder << "TextUrl"; - has_argument = true; - break; + return string_builder << "TextUrl"; case MessageEntity::Type::MentionName: - string_builder << "MentionName"; - break; + return string_builder << "MentionName"; case MessageEntity::Type::Cashtag: - string_builder << "Cashtag"; - break; + return string_builder << "Cashtag"; case MessageEntity::Type::PhoneNumber: - string_builder << "PhoneNumber"; - break; + return string_builder << "PhoneNumber"; default: UNREACHABLE(); - string_builder << "Impossible"; - break; + return string_builder << "Impossible"; } +} - string_builder << ", offset = " << message_entity.offset << ", length = " << message_entity.length; - if (has_argument) { +StringBuilder &operator<<(StringBuilder &string_builder, const MessageEntity &message_entity) { + string_builder << '[' << message_entity.type << ", offset = " << message_entity.offset + << ", length = " << message_entity.length; + if (!message_entity.argument.empty()) { string_builder << ", argument = \"" << message_entity.argument << "\""; } if (message_entity.user_id.is_valid()) { @@ -1322,7 +1303,7 @@ Result> parse_markdown(string &text) { i += 2; is_pre = true; size_t language_end = i; - while (language_end < size && !is_space(text[language_end]) && text[language_end] != '`') { + while (!is_space(text[language_end]) && text[language_end] != '`') { language_end++; } if (i != language_end && language_end < size && text[language_end] != '`') { @@ -1405,7 +1386,224 @@ Result> parse_markdown(string &text) { return entities; } -static uint32 decode_html_entity(Slice text, size_t &pos) { +static Result> do_parse_markdown_v2(CSlice text, string &result) { + vector entities; + int32 utf16_offset = 0; + + struct EntityInfo { + MessageEntity::Type type; + string argument; + int32 entity_offset; + size_t entity_byte_offset; + size_t entity_begin_pos; + + EntityInfo(MessageEntity::Type type, string argument, int32 entity_offset, size_t entity_byte_offset, + size_t entity_begin_pos) + : type(type) + , argument(std::move(argument)) + , entity_offset(entity_offset) + , entity_byte_offset(entity_byte_offset) + , entity_begin_pos(entity_begin_pos) { + } + }; + std::vector nested_entities; + + for (size_t i = 0; i < text.size(); i++) { + auto c = static_cast(text[i]); + if (c == '\\' && text[i + 1] > 0 && text[i + 1] <= 126) { + i++; + utf16_offset += 1; + result += text[i]; + continue; + } + + Slice reserved_characters("_*[]()~`>#+=|{}.!"); + if (!nested_entities.empty()) { + switch (nested_entities.back().type) { + case MessageEntity::Type::Code: + case MessageEntity::Type::Pre: + case MessageEntity::Type::PreCode: + reserved_characters = Slice("`"); + break; + default: + break; + } + } + + if (reserved_characters.find(text[i]) == Slice::npos) { + if (is_utf8_character_first_code_unit(c)) { + utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogaite pair + } + result.push_back(text[i]); + continue; + } + + bool is_end_of_an_entity = false; + if (!nested_entities.empty()) { + is_end_of_an_entity = [&] { + switch (nested_entities.back().type) { + case MessageEntity::Type::Bold: + return c == '*'; + case MessageEntity::Type::Italic: + return c == '_' && text[i + 1] != '_'; + case MessageEntity::Type::Code: + return c == '`'; + case MessageEntity::Type::Pre: + case MessageEntity::Type::PreCode: + return c == '`' && text[i + 1] == '`' && text[i + 2] == '`'; + case MessageEntity::Type::TextUrl: + return c == ']'; + case MessageEntity::Type::Underline: + return c == '_' && text[i + 1] == '_'; + case MessageEntity::Type::Strikethrough: + return c == '~'; + default: + UNREACHABLE(); + return false; + } + }(); + } + + if (!is_end_of_an_entity) { + // begin of an entity + MessageEntity::Type type; + string argument; + int32 entity_byte_offset = i; + switch (c) { + case '_': + if (text[i + 1] == '_') { + type = MessageEntity::Type::Underline; + i++; + } else { + type = MessageEntity::Type::Italic; + } + break; + case '*': + type = MessageEntity::Type::Bold; + break; + case '~': + type = MessageEntity::Type::Strikethrough; + break; + case '[': + type = MessageEntity::Type::TextUrl; + break; + case '`': + if (text[i + 1] == '`' && text[i + 2] == '`') { + i += 3; + type = MessageEntity::Type::Pre; + size_t language_end = i; + while (!is_space(text[language_end]) && text[language_end] != '`') { + language_end++; + } + if (i != language_end && language_end < text.size() && text[language_end] != '`') { + type = MessageEntity::Type::PreCode; + argument = text.substr(i, language_end - i).str(); + i = language_end; + } + // skip one new line in the beginning of the text + if (text[i] == '\n' || text[i] == '\r') { + if ((text[i + 1] == '\n' || text[i + 1] == '\r') && text[i] != text[i + 1]) { + i += 2; + } else { + i++; + } + } + + i--; + } else { + type = MessageEntity::Type::Code; + } + break; + default: + return Status::Error( + 400, PSLICE() << "Character '" << text[i] << "' is reserved and must be escaped with the preceding '\\'"); + } + nested_entities.emplace_back(type, std::move(argument), utf16_offset, entity_byte_offset, result.size()); + } else { + // end of an entity + auto type = nested_entities.back().type; + auto argument = std::move(nested_entities.back().argument); + UserId user_id; + bool skip_entity = utf16_offset == nested_entities.back().entity_offset; + switch (type) { + case MessageEntity::Type::Bold: + case MessageEntity::Type::Italic: + case MessageEntity::Type::Code: + case MessageEntity::Type::Strikethrough: + break; + case MessageEntity::Type::Underline: + i++; + break; + case MessageEntity::Type::Pre: + case MessageEntity::Type::PreCode: + i += 2; + break; + case MessageEntity::Type::TextUrl: { + string url; + if (text[i + 1] != '(') { + // use text as a url + url = result.substr(nested_entities.back().entity_begin_pos); + } else { + i += 2; + auto url_begin_pos = i; + while (i < text.size() && text[i] != ')') { + if (text[i] == '\\' && text[i + 1] > 0 && text[i + 1] <= 126) { + url += text[i + 1]; + i += 2; + continue; + } + url += text[i++]; + } + if (text[i] != ')') { + return Status::Error(400, PSLICE() << "Can't find end of a URL at byte offset " << url_begin_pos); + } + } + user_id = get_link_user_id(url); + if (!user_id.is_valid()) { + auto r_url = check_url(url); + if (r_url.is_error()) { + skip_entity = true; + } else { + argument = r_url.move_as_ok(); + } + } + break; + } + default: + UNREACHABLE(); + return false; + } + + if (!skip_entity) { + auto entity_offset = nested_entities.back().entity_offset; + auto entity_length = utf16_offset - entity_offset; + if (user_id.is_valid()) { + entities.emplace_back(entity_offset, entity_length, user_id); + } else { + entities.emplace_back(type, entity_offset, entity_length, std::move(argument)); + } + } + nested_entities.pop_back(); + } + } + if (!nested_entities.empty()) { + return Status::Error(400, PSLICE() << "Can't find end of " << nested_entities.back().type + << " entity at byte offset " << nested_entities.back().entity_byte_offset); + } + + std::sort(entities.begin(), entities.end()); + + return entities; +} + +Result> parse_markdown_v2(string &text) { + string result; + TRY_RESULT(entities, do_parse_markdown_v2(text, result)); + text = result; + return entities; +} + +static uint32 decode_html_entity(CSlice text, size_t &pos) { auto c = static_cast(text[pos]); if (c != '&') { return 0; @@ -1458,7 +1656,7 @@ static uint32 decode_html_entity(Slice text, size_t &pos) { return res; } -static Result> do_parse_html(Slice text, string &result) { +static Result> do_parse_html(CSlice text, string &result) { vector entities; int32 utf16_offset = 0; diff --git a/td/telegram/MessageEntity.h b/td/telegram/MessageEntity.h index b9f20b6d..09a96901 100644 --- a/td/telegram/MessageEntity.h +++ b/td/telegram/MessageEntity.h @@ -97,6 +97,8 @@ class MessageEntity { static int get_type_priority(Type type); }; +StringBuilder &operator<<(StringBuilder &string_builder, const MessageEntity::Type &message_entity_type); + StringBuilder &operator<<(StringBuilder &string_builder, const MessageEntity &message_entity); struct FormattedText { @@ -140,6 +142,8 @@ string get_first_url(Slice text, const vector &entities); Result> parse_markdown(string &text); +Result> parse_markdown_v2(string &text); + Result> parse_html(string &text); vector> get_input_message_entities(const ContactsManager *contacts_manager, diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 3481f397..e249f6c5 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -6,6 +6,7 @@ // #include "td/telegram/MessageEntity.h" +#include "td/utils/common.h" #include "td/utils/format.h" #include "td/utils/logging.h" #include "td/utils/tests.h" @@ -730,7 +731,7 @@ static void check_parse_html(td::string text, const td::string &result, const td ASSERT_STREQ(result, text); } -static void check_parse_html(td::string text, const td::string &error_message) { +static void check_parse_html(td::string text, td::Slice error_message) { auto r_entities = td::parse_html(text); ASSERT_TRUE(r_entities.is_error()); ASSERT_EQ(400, r_entities.error().code()); @@ -764,6 +765,8 @@ TEST(MessageEntities, parse_html) { check_parse_html("<>&"«»�", "<>&\"«»�", {}); check_parse_html("➡️ ➡️➡️ ➡️", "➡️ ➡️➡️ ➡️", {{td::MessageEntity::Type::Italic, 5, 5}}); + check_parse_html("➡️ ➡️➡️ ➡️➡️ ➡️", "➡️ ➡️➡️ ➡️➡️ ➡️", + {{td::MessageEntity::Type::Italic, 5, 5}, {td::MessageEntity::Type::Bold, 10, 5}}); check_parse_html("🏟 🏟🏟 <🏟", "🏟 🏟🏟 <🏟", {{td::MessageEntity::Type::Italic, 5, 6}}); check_parse_html("🏟 🏟🏟 ><🏟", "🏟 🏟🏟 ><🏟", {{td::MessageEntity::Type::Italic, 5, 7}, {td::MessageEntity::Type::Bold, 9, 3}}); @@ -777,6 +780,8 @@ TEST(MessageEntities, parse_html) { {{td::MessageEntity::Type::Italic, 6, 1}}); check_parse_html("🏟 🏟<a", "🏟 🏟🏟 🏟<", "🏟 🏟<🏟 🏟<", + {{td::MessageEntity::Type::Italic, 6, 6}}); check_parse_html("🏟 🏟<a", "🏟 🏟a", "🏟 🏟", "🏟 🏟<", {}); @@ -806,3 +811,126 @@ TEST(MessageEntities, parse_html) { check_parse_html("https://telegram.org/asdsa?asdasdwe#12e3we", "https://telegram.org/asdsa?asdasdwe#12e3we", {{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}}); } + +static void check_parse_markdown(td::string text, const td::string &result, + const td::vector &entities) { + auto r_entities = td::parse_markdown_v2(text); + ASSERT_TRUE(r_entities.is_ok()); + ASSERT_EQ(entities, r_entities.ok()); + ASSERT_STREQ(result, text); +} + +static void check_parse_markdown(td::string text, td::Slice error_message) { + auto r_entities = td::parse_markdown_v2(text); + ASSERT_TRUE(r_entities.is_error()); + ASSERT_EQ(400, r_entities.error().code()); + ASSERT_STREQ(error_message, r_entities.error().message()); +} + +TEST(MessageEntities, parse_markdown) { + td::Slice reserved_characters("]()>#+=|{}.!"); + td::Slice begin_characters("_*[~`"); + for (char c = 1; c < 126; c++) { + if (begin_characters.find(c) != td::Slice::npos) { + continue; + } + + td::string text(1, c); + if (reserved_characters.find(c) == td::Slice::npos) { + check_parse_markdown(text, text, {}); + } else { + check_parse_markdown( + text, PSLICE() << "Character '" << c << "' is reserved and must be escaped with the preceding '\\'"); + + td::string escaped_text = "\\" + text; + check_parse_markdown(escaped_text, text, {}); + } + } + + check_parse_markdown("🏟 🏟_abacaba", "Can't find end of Italic entity at byte offset 9"); + check_parse_markdown("🏟 🏟_abac * asd ", "Can't find end of Bold entity at byte offset 15"); + check_parse_markdown("🏟 🏟_abac * asd _", "Can't find end of Italic entity at byte offset 21"); + check_parse_markdown("🏟 🏟`", "Can't find end of Code entity at byte offset 9"); + check_parse_markdown("🏟 🏟```", "Can't find end of Pre entity at byte offset 9"); + check_parse_markdown("🏟 🏟```a", "Can't find end of Pre entity at byte offset 9"); + check_parse_markdown("🏟 🏟```a ", "Can't find end of PreCode entity at byte offset 9"); + check_parse_markdown("🏟 🏟__🏟 🏟_", "Can't find end of Italic entity at byte offset 20"); + check_parse_markdown("🏟 🏟_🏟 🏟__", "Can't find end of Underline entity at byte offset 19"); + check_parse_markdown("🏟 🏟```🏟 🏟`", "Can't find end of Code entity at byte offset 21"); + check_parse_markdown("🏟 🏟```🏟 🏟_", "Can't find end of PreCode entity at byte offset 9"); + check_parse_markdown("🏟 🏟```🏟 🏟\\`", "Can't find end of PreCode entity at byte offset 9"); + check_parse_markdown("[telegram\\.org](asd\\)", "Can't find end of a URL at byte offset 16"); + check_parse_markdown("[telegram\\.org](", "Can't find end of a URL at byte offset 16"); + check_parse_markdown("[telegram\\.org](asd", "Can't find end of a URL at byte offset 16"); + check_parse_markdown("🏟 🏟__🏟 _🏟___", "Can't find end of Italic entity at byte offset 23"); + check_parse_markdown("🏟 🏟__", "Can't find end of Underline entity at byte offset 9"); + + check_parse_markdown("", "", {}); + check_parse_markdown("\\\\", "\\", {}); + check_parse_markdown("\\\\\\", "\\\\", {}); + check_parse_markdown("\\\\\\\\\\_\\*\\`", "\\\\_*`", {}); + check_parse_markdown("➡️ ➡️", "➡️ ➡️", {}); + check_parse_markdown("🏟 🏟``", "🏟 🏟", {}); + check_parse_markdown("🏟 🏟_abac \\* asd _", "🏟 🏟abac * asd ", {{td::MessageEntity::Type::Italic, 5, 11}}); + check_parse_markdown("🏟 \\.🏟_🏟\\. 🏟_", "🏟 .🏟🏟. 🏟", {{td::MessageEntity::Type::Italic, 6, 6}}); + check_parse_markdown("\\\\\\a\\b\\c\\d\\e\\f\\1\\2\\3\\4\\➡️\\", "\\abcdef1234\\➡️\\", {}); + check_parse_markdown("➡️ ➡️_➡️ ➡️_", "➡️ ➡️➡️ ➡️", + {{td::MessageEntity::Type::Italic, 5, 5}}); + check_parse_markdown("➡️ ➡️_➡️ ➡️_*➡️ ➡️*", "➡️ ➡️➡️ ➡️➡️ ➡️", + {{td::MessageEntity::Type::Italic, 5, 5}, {td::MessageEntity::Type::Bold, 10, 5}}); + check_parse_markdown("🏟 🏟_🏟 \\.🏟_", "🏟 🏟🏟 .🏟", {{td::MessageEntity::Type::Italic, 5, 6}}); + check_parse_markdown("🏟 🏟_🏟 *🏟*_", "🏟 🏟🏟 🏟", + {{td::MessageEntity::Type::Italic, 5, 5}, {td::MessageEntity::Type::Bold, 8, 2}}); + check_parse_markdown("🏟 🏟_🏟 __🏟___", "🏟 🏟🏟 🏟", + {{td::MessageEntity::Type::Italic, 5, 5}, {td::MessageEntity::Type::Underline, 8, 2}}); + check_parse_markdown("🏟 🏟__🏟 _🏟_ __", "🏟 🏟🏟 🏟 ", + {{td::MessageEntity::Type::Underline, 5, 6}, {td::MessageEntity::Type::Italic, 8, 2}}); + check_parse_markdown("🏟 🏟__🏟 _🏟_\\___", "🏟 🏟🏟 🏟_", + {{td::MessageEntity::Type::Underline, 5, 6}, {td::MessageEntity::Type::Italic, 8, 2}}); + check_parse_markdown("🏟 🏟`🏟 🏟```", "🏟 🏟🏟 🏟", {{td::MessageEntity::Type::Code, 5, 5}}); + check_parse_markdown("🏟 🏟```🏟 🏟```", "🏟 🏟 🏟", + {{td::MessageEntity::Type::PreCode, 5, 3, "🏟"}}); + check_parse_markdown("🏟 🏟```🏟\n🏟```", "🏟 🏟🏟", + {{td::MessageEntity::Type::PreCode, 5, 2, "🏟"}}); + check_parse_markdown("🏟 🏟```🏟\r🏟```", "🏟 🏟🏟", + {{td::MessageEntity::Type::PreCode, 5, 2, "🏟"}}); + check_parse_markdown("🏟 🏟```🏟\n\r🏟```", "🏟 🏟🏟", + {{td::MessageEntity::Type::PreCode, 5, 2, "🏟"}}); + check_parse_markdown("🏟 🏟```🏟\r\n🏟```", "🏟 🏟🏟", + {{td::MessageEntity::Type::PreCode, 5, 2, "🏟"}}); + check_parse_markdown("🏟 🏟```🏟\n\n🏟```", "🏟 🏟\n🏟", + {{td::MessageEntity::Type::PreCode, 5, 3, "🏟"}}); + check_parse_markdown("🏟 🏟```🏟\r\r🏟```", "🏟 🏟\r🏟", + {{td::MessageEntity::Type::PreCode, 5, 3, "🏟"}}); + check_parse_markdown("🏟 🏟```🏟 \\\\\\`🏟```", "🏟 🏟 \\`🏟", + {{td::MessageEntity::Type::PreCode, 5, 5, "🏟"}}); + check_parse_markdown("🏟 🏟**", "🏟 🏟", {}); + check_parse_markdown("🏟 🏟``", "🏟 🏟", {}); + check_parse_markdown("🏟 🏟``````", "🏟 🏟", {}); + check_parse_markdown("🏟 🏟____", "🏟 🏟", {}); + check_parse_markdown("`_* *_`__*` `*__", "_* *_ ", + {{td::MessageEntity::Type::Code, 0, 5}, + {td::MessageEntity::Type::Code, 5, 1}, + {td::MessageEntity::Type::Bold, 5, 1}, + {td::MessageEntity::Type::Underline, 5, 1}}); + check_parse_markdown("_* * ` `_", " ", + {{td::MessageEntity::Type::Italic, 0, 3}, + {td::MessageEntity::Type::Bold, 0, 1}, + {td::MessageEntity::Type::Code, 2, 1}}); + check_parse_markdown("[](telegram.org)", "", {}); + check_parse_markdown("[ ](telegram.org)", " ", {{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}}); + check_parse_markdown("[ ](as)", " ", {}); + check_parse_markdown("[telegram\\.org]", "telegram.org", + {{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}}); + check_parse_markdown("[telegram\\.org]a", "telegram.orga", + {{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}}); + check_parse_markdown("[telegram\\.org](telegram.dog)", "telegram.org", + {{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.dog/"}}); + check_parse_markdown("[telegram\\.org](https://telegram.dog?)", "telegram.org", + {{td::MessageEntity::Type::TextUrl, 0, 12, "https://telegram.dog/?"}}); + check_parse_markdown("[telegram\\.org](https://telegram.dog?\\\\\\()", "telegram.org", + {{td::MessageEntity::Type::TextUrl, 0, 12, "https://telegram.dog/?\\("}}); + check_parse_markdown("[telegram\\.org]()", "telegram.org", {}); + check_parse_markdown("[telegram\\.org](asdasd)", "telegram.org", {}); + check_parse_markdown("[telegram\\.org](tg:user?id=123456)", "telegram.org", {{0, 12, td::UserId(123456)}}); +}