From 9c67f426826c2e7bf34a0b1acbbf233b5f55cc1d Mon Sep 17 00:00:00 2001 From: levlam Date: Thu, 12 Mar 2020 06:22:14 +0300 Subject: [PATCH] Add td_api::getMarkdownText. GitOrigin-RevId: b463cc2c92052c552d66d774450ffa7bb4bc132e --- td/generate/scheme/td_api.tl | 3 + td/generate/scheme/td_api.tlo | Bin 167840 -> 167928 bytes td/telegram/MessageEntity.cpp | 195 ++++++++++++++++++++++++++++------ td/telegram/MessageEntity.h | 2 + td/telegram/Td.cpp | 27 ++++- td/telegram/Td.h | 3 + test/message_entities.cpp | 73 ++++++++++++- 7 files changed, 263 insertions(+), 40 deletions(-) diff --git a/td/generate/scheme/td_api.tl b/td/generate/scheme/td_api.tl index 6f36c92a..f9c4e7c9 100644 --- a/td/generate/scheme/td_api.tl +++ b/td/generate/scheme/td_api.tl @@ -3428,6 +3428,9 @@ parseTextEntities text:string parse_mode:TextParseMode = FormattedText; //@text The text to parse. For example, "__italic__ ~~strikethrough~~ **bold** `code` ```pre``` __[italic__ text_url](telegram.org) __italic**bold italic__bold**" parseMarkdown text:formattedText = FormattedText; +//@description Replaces text entities with Markdown formatting in a human-friendly format. Entities that can't be represented in Markdown unambiguously are kept as is. This is an offline method. Can be called before authorization. Can be called synchronously @text The text +getMarkdownText text:formattedText = FormattedText; + //@description Returns the MIME type of a file, guessed by its extension. Returns an empty string on failure. This is an offline method. Can be called before authorization. Can be called synchronously @file_name The name of the file or path to the file getFileMimeType file_name:string = Text; diff --git a/td/generate/scheme/td_api.tlo b/td/generate/scheme/td_api.tlo index a6fdd0ae14ad3b8703c3f29becbfc5a6dfd332b9..095ff3672fdf07f7ec38680a38a4b983f29caf55 100644 GIT binary patch delta 68 zcmV-K0K5O7p9=V&3V^f$_}c++xB1%vCrlV9a?S}4XJvFvVRCC^Z+C7~Wq5QIg$7eYIsl`}3!(r3 diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 933458a2..b702492a 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -1204,6 +1204,12 @@ static constexpr int32 get_pre_entities_mask() { get_entity_type_mask(MessageEntity::Type::PreCode); } +static constexpr int32 get_user_entities_mask() { + return get_splittable_entities_mask() | get_blockquote_entities_mask() | + get_entity_type_mask(MessageEntity::Type::TextUrl) | get_entity_type_mask(MessageEntity::Type::MentionName) | + get_pre_entities_mask(); +} + static int32 is_splittable_entity(MessageEntity::Type type) { return (get_entity_type_mask(type) & get_splittable_entities_mask()) != 0; } @@ -1220,6 +1226,10 @@ static int32 is_pre_entity(MessageEntity::Type type) { return (get_entity_type_mask(type) & get_pre_entities_mask()) != 0; } +static int32 is_user_entity(MessageEntity::Type type) { + return (get_entity_type_mask(type) & get_user_entities_mask()) != 0; +} + static constexpr size_t SPLITTABLE_ENTITY_TYPE_COUNT = 4; static size_t get_splittable_entity_type_index(MessageEntity::Type type) { @@ -2118,6 +2128,13 @@ static vector find_splittable_entities_v3(Slice text, const vecto for (auto &entity : entities) { unallowed_boundaries.insert(entity.offset); unallowed_boundaries.insert(entity.offset + entity.length); + if (entity.type == MessageEntity::Type::Mention || entity.type == MessageEntity::Type::Hashtag || + entity.type == MessageEntity::Type::BotCommand || entity.type == MessageEntity::Type::Cashtag || + entity.type == MessageEntity::Type::PhoneNumber || entity.type == MessageEntity::Type::BankCardNumber) { + for (int32 i = 1; i < entity.length; i++) { + unallowed_boundaries.insert(entity.offset + i); + } + } } auto found_entities = find_entities(text, false, false); @@ -2440,6 +2457,128 @@ FormattedText parse_markdown_v3(FormattedText text) { return result; } +// text entities must be valid +FormattedText get_markdown_v3(FormattedText text) { + if (text.entities.empty()) { + return text; + } + + check_is_sorted(text.entities); + for (auto &entity : text.entities) { + if (!is_user_entity(entity.type)) { + return text; + } + } + + FormattedText result; + struct EntityInfo { + const MessageEntity *entity; + int32 utf16_added_before; + + EntityInfo(MessageEntity *entity, int32 utf16_added_before) + : entity(entity), utf16_added_before(utf16_added_before) { + } + }; + vector nested_entities_stack; + size_t current_entity = 0; + + int32 utf16_offset = 0; + int32 utf16_added = 0; + + for (size_t pos = 0; pos <= text.text.size(); pos++) { + auto c = static_cast(text.text[pos]); + if (is_utf8_character_first_code_unit(c)) { + while (!nested_entities_stack.empty()) { + const auto *entity = nested_entities_stack.back().entity; + auto entity_end = entity->offset + entity->length; + if (utf16_offset < entity_end) { + break; + } + + CHECK(utf16_offset == entity_end); + + switch (entity->type) { + case MessageEntity::Type::Italic: + result.text += "__"; + utf16_added += 2; + break; + case MessageEntity::Type::Bold: + result.text += "**"; + utf16_added += 2; + break; + case MessageEntity::Type::Strikethrough: + result.text += "~~"; + utf16_added += 2; + break; + case MessageEntity::Type::TextUrl: + result.text += "]("; + result.text += entity->argument; + result.text += ')'; + utf16_added += 3 + entity->argument.size(); + break; + case MessageEntity::Type::Code: + result.text += '`'; + utf16_added++; + break; + case MessageEntity::Type::Pre: + result.text += "```"; + utf16_added += 3; + break; + default: + result.entities.push_back(*entity); + result.entities.back().offset += nested_entities_stack.back().utf16_added_before; + result.entities.back().length += utf16_added - nested_entities_stack.back().utf16_added_before; + break; + } + nested_entities_stack.pop_back(); + } + + while (current_entity < text.entities.size() && utf16_offset >= text.entities[current_entity].offset) { + CHECK(utf16_offset == text.entities[current_entity].offset); + switch (text.entities[current_entity].type) { + case MessageEntity::Type::Italic: + result.text += "__"; + utf16_added += 2; + break; + case MessageEntity::Type::Bold: + result.text += "**"; + utf16_added += 2; + break; + case MessageEntity::Type::Strikethrough: + result.text += "~~"; + utf16_added += 2; + break; + case MessageEntity::Type::TextUrl: + result.text += '['; + utf16_added++; + break; + case MessageEntity::Type::Code: + result.text += '`'; + utf16_added++; + break; + case MessageEntity::Type::Pre: + result.text += "```"; + utf16_added += 3; + break; + } + nested_entities_stack.emplace_back(&text.entities[current_entity++], utf16_added); + } + utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogaite pair + } + if (pos == text.text.size()) { + break; + } + + result.text.push_back(text.text[pos]); + } + + sort_entities(result.entities); + if (parse_markdown_v3(result) != text) { + return text; + } + return result; +} + static uint32 decode_html_entity(CSlice text, size_t &pos) { auto c = static_cast(text[pos]); if (c != '&') { @@ -2730,16 +2869,10 @@ vector> get_input_message_entities(co const char *source) { vector> result; for (auto &entity : entities) { + if (!is_user_entity(entity.type)) { + continue; + } switch (entity.type) { - case MessageEntity::Type::Mention: - case MessageEntity::Type::Hashtag: - case MessageEntity::Type::BotCommand: - case MessageEntity::Type::Url: - case MessageEntity::Type::EmailAddress: - case MessageEntity::Type::Cashtag: - case MessageEntity::Type::PhoneNumber: - case MessageEntity::Type::BankCardNumber: - continue; case MessageEntity::Type::Bold: result.push_back(make_tl_object(entity.offset, entity.length)); break; @@ -2775,6 +2908,14 @@ vector> get_input_message_entities(co std::move(input_user))); break; } + case MessageEntity::Type::Mention: + case MessageEntity::Type::Hashtag: + case MessageEntity::Type::BotCommand: + case MessageEntity::Type::Url: + case MessageEntity::Type::EmailAddress: + case MessageEntity::Type::Cashtag: + case MessageEntity::Type::PhoneNumber: + case MessageEntity::Type::BankCardNumber: default: UNREACHABLE(); } @@ -2872,44 +3013,28 @@ Result> get_message_entities(const ContactsManager *contac switch (entity->type_->get_id()) { case td_api::textEntityTypeMention::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::Mention, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::Mention, entity->offset_, entity->length_); break; case td_api::textEntityTypeHashtag::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::Hashtag, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::Hashtag, entity->offset_, entity->length_); break; case td_api::textEntityTypeBotCommand::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::BotCommand, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::BotCommand, entity->offset_, entity->length_); break; case td_api::textEntityTypeUrl::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::Url, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::Url, entity->offset_, entity->length_); break; case td_api::textEntityTypeEmailAddress::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::EmailAddress, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::EmailAddress, entity->offset_, entity->length_); break; case td_api::textEntityTypeCashtag::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::Cashtag, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::Cashtag, entity->offset_, entity->length_); break; case td_api::textEntityTypePhoneNumber::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::PhoneNumber, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::PhoneNumber, entity->offset_, entity->length_); break; case td_api::textEntityTypeBankCardNumber::ID: - if (allow_all) { - entities.emplace_back(MessageEntity::Type::BankCardNumber, entity->offset_, entity->length_); - } + entities.emplace_back(MessageEntity::Type::BankCardNumber, entity->offset_, entity->length_); break; case td_api::textEntityTypeBold::ID: entities.emplace_back(MessageEntity::Type::Bold, entity->offset_, entity->length_); @@ -2962,6 +3087,10 @@ Result> get_message_entities(const ContactsManager *contac default: UNREACHABLE(); } + CHECK(!entities.empty()); + if (!allow_all && !is_user_entity(entities.back().type)) { + entities.pop_back(); + } } return entities; } diff --git a/td/telegram/MessageEntity.h b/td/telegram/MessageEntity.h index 47ad7302..ea922950 100644 --- a/td/telegram/MessageEntity.h +++ b/td/telegram/MessageEntity.h @@ -151,6 +151,8 @@ Result> parse_markdown_v2(string &text); FormattedText parse_markdown_v3(FormattedText text); +FormattedText get_markdown_v3(FormattedText text); + Result> parse_html(string &text); vector> get_input_message_entities(const ContactsManager *contacts_manager, diff --git a/td/telegram/Td.cpp b/td/telegram/Td.cpp index d36a002c..641dac9c 100644 --- a/td/telegram/Td.cpp +++ b/td/telegram/Td.cpp @@ -3081,6 +3081,7 @@ bool Td::is_synchronous_request(int32 id) { case td_api::getTextEntities::ID: case td_api::parseTextEntities::ID: case td_api::parseMarkdown::ID: + case td_api::getMarkdownText::ID: case td_api::getFileMimeType::ID: case td_api::getFileExtension::ID: case td_api::cleanFileName::ID: @@ -3304,6 +3305,7 @@ td_api::object_ptr Td::static_request(td_api::object_ptr Td::do_static_request(td_api::parseMarkdown & auto parsed_text = parse_markdown_v3({std::move(request.text_->text_), std::move(entities)}); fix_formatted_text(parsed_text.text, parsed_text.entities, true, true, true, true).ensure(); - return make_tl_object(std::move(parsed_text.text), - get_text_entities_object(parsed_text.entities)); + return get_formatted_text_object(parsed_text); +} + +td_api::object_ptr Td::do_static_request(td_api::getMarkdownText &request) { + if (request.text_ == nullptr) { + return make_error(400, "Text must be non-empty"); + } + + auto r_entities = get_message_entities(nullptr, std::move(request.text_->entities_)); + if (r_entities.is_error()) { + return make_error(400, r_entities.error().message()); + } + auto entities = r_entities.move_as_ok(); + auto status = fix_formatted_text(request.text_->text_, entities, true, true, true, true); + if (status.is_error()) { + return make_error(400, status.error().message()); + } + + return get_formatted_text_object(get_markdown_v3({std::move(request.text_->text_), std::move(entities)})); } td_api::object_ptr Td::do_static_request(const td_api::getFileMimeType &request) { diff --git a/td/telegram/Td.h b/td/telegram/Td.h index 8905eefb..ce2bcb29 100644 --- a/td/telegram/Td.h +++ b/td/telegram/Td.h @@ -1047,6 +1047,8 @@ class Td final : public NetQueryCallback { void on_request(uint64 id, const td_api::parseMarkdown &request); + void on_request(uint64 id, const td_api::getMarkdownText &request); + void on_request(uint64 id, const td_api::getFileMimeType &request); void on_request(uint64 id, const td_api::getFileExtension &request); @@ -1099,6 +1101,7 @@ class Td final : public NetQueryCallback { static td_api::object_ptr do_static_request(const td_api::getTextEntities &request); static td_api::object_ptr do_static_request(td_api::parseTextEntities &request); static td_api::object_ptr do_static_request(td_api::parseMarkdown &request); + static td_api::object_ptr do_static_request(td_api::getMarkdownText &request); static td_api::object_ptr do_static_request(const td_api::getFileMimeType &request); static td_api::object_ptr do_static_request(const td_api::getFileExtension &request); static td_api::object_ptr do_static_request(const td_api::cleanFileName &request); diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 936b0704..aa72ae29 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -1256,19 +1256,24 @@ TEST(MessageEntities, parse_markdown) { check_parse_markdown("[telegram\\.org](tg:user?id=123456)", "telegram.org", {{0, 12, td::UserId(123456)}}); } -static void check_parse_markdown_v3(td::string text, td::vector entities, const td::string &result, - const td::vector &result_entities, bool fix = false) { +static void check_parse_markdown_v3(td::string text, td::vector entities, + const td::string &result_text, const td::vector &result_entities, + bool fix = false) { auto parsed_text = td::parse_markdown_v3({std::move(text), std::move(entities)}); if (fix) { ASSERT_TRUE(fix_formatted_text(parsed_text.text, parsed_text.entities, true, true, true, true).is_ok()); } - ASSERT_STREQ(result, parsed_text.text); + ASSERT_STREQ(result_text, parsed_text.text); ASSERT_EQ(result_entities, parsed_text.entities); + if (fix) { + auto markdown_text = td::get_markdown_v3(parsed_text); + ASSERT_TRUE(parsed_text == markdown_text || parsed_text == td::parse_markdown_v3(markdown_text)); + } } -static void check_parse_markdown_v3(td::string text, const td::string &result, +static void check_parse_markdown_v3(td::string text, const td::string &result_text, const td::vector &result_entities, bool fix = false) { - check_parse_markdown_v3(std::move(text), td::vector(), result, result_entities, fix); + check_parse_markdown_v3(std::move(text), td::vector(), result_text, result_entities, fix); } TEST(MessageEntities, parse_markdown_v3) { @@ -1293,6 +1298,9 @@ TEST(MessageEntities, parse_markdown_v3) { check_parse_markdown_v3("` `a", " a", {{td::MessageEntity::Type::Code, 0, 1}}, true); check_parse_markdown_v3("`\n`a", "\na", {}, true); check_parse_markdown_v3("``", "``", {}); + check_parse_markdown_v3("`a````b```", "`a````b```", {}); + check_parse_markdown_v3("ab", {{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}}, "ab", + {{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}}); check_parse_markdown_v3("[a](b[c](t.me)", "[a](b[c](t.me)", {}); check_parse_markdown_v3("[](t.me)", "[](t.me)", {}); @@ -1411,6 +1419,9 @@ TEST(MessageEntities, parse_markdown_v3) { {td::MessageEntity::Type::TextUrl, 3, 4, "http://t.me/"}, {td::MessageEntity::Type::Italic, 3, 2}}, true); + check_parse_markdown_v3("__a #test__test", "__a #test__test", {}); + check_parse_markdown_v3("a #testtest", {{td::MessageEntity::Type::Italic, 0, 7}}, "a #testtest", + {{td::MessageEntity::Type::Italic, 0, 7}}); // TODO parse_markdown_v3 is not idempotent now, which is bad check_parse_markdown_v3( @@ -1551,5 +1562,57 @@ TEST(MessageEntities, parse_markdown_v3) { text = std::move(parsed_text); } ASSERT_EQ(text, td::parse_markdown_v3(text)); + auto markdown_text = td::get_markdown_v3(text); + ASSERT_TRUE(text == markdown_text || text == td::parse_markdown_v3(markdown_text)); } } + +static void check_get_markdown_v3(td::string result_text, td::vector result_entities, + const td::string &text, const td::vector &entities) { + auto markdown_text = td::get_markdown_v3({std::move(text), std::move(entities)}); + ASSERT_STREQ(result_text, markdown_text.text); + ASSERT_EQ(result_entities, markdown_text.entities); +} + +TEST(MessageEntities, get_markdown_v3) { + check_get_markdown_v3("``` ```", {}, " ", {{td::MessageEntity::Type::Pre, 0, 1}}); + check_get_markdown_v3("` `", {}, " ", {{td::MessageEntity::Type::Code, 0, 1}}); + check_get_markdown_v3("`\n`", {}, "\n", {{td::MessageEntity::Type::Code, 0, 1}}); + check_get_markdown_v3("ab", {{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}}, "ab", + {{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}}); + + check_get_markdown_v3("[ ](http://t.me/)", {}, " ", {{td::MessageEntity::Type::TextUrl, 0, 1, "http://t.me/"}}); + check_get_markdown_v3("[ ]t.me[)](http://t.me/) [ ](t.me)", {{25, 1, td::UserId(1)}}, "[ ]t.me) [ ](t.me)", + {{td::MessageEntity::Type::TextUrl, 7, 1, "http://t.me/"}, {9, 1, td::UserId(1)}}); + + check_get_markdown_v3("__ __", {}, " ", {{td::MessageEntity::Type::Italic, 0, 1}}); + check_get_markdown_v3("** **", {}, " ", {{td::MessageEntity::Type::Bold, 0, 1}}); + check_get_markdown_v3("~~ ~~", {}, " ", {{td::MessageEntity::Type::Strikethrough, 0, 1}}); + check_get_markdown_v3("__a__ **b** ~~c~~ d", {{td::MessageEntity::Type::PreCode, 18, 1, "C++"}}, "a b c d", + {{td::MessageEntity::Type::Italic, 0, 1}, + {td::MessageEntity::Type::Bold, 2, 1}, + {td::MessageEntity::Type::Strikethrough, 4, 1}, + {td::MessageEntity::Type::PreCode, 6, 1, "C++"}}); + check_get_markdown_v3("`ab` ```cd``` ef", {{td::MessageEntity::Type::PreCode, 14, 2, "C++"}}, "ab cd ef", + {{td::MessageEntity::Type::Code, 0, 2}, + {td::MessageEntity::Type::Pre, 3, 2}, + {td::MessageEntity::Type::PreCode, 6, 2, "C++"}}); + check_get_markdown_v3("__asd__[__ab__cd](http://t.me/)", {}, "asdabcd", + {{td::MessageEntity::Type::Italic, 0, 3}, + {td::MessageEntity::Type::TextUrl, 3, 4, "http://t.me/"}, + {td::MessageEntity::Type::Italic, 3, 2}}); + + check_get_markdown_v3("__ab", {{td::MessageEntity::Type::Italic, 3, 1}}, "__ab", + {{td::MessageEntity::Type::Italic, 3, 1}}); + check_get_markdown_v3("__ab__**__cd__**~~**__ef__gh**ij~~", {}, "abcdefghij", + {{td::MessageEntity::Type::Italic, 0, 2}, + {td::MessageEntity::Type::Bold, 2, 2}, + {td::MessageEntity::Type::Italic, 2, 2}, + {td::MessageEntity::Type::Strikethrough, 4, 6}, + {td::MessageEntity::Type::Bold, 4, 4}, + {td::MessageEntity::Type::Italic, 4, 2}}); + check_get_markdown_v3("[**__bold italic link__**](http://example.com/)", {}, "bold italic link", + {{td::MessageEntity::Type::TextUrl, 0, 16, "http://example.com/"}, + {td::MessageEntity::Type::Bold, 0, 16}, + {td::MessageEntity::Type::Italic, 0, 16}}); +}