From 8e8a7e0f1886275a2ac5f2b315fec048ec608e7f Mon Sep 17 00:00:00 2001 From: levlam Date: Mon, 6 Nov 2023 00:45:24 +0300 Subject: [PATCH] Support language code specification for PreCode entities in parseMarkdown. --- td/telegram/MessageEntity.cpp | 35 ++++++++++++++++++++++++++++------- test/message_entities.cpp | 13 +++++++++++++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 99bd4f9e1..0e4e84a1a 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -2711,11 +2711,35 @@ static FormattedText parse_pre_entities_v3(Slice text) { if (end_tag_end - end_tag_begin == j - i) { // end tag found CHECK(entity_length > 0); - entities.emplace_back(j - i == 3 ? MessageEntity::Type::Pre : MessageEntity::Type::Code, utf16_offset, - entity_length); - result.append(text.begin() + j, end_tag_begin - j); + auto entity_begin = j; + string language_code; + if (j - i == 3) { + size_t language_code_end = j; + while (language_code_end < end_tag_begin - 1 && 33 <= text[language_code_end] && + text[language_code_end] <= 126) { + language_code_end++; + } + if (language_code_end < end_tag_begin - 1 && text[language_code_end] == '\n' && + (language_code_end != entity_begin || i == 0 || text[i - 1] == '\n')) { + language_code = text.substr(entity_begin, language_code_end - entity_begin).str(); + entity_begin = language_code_end + 1; + entity_length -= entity_begin - j; + CHECK(entity_length > 0); + } + } + if (!language_code.empty()) { + entities.emplace_back(MessageEntity::Type::PreCode, utf16_offset, entity_length, + std::move(language_code)); + } else { + entities.emplace_back(j - i == 3 ? MessageEntity::Type::Pre : MessageEntity::Type::Code, utf16_offset, + entity_length); + } + result.append(text.begin() + entity_begin, end_tag_begin - entity_begin); utf16_offset += entity_length; i = end_tag_end - 1; + if (end_tag_end < size && text[end_tag_end] == '\n') { + i++; + } is_found = true; break; } else { @@ -2774,10 +2798,7 @@ static FormattedText parse_pre_entities_v3(Slice text, vector ent result_text_utf16_length += part_end - max_end; } else { FormattedText parsed_text = parse_pre_entities_v3(parsed_part_text); - int32 new_skipped_length = 0; - for (auto &entity : parsed_text.entities) { - new_skipped_length += (entity.type == MessageEntity::Type::Pre ? 6 : 2); - } + int32 new_skipped_length = parsed_part_text.size() - parsed_text.text.size(); CHECK(new_skipped_length < part_end - max_end); result.text += parsed_text.text; for (auto &entity : parsed_text.entities) { diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 0cc7bd8f9..b8267e111 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -1835,6 +1835,19 @@ TEST(MessageEntities, parse_markdown_v3) { {td::MessageEntity::Type::Italic, 123, 17}, {td::MessageEntity::Type::Bold, 129, 15}, {td::MessageEntity::Type::Spoiler, 145, 7}}); + check_parse_markdown_v3("```\nsome code\n```", "some code\n", {{td::MessageEntity::Type::Pre, 0, 10}}); + check_parse_markdown_v3("asd\n```\nsome code\n```cabab", "asd\nsome code\ncabab", + {{td::MessageEntity::Type::Pre, 4, 10}}); + check_parse_markdown_v3("asd\naba```\nsome code\n```cabab", "asd\naba\nsome code\ncabab", + {{td::MessageEntity::Type::Pre, 7, 11}}); + check_parse_markdown_v3("asd\naba```\nsome code\n```\ncabab", "asd\naba\nsome code\ncabab", + {{td::MessageEntity::Type::Pre, 7, 11}}); + check_parse_markdown_v3("asd\naba```a b\nsome code\n```\ncabab", "asd\nabaa b\nsome code\ncabab", + {{td::MessageEntity::Type::Pre, 7, 14}}); + check_parse_markdown_v3("asd\naba```a!@#$%^&*(b\nsome code\n```\ncabab", "asd\nabasome code\ncabab", + {{td::MessageEntity::Type::PreCode, 7, 10, "a!@#$%^&*(b"}}); + check_parse_markdown_v3("```aba\n```", "aba\n", {{td::MessageEntity::Type::Pre, 0, 4}}); + check_parse_markdown_v3("```\n```", "\n", {{td::MessageEntity::Type::Pre, 0, 1}}); td::vector parts{"a", " #test__a", "__", "**", "~~", "||", "[", "](t.me)", "`"}; td::vector types{