From 4c052a5efa682ce29d15de4427fffc0604e9c755 Mon Sep 17 00:00:00 2001 From: levlam Date: Mon, 4 Dec 2023 18:33:40 +0300 Subject: [PATCH] Don't remove whitespace-only entities. --- td/telegram/MessageEntity.cpp | 60 ++---------------------------- test/message_entities.cpp | 70 ++++++++++++++--------------------- 2 files changed, 30 insertions(+), 100 deletions(-) diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 5f6c9c7d2..2db4f3ba5 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -4107,11 +4107,10 @@ static Result clean_input_string_with_entities(const string &text, vecto } replace_offending_characters(result); - return result; } -// removes entities containing whitespaces only +// removes empty entities // entities must be sorted by offset and length, but not necessary by type // returns {last_non_whitespace_pos, last_non_whitespace_utf16_offset} static std::pair remove_invalid_entities(const string &text, vector &entities) { @@ -4129,8 +4128,6 @@ static std::pair remove_invalid_entities(const string &text, vect } // check_is_sorted(entities); - vector nested_entities_stack; - size_t current_entity = 0; size_t last_non_whitespace_pos = text.size(); @@ -4139,49 +4136,7 @@ static std::pair remove_invalid_entities(const string &text, vect remove_empty_entities(entities); - for (size_t pos = 0; pos <= text.size(); pos++) { - while (!nested_entities_stack.empty()) { - auto *entity = nested_entities_stack.back(); - auto entity_end = entity->offset + entity->length; - if (utf16_offset < entity_end) { - break; - } - - if (last_non_whitespace_utf16_offset >= entity->offset || is_hidden_data_entity(entity->type)) { - // keep entity - // TODO check entity for validness, for example, that mentions, hashtags, cashtags and URLs are valid - } else { - entity->length = 0; - } - - nested_entities_stack.pop_back(); - } - while (current_entity < entities.size() && utf16_offset >= entities[current_entity].offset) { - nested_entities_stack.push_back(&entities[current_entity++]); - } - - if (pos == text.size()) { - break; - } - - if (!nested_entities_stack.empty() && nested_entities_stack.back()->offset == utf16_offset && - (text[pos] == '\n' || text[pos] == ' ')) { - // entities was fixed, so there can't be more than one splittable entity of each type, one blockquote and - // one continuous entity for the given offset - for (size_t i = nested_entities_stack.size(); i > 0; i--) { - auto *entity = nested_entities_stack[i - 1]; - if (entity->offset != utf16_offset || is_hidden_data_entity(entity->type)) { - break; - } - entity->offset++; - entity->length--; - if (entity->length == 0) { - CHECK(i == nested_entities_stack.size()); - nested_entities_stack.pop_back(); - } - } - } - + for (size_t pos = 0; pos < text.size(); pos++) { auto c = static_cast(text[pos]); switch (c) { case '\n': @@ -4199,11 +4154,6 @@ static std::pair remove_invalid_entities(const string &text, vect utf16_offset++; } - CHECK(nested_entities_stack.empty()); - CHECK(current_entity == entities.size()); - - remove_empty_entities(entities); - return {last_non_whitespace_pos, last_non_whitespace_utf16_offset}; } @@ -4429,7 +4379,6 @@ Status fix_formatted_text(string &text, vector &entities, bool al CHECK(last_non_whitespace_pos < result.size()); result.resize(last_non_whitespace_pos + 1); while (!entities.empty() && entities.back().offset > last_non_whitespace_utf16_offset) { - CHECK(is_hidden_data_entity(entities.back().type)); entities.pop_back(); } bool need_sort = false; @@ -4490,9 +4439,6 @@ Status fix_formatted_text(string &text, vector &entities, bool al merge_new_entities(entities, find_media_timestamp_entities(text)); } - // new whitespace-only entities could be added after splitting of entities - remove_invalid_entities(text, entities); - return Status::OK(); } @@ -4807,7 +4753,7 @@ int32 search_quote(FormattedText &&text, FormattedText &"e, int32 quote_posi }); remove_empty_entities(text.entities); fix_entities(text.entities); - remove_invalid_entities(text.text, text.entities); + remove_empty_entities(text.entities); }; int32 length = text_length(text.text); int32 quote_length = text_length(quote.text); diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 73c4ab36f..e0ddb2e54 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -787,17 +787,13 @@ TEST(MessageEntities, fix_formatted_text) { entities.emplace_back(td::MessageEntity::Type::Bold, 0, i); td::vector fixed_entities; - if (i != 33) { - fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 32, i - 33); - } + fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 0, i - 1 /* deleted \r */); check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, true); - td::string expected_str; + td::string expected_str = fixed_str.substr(0, 33); if (i != 33) { - fixed_entities.back().offset = 0; - fixed_entities.back().length = 1; + fixed_entities.back().length = 33; } - expected_str = "a"; check_fix_formatted_text(str, entities, expected_str, fixed_entities, false, false, false, false); } @@ -809,8 +805,13 @@ TEST(MessageEntities, fix_formatted_text) { check_fix_formatted_text(str, entities, true, true, true, true); check_fix_formatted_text(str, entities, false, false, false, false); } else { - check_fix_formatted_text(str, entities, str, {}, true, true, true, true); - check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false); + check_fix_formatted_text(str, entities, str, {{td::MessageEntity::Type::Bold, i, 1}}, true, true, true, true); + if (i == 2) { + check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {{td::MessageEntity::Type::Bold, i, 1}}, + false, false, false, false); + } else { + check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false); + } } } @@ -850,18 +851,9 @@ TEST(MessageEntities, fix_formatted_text) { fixed_str = skip_trim ? "aba \n caba " : "aba \n caba"; auto fixed_length = offset <= 4 && offset + length >= 5 ? length - 1 : length; auto fixed_offset = offset >= 5 ? offset - 1 : offset; - if (static_cast(fixed_offset) >= fixed_str.size()) { - fixed_length = 0; - } while (static_cast(fixed_offset + fixed_length) > fixed_str.size()) { fixed_length--; } - if (type == td::MessageEntity::Type::Bold || type == td::MessageEntity::Type::Url) { - while (fixed_length > 0 && (fixed_str[fixed_offset] == ' ' || fixed_str[fixed_offset] == '\n')) { - fixed_offset++; - fixed_length--; - } - } td::vector entities; entities.emplace_back(type, offset, length); @@ -872,17 +864,11 @@ TEST(MessageEntities, fix_formatted_text) { } td::vector fixed_entities; if (fixed_length > 0) { - for (auto i = 0; i < length; i++) { - if (!td::is_space(str[offset + i]) || type == td::MessageEntity::Type::TextUrl || - type == td::MessageEntity::Type::MentionName) { - fixed_entities.emplace_back(type, fixed_offset, fixed_length); - if (type == td::MessageEntity::Type::TextUrl) { - fixed_entities.back().argument = "t.me"; - } else if (type == td::MessageEntity::Type::MentionName) { - fixed_entities.back().user_id = user_id; - } - break; - } + fixed_entities.emplace_back(type, fixed_offset, fixed_length); + if (type == td::MessageEntity::Type::TextUrl) { + fixed_entities.back().argument = "t.me"; + } else if (type == td::MessageEntity::Type::MentionName) { + fixed_entities.back().user_id = user_id; } } check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, skip_trim); @@ -904,13 +890,7 @@ TEST(MessageEntities, fix_formatted_text) { td::vector fixed_entities; if (length > 0) { - if (offset == 3) { - if (length >= 2) { - fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset + 1, length - 1); - } - } else { - fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length); - } + fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length); } check_fix_formatted_text(str, entities, str, fixed_entities, true, false, false, false); @@ -958,6 +938,9 @@ TEST(MessageEntities, fix_formatted_text) { if (i < 4) { fixed_entities.emplace_back(td::MessageEntity::Type::Bold, i * 3, 2); } + if (i < 3) { + fixed_entities.emplace_back(td::MessageEntity::Type::Italic, i * 3 + 2, 1); + } } check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11).str(), fixed_entities, false, false, @@ -974,10 +957,10 @@ TEST(MessageEntities, fix_formatted_text) { check_fix_formatted_text("a \r", {{td::MessageEntity::Type::Bold, 0, 3}, {td::MessageEntity::Type::Underline, 2, 1}}, "a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true); check_fix_formatted_text("a \r ", {{td::MessageEntity::Type::Bold, 0, 4}, {td::MessageEntity::Type::Underline, 2, 1}}, - "a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true); - check_fix_formatted_text( - "a \r b", {{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b", - {{td::MessageEntity::Type::Bold, 0, 2}, {td::MessageEntity::Type::Bold, 3, 1}}, true, false, false, true); + "a ", {{td::MessageEntity::Type::Bold, 0, 3}}, true, false, false, true); + check_fix_formatted_text("a \r b", + {{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b", + {{td::MessageEntity::Type::Bold, 0, 4}}, true, false, false, true); check_fix_formatted_text("a\rbc\r", {{td::MessageEntity::Type::Italic, 0, 1}, @@ -1014,6 +997,7 @@ TEST(MessageEntities, fix_formatted_text) { check_fix_formatted_text("@tests @tests", {{td::MessageEntity::Type::Italic, 0, 13}}, "@tests @tests", {{td::MessageEntity::Type::Mention, 0, 6}, {td::MessageEntity::Type::Italic, 0, 6}, + {td::MessageEntity::Type::Italic, 6, 1}, {td::MessageEntity::Type::Mention, 7, 6}, {td::MessageEntity::Type::Italic, 7, 6}}); @@ -1113,7 +1097,7 @@ TEST(MessageEntities, fix_formatted_text) { check_fix_formatted_text("example.com a", {{td::MessageEntity::Type::Italic, 0, 13}}, "example.com a", {{td::MessageEntity::Type::Url, 0, 11}, {td::MessageEntity::Type::Italic, 0, 11}, - {td::MessageEntity::Type::Italic, 12, 1}}); + {td::MessageEntity::Type::Italic, 11, 2}}); check_fix_formatted_text("a example.com", {{td::MessageEntity::Type::Italic, 0, 13}}, "a example.com", {{td::MessageEntity::Type::Italic, 0, 2}, {td::MessageEntity::Type::Url, 2, 11}, @@ -1679,8 +1663,8 @@ TEST(MessageEntities, parse_markdown_v3) { check_parse_markdown_v3("__ __", " ", {{td::MessageEntity::Type::Italic, 0, 1}}); check_parse_markdown_v3("__\n__", "\n", {{td::MessageEntity::Type::Italic, 0, 1}}); - check_parse_markdown_v3("__ __a", " a", {}, true); - check_parse_markdown_v3("__\n__a", "\na", {}, true); + check_parse_markdown_v3("__ __a", " a", {{td::MessageEntity::Type::Italic, 0, 1}}, true); + check_parse_markdown_v3("__\n__a", "\na", {{td::MessageEntity::Type::Italic, 0, 1}}, true); check_parse_markdown_v3("**** __a__ **b** ~~c~~ ||d||", "**** a b c d", {{td::MessageEntity::Type::Italic, 5, 1}, {td::MessageEntity::Type::Bold, 7, 1},