Don't remove whitespace-only entities.

This commit is contained in:
levlam 2023-12-04 18:33:40 +03:00
parent 9a9e3be8cf
commit 4c052a5efa
2 changed files with 30 additions and 100 deletions

View File

@ -4107,11 +4107,10 @@ static Result<string> clean_input_string_with_entities(const string &text, vecto
} }
replace_offending_characters(result); replace_offending_characters(result);
return result; return result;
} }
// removes entities containing whitespaces only // removes empty entities
// entities must be sorted by offset and length, but not necessary by type // entities must be sorted by offset and length, but not necessary by type
// returns {last_non_whitespace_pos, last_non_whitespace_utf16_offset} // returns {last_non_whitespace_pos, last_non_whitespace_utf16_offset}
static std::pair<size_t, int32> remove_invalid_entities(const string &text, vector<MessageEntity> &entities) { static std::pair<size_t, int32> remove_invalid_entities(const string &text, vector<MessageEntity> &entities) {
@ -4129,8 +4128,6 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
} }
// check_is_sorted(entities); // check_is_sorted(entities);
vector<MessageEntity *> nested_entities_stack;
size_t current_entity = 0;
size_t last_non_whitespace_pos = text.size(); size_t last_non_whitespace_pos = text.size();
@ -4139,49 +4136,7 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
remove_empty_entities(entities); remove_empty_entities(entities);
for (size_t pos = 0; pos <= text.size(); pos++) { for (size_t pos = 0; pos < text.size(); pos++) {
while (!nested_entities_stack.empty()) {
auto *entity = nested_entities_stack.back();
auto entity_end = entity->offset + entity->length;
if (utf16_offset < entity_end) {
break;
}
if (last_non_whitespace_utf16_offset >= entity->offset || is_hidden_data_entity(entity->type)) {
// keep entity
// TODO check entity for validness, for example, that mentions, hashtags, cashtags and URLs are valid
} else {
entity->length = 0;
}
nested_entities_stack.pop_back();
}
while (current_entity < entities.size() && utf16_offset >= entities[current_entity].offset) {
nested_entities_stack.push_back(&entities[current_entity++]);
}
if (pos == text.size()) {
break;
}
if (!nested_entities_stack.empty() && nested_entities_stack.back()->offset == utf16_offset &&
(text[pos] == '\n' || text[pos] == ' ')) {
// entities was fixed, so there can't be more than one splittable entity of each type, one blockquote and
// one continuous entity for the given offset
for (size_t i = nested_entities_stack.size(); i > 0; i--) {
auto *entity = nested_entities_stack[i - 1];
if (entity->offset != utf16_offset || is_hidden_data_entity(entity->type)) {
break;
}
entity->offset++;
entity->length--;
if (entity->length == 0) {
CHECK(i == nested_entities_stack.size());
nested_entities_stack.pop_back();
}
}
}
auto c = static_cast<unsigned char>(text[pos]); auto c = static_cast<unsigned char>(text[pos]);
switch (c) { switch (c) {
case '\n': case '\n':
@ -4199,11 +4154,6 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
utf16_offset++; utf16_offset++;
} }
CHECK(nested_entities_stack.empty());
CHECK(current_entity == entities.size());
remove_empty_entities(entities);
return {last_non_whitespace_pos, last_non_whitespace_utf16_offset}; return {last_non_whitespace_pos, last_non_whitespace_utf16_offset};
} }
@ -4429,7 +4379,6 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
CHECK(last_non_whitespace_pos < result.size()); CHECK(last_non_whitespace_pos < result.size());
result.resize(last_non_whitespace_pos + 1); result.resize(last_non_whitespace_pos + 1);
while (!entities.empty() && entities.back().offset > last_non_whitespace_utf16_offset) { while (!entities.empty() && entities.back().offset > last_non_whitespace_utf16_offset) {
CHECK(is_hidden_data_entity(entities.back().type));
entities.pop_back(); entities.pop_back();
} }
bool need_sort = false; bool need_sort = false;
@ -4490,9 +4439,6 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
merge_new_entities(entities, find_media_timestamp_entities(text)); merge_new_entities(entities, find_media_timestamp_entities(text));
} }
// new whitespace-only entities could be added after splitting of entities
remove_invalid_entities(text, entities);
return Status::OK(); return Status::OK();
} }
@ -4807,7 +4753,7 @@ int32 search_quote(FormattedText &&text, FormattedText &&quote, int32 quote_posi
}); });
remove_empty_entities(text.entities); remove_empty_entities(text.entities);
fix_entities(text.entities); fix_entities(text.entities);
remove_invalid_entities(text.text, text.entities); remove_empty_entities(text.entities);
}; };
int32 length = text_length(text.text); int32 length = text_length(text.text);
int32 quote_length = text_length(quote.text); int32 quote_length = text_length(quote.text);

View File

@ -787,17 +787,13 @@ TEST(MessageEntities, fix_formatted_text) {
entities.emplace_back(td::MessageEntity::Type::Bold, 0, i); entities.emplace_back(td::MessageEntity::Type::Bold, 0, i);
td::vector<td::MessageEntity> fixed_entities; td::vector<td::MessageEntity> fixed_entities;
if (i != 33) { fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 0, i - 1 /* deleted \r */);
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 32, i - 33);
}
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, true); check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, true);
td::string expected_str; td::string expected_str = fixed_str.substr(0, 33);
if (i != 33) { if (i != 33) {
fixed_entities.back().offset = 0; fixed_entities.back().length = 33;
fixed_entities.back().length = 1;
} }
expected_str = "a";
check_fix_formatted_text(str, entities, expected_str, fixed_entities, false, false, false, false); check_fix_formatted_text(str, entities, expected_str, fixed_entities, false, false, false, false);
} }
@ -809,10 +805,15 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text(str, entities, true, true, true, true); check_fix_formatted_text(str, entities, true, true, true, true);
check_fix_formatted_text(str, entities, false, false, false, false); check_fix_formatted_text(str, entities, false, false, false, false);
} else { } else {
check_fix_formatted_text(str, entities, str, {}, true, true, true, true); check_fix_formatted_text(str, entities, str, {{td::MessageEntity::Type::Bold, i, 1}}, true, true, true, true);
if (i == 2) {
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {{td::MessageEntity::Type::Bold, i, 1}},
false, false, false, false);
} else {
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false); check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false);
} }
} }
}
str = " /test @abaca #ORD $ABC telegram.org "; str = " /test @abaca #ORD $ABC telegram.org ";
for (auto skip_trim : {false, true}) { for (auto skip_trim : {false, true}) {
@ -850,18 +851,9 @@ TEST(MessageEntities, fix_formatted_text) {
fixed_str = skip_trim ? "aba \n caba " : "aba \n caba"; fixed_str = skip_trim ? "aba \n caba " : "aba \n caba";
auto fixed_length = offset <= 4 && offset + length >= 5 ? length - 1 : length; auto fixed_length = offset <= 4 && offset + length >= 5 ? length - 1 : length;
auto fixed_offset = offset >= 5 ? offset - 1 : offset; auto fixed_offset = offset >= 5 ? offset - 1 : offset;
if (static_cast<size_t>(fixed_offset) >= fixed_str.size()) {
fixed_length = 0;
}
while (static_cast<size_t>(fixed_offset + fixed_length) > fixed_str.size()) { while (static_cast<size_t>(fixed_offset + fixed_length) > fixed_str.size()) {
fixed_length--; fixed_length--;
} }
if (type == td::MessageEntity::Type::Bold || type == td::MessageEntity::Type::Url) {
while (fixed_length > 0 && (fixed_str[fixed_offset] == ' ' || fixed_str[fixed_offset] == '\n')) {
fixed_offset++;
fixed_length--;
}
}
td::vector<td::MessageEntity> entities; td::vector<td::MessageEntity> entities;
entities.emplace_back(type, offset, length); entities.emplace_back(type, offset, length);
@ -872,18 +864,12 @@ TEST(MessageEntities, fix_formatted_text) {
} }
td::vector<td::MessageEntity> fixed_entities; td::vector<td::MessageEntity> fixed_entities;
if (fixed_length > 0) { if (fixed_length > 0) {
for (auto i = 0; i < length; i++) {
if (!td::is_space(str[offset + i]) || type == td::MessageEntity::Type::TextUrl ||
type == td::MessageEntity::Type::MentionName) {
fixed_entities.emplace_back(type, fixed_offset, fixed_length); fixed_entities.emplace_back(type, fixed_offset, fixed_length);
if (type == td::MessageEntity::Type::TextUrl) { if (type == td::MessageEntity::Type::TextUrl) {
fixed_entities.back().argument = "t.me"; fixed_entities.back().argument = "t.me";
} else if (type == td::MessageEntity::Type::MentionName) { } else if (type == td::MessageEntity::Type::MentionName) {
fixed_entities.back().user_id = user_id; fixed_entities.back().user_id = user_id;
} }
break;
}
}
} }
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, skip_trim); check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, skip_trim);
} }
@ -904,14 +890,8 @@ TEST(MessageEntities, fix_formatted_text) {
td::vector<td::MessageEntity> fixed_entities; td::vector<td::MessageEntity> fixed_entities;
if (length > 0) { if (length > 0) {
if (offset == 3) {
if (length >= 2) {
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset + 1, length - 1);
}
} else {
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length); fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length);
} }
}
check_fix_formatted_text(str, entities, str, fixed_entities, true, false, false, false); check_fix_formatted_text(str, entities, str, fixed_entities, true, false, false, false);
check_fix_formatted_text(str, entities, str, fixed_entities, false, false, false, true); check_fix_formatted_text(str, entities, str, fixed_entities, false, false, false, true);
@ -958,6 +938,9 @@ TEST(MessageEntities, fix_formatted_text) {
if (i < 4) { if (i < 4) {
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, i * 3, 2); fixed_entities.emplace_back(td::MessageEntity::Type::Bold, i * 3, 2);
} }
if (i < 3) {
fixed_entities.emplace_back(td::MessageEntity::Type::Italic, i * 3 + 2, 1);
}
} }
check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11).str(), fixed_entities, false, false, check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11).str(), fixed_entities, false, false,
@ -974,10 +957,10 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text("a \r", {{td::MessageEntity::Type::Bold, 0, 3}, {td::MessageEntity::Type::Underline, 2, 1}}, check_fix_formatted_text("a \r", {{td::MessageEntity::Type::Bold, 0, 3}, {td::MessageEntity::Type::Underline, 2, 1}},
"a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true); "a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true);
check_fix_formatted_text("a \r ", {{td::MessageEntity::Type::Bold, 0, 4}, {td::MessageEntity::Type::Underline, 2, 1}}, check_fix_formatted_text("a \r ", {{td::MessageEntity::Type::Bold, 0, 4}, {td::MessageEntity::Type::Underline, 2, 1}},
"a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true); "a ", {{td::MessageEntity::Type::Bold, 0, 3}}, true, false, false, true);
check_fix_formatted_text( check_fix_formatted_text("a \r b",
"a \r b", {{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b", {{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b",
{{td::MessageEntity::Type::Bold, 0, 2}, {td::MessageEntity::Type::Bold, 3, 1}}, true, false, false, true); {{td::MessageEntity::Type::Bold, 0, 4}}, true, false, false, true);
check_fix_formatted_text("a\rbc\r", check_fix_formatted_text("a\rbc\r",
{{td::MessageEntity::Type::Italic, 0, 1}, {{td::MessageEntity::Type::Italic, 0, 1},
@ -1014,6 +997,7 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text("@tests @tests", {{td::MessageEntity::Type::Italic, 0, 13}}, "@tests @tests", check_fix_formatted_text("@tests @tests", {{td::MessageEntity::Type::Italic, 0, 13}}, "@tests @tests",
{{td::MessageEntity::Type::Mention, 0, 6}, {{td::MessageEntity::Type::Mention, 0, 6},
{td::MessageEntity::Type::Italic, 0, 6}, {td::MessageEntity::Type::Italic, 0, 6},
{td::MessageEntity::Type::Italic, 6, 1},
{td::MessageEntity::Type::Mention, 7, 6}, {td::MessageEntity::Type::Mention, 7, 6},
{td::MessageEntity::Type::Italic, 7, 6}}); {td::MessageEntity::Type::Italic, 7, 6}});
@ -1113,7 +1097,7 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text("example.com a", {{td::MessageEntity::Type::Italic, 0, 13}}, "example.com a", check_fix_formatted_text("example.com a", {{td::MessageEntity::Type::Italic, 0, 13}}, "example.com a",
{{td::MessageEntity::Type::Url, 0, 11}, {{td::MessageEntity::Type::Url, 0, 11},
{td::MessageEntity::Type::Italic, 0, 11}, {td::MessageEntity::Type::Italic, 0, 11},
{td::MessageEntity::Type::Italic, 12, 1}}); {td::MessageEntity::Type::Italic, 11, 2}});
check_fix_formatted_text("a example.com", {{td::MessageEntity::Type::Italic, 0, 13}}, "a example.com", check_fix_formatted_text("a example.com", {{td::MessageEntity::Type::Italic, 0, 13}}, "a example.com",
{{td::MessageEntity::Type::Italic, 0, 2}, {{td::MessageEntity::Type::Italic, 0, 2},
{td::MessageEntity::Type::Url, 2, 11}, {td::MessageEntity::Type::Url, 2, 11},
@ -1679,8 +1663,8 @@ TEST(MessageEntities, parse_markdown_v3) {
check_parse_markdown_v3("__ __", " ", {{td::MessageEntity::Type::Italic, 0, 1}}); check_parse_markdown_v3("__ __", " ", {{td::MessageEntity::Type::Italic, 0, 1}});
check_parse_markdown_v3("__\n__", "\n", {{td::MessageEntity::Type::Italic, 0, 1}}); check_parse_markdown_v3("__\n__", "\n", {{td::MessageEntity::Type::Italic, 0, 1}});
check_parse_markdown_v3("__ __a", " a", {}, true); check_parse_markdown_v3("__ __a", " a", {{td::MessageEntity::Type::Italic, 0, 1}}, true);
check_parse_markdown_v3("__\n__a", "\na", {}, true); check_parse_markdown_v3("__\n__a", "\na", {{td::MessageEntity::Type::Italic, 0, 1}}, true);
check_parse_markdown_v3("**** __a__ **b** ~~c~~ ||d||", "**** a b c d", check_parse_markdown_v3("**** __a__ **b** ~~c~~ ||d||", "**** a b c d",
{{td::MessageEntity::Type::Italic, 5, 1}, {{td::MessageEntity::Type::Italic, 5, 1},
{td::MessageEntity::Type::Bold, 7, 1}, {td::MessageEntity::Type::Bold, 7, 1},