Don't remove whitespace-only entities.

This commit is contained in:
levlam 2023-12-04 18:33:40 +03:00
parent 9a9e3be8cf
commit 4c052a5efa
2 changed files with 30 additions and 100 deletions

View File

@ -4107,11 +4107,10 @@ static Result<string> clean_input_string_with_entities(const string &text, vecto
}
replace_offending_characters(result);
return result;
}
// removes entities containing whitespaces only
// removes empty entities
// entities must be sorted by offset and length, but not necessary by type
// returns {last_non_whitespace_pos, last_non_whitespace_utf16_offset}
static std::pair<size_t, int32> remove_invalid_entities(const string &text, vector<MessageEntity> &entities) {
@ -4129,8 +4128,6 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
}
// check_is_sorted(entities);
vector<MessageEntity *> nested_entities_stack;
size_t current_entity = 0;
size_t last_non_whitespace_pos = text.size();
@ -4139,49 +4136,7 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
remove_empty_entities(entities);
for (size_t pos = 0; pos <= text.size(); pos++) {
while (!nested_entities_stack.empty()) {
auto *entity = nested_entities_stack.back();
auto entity_end = entity->offset + entity->length;
if (utf16_offset < entity_end) {
break;
}
if (last_non_whitespace_utf16_offset >= entity->offset || is_hidden_data_entity(entity->type)) {
// keep entity
// TODO check entity for validness, for example, that mentions, hashtags, cashtags and URLs are valid
} else {
entity->length = 0;
}
nested_entities_stack.pop_back();
}
while (current_entity < entities.size() && utf16_offset >= entities[current_entity].offset) {
nested_entities_stack.push_back(&entities[current_entity++]);
}
if (pos == text.size()) {
break;
}
if (!nested_entities_stack.empty() && nested_entities_stack.back()->offset == utf16_offset &&
(text[pos] == '\n' || text[pos] == ' ')) {
// entities was fixed, so there can't be more than one splittable entity of each type, one blockquote and
// one continuous entity for the given offset
for (size_t i = nested_entities_stack.size(); i > 0; i--) {
auto *entity = nested_entities_stack[i - 1];
if (entity->offset != utf16_offset || is_hidden_data_entity(entity->type)) {
break;
}
entity->offset++;
entity->length--;
if (entity->length == 0) {
CHECK(i == nested_entities_stack.size());
nested_entities_stack.pop_back();
}
}
}
for (size_t pos = 0; pos < text.size(); pos++) {
auto c = static_cast<unsigned char>(text[pos]);
switch (c) {
case '\n':
@ -4199,11 +4154,6 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
utf16_offset++;
}
CHECK(nested_entities_stack.empty());
CHECK(current_entity == entities.size());
remove_empty_entities(entities);
return {last_non_whitespace_pos, last_non_whitespace_utf16_offset};
}
@ -4429,7 +4379,6 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
CHECK(last_non_whitespace_pos < result.size());
result.resize(last_non_whitespace_pos + 1);
while (!entities.empty() && entities.back().offset > last_non_whitespace_utf16_offset) {
CHECK(is_hidden_data_entity(entities.back().type));
entities.pop_back();
}
bool need_sort = false;
@ -4490,9 +4439,6 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
merge_new_entities(entities, find_media_timestamp_entities(text));
}
// new whitespace-only entities could be added after splitting of entities
remove_invalid_entities(text, entities);
return Status::OK();
}
@ -4807,7 +4753,7 @@ int32 search_quote(FormattedText &&text, FormattedText &&quote, int32 quote_posi
});
remove_empty_entities(text.entities);
fix_entities(text.entities);
remove_invalid_entities(text.text, text.entities);
remove_empty_entities(text.entities);
};
int32 length = text_length(text.text);
int32 quote_length = text_length(quote.text);

View File

@ -787,17 +787,13 @@ TEST(MessageEntities, fix_formatted_text) {
entities.emplace_back(td::MessageEntity::Type::Bold, 0, i);
td::vector<td::MessageEntity> fixed_entities;
if (i != 33) {
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 32, i - 33);
}
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 0, i - 1 /* deleted \r */);
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, true);
td::string expected_str;
td::string expected_str = fixed_str.substr(0, 33);
if (i != 33) {
fixed_entities.back().offset = 0;
fixed_entities.back().length = 1;
fixed_entities.back().length = 33;
}
expected_str = "a";
check_fix_formatted_text(str, entities, expected_str, fixed_entities, false, false, false, false);
}
@ -809,8 +805,13 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text(str, entities, true, true, true, true);
check_fix_formatted_text(str, entities, false, false, false, false);
} else {
check_fix_formatted_text(str, entities, str, {}, true, true, true, true);
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false);
check_fix_formatted_text(str, entities, str, {{td::MessageEntity::Type::Bold, i, 1}}, true, true, true, true);
if (i == 2) {
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {{td::MessageEntity::Type::Bold, i, 1}},
false, false, false, false);
} else {
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false);
}
}
}
@ -850,18 +851,9 @@ TEST(MessageEntities, fix_formatted_text) {
fixed_str = skip_trim ? "aba \n caba " : "aba \n caba";
auto fixed_length = offset <= 4 && offset + length >= 5 ? length - 1 : length;
auto fixed_offset = offset >= 5 ? offset - 1 : offset;
if (static_cast<size_t>(fixed_offset) >= fixed_str.size()) {
fixed_length = 0;
}
while (static_cast<size_t>(fixed_offset + fixed_length) > fixed_str.size()) {
fixed_length--;
}
if (type == td::MessageEntity::Type::Bold || type == td::MessageEntity::Type::Url) {
while (fixed_length > 0 && (fixed_str[fixed_offset] == ' ' || fixed_str[fixed_offset] == '\n')) {
fixed_offset++;
fixed_length--;
}
}
td::vector<td::MessageEntity> entities;
entities.emplace_back(type, offset, length);
@ -872,17 +864,11 @@ TEST(MessageEntities, fix_formatted_text) {
}
td::vector<td::MessageEntity> fixed_entities;
if (fixed_length > 0) {
for (auto i = 0; i < length; i++) {
if (!td::is_space(str[offset + i]) || type == td::MessageEntity::Type::TextUrl ||
type == td::MessageEntity::Type::MentionName) {
fixed_entities.emplace_back(type, fixed_offset, fixed_length);
if (type == td::MessageEntity::Type::TextUrl) {
fixed_entities.back().argument = "t.me";
} else if (type == td::MessageEntity::Type::MentionName) {
fixed_entities.back().user_id = user_id;
}
break;
}
fixed_entities.emplace_back(type, fixed_offset, fixed_length);
if (type == td::MessageEntity::Type::TextUrl) {
fixed_entities.back().argument = "t.me";
} else if (type == td::MessageEntity::Type::MentionName) {
fixed_entities.back().user_id = user_id;
}
}
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, skip_trim);
@ -904,13 +890,7 @@ TEST(MessageEntities, fix_formatted_text) {
td::vector<td::MessageEntity> fixed_entities;
if (length > 0) {
if (offset == 3) {
if (length >= 2) {
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset + 1, length - 1);
}
} else {
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length);
}
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length);
}
check_fix_formatted_text(str, entities, str, fixed_entities, true, false, false, false);
@ -958,6 +938,9 @@ TEST(MessageEntities, fix_formatted_text) {
if (i < 4) {
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, i * 3, 2);
}
if (i < 3) {
fixed_entities.emplace_back(td::MessageEntity::Type::Italic, i * 3 + 2, 1);
}
}
check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11).str(), fixed_entities, false, false,
@ -974,10 +957,10 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text("a \r", {{td::MessageEntity::Type::Bold, 0, 3}, {td::MessageEntity::Type::Underline, 2, 1}},
"a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true);
check_fix_formatted_text("a \r ", {{td::MessageEntity::Type::Bold, 0, 4}, {td::MessageEntity::Type::Underline, 2, 1}},
"a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true);
check_fix_formatted_text(
"a \r b", {{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b",
{{td::MessageEntity::Type::Bold, 0, 2}, {td::MessageEntity::Type::Bold, 3, 1}}, true, false, false, true);
"a ", {{td::MessageEntity::Type::Bold, 0, 3}}, true, false, false, true);
check_fix_formatted_text("a \r b",
{{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b",
{{td::MessageEntity::Type::Bold, 0, 4}}, true, false, false, true);
check_fix_formatted_text("a\rbc\r",
{{td::MessageEntity::Type::Italic, 0, 1},
@ -1014,6 +997,7 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text("@tests @tests", {{td::MessageEntity::Type::Italic, 0, 13}}, "@tests @tests",
{{td::MessageEntity::Type::Mention, 0, 6},
{td::MessageEntity::Type::Italic, 0, 6},
{td::MessageEntity::Type::Italic, 6, 1},
{td::MessageEntity::Type::Mention, 7, 6},
{td::MessageEntity::Type::Italic, 7, 6}});
@ -1113,7 +1097,7 @@ TEST(MessageEntities, fix_formatted_text) {
check_fix_formatted_text("example.com a", {{td::MessageEntity::Type::Italic, 0, 13}}, "example.com a",
{{td::MessageEntity::Type::Url, 0, 11},
{td::MessageEntity::Type::Italic, 0, 11},
{td::MessageEntity::Type::Italic, 12, 1}});
{td::MessageEntity::Type::Italic, 11, 2}});
check_fix_formatted_text("a example.com", {{td::MessageEntity::Type::Italic, 0, 13}}, "a example.com",
{{td::MessageEntity::Type::Italic, 0, 2},
{td::MessageEntity::Type::Url, 2, 11},
@ -1679,8 +1663,8 @@ TEST(MessageEntities, parse_markdown_v3) {
check_parse_markdown_v3("__ __", " ", {{td::MessageEntity::Type::Italic, 0, 1}});
check_parse_markdown_v3("__\n__", "\n", {{td::MessageEntity::Type::Italic, 0, 1}});
check_parse_markdown_v3("__ __a", " a", {}, true);
check_parse_markdown_v3("__\n__a", "\na", {}, true);
check_parse_markdown_v3("__ __a", " a", {{td::MessageEntity::Type::Italic, 0, 1}}, true);
check_parse_markdown_v3("__\n__a", "\na", {{td::MessageEntity::Type::Italic, 0, 1}}, true);
check_parse_markdown_v3("**** __a__ **b** ~~c~~ ||d||", "**** a b c d",
{{td::MessageEntity::Type::Italic, 5, 1},
{td::MessageEntity::Type::Bold, 7, 1},