Don't remove whitespace-only entities.
This commit is contained in:
parent
9a9e3be8cf
commit
4c052a5efa
@ -4107,11 +4107,10 @@ static Result<string> clean_input_string_with_entities(const string &text, vecto
|
|||||||
}
|
}
|
||||||
|
|
||||||
replace_offending_characters(result);
|
replace_offending_characters(result);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// removes entities containing whitespaces only
|
// removes empty entities
|
||||||
// entities must be sorted by offset and length, but not necessary by type
|
// entities must be sorted by offset and length, but not necessary by type
|
||||||
// returns {last_non_whitespace_pos, last_non_whitespace_utf16_offset}
|
// returns {last_non_whitespace_pos, last_non_whitespace_utf16_offset}
|
||||||
static std::pair<size_t, int32> remove_invalid_entities(const string &text, vector<MessageEntity> &entities) {
|
static std::pair<size_t, int32> remove_invalid_entities(const string &text, vector<MessageEntity> &entities) {
|
||||||
@ -4129,8 +4128,6 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check_is_sorted(entities);
|
// check_is_sorted(entities);
|
||||||
vector<MessageEntity *> nested_entities_stack;
|
|
||||||
size_t current_entity = 0;
|
|
||||||
|
|
||||||
size_t last_non_whitespace_pos = text.size();
|
size_t last_non_whitespace_pos = text.size();
|
||||||
|
|
||||||
@ -4139,49 +4136,7 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
|
|||||||
|
|
||||||
remove_empty_entities(entities);
|
remove_empty_entities(entities);
|
||||||
|
|
||||||
for (size_t pos = 0; pos <= text.size(); pos++) {
|
for (size_t pos = 0; pos < text.size(); pos++) {
|
||||||
while (!nested_entities_stack.empty()) {
|
|
||||||
auto *entity = nested_entities_stack.back();
|
|
||||||
auto entity_end = entity->offset + entity->length;
|
|
||||||
if (utf16_offset < entity_end) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (last_non_whitespace_utf16_offset >= entity->offset || is_hidden_data_entity(entity->type)) {
|
|
||||||
// keep entity
|
|
||||||
// TODO check entity for validness, for example, that mentions, hashtags, cashtags and URLs are valid
|
|
||||||
} else {
|
|
||||||
entity->length = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
nested_entities_stack.pop_back();
|
|
||||||
}
|
|
||||||
while (current_entity < entities.size() && utf16_offset >= entities[current_entity].offset) {
|
|
||||||
nested_entities_stack.push_back(&entities[current_entity++]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pos == text.size()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!nested_entities_stack.empty() && nested_entities_stack.back()->offset == utf16_offset &&
|
|
||||||
(text[pos] == '\n' || text[pos] == ' ')) {
|
|
||||||
// entities was fixed, so there can't be more than one splittable entity of each type, one blockquote and
|
|
||||||
// one continuous entity for the given offset
|
|
||||||
for (size_t i = nested_entities_stack.size(); i > 0; i--) {
|
|
||||||
auto *entity = nested_entities_stack[i - 1];
|
|
||||||
if (entity->offset != utf16_offset || is_hidden_data_entity(entity->type)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
entity->offset++;
|
|
||||||
entity->length--;
|
|
||||||
if (entity->length == 0) {
|
|
||||||
CHECK(i == nested_entities_stack.size());
|
|
||||||
nested_entities_stack.pop_back();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto c = static_cast<unsigned char>(text[pos]);
|
auto c = static_cast<unsigned char>(text[pos]);
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case '\n':
|
case '\n':
|
||||||
@ -4199,11 +4154,6 @@ static std::pair<size_t, int32> remove_invalid_entities(const string &text, vect
|
|||||||
|
|
||||||
utf16_offset++;
|
utf16_offset++;
|
||||||
}
|
}
|
||||||
CHECK(nested_entities_stack.empty());
|
|
||||||
CHECK(current_entity == entities.size());
|
|
||||||
|
|
||||||
remove_empty_entities(entities);
|
|
||||||
|
|
||||||
return {last_non_whitespace_pos, last_non_whitespace_utf16_offset};
|
return {last_non_whitespace_pos, last_non_whitespace_utf16_offset};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4429,7 +4379,6 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
|
|||||||
CHECK(last_non_whitespace_pos < result.size());
|
CHECK(last_non_whitespace_pos < result.size());
|
||||||
result.resize(last_non_whitespace_pos + 1);
|
result.resize(last_non_whitespace_pos + 1);
|
||||||
while (!entities.empty() && entities.back().offset > last_non_whitespace_utf16_offset) {
|
while (!entities.empty() && entities.back().offset > last_non_whitespace_utf16_offset) {
|
||||||
CHECK(is_hidden_data_entity(entities.back().type));
|
|
||||||
entities.pop_back();
|
entities.pop_back();
|
||||||
}
|
}
|
||||||
bool need_sort = false;
|
bool need_sort = false;
|
||||||
@ -4490,9 +4439,6 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
|
|||||||
merge_new_entities(entities, find_media_timestamp_entities(text));
|
merge_new_entities(entities, find_media_timestamp_entities(text));
|
||||||
}
|
}
|
||||||
|
|
||||||
// new whitespace-only entities could be added after splitting of entities
|
|
||||||
remove_invalid_entities(text, entities);
|
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4807,7 +4753,7 @@ int32 search_quote(FormattedText &&text, FormattedText &"e, int32 quote_posi
|
|||||||
});
|
});
|
||||||
remove_empty_entities(text.entities);
|
remove_empty_entities(text.entities);
|
||||||
fix_entities(text.entities);
|
fix_entities(text.entities);
|
||||||
remove_invalid_entities(text.text, text.entities);
|
remove_empty_entities(text.entities);
|
||||||
};
|
};
|
||||||
int32 length = text_length(text.text);
|
int32 length = text_length(text.text);
|
||||||
int32 quote_length = text_length(quote.text);
|
int32 quote_length = text_length(quote.text);
|
||||||
|
@ -787,17 +787,13 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
entities.emplace_back(td::MessageEntity::Type::Bold, 0, i);
|
entities.emplace_back(td::MessageEntity::Type::Bold, 0, i);
|
||||||
|
|
||||||
td::vector<td::MessageEntity> fixed_entities;
|
td::vector<td::MessageEntity> fixed_entities;
|
||||||
if (i != 33) {
|
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 0, i - 1 /* deleted \r */);
|
||||||
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, 32, i - 33);
|
|
||||||
}
|
|
||||||
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, true);
|
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, true);
|
||||||
|
|
||||||
td::string expected_str;
|
td::string expected_str = fixed_str.substr(0, 33);
|
||||||
if (i != 33) {
|
if (i != 33) {
|
||||||
fixed_entities.back().offset = 0;
|
fixed_entities.back().length = 33;
|
||||||
fixed_entities.back().length = 1;
|
|
||||||
}
|
}
|
||||||
expected_str = "a";
|
|
||||||
check_fix_formatted_text(str, entities, expected_str, fixed_entities, false, false, false, false);
|
check_fix_formatted_text(str, entities, expected_str, fixed_entities, false, false, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -809,10 +805,15 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
check_fix_formatted_text(str, entities, true, true, true, true);
|
check_fix_formatted_text(str, entities, true, true, true, true);
|
||||||
check_fix_formatted_text(str, entities, false, false, false, false);
|
check_fix_formatted_text(str, entities, false, false, false, false);
|
||||||
} else {
|
} else {
|
||||||
check_fix_formatted_text(str, entities, str, {}, true, true, true, true);
|
check_fix_formatted_text(str, entities, str, {{td::MessageEntity::Type::Bold, i, 1}}, true, true, true, true);
|
||||||
|
if (i == 2) {
|
||||||
|
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {{td::MessageEntity::Type::Bold, i, 1}},
|
||||||
|
false, false, false, false);
|
||||||
|
} else {
|
||||||
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false);
|
check_fix_formatted_text(str, entities, str.substr(0, str.size() - 2), {}, false, false, false, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
str = " /test @abaca #ORD $ABC telegram.org ";
|
str = " /test @abaca #ORD $ABC telegram.org ";
|
||||||
for (auto skip_trim : {false, true}) {
|
for (auto skip_trim : {false, true}) {
|
||||||
@ -850,18 +851,9 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
fixed_str = skip_trim ? "aba \n caba " : "aba \n caba";
|
fixed_str = skip_trim ? "aba \n caba " : "aba \n caba";
|
||||||
auto fixed_length = offset <= 4 && offset + length >= 5 ? length - 1 : length;
|
auto fixed_length = offset <= 4 && offset + length >= 5 ? length - 1 : length;
|
||||||
auto fixed_offset = offset >= 5 ? offset - 1 : offset;
|
auto fixed_offset = offset >= 5 ? offset - 1 : offset;
|
||||||
if (static_cast<size_t>(fixed_offset) >= fixed_str.size()) {
|
|
||||||
fixed_length = 0;
|
|
||||||
}
|
|
||||||
while (static_cast<size_t>(fixed_offset + fixed_length) > fixed_str.size()) {
|
while (static_cast<size_t>(fixed_offset + fixed_length) > fixed_str.size()) {
|
||||||
fixed_length--;
|
fixed_length--;
|
||||||
}
|
}
|
||||||
if (type == td::MessageEntity::Type::Bold || type == td::MessageEntity::Type::Url) {
|
|
||||||
while (fixed_length > 0 && (fixed_str[fixed_offset] == ' ' || fixed_str[fixed_offset] == '\n')) {
|
|
||||||
fixed_offset++;
|
|
||||||
fixed_length--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
td::vector<td::MessageEntity> entities;
|
td::vector<td::MessageEntity> entities;
|
||||||
entities.emplace_back(type, offset, length);
|
entities.emplace_back(type, offset, length);
|
||||||
@ -872,18 +864,12 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
}
|
}
|
||||||
td::vector<td::MessageEntity> fixed_entities;
|
td::vector<td::MessageEntity> fixed_entities;
|
||||||
if (fixed_length > 0) {
|
if (fixed_length > 0) {
|
||||||
for (auto i = 0; i < length; i++) {
|
|
||||||
if (!td::is_space(str[offset + i]) || type == td::MessageEntity::Type::TextUrl ||
|
|
||||||
type == td::MessageEntity::Type::MentionName) {
|
|
||||||
fixed_entities.emplace_back(type, fixed_offset, fixed_length);
|
fixed_entities.emplace_back(type, fixed_offset, fixed_length);
|
||||||
if (type == td::MessageEntity::Type::TextUrl) {
|
if (type == td::MessageEntity::Type::TextUrl) {
|
||||||
fixed_entities.back().argument = "t.me";
|
fixed_entities.back().argument = "t.me";
|
||||||
} else if (type == td::MessageEntity::Type::MentionName) {
|
} else if (type == td::MessageEntity::Type::MentionName) {
|
||||||
fixed_entities.back().user_id = user_id;
|
fixed_entities.back().user_id = user_id;
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, skip_trim);
|
check_fix_formatted_text(str, entities, fixed_str, fixed_entities, true, false, false, skip_trim);
|
||||||
}
|
}
|
||||||
@ -904,14 +890,8 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
|
|
||||||
td::vector<td::MessageEntity> fixed_entities;
|
td::vector<td::MessageEntity> fixed_entities;
|
||||||
if (length > 0) {
|
if (length > 0) {
|
||||||
if (offset == 3) {
|
|
||||||
if (length >= 2) {
|
|
||||||
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset + 1, length - 1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length);
|
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, offset, length);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
check_fix_formatted_text(str, entities, str, fixed_entities, true, false, false, false);
|
check_fix_formatted_text(str, entities, str, fixed_entities, true, false, false, false);
|
||||||
check_fix_formatted_text(str, entities, str, fixed_entities, false, false, false, true);
|
check_fix_formatted_text(str, entities, str, fixed_entities, false, false, false, true);
|
||||||
@ -958,6 +938,9 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
if (i < 4) {
|
if (i < 4) {
|
||||||
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, i * 3, 2);
|
fixed_entities.emplace_back(td::MessageEntity::Type::Bold, i * 3, 2);
|
||||||
}
|
}
|
||||||
|
if (i < 3) {
|
||||||
|
fixed_entities.emplace_back(td::MessageEntity::Type::Italic, i * 3 + 2, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11).str(), fixed_entities, false, false,
|
check_fix_formatted_text(str, entities, td::utf8_utf16_substr(str, 3, 11).str(), fixed_entities, false, false,
|
||||||
@ -974,10 +957,10 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
check_fix_formatted_text("a \r", {{td::MessageEntity::Type::Bold, 0, 3}, {td::MessageEntity::Type::Underline, 2, 1}},
|
check_fix_formatted_text("a \r", {{td::MessageEntity::Type::Bold, 0, 3}, {td::MessageEntity::Type::Underline, 2, 1}},
|
||||||
"a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true);
|
"a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true);
|
||||||
check_fix_formatted_text("a \r ", {{td::MessageEntity::Type::Bold, 0, 4}, {td::MessageEntity::Type::Underline, 2, 1}},
|
check_fix_formatted_text("a \r ", {{td::MessageEntity::Type::Bold, 0, 4}, {td::MessageEntity::Type::Underline, 2, 1}},
|
||||||
"a ", {{td::MessageEntity::Type::Bold, 0, 2}}, true, false, false, true);
|
"a ", {{td::MessageEntity::Type::Bold, 0, 3}}, true, false, false, true);
|
||||||
check_fix_formatted_text(
|
check_fix_formatted_text("a \r b",
|
||||||
"a \r b", {{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b",
|
{{td::MessageEntity::Type::Bold, 0, 5}, {td::MessageEntity::Type::Underline, 2, 1}}, "a b",
|
||||||
{{td::MessageEntity::Type::Bold, 0, 2}, {td::MessageEntity::Type::Bold, 3, 1}}, true, false, false, true);
|
{{td::MessageEntity::Type::Bold, 0, 4}}, true, false, false, true);
|
||||||
|
|
||||||
check_fix_formatted_text("a\rbc\r",
|
check_fix_formatted_text("a\rbc\r",
|
||||||
{{td::MessageEntity::Type::Italic, 0, 1},
|
{{td::MessageEntity::Type::Italic, 0, 1},
|
||||||
@ -1014,6 +997,7 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
check_fix_formatted_text("@tests @tests", {{td::MessageEntity::Type::Italic, 0, 13}}, "@tests @tests",
|
check_fix_formatted_text("@tests @tests", {{td::MessageEntity::Type::Italic, 0, 13}}, "@tests @tests",
|
||||||
{{td::MessageEntity::Type::Mention, 0, 6},
|
{{td::MessageEntity::Type::Mention, 0, 6},
|
||||||
{td::MessageEntity::Type::Italic, 0, 6},
|
{td::MessageEntity::Type::Italic, 0, 6},
|
||||||
|
{td::MessageEntity::Type::Italic, 6, 1},
|
||||||
{td::MessageEntity::Type::Mention, 7, 6},
|
{td::MessageEntity::Type::Mention, 7, 6},
|
||||||
{td::MessageEntity::Type::Italic, 7, 6}});
|
{td::MessageEntity::Type::Italic, 7, 6}});
|
||||||
|
|
||||||
@ -1113,7 +1097,7 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
check_fix_formatted_text("example.com a", {{td::MessageEntity::Type::Italic, 0, 13}}, "example.com a",
|
check_fix_formatted_text("example.com a", {{td::MessageEntity::Type::Italic, 0, 13}}, "example.com a",
|
||||||
{{td::MessageEntity::Type::Url, 0, 11},
|
{{td::MessageEntity::Type::Url, 0, 11},
|
||||||
{td::MessageEntity::Type::Italic, 0, 11},
|
{td::MessageEntity::Type::Italic, 0, 11},
|
||||||
{td::MessageEntity::Type::Italic, 12, 1}});
|
{td::MessageEntity::Type::Italic, 11, 2}});
|
||||||
check_fix_formatted_text("a example.com", {{td::MessageEntity::Type::Italic, 0, 13}}, "a example.com",
|
check_fix_formatted_text("a example.com", {{td::MessageEntity::Type::Italic, 0, 13}}, "a example.com",
|
||||||
{{td::MessageEntity::Type::Italic, 0, 2},
|
{{td::MessageEntity::Type::Italic, 0, 2},
|
||||||
{td::MessageEntity::Type::Url, 2, 11},
|
{td::MessageEntity::Type::Url, 2, 11},
|
||||||
@ -1679,8 +1663,8 @@ TEST(MessageEntities, parse_markdown_v3) {
|
|||||||
|
|
||||||
check_parse_markdown_v3("__ __", " ", {{td::MessageEntity::Type::Italic, 0, 1}});
|
check_parse_markdown_v3("__ __", " ", {{td::MessageEntity::Type::Italic, 0, 1}});
|
||||||
check_parse_markdown_v3("__\n__", "\n", {{td::MessageEntity::Type::Italic, 0, 1}});
|
check_parse_markdown_v3("__\n__", "\n", {{td::MessageEntity::Type::Italic, 0, 1}});
|
||||||
check_parse_markdown_v3("__ __a", " a", {}, true);
|
check_parse_markdown_v3("__ __a", " a", {{td::MessageEntity::Type::Italic, 0, 1}}, true);
|
||||||
check_parse_markdown_v3("__\n__a", "\na", {}, true);
|
check_parse_markdown_v3("__\n__a", "\na", {{td::MessageEntity::Type::Italic, 0, 1}}, true);
|
||||||
check_parse_markdown_v3("**** __a__ **b** ~~c~~ ||d||", "**** a b c d",
|
check_parse_markdown_v3("**** __a__ **b** ~~c~~ ||d||", "**** a b c d",
|
||||||
{{td::MessageEntity::Type::Italic, 5, 1},
|
{{td::MessageEntity::Type::Italic, 5, 1},
|
||||||
{td::MessageEntity::Type::Bold, 7, 1},
|
{td::MessageEntity::Type::Bold, 7, 1},
|
||||||
|
Loading…
Reference in New Issue
Block a user