Improve parse_markdown_v3 test and fix parse_text_url_entities_v3.

GitOrigin-RevId: 337b87cbb885f92f30c9c95603b0ac37693d104f
This commit is contained in:
levlam 2020-03-12 01:44:42 +03:00
parent ee0e60e097
commit ed7e486668
3 changed files with 52 additions and 38 deletions

View File

@ -1169,9 +1169,10 @@ static void check_is_sorted_impl(const vector<MessageEntity> &entities, int line
LOG_CHECK(std::is_sorted(entities.begin(), entities.end())) << line << " " << entities;
}
static void check_non_intersecting(const vector<MessageEntity> &entities) {
#define check_non_intersecting(entities) check_non_intersecting_impl(entities, __LINE__)
static void check_non_intersecting_impl(const vector<MessageEntity> &entities, int line) {
for (size_t i = 0; i + 1 < entities.size(); i++) {
CHECK(entities[i].offset + entities[i].length <= entities[i + 1].offset);
LOG_CHECK(entities[i].offset + entities[i].length <= entities[i + 1].offset) << line << " " << entities;
}
}
@ -1273,6 +1274,11 @@ static bool are_entities_valid(const vector<MessageEntity> &entities) {
// continuous and blockquote can't be contained in continuous
return false;
}
if ((nested_entity_type_mask & get_splittable_entities_mask()) != 0) {
// the previous nested entity may be needed to splitted for consistency
// alternatively, better entity merging needs to be implemented
return false;
}
}
if (is_splittable_entity(entity.type)) {
@ -1955,16 +1961,16 @@ static FormattedText parse_text_url_entities_v3(Slice text, vector<MessageEntity
result_text_utf16_length += max_end - part_begin;
}
size_t splittable_entity_pos[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
check_non_intersecting(part_splittable_entities[index]);
}
if (part_end != max_end) {
// try to find text_url entities in the left part
auto parsed_part_text = utf8_utf16_substr(text, 0, part_end - max_end);
text = text.substr(parsed_part_text.size());
size_t splittable_entity_pos[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
vector<Slice> text_urls = find_text_url_entities_v3(parsed_part_text);
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
check_non_intersecting(part_splittable_entities[index]);
}
int32 text_utf16_offset = max_end;
size_t prev_pos = 0;
@ -2047,39 +2053,37 @@ static FormattedText parse_text_url_entities_v3(Slice text, vector<MessageEntity
result.text.append(parsed_part_text.begin() + prev_pos, parsed_part_text.size() - prev_pos);
result_text_utf16_length += part_end - text_utf16_offset;
}
// now add all splittable entities from [text_utf16_offset, part_end)
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
auto &pos = splittable_entity_pos[index];
auto &splittable_entities = part_splittable_entities[index];
while (pos < splittable_entities.size() && splittable_entities[pos].offset < part_end) {
if (splittable_entities[pos].offset + splittable_entities[pos].length > part_end) {
// begins before end of the segment, but ends after it
// need to keep the entity for future segments, so split the entity
// entities don't intersect each other, so there can be at most one such entity
result.entities.emplace_back(splittable_entities[pos].type,
splittable_entities[pos].offset - skipped_length,
part_end - splittable_entities[pos].offset);
// now add all left splittable entities from [part_begin, part_end)
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
auto &pos = splittable_entity_pos[index];
auto &splittable_entities = part_splittable_entities[index];
while (pos < splittable_entities.size() && splittable_entities[pos].offset < part_end) {
if (splittable_entities[pos].offset + splittable_entities[pos].length > part_end) {
// begins before end of the segment, but ends after it
// need to keep the entity for future segments, so split the entity
// entities don't intersect each other, so there can be at most one such entity
result.entities.emplace_back(splittable_entities[pos].type, splittable_entities[pos].offset - skipped_length,
part_end - splittable_entities[pos].offset);
splittable_entities[pos].length =
splittable_entities[pos].offset + splittable_entities[pos].length - part_end;
splittable_entities[pos].offset = part_end;
} else {
result.entities.emplace_back(splittable_entities[pos].type,
splittable_entities[pos].offset - skipped_length,
splittable_entities[pos].length);
pos++;
}
}
if (pos == splittable_entities.size()) {
splittable_entities.clear();
splittable_entities[pos].length =
splittable_entities[pos].offset + splittable_entities[pos].length - part_end;
splittable_entities[pos].offset = part_end;
} else {
CHECK(pos == splittable_entities.size() - 1);
CHECK(!text.empty());
splittable_entities[0] = std::move(splittable_entities.back());
splittable_entities.resize(1);
result.entities.emplace_back(splittable_entities[pos].type, splittable_entities[pos].offset - skipped_length,
splittable_entities[pos].length);
pos++;
}
}
if (pos == splittable_entities.size()) {
splittable_entities.clear();
} else {
CHECK(pos == splittable_entities.size() - 1);
CHECK(!text.empty());
splittable_entities[0] = std::move(splittable_entities.back());
splittable_entities.resize(1);
}
}
part_begin = part_end;

View File

@ -2694,7 +2694,8 @@ class CliClient final : public Actor {
} else if (op == "gtes") {
execute(td_api::make_object<td_api::getTextEntities>(args));
} else if (op == "pm") {
send_request(td_api::make_object<td_api::parseMarkdown>(as_formatted_text(args)));
send_request(
td_api::make_object<td_api::parseMarkdown>(td_api::make_object<td_api::formattedText>(args, Auto())));
} else if (op == "pte") {
send_request(
td_api::make_object<td_api::parseTextEntities>(args, td_api::make_object<td_api::textParseModeMarkdown>(2)));

View File

@ -1502,7 +1502,7 @@ TEST(MessageEntities, parse_markdown_v3) {
{td::MessageEntity::Type::Italic, 123, 17},
{td::MessageEntity::Type::Bold, 129, 15}});
td::vector<td::string> parts{"a", " #test ", "__", "**", "~~", "[", "](t.me)", "`"};
td::vector<td::string> parts{"a", " #test__a", "__", "**", "~~", "[", "](t.me)", "`"};
td::vector<td::MessageEntity::Type> types{
td::MessageEntity::Type::Bold, td::MessageEntity::Type::Italic, td::MessageEntity::Type::Underline,
td::MessageEntity::Type::Strikethrough, td::MessageEntity::Type::Code, td::MessageEntity::Type::Pre,
@ -1527,7 +1527,16 @@ TEST(MessageEntities, parse_markdown_v3) {
entities.emplace_back(type, offset, length);
}
ASSERT_TRUE(fix_formatted_text(str, entities, true, true, true, true).is_ok());
td::parse_markdown_v3({std::move(str), std::move(entities)});
td::FormattedText text{std::move(str), std::move(entities)};
while (true) {
ASSERT_TRUE(fix_formatted_text(text.text, text.entities, true, true, true, true).is_ok());
auto parsed_text = td::parse_markdown_v3(text);
ASSERT_TRUE(fix_formatted_text(parsed_text.text, parsed_text.entities, true, true, true, true).is_ok());
if (parsed_text == text) {
break;
}
text = std::move(parsed_text);
}
ASSERT_EQ(text, td::parse_markdown_v3(text));
}
}