Improve parse_markdown_v3 test and fix parse_text_url_entities_v3.
GitOrigin-RevId: 337b87cbb885f92f30c9c95603b0ac37693d104f
This commit is contained in:
parent
ee0e60e097
commit
ed7e486668
@ -1169,9 +1169,10 @@ static void check_is_sorted_impl(const vector<MessageEntity> &entities, int line
|
||||
LOG_CHECK(std::is_sorted(entities.begin(), entities.end())) << line << " " << entities;
|
||||
}
|
||||
|
||||
static void check_non_intersecting(const vector<MessageEntity> &entities) {
|
||||
#define check_non_intersecting(entities) check_non_intersecting_impl(entities, __LINE__)
|
||||
static void check_non_intersecting_impl(const vector<MessageEntity> &entities, int line) {
|
||||
for (size_t i = 0; i + 1 < entities.size(); i++) {
|
||||
CHECK(entities[i].offset + entities[i].length <= entities[i + 1].offset);
|
||||
LOG_CHECK(entities[i].offset + entities[i].length <= entities[i + 1].offset) << line << " " << entities;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1273,6 +1274,11 @@ static bool are_entities_valid(const vector<MessageEntity> &entities) {
|
||||
// continuous and blockquote can't be contained in continuous
|
||||
return false;
|
||||
}
|
||||
if ((nested_entity_type_mask & get_splittable_entities_mask()) != 0) {
|
||||
// the previous nested entity may be needed to splitted for consistency
|
||||
// alternatively, better entity merging needs to be implemented
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_splittable_entity(entity.type)) {
|
||||
@ -1955,16 +1961,16 @@ static FormattedText parse_text_url_entities_v3(Slice text, vector<MessageEntity
|
||||
result_text_utf16_length += max_end - part_begin;
|
||||
}
|
||||
|
||||
size_t splittable_entity_pos[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
|
||||
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
|
||||
check_non_intersecting(part_splittable_entities[index]);
|
||||
}
|
||||
if (part_end != max_end) {
|
||||
// try to find text_url entities in the left part
|
||||
auto parsed_part_text = utf8_utf16_substr(text, 0, part_end - max_end);
|
||||
text = text.substr(parsed_part_text.size());
|
||||
|
||||
size_t splittable_entity_pos[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
|
||||
vector<Slice> text_urls = find_text_url_entities_v3(parsed_part_text);
|
||||
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
|
||||
check_non_intersecting(part_splittable_entities[index]);
|
||||
}
|
||||
|
||||
int32 text_utf16_offset = max_end;
|
||||
size_t prev_pos = 0;
|
||||
@ -2047,39 +2053,37 @@ static FormattedText parse_text_url_entities_v3(Slice text, vector<MessageEntity
|
||||
|
||||
result.text.append(parsed_part_text.begin() + prev_pos, parsed_part_text.size() - prev_pos);
|
||||
result_text_utf16_length += part_end - text_utf16_offset;
|
||||
}
|
||||
|
||||
// now add all splittable entities from [text_utf16_offset, part_end)
|
||||
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
|
||||
auto &pos = splittable_entity_pos[index];
|
||||
auto &splittable_entities = part_splittable_entities[index];
|
||||
while (pos < splittable_entities.size() && splittable_entities[pos].offset < part_end) {
|
||||
if (splittable_entities[pos].offset + splittable_entities[pos].length > part_end) {
|
||||
// begins before end of the segment, but ends after it
|
||||
// need to keep the entity for future segments, so split the entity
|
||||
// entities don't intersect each other, so there can be at most one such entity
|
||||
result.entities.emplace_back(splittable_entities[pos].type,
|
||||
splittable_entities[pos].offset - skipped_length,
|
||||
part_end - splittable_entities[pos].offset);
|
||||
// now add all left splittable entities from [part_begin, part_end)
|
||||
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
|
||||
auto &pos = splittable_entity_pos[index];
|
||||
auto &splittable_entities = part_splittable_entities[index];
|
||||
while (pos < splittable_entities.size() && splittable_entities[pos].offset < part_end) {
|
||||
if (splittable_entities[pos].offset + splittable_entities[pos].length > part_end) {
|
||||
// begins before end of the segment, but ends after it
|
||||
// need to keep the entity for future segments, so split the entity
|
||||
// entities don't intersect each other, so there can be at most one such entity
|
||||
result.entities.emplace_back(splittable_entities[pos].type, splittable_entities[pos].offset - skipped_length,
|
||||
part_end - splittable_entities[pos].offset);
|
||||
|
||||
splittable_entities[pos].length =
|
||||
splittable_entities[pos].offset + splittable_entities[pos].length - part_end;
|
||||
splittable_entities[pos].offset = part_end;
|
||||
} else {
|
||||
result.entities.emplace_back(splittable_entities[pos].type,
|
||||
splittable_entities[pos].offset - skipped_length,
|
||||
splittable_entities[pos].length);
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
if (pos == splittable_entities.size()) {
|
||||
splittable_entities.clear();
|
||||
splittable_entities[pos].length =
|
||||
splittable_entities[pos].offset + splittable_entities[pos].length - part_end;
|
||||
splittable_entities[pos].offset = part_end;
|
||||
} else {
|
||||
CHECK(pos == splittable_entities.size() - 1);
|
||||
CHECK(!text.empty());
|
||||
splittable_entities[0] = std::move(splittable_entities.back());
|
||||
splittable_entities.resize(1);
|
||||
result.entities.emplace_back(splittable_entities[pos].type, splittable_entities[pos].offset - skipped_length,
|
||||
splittable_entities[pos].length);
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
if (pos == splittable_entities.size()) {
|
||||
splittable_entities.clear();
|
||||
} else {
|
||||
CHECK(pos == splittable_entities.size() - 1);
|
||||
CHECK(!text.empty());
|
||||
splittable_entities[0] = std::move(splittable_entities.back());
|
||||
splittable_entities.resize(1);
|
||||
}
|
||||
}
|
||||
|
||||
part_begin = part_end;
|
||||
|
@ -2694,7 +2694,8 @@ class CliClient final : public Actor {
|
||||
} else if (op == "gtes") {
|
||||
execute(td_api::make_object<td_api::getTextEntities>(args));
|
||||
} else if (op == "pm") {
|
||||
send_request(td_api::make_object<td_api::parseMarkdown>(as_formatted_text(args)));
|
||||
send_request(
|
||||
td_api::make_object<td_api::parseMarkdown>(td_api::make_object<td_api::formattedText>(args, Auto())));
|
||||
} else if (op == "pte") {
|
||||
send_request(
|
||||
td_api::make_object<td_api::parseTextEntities>(args, td_api::make_object<td_api::textParseModeMarkdown>(2)));
|
||||
|
@ -1502,7 +1502,7 @@ TEST(MessageEntities, parse_markdown_v3) {
|
||||
{td::MessageEntity::Type::Italic, 123, 17},
|
||||
{td::MessageEntity::Type::Bold, 129, 15}});
|
||||
|
||||
td::vector<td::string> parts{"a", " #test ", "__", "**", "~~", "[", "](t.me)", "`"};
|
||||
td::vector<td::string> parts{"a", " #test__a", "__", "**", "~~", "[", "](t.me)", "`"};
|
||||
td::vector<td::MessageEntity::Type> types{
|
||||
td::MessageEntity::Type::Bold, td::MessageEntity::Type::Italic, td::MessageEntity::Type::Underline,
|
||||
td::MessageEntity::Type::Strikethrough, td::MessageEntity::Type::Code, td::MessageEntity::Type::Pre,
|
||||
@ -1527,7 +1527,16 @@ TEST(MessageEntities, parse_markdown_v3) {
|
||||
entities.emplace_back(type, offset, length);
|
||||
}
|
||||
|
||||
ASSERT_TRUE(fix_formatted_text(str, entities, true, true, true, true).is_ok());
|
||||
td::parse_markdown_v3({std::move(str), std::move(entities)});
|
||||
td::FormattedText text{std::move(str), std::move(entities)};
|
||||
while (true) {
|
||||
ASSERT_TRUE(fix_formatted_text(text.text, text.entities, true, true, true, true).is_ok());
|
||||
auto parsed_text = td::parse_markdown_v3(text);
|
||||
ASSERT_TRUE(fix_formatted_text(parsed_text.text, parsed_text.entities, true, true, true, true).is_ok());
|
||||
if (parsed_text == text) {
|
||||
break;
|
||||
}
|
||||
text = std::move(parsed_text);
|
||||
}
|
||||
ASSERT_EQ(text, td::parse_markdown_v3(text));
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user