Add BlockQuote parsing in MarkdownV2.

This commit is contained in:
levlam 2023-10-30 18:18:13 +03:00
parent bdbee0765b
commit ea450564b0
2 changed files with 83 additions and 3 deletions

View File

@ -2010,8 +2010,9 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
, entity_begin_pos(entity_begin_pos) {
}
};
std::vector<EntityInfo> nested_entities;
vector<EntityInfo> nested_entities;
bool have_blockquote = false;
for (size_t i = 0; i < text.size(); i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c == '\\' && text[i + 1] > 0 && text[i + 1] <= 126) {
@ -2021,7 +2022,7 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
continue;
}
Slice reserved_characters("_*[]()~`>#+-=|{}.!");
Slice reserved_characters("_*[]()~`>#+-=|{}.!\n");
if (!nested_entities.empty()) {
switch (nested_entities.back().type) {
case MessageEntity::Type::Code:
@ -2045,6 +2046,9 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
bool is_end_of_an_entity = false;
if (!nested_entities.empty()) {
is_end_of_an_entity = [&] {
if (have_blockquote && c == '\n' && (i + 1 == text.size() || text[i + 1] != '>')) {
return true;
}
switch (nested_entities.back().type) {
case MessageEntity::Type::Bold:
return c == '*';
@ -2065,6 +2069,8 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
return c == '|' && text[i + 1] == '|';
case MessageEntity::Type::CustomEmoji:
return c == ']';
case MessageEntity::Type::BlockQuote:
return false;
default:
UNREACHABLE();
return false;
@ -2140,14 +2146,42 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
<< "' is reserved and must be escaped with the preceding '\\'");
}
break;
case '\n':
utf16_offset += 1;
text[result_size++] = '\n';
type = MessageEntity::Type::Size;
if (i + 1 < text.size() && text[i + 1] == '>') {
i++;
if (!have_blockquote) {
type = MessageEntity::Type::BlockQuote;
have_blockquote = true;
}
}
break;
case '>':
if (i == 0) {
type = MessageEntity::Type::BlockQuote;
have_blockquote = true;
} else {
return Status::Error(400, PSLICE() << "Character '" << text[i]
<< "' is reserved and must be escaped with the preceding '\\'");
}
break;
default:
return Status::Error(
400, PSLICE() << "Character '" << text[i] << "' is reserved and must be escaped with the preceding '\\'");
}
if (type == MessageEntity::Type::Size) {
continue;
}
nested_entities.emplace_back(type, std::move(argument), utf16_offset, entity_byte_offset, result_size);
} else {
// end of an entity
auto type = nested_entities.back().type;
if (c == '\n' && type != MessageEntity::Type::BlockQuote) {
return Status::Error(400, PSLICE() << "Can't find end of " << nested_entities.back().type
<< " entity at byte offset " << nested_entities.back().entity_byte_offset);
}
auto argument = std::move(nested_entities.back().argument);
UserId user_id;
CustomEmojiId custom_emoji_id;
@ -2220,6 +2254,12 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
TRY_RESULT_ASSIGN(custom_emoji_id, LinkManager::get_link_custom_emoji_id(url));
break;
}
case MessageEntity::Type::BlockQuote:
CHECK(have_blockquote);
have_blockquote = false;
text[result_size++] = text[i];
utf16_offset += 1;
break;
default:
UNREACHABLE();
return false;
@ -2239,6 +2279,18 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
nested_entities.pop_back();
}
}
if (have_blockquote) {
CHECK(!nested_entities.empty());
if (nested_entities.back().type == MessageEntity::Type::BlockQuote) {
have_blockquote = false;
auto entity_offset = nested_entities.back().entity_offset;
auto entity_length = utf16_offset - entity_offset;
if (entity_length != 0) {
entities.emplace_back(MessageEntity::Type::BlockQuote, entity_offset, entity_length);
}
nested_entities.pop_back();
}
}
if (!nested_entities.empty()) {
return Status::Error(400, PSLICE() << "Can't find end of " << nested_entities.back().type
<< " entity at byte offset " << nested_entities.back().entity_byte_offset);

View File

@ -1375,6 +1375,9 @@ TEST(MessageEntities, parse_html) {
static void check_parse_markdown(td::string text, const td::string &result,
const td::vector<td::MessageEntity> &entities) {
auto r_entities = td::parse_markdown_v2(text);
if (r_entities.is_error()) {
LOG(ERROR) << r_entities.error();
}
ASSERT_TRUE(r_entities.is_ok());
ASSERT_EQ(entities, r_entities.ok());
ASSERT_STREQ(result, text);
@ -1389,7 +1392,7 @@ static void check_parse_markdown(td::string text, td::Slice error_message) {
TEST(MessageEntities, parse_markdown) {
td::Slice reserved_characters("]()>#+-=|{}.!");
td::Slice begin_characters("_*[~`");
td::Slice begin_characters("_*[~`>");
for (char c = 1; c < 126; c++) {
if (begin_characters.find(c) != td::Slice::npos) {
continue;
@ -1426,6 +1429,7 @@ TEST(MessageEntities, parse_markdown) {
check_parse_markdown("🏟 🏟__", "Can't find end of Underline entity at byte offset 9");
check_parse_markdown("🏟 🏟||test\\|", "Can't find end of Spoiler entity at byte offset 9");
check_parse_markdown("🏟 🏟!", "Character '!' is reserved and must be escaped with the preceding '\\'");
check_parse_markdown("🏟 🏟>", "Character '>' is reserved and must be escaped with the preceding '\\'");
check_parse_markdown("🏟 🏟![", "Can't find end of CustomEmoji entity at byte offset 9");
check_parse_markdown("🏟 🏟![πŸ‘", "Can't find end of CustomEmoji entity at byte offset 9");
check_parse_markdown("🏟 🏟![πŸ‘]", "Custom emoji entity must contain a tg://emoji URL");
@ -1435,6 +1439,7 @@ TEST(MessageEntities, parse_markdown) {
check_parse_markdown("🏟 🏟![πŸ‘](tg://emoji#test)", "Custom emoji URL must have an emoji identifier");
check_parse_markdown("🏟 🏟![πŸ‘](tg://emoji?test=1#&id=25)", "Custom emoji URL must have an emoji identifier");
check_parse_markdown("🏟 🏟![πŸ‘](tg://emoji?test=1231&id=025)", "Invalid custom emoji identifier specified");
check_parse_markdown(">*b\n>ld \n>bo\nld*\nasd\ndef", "Can't find end of Bold entity at byte offset 1");
check_parse_markdown("", "", {});
check_parse_markdown("\\\\", "\\", {});
@ -1499,6 +1504,29 @@ TEST(MessageEntities, parse_markdown) {
{{0, 12, td::UserId(static_cast<td::int64>(123456))}});
check_parse_markdown("🏟 🏟![πŸ‘](TG://EMoJI/?test=1231&id=25#id=32)a", "🏟 πŸŸπŸ‘a",
{{td::MessageEntity::Type::CustomEmoji, 5, 2, td::CustomEmojiId(static_cast<td::int64>(25))}});
check_parse_markdown("> \n> \n>", " \n \n", {{td::MessageEntity::Type::BlockQuote, 0, 4}});
check_parse_markdown("> \\>\n \\> \n>", " >\n > \n", {{td::MessageEntity::Type::BlockQuote, 0, 3}});
check_parse_markdown("abc\n> \n> \n>\ndef", "abc\n \n \n\ndef", {{td::MessageEntity::Type::BlockQuote, 4, 5}});
check_parse_markdown(">", "", {});
check_parse_markdown(">a", "a", {{td::MessageEntity::Type::BlockQuote, 0, 1}});
check_parse_markdown(
">*bold _italic bold ~italic bold strikethrough ||italic bold strikethrough spoiler||~ __underline italic "
"bold___ bold*",
"bold italic bold italic bold strikethrough italic bold strikethrough spoiler underline italic bold bold",
{{td::MessageEntity::Type::BlockQuote, 0, 103},
{td::MessageEntity::Type::Bold, 0, 103},
{td::MessageEntity::Type::Italic, 5, 93},
{td::MessageEntity::Type::Strikethrough, 17, 59},
{td::MessageEntity::Type::Spoiler, 43, 33},
{td::MessageEntity::Type::Underline, 77, 21}});
check_parse_markdown(">*b\n>ld \n>bo\n>ld*\nasd\ndef", "b\nld \nbo\nld\nasd\ndef",
{{td::MessageEntity::Type::BlockQuote, 0, 12}, {td::MessageEntity::Type::Bold, 0, 11}});
check_parse_markdown("*a\n>b\n>ld \n>bo\n>ld\nasd*\ndef", "a\nb\nld \nbo\nld\nasd\ndef",
{{td::MessageEntity::Type::Bold, 0, 17}, {td::MessageEntity::Type::BlockQuote, 2, 12}});
check_parse_markdown(">`b\n>ld \n>bo\nld`\n>asd\ndef", "b\n>ld \n>bo\nld\nasd\ndef",
{{td::MessageEntity::Type::BlockQuote, 0, 18}, {td::MessageEntity::Type::Code, 0, 13}});
check_parse_markdown("`>b\n>ld \n>bo\nld`\n>asd\ndef", ">b\n>ld \n>bo\nld\nasd\ndef",
{{td::MessageEntity::Type::Code, 0, 14}, {td::MessageEntity::Type::BlockQuote, 15, 4}});
}
static void check_parse_markdown_v3(td::string text, td::vector<td::MessageEntity> entities,