Allow \r and other empty entities as a separator between MarkdownV2 blockquotes.

This commit is contained in:
levlam 2024-01-10 14:36:27 +03:00
parent ce8a4b6d21
commit da031b3faa
2 changed files with 68 additions and 44 deletions

View File

@ -2013,12 +2013,16 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
vector<EntityInfo> nested_entities;
bool have_blockquote = false;
bool can_start_blockquote = true;
for (size_t i = 0; i < text.size(); i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c == '\\' && text[i + 1] > 0 && text[i + 1] <= 126) {
i++;
utf16_offset += 1;
text[result_size++] = text[i];
if (text[i] != '\r') {
can_start_blockquote = (text[i] == '\n');
}
continue;
}
@ -2038,46 +2042,48 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
if (reserved_characters.find(text[i]) == Slice::npos) {
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
if (c != '\r') {
can_start_blockquote = false;
}
}
text[result_size++] = text[i];
continue;
}
bool is_end_of_an_entity = false;
if (!nested_entities.empty()) {
is_end_of_an_entity = [&] {
if (have_blockquote && c == '\n' && (i + 1 == text.size() || text[i + 1] != '>')) {
return true;
}
switch (nested_entities.back().type) {
case MessageEntity::Type::Bold:
return c == '*';
case MessageEntity::Type::Italic:
return c == '_' && text[i + 1] != '_';
case MessageEntity::Type::Code:
return c == '`';
case MessageEntity::Type::Pre:
case MessageEntity::Type::PreCode:
return c == '`' && text[i + 1] == '`' && text[i + 2] == '`';
case MessageEntity::Type::TextUrl:
return c == ']';
case MessageEntity::Type::Underline:
return c == '_' && text[i + 1] == '_';
case MessageEntity::Type::Strikethrough:
return c == '~';
case MessageEntity::Type::Spoiler:
return c == '|' && text[i + 1] == '|';
case MessageEntity::Type::CustomEmoji:
return c == ']';
case MessageEntity::Type::BlockQuote:
return false;
default:
UNREACHABLE();
return false;
}
}();
}
bool is_end_of_an_entity = [&] {
if (nested_entities.empty()) {
return false;
}
if (have_blockquote && c == '\n' && (i + 1 == text.size() || text[i + 1] != '>')) {
return true;
}
switch (nested_entities.back().type) {
case MessageEntity::Type::Bold:
return c == '*';
case MessageEntity::Type::Italic:
return c == '_' && text[i + 1] != '_';
case MessageEntity::Type::Code:
return c == '`';
case MessageEntity::Type::Pre:
case MessageEntity::Type::PreCode:
return c == '`' && text[i + 1] == '`' && text[i + 2] == '`';
case MessageEntity::Type::TextUrl:
return c == ']';
case MessageEntity::Type::Underline:
return c == '_' && text[i + 1] == '_';
case MessageEntity::Type::Strikethrough:
return c == '~';
case MessageEntity::Type::Spoiler:
return c == '|' && text[i + 1] == '|';
case MessageEntity::Type::CustomEmoji:
return c == ']';
case MessageEntity::Type::BlockQuote:
return false;
default:
UNREACHABLE();
return false;
}
}();
if (!is_end_of_an_entity) {
// begin of an entity
MessageEntity::Type type;
@ -2149,19 +2155,17 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
case '\n':
utf16_offset += 1;
text[result_size++] = '\n';
can_start_blockquote = true;
type = MessageEntity::Type::Size;
if (i + 1 < text.size() && text[i + 1] == '>') {
i++;
if (!have_blockquote) {
break;
case '>':
if (can_start_blockquote) {
if (have_blockquote) {
type = MessageEntity::Type::Size;
} else {
type = MessageEntity::Type::BlockQuote;
have_blockquote = true;
}
}
break;
case '>':
if (i == 0) {
type = MessageEntity::Type::BlockQuote;
have_blockquote = true;
} else {
return Status::Error(400, PSLICE() << "Character '" << text[i]
<< "' is reserved and must be escaped with the preceding '\\'");
@ -2258,6 +2262,7 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
CHECK(have_blockquote);
have_blockquote = false;
text[result_size++] = text[i];
can_start_blockquote = true;
utf16_offset += 1;
skip_entity = false;
break;

View File

@ -1424,6 +1424,7 @@ TEST(MessageEntities, parse_markdown) {
check_parse_markdown("🏟 🏟![πŸ‘](tg://emoji?test=1#&id=25)", "Custom emoji URL must have an emoji identifier");
check_parse_markdown("🏟 🏟![πŸ‘](tg://emoji?test=1231&id=025)", "Invalid custom emoji identifier specified");
check_parse_markdown(">*b\n>ld \n>bo\nld*\nasd\ndef", "Can't find end of Bold entity at byte offset 1");
check_parse_markdown(">\n*a*>2", "Character '>' is reserved and must be escaped with the preceding '\\'");
check_parse_markdown("", "", {});
check_parse_markdown("\\\\", "\\", {});
@ -1493,6 +1494,8 @@ TEST(MessageEntities, parse_markdown) {
check_parse_markdown("abc\n> \n> \n>\ndef", "abc\n \n \n\ndef", {{td::MessageEntity::Type::BlockQuote, 4, 5}});
check_parse_markdown(">", "", {});
check_parse_markdown(">a", "a", {{td::MessageEntity::Type::BlockQuote, 0, 1}});
check_parse_markdown("\r>a", "\ra", {{td::MessageEntity::Type::BlockQuote, 1, 1}});
check_parse_markdown("\r\r>\r\ra\r\n\r", "\r\r\r\ra\r\n\r", {{td::MessageEntity::Type::BlockQuote, 2, 5}});
check_parse_markdown(
">*bold _italic bold ~italic bold strikethrough ||italic bold strikethrough spoiler||~ __underline italic "
"bold___ bold*",
@ -1513,6 +1516,22 @@ TEST(MessageEntities, parse_markdown) {
{{td::MessageEntity::Type::Code, 0, 14}, {td::MessageEntity::Type::BlockQuote, 15, 4}});
check_parse_markdown(">1", "1", {{td::MessageEntity::Type::BlockQuote, 0, 1}});
check_parse_markdown(">\n1", "\n1", {{td::MessageEntity::Type::BlockQuote, 0, 1}});
check_parse_markdown(">\n\r>2", "\n\r2",
{{td::MessageEntity::Type::BlockQuote, 0, 1}, {td::MessageEntity::Type::BlockQuote, 2, 1}});
check_parse_markdown(">\n**>2", "\n2",
{{td::MessageEntity::Type::BlockQuote, 0, 1}, {td::MessageEntity::Type::BlockQuote, 1, 1}});
// check_parse_markdown("*>abcd*", "abcd",
// {{td::MessageEntity::Type::BlockQuote, 0, 4}, {td::MessageEntity::Type::Bold, 0, 4}});
check_parse_markdown(">*abcd*", "abcd",
{{td::MessageEntity::Type::BlockQuote, 0, 4}, {td::MessageEntity::Type::Bold, 0, 4}});
// check_parse_markdown(">*abcd\n*", "abcd\n",
// {{td::MessageEntity::Type::BlockQuote, 0, 5}, {td::MessageEntity::Type::Bold, 0, 5}});
check_parse_markdown(">*abcd*\n", "abcd\n",
{{td::MessageEntity::Type::BlockQuote, 0, 5}, {td::MessageEntity::Type::Bold, 0, 4}});
check_parse_markdown("*>abcd\n*", "abcd\n",
{{td::MessageEntity::Type::BlockQuote, 0, 5}, {td::MessageEntity::Type::Bold, 0, 5}});
check_parse_markdown("abc\n>def\n>def\n\r>ghi2\njkl", "abc\ndef\ndef\n\rghi2\njkl",
{{td::MessageEntity::Type::BlockQuote, 4, 8}, {td::MessageEntity::Type::BlockQuote, 13, 5}});
}
static void check_parse_markdown_v3(td::string text, td::vector<td::MessageEntity> entities,