Add td_api::getMarkdownText.

GitOrigin-RevId: b463cc2c92052c552d66d774450ffa7bb4bc132e
This commit is contained in:
levlam 2020-03-12 06:22:14 +03:00
parent e6efab096a
commit 9c67f42682
7 changed files with 263 additions and 40 deletions

View File

@ -3428,6 +3428,9 @@ parseTextEntities text:string parse_mode:TextParseMode = FormattedText;
//@text The text to parse. For example, "__italic__ ~~strikethrough~~ **bold** `code` ```pre``` __[italic__ text_url](telegram.org) __italic**bold italic__bold**"
parseMarkdown text:formattedText = FormattedText;
//@description Replaces text entities with Markdown formatting in a human-friendly format. Entities that can't be represented in Markdown unambiguously are kept as is. This is an offline method. Can be called before authorization. Can be called synchronously @text The text
getMarkdownText text:formattedText = FormattedText;
//@description Returns the MIME type of a file, guessed by its extension. Returns an empty string on failure. This is an offline method. Can be called before authorization. Can be called synchronously @file_name The name of the file or path to the file
getFileMimeType file_name:string = Text;

Binary file not shown.

View File

@ -1204,6 +1204,12 @@ static constexpr int32 get_pre_entities_mask() {
get_entity_type_mask(MessageEntity::Type::PreCode);
}
static constexpr int32 get_user_entities_mask() {
return get_splittable_entities_mask() | get_blockquote_entities_mask() |
get_entity_type_mask(MessageEntity::Type::TextUrl) | get_entity_type_mask(MessageEntity::Type::MentionName) |
get_pre_entities_mask();
}
static int32 is_splittable_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_splittable_entities_mask()) != 0;
}
@ -1220,6 +1226,10 @@ static int32 is_pre_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_pre_entities_mask()) != 0;
}
static int32 is_user_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_user_entities_mask()) != 0;
}
static constexpr size_t SPLITTABLE_ENTITY_TYPE_COUNT = 4;
static size_t get_splittable_entity_type_index(MessageEntity::Type type) {
@ -2118,6 +2128,13 @@ static vector<MessageEntity> find_splittable_entities_v3(Slice text, const vecto
for (auto &entity : entities) {
unallowed_boundaries.insert(entity.offset);
unallowed_boundaries.insert(entity.offset + entity.length);
if (entity.type == MessageEntity::Type::Mention || entity.type == MessageEntity::Type::Hashtag ||
entity.type == MessageEntity::Type::BotCommand || entity.type == MessageEntity::Type::Cashtag ||
entity.type == MessageEntity::Type::PhoneNumber || entity.type == MessageEntity::Type::BankCardNumber) {
for (int32 i = 1; i < entity.length; i++) {
unallowed_boundaries.insert(entity.offset + i);
}
}
}
auto found_entities = find_entities(text, false, false);
@ -2440,6 +2457,128 @@ FormattedText parse_markdown_v3(FormattedText text) {
return result;
}
// text entities must be valid
FormattedText get_markdown_v3(FormattedText text) {
if (text.entities.empty()) {
return text;
}
check_is_sorted(text.entities);
for (auto &entity : text.entities) {
if (!is_user_entity(entity.type)) {
return text;
}
}
FormattedText result;
struct EntityInfo {
const MessageEntity *entity;
int32 utf16_added_before;
EntityInfo(MessageEntity *entity, int32 utf16_added_before)
: entity(entity), utf16_added_before(utf16_added_before) {
}
};
vector<EntityInfo> nested_entities_stack;
size_t current_entity = 0;
int32 utf16_offset = 0;
int32 utf16_added = 0;
for (size_t pos = 0; pos <= text.text.size(); pos++) {
auto c = static_cast<unsigned char>(text.text[pos]);
if (is_utf8_character_first_code_unit(c)) {
while (!nested_entities_stack.empty()) {
const auto *entity = nested_entities_stack.back().entity;
auto entity_end = entity->offset + entity->length;
if (utf16_offset < entity_end) {
break;
}
CHECK(utf16_offset == entity_end);
switch (entity->type) {
case MessageEntity::Type::Italic:
result.text += "__";
utf16_added += 2;
break;
case MessageEntity::Type::Bold:
result.text += "**";
utf16_added += 2;
break;
case MessageEntity::Type::Strikethrough:
result.text += "~~";
utf16_added += 2;
break;
case MessageEntity::Type::TextUrl:
result.text += "](";
result.text += entity->argument;
result.text += ')';
utf16_added += 3 + entity->argument.size();
break;
case MessageEntity::Type::Code:
result.text += '`';
utf16_added++;
break;
case MessageEntity::Type::Pre:
result.text += "```";
utf16_added += 3;
break;
default:
result.entities.push_back(*entity);
result.entities.back().offset += nested_entities_stack.back().utf16_added_before;
result.entities.back().length += utf16_added - nested_entities_stack.back().utf16_added_before;
break;
}
nested_entities_stack.pop_back();
}
while (current_entity < text.entities.size() && utf16_offset >= text.entities[current_entity].offset) {
CHECK(utf16_offset == text.entities[current_entity].offset);
switch (text.entities[current_entity].type) {
case MessageEntity::Type::Italic:
result.text += "__";
utf16_added += 2;
break;
case MessageEntity::Type::Bold:
result.text += "**";
utf16_added += 2;
break;
case MessageEntity::Type::Strikethrough:
result.text += "~~";
utf16_added += 2;
break;
case MessageEntity::Type::TextUrl:
result.text += '[';
utf16_added++;
break;
case MessageEntity::Type::Code:
result.text += '`';
utf16_added++;
break;
case MessageEntity::Type::Pre:
result.text += "```";
utf16_added += 3;
break;
}
nested_entities_stack.emplace_back(&text.entities[current_entity++], utf16_added);
}
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
}
if (pos == text.text.size()) {
break;
}
result.text.push_back(text.text[pos]);
}
sort_entities(result.entities);
if (parse_markdown_v3(result) != text) {
return text;
}
return result;
}
static uint32 decode_html_entity(CSlice text, size_t &pos) {
auto c = static_cast<unsigned char>(text[pos]);
if (c != '&') {
@ -2730,16 +2869,10 @@ vector<tl_object_ptr<telegram_api::MessageEntity>> get_input_message_entities(co
const char *source) {
vector<tl_object_ptr<telegram_api::MessageEntity>> result;
for (auto &entity : entities) {
if (!is_user_entity(entity.type)) {
continue;
}
switch (entity.type) {
case MessageEntity::Type::Mention:
case MessageEntity::Type::Hashtag:
case MessageEntity::Type::BotCommand:
case MessageEntity::Type::Url:
case MessageEntity::Type::EmailAddress:
case MessageEntity::Type::Cashtag:
case MessageEntity::Type::PhoneNumber:
case MessageEntity::Type::BankCardNumber:
continue;
case MessageEntity::Type::Bold:
result.push_back(make_tl_object<telegram_api::messageEntityBold>(entity.offset, entity.length));
break;
@ -2775,6 +2908,14 @@ vector<tl_object_ptr<telegram_api::MessageEntity>> get_input_message_entities(co
std::move(input_user)));
break;
}
case MessageEntity::Type::Mention:
case MessageEntity::Type::Hashtag:
case MessageEntity::Type::BotCommand:
case MessageEntity::Type::Url:
case MessageEntity::Type::EmailAddress:
case MessageEntity::Type::Cashtag:
case MessageEntity::Type::PhoneNumber:
case MessageEntity::Type::BankCardNumber:
default:
UNREACHABLE();
}
@ -2872,44 +3013,28 @@ Result<vector<MessageEntity>> get_message_entities(const ContactsManager *contac
switch (entity->type_->get_id()) {
case td_api::textEntityTypeMention::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::Mention, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::Mention, entity->offset_, entity->length_);
break;
case td_api::textEntityTypeHashtag::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::Hashtag, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::Hashtag, entity->offset_, entity->length_);
break;
case td_api::textEntityTypeBotCommand::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::BotCommand, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::BotCommand, entity->offset_, entity->length_);
break;
case td_api::textEntityTypeUrl::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::Url, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::Url, entity->offset_, entity->length_);
break;
case td_api::textEntityTypeEmailAddress::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::EmailAddress, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::EmailAddress, entity->offset_, entity->length_);
break;
case td_api::textEntityTypeCashtag::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::Cashtag, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::Cashtag, entity->offset_, entity->length_);
break;
case td_api::textEntityTypePhoneNumber::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::PhoneNumber, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::PhoneNumber, entity->offset_, entity->length_);
break;
case td_api::textEntityTypeBankCardNumber::ID:
if (allow_all) {
entities.emplace_back(MessageEntity::Type::BankCardNumber, entity->offset_, entity->length_);
}
entities.emplace_back(MessageEntity::Type::BankCardNumber, entity->offset_, entity->length_);
break;
case td_api::textEntityTypeBold::ID:
entities.emplace_back(MessageEntity::Type::Bold, entity->offset_, entity->length_);
@ -2962,6 +3087,10 @@ Result<vector<MessageEntity>> get_message_entities(const ContactsManager *contac
default:
UNREACHABLE();
}
CHECK(!entities.empty());
if (!allow_all && !is_user_entity(entities.back().type)) {
entities.pop_back();
}
}
return entities;
}

View File

@ -151,6 +151,8 @@ Result<vector<MessageEntity>> parse_markdown_v2(string &text);
FormattedText parse_markdown_v3(FormattedText text);
FormattedText get_markdown_v3(FormattedText text);
Result<vector<MessageEntity>> parse_html(string &text);
vector<tl_object_ptr<telegram_api::MessageEntity>> get_input_message_entities(const ContactsManager *contacts_manager,

View File

@ -3081,6 +3081,7 @@ bool Td::is_synchronous_request(int32 id) {
case td_api::getTextEntities::ID:
case td_api::parseTextEntities::ID:
case td_api::parseMarkdown::ID:
case td_api::getMarkdownText::ID:
case td_api::getFileMimeType::ID:
case td_api::getFileExtension::ID:
case td_api::cleanFileName::ID:
@ -3304,6 +3305,7 @@ td_api::object_ptr<td_api::Object> Td::static_request(td_api::object_ptr<td_api:
switch (function_id) {
case td_api::parseTextEntities::ID:
case td_api::parseMarkdown::ID:
case td_api::getMarkdownText::ID:
case td_api::getFileMimeType::ID:
case td_api::getFileExtension::ID:
case td_api::cleanFileName::ID:
@ -7453,6 +7455,10 @@ void Td::on_request(uint64 id, const td_api::parseMarkdown &request) {
UNREACHABLE();
}
void Td::on_request(uint64 id, const td_api::getMarkdownText &request) {
UNREACHABLE();
}
void Td::on_request(uint64 id, const td_api::getFileMimeType &request) {
UNREACHABLE();
}
@ -7572,8 +7578,25 @@ td_api::object_ptr<td_api::Object> Td::do_static_request(td_api::parseMarkdown &
auto parsed_text = parse_markdown_v3({std::move(request.text_->text_), std::move(entities)});
fix_formatted_text(parsed_text.text, parsed_text.entities, true, true, true, true).ensure();
return make_tl_object<td_api::formattedText>(std::move(parsed_text.text),
get_text_entities_object(parsed_text.entities));
return get_formatted_text_object(parsed_text);
}
td_api::object_ptr<td_api::Object> Td::do_static_request(td_api::getMarkdownText &request) {
if (request.text_ == nullptr) {
return make_error(400, "Text must be non-empty");
}
auto r_entities = get_message_entities(nullptr, std::move(request.text_->entities_));
if (r_entities.is_error()) {
return make_error(400, r_entities.error().message());
}
auto entities = r_entities.move_as_ok();
auto status = fix_formatted_text(request.text_->text_, entities, true, true, true, true);
if (status.is_error()) {
return make_error(400, status.error().message());
}
return get_formatted_text_object(get_markdown_v3({std::move(request.text_->text_), std::move(entities)}));
}
td_api::object_ptr<td_api::Object> Td::do_static_request(const td_api::getFileMimeType &request) {

View File

@ -1047,6 +1047,8 @@ class Td final : public NetQueryCallback {
void on_request(uint64 id, const td_api::parseMarkdown &request);
void on_request(uint64 id, const td_api::getMarkdownText &request);
void on_request(uint64 id, const td_api::getFileMimeType &request);
void on_request(uint64 id, const td_api::getFileExtension &request);
@ -1099,6 +1101,7 @@ class Td final : public NetQueryCallback {
static td_api::object_ptr<td_api::Object> do_static_request(const td_api::getTextEntities &request);
static td_api::object_ptr<td_api::Object> do_static_request(td_api::parseTextEntities &request);
static td_api::object_ptr<td_api::Object> do_static_request(td_api::parseMarkdown &request);
static td_api::object_ptr<td_api::Object> do_static_request(td_api::getMarkdownText &request);
static td_api::object_ptr<td_api::Object> do_static_request(const td_api::getFileMimeType &request);
static td_api::object_ptr<td_api::Object> do_static_request(const td_api::getFileExtension &request);
static td_api::object_ptr<td_api::Object> do_static_request(const td_api::cleanFileName &request);

View File

@ -1256,19 +1256,24 @@ TEST(MessageEntities, parse_markdown) {
check_parse_markdown("[telegram\\.org](tg:user?id=123456)", "telegram.org", {{0, 12, td::UserId(123456)}});
}
static void check_parse_markdown_v3(td::string text, td::vector<td::MessageEntity> entities, const td::string &result,
const td::vector<td::MessageEntity> &result_entities, bool fix = false) {
static void check_parse_markdown_v3(td::string text, td::vector<td::MessageEntity> entities,
const td::string &result_text, const td::vector<td::MessageEntity> &result_entities,
bool fix = false) {
auto parsed_text = td::parse_markdown_v3({std::move(text), std::move(entities)});
if (fix) {
ASSERT_TRUE(fix_formatted_text(parsed_text.text, parsed_text.entities, true, true, true, true).is_ok());
}
ASSERT_STREQ(result, parsed_text.text);
ASSERT_STREQ(result_text, parsed_text.text);
ASSERT_EQ(result_entities, parsed_text.entities);
if (fix) {
auto markdown_text = td::get_markdown_v3(parsed_text);
ASSERT_TRUE(parsed_text == markdown_text || parsed_text == td::parse_markdown_v3(markdown_text));
}
}
static void check_parse_markdown_v3(td::string text, const td::string &result,
static void check_parse_markdown_v3(td::string text, const td::string &result_text,
const td::vector<td::MessageEntity> &result_entities, bool fix = false) {
check_parse_markdown_v3(std::move(text), td::vector<td::MessageEntity>(), result, result_entities, fix);
check_parse_markdown_v3(std::move(text), td::vector<td::MessageEntity>(), result_text, result_entities, fix);
}
TEST(MessageEntities, parse_markdown_v3) {
@ -1293,6 +1298,9 @@ TEST(MessageEntities, parse_markdown_v3) {
check_parse_markdown_v3("` `a", " a", {{td::MessageEntity::Type::Code, 0, 1}}, true);
check_parse_markdown_v3("`\n`a", "\na", {}, true);
check_parse_markdown_v3("``", "``", {});
check_parse_markdown_v3("`a````b```", "`a````b```", {});
check_parse_markdown_v3("ab", {{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}}, "ab",
{{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}});
check_parse_markdown_v3("[a](b[c](t.me)", "[a](b[c](t.me)", {});
check_parse_markdown_v3("[](t.me)", "[](t.me)", {});
@ -1411,6 +1419,9 @@ TEST(MessageEntities, parse_markdown_v3) {
{td::MessageEntity::Type::TextUrl, 3, 4, "http://t.me/"},
{td::MessageEntity::Type::Italic, 3, 2}},
true);
check_parse_markdown_v3("__a #test__test", "__a #test__test", {});
check_parse_markdown_v3("a #testtest", {{td::MessageEntity::Type::Italic, 0, 7}}, "a #testtest",
{{td::MessageEntity::Type::Italic, 0, 7}});
// TODO parse_markdown_v3 is not idempotent now, which is bad
check_parse_markdown_v3(
@ -1551,5 +1562,57 @@ TEST(MessageEntities, parse_markdown_v3) {
text = std::move(parsed_text);
}
ASSERT_EQ(text, td::parse_markdown_v3(text));
auto markdown_text = td::get_markdown_v3(text);
ASSERT_TRUE(text == markdown_text || text == td::parse_markdown_v3(markdown_text));
}
}
static void check_get_markdown_v3(td::string result_text, td::vector<td::MessageEntity> result_entities,
const td::string &text, const td::vector<td::MessageEntity> &entities) {
auto markdown_text = td::get_markdown_v3({std::move(text), std::move(entities)});
ASSERT_STREQ(result_text, markdown_text.text);
ASSERT_EQ(result_entities, markdown_text.entities);
}
TEST(MessageEntities, get_markdown_v3) {
check_get_markdown_v3("``` ```", {}, " ", {{td::MessageEntity::Type::Pre, 0, 1}});
check_get_markdown_v3("` `", {}, " ", {{td::MessageEntity::Type::Code, 0, 1}});
check_get_markdown_v3("`\n`", {}, "\n", {{td::MessageEntity::Type::Code, 0, 1}});
check_get_markdown_v3("ab", {{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}}, "ab",
{{td::MessageEntity::Type::Code, 0, 1}, {td::MessageEntity::Type::Pre, 1, 1}});
check_get_markdown_v3("[ ](http://t.me/)", {}, " ", {{td::MessageEntity::Type::TextUrl, 0, 1, "http://t.me/"}});
check_get_markdown_v3("[ ]t.me[)](http://t.me/) [ ](t.me)", {{25, 1, td::UserId(1)}}, "[ ]t.me) [ ](t.me)",
{{td::MessageEntity::Type::TextUrl, 7, 1, "http://t.me/"}, {9, 1, td::UserId(1)}});
check_get_markdown_v3("__ __", {}, " ", {{td::MessageEntity::Type::Italic, 0, 1}});
check_get_markdown_v3("** **", {}, " ", {{td::MessageEntity::Type::Bold, 0, 1}});
check_get_markdown_v3("~~ ~~", {}, " ", {{td::MessageEntity::Type::Strikethrough, 0, 1}});
check_get_markdown_v3("__a__ **b** ~~c~~ d", {{td::MessageEntity::Type::PreCode, 18, 1, "C++"}}, "a b c d",
{{td::MessageEntity::Type::Italic, 0, 1},
{td::MessageEntity::Type::Bold, 2, 1},
{td::MessageEntity::Type::Strikethrough, 4, 1},
{td::MessageEntity::Type::PreCode, 6, 1, "C++"}});
check_get_markdown_v3("`ab` ```cd``` ef", {{td::MessageEntity::Type::PreCode, 14, 2, "C++"}}, "ab cd ef",
{{td::MessageEntity::Type::Code, 0, 2},
{td::MessageEntity::Type::Pre, 3, 2},
{td::MessageEntity::Type::PreCode, 6, 2, "C++"}});
check_get_markdown_v3("__asd__[__ab__cd](http://t.me/)", {}, "asdabcd",
{{td::MessageEntity::Type::Italic, 0, 3},
{td::MessageEntity::Type::TextUrl, 3, 4, "http://t.me/"},
{td::MessageEntity::Type::Italic, 3, 2}});
check_get_markdown_v3("__ab", {{td::MessageEntity::Type::Italic, 3, 1}}, "__ab",
{{td::MessageEntity::Type::Italic, 3, 1}});
check_get_markdown_v3("__ab__**__cd__**~~**__ef__gh**ij~~", {}, "abcdefghij",
{{td::MessageEntity::Type::Italic, 0, 2},
{td::MessageEntity::Type::Bold, 2, 2},
{td::MessageEntity::Type::Italic, 2, 2},
{td::MessageEntity::Type::Strikethrough, 4, 6},
{td::MessageEntity::Type::Bold, 4, 4},
{td::MessageEntity::Type::Italic, 4, 2}});
check_get_markdown_v3("[**__bold italic link__**](http://example.com/)", {}, "bold italic link",
{{td::MessageEntity::Type::TextUrl, 0, 16, "http://example.com/"},
{td::MessageEntity::Type::Bold, 0, 16},
{td::MessageEntity::Type::Italic, 0, 16}});
}