tdlight/td/telegram/MessageEntity.cpp
2024-06-17 00:11:23 +03:00

4759 lines
178 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2024
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
#include "td/telegram/MessageEntity.h"
#include "td/telegram/Dependencies.h"
#include "td/telegram/DialogManager.h"
#include "td/telegram/LinkManager.h"
#include "td/telegram/misc.h"
#include "td/telegram/OptionManager.h"
#include "td/telegram/SecretChatLayer.h"
#include "td/telegram/StickersManager.h"
#include "td/telegram/Td.h"
#include "td/telegram/UserManager.h"
#include "td/actor/MultiPromise.h"
#include "td/utils/algorithm.h"
#include "td/utils/format.h"
#include "td/utils/HashTableUtils.h"
#include "td/utils/logging.h"
#include "td/utils/misc.h"
#include "td/utils/Promise.h"
#include "td/utils/SliceBuilder.h"
#include "td/utils/StringBuilder.h"
#include "td/utils/unicode.h"
#include "td/utils/utf8.h"
#include <algorithm>
#include <cstring>
#include <limits>
#include <tuple>
namespace td {
int MessageEntity::get_type_priority(Type type) {
static const int priorities[] = {50 /*Mention*/,
50 /*Hashtag*/,
50 /*BotCommand*/,
50 /*Url*/,
50 /*EmailAddress*/,
90 /*Bold*/,
91 /*Italic*/,
20 /*Code*/,
11 /*Pre*/,
10 /*PreCode*/,
49 /*TextUrl*/,
49 /*MentionName*/,
50 /*Cashtag*/,
50 /*PhoneNumber*/,
92 /*Underline*/,
93 /*Strikethrough*/,
0 /*BlockQuote*/,
50 /*BankCardNumber*/,
50 /*MediaTimestamp*/,
94 /*Spoiler*/,
99 /*CustomEmoji*/,
0 /*ExpandableBlockQuote*/};
static_assert(sizeof(priorities) / sizeof(priorities[0]) == static_cast<size_t>(MessageEntity::Type::Size), "");
return priorities[static_cast<int32>(type)];
}
StringBuilder &operator<<(StringBuilder &string_builder, const MessageEntity::Type &message_entity_type) {
switch (message_entity_type) {
case MessageEntity::Type::Mention:
return string_builder << "Mention";
case MessageEntity::Type::Hashtag:
return string_builder << "Hashtag";
case MessageEntity::Type::BotCommand:
return string_builder << "BotCommand";
case MessageEntity::Type::Url:
return string_builder << "Url";
case MessageEntity::Type::EmailAddress:
return string_builder << "EmailAddress";
case MessageEntity::Type::Bold:
return string_builder << "Bold";
case MessageEntity::Type::Italic:
return string_builder << "Italic";
case MessageEntity::Type::Underline:
return string_builder << "Underline";
case MessageEntity::Type::Strikethrough:
return string_builder << "Strikethrough";
case MessageEntity::Type::BlockQuote:
return string_builder << "BlockQuote";
case MessageEntity::Type::Code:
return string_builder << "Code";
case MessageEntity::Type::Pre:
return string_builder << "Pre";
case MessageEntity::Type::PreCode:
return string_builder << "PreCode";
case MessageEntity::Type::TextUrl:
return string_builder << "TextUrl";
case MessageEntity::Type::MentionName:
return string_builder << "MentionName";
case MessageEntity::Type::Cashtag:
return string_builder << "Cashtag";
case MessageEntity::Type::PhoneNumber:
return string_builder << "PhoneNumber";
case MessageEntity::Type::BankCardNumber:
return string_builder << "BankCardNumber";
case MessageEntity::Type::MediaTimestamp:
return string_builder << "MediaTimestamp";
case MessageEntity::Type::Spoiler:
return string_builder << "Spoiler";
case MessageEntity::Type::CustomEmoji:
return string_builder << "CustomEmoji";
case MessageEntity::Type::ExpandableBlockQuote:
return string_builder << "ExpandableBlockQuote";
default:
UNREACHABLE();
return string_builder << "Impossible";
}
}
StringBuilder &operator<<(StringBuilder &string_builder, const MessageEntity &message_entity) {
string_builder << '[' << message_entity.type << ", offset = " << message_entity.offset
<< ", length = " << message_entity.length;
if (message_entity.media_timestamp >= 0) {
string_builder << ", media_timestamp = \"" << message_entity.media_timestamp << "\"";
}
if (!message_entity.argument.empty()) {
string_builder << ", argument = \"" << message_entity.argument << "\"";
}
if (message_entity.user_id.is_valid()) {
string_builder << ", " << message_entity.user_id;
}
if (message_entity.custom_emoji_id.is_valid()) {
string_builder << ", " << message_entity.custom_emoji_id;
}
string_builder << ']';
return string_builder;
}
tl_object_ptr<td_api::TextEntityType> MessageEntity::get_text_entity_type_object() const {
switch (type) {
case MessageEntity::Type::Mention:
return make_tl_object<td_api::textEntityTypeMention>();
case MessageEntity::Type::Hashtag:
return make_tl_object<td_api::textEntityTypeHashtag>();
case MessageEntity::Type::BotCommand:
return make_tl_object<td_api::textEntityTypeBotCommand>();
case MessageEntity::Type::Url:
return make_tl_object<td_api::textEntityTypeUrl>();
case MessageEntity::Type::EmailAddress:
return make_tl_object<td_api::textEntityTypeEmailAddress>();
case MessageEntity::Type::Bold:
return make_tl_object<td_api::textEntityTypeBold>();
case MessageEntity::Type::Italic:
return make_tl_object<td_api::textEntityTypeItalic>();
case MessageEntity::Type::Underline:
return make_tl_object<td_api::textEntityTypeUnderline>();
case MessageEntity::Type::Strikethrough:
return make_tl_object<td_api::textEntityTypeStrikethrough>();
case MessageEntity::Type::BlockQuote:
return make_tl_object<td_api::textEntityTypeBlockQuote>();
case MessageEntity::Type::Code:
return make_tl_object<td_api::textEntityTypeCode>();
case MessageEntity::Type::Pre:
return make_tl_object<td_api::textEntityTypePre>();
case MessageEntity::Type::PreCode:
return make_tl_object<td_api::textEntityTypePreCode>(argument);
case MessageEntity::Type::TextUrl:
return make_tl_object<td_api::textEntityTypeTextUrl>(argument);
case MessageEntity::Type::MentionName:
// can't use user_manager, because can be called from a static request
return make_tl_object<td_api::textEntityTypeMentionName>(user_id.get());
case MessageEntity::Type::Cashtag:
return make_tl_object<td_api::textEntityTypeCashtag>();
case MessageEntity::Type::PhoneNumber:
return make_tl_object<td_api::textEntityTypePhoneNumber>();
case MessageEntity::Type::BankCardNumber:
return make_tl_object<td_api::textEntityTypeBankCardNumber>();
case MessageEntity::Type::MediaTimestamp:
return make_tl_object<td_api::textEntityTypeMediaTimestamp>(media_timestamp);
case MessageEntity::Type::Spoiler:
return make_tl_object<td_api::textEntityTypeSpoiler>();
case MessageEntity::Type::CustomEmoji:
return make_tl_object<td_api::textEntityTypeCustomEmoji>(custom_emoji_id.get());
case MessageEntity::Type::ExpandableBlockQuote:
return make_tl_object<td_api::textEntityTypeExpandableBlockQuote>();
default:
UNREACHABLE();
return nullptr;
}
}
tl_object_ptr<td_api::textEntity> MessageEntity::get_text_entity_object() const {
return make_tl_object<td_api::textEntity>(offset, length, get_text_entity_type_object());
}
vector<tl_object_ptr<td_api::textEntity>> get_text_entities_object(const vector<MessageEntity> &entities,
bool skip_bot_commands, int32 max_media_timestamp) {
vector<tl_object_ptr<td_api::textEntity>> result;
result.reserve(entities.size());
for (auto &entity : entities) {
if (skip_bot_commands && entity.type == MessageEntity::Type::BotCommand) {
continue;
}
if (entity.type == MessageEntity::Type::MediaTimestamp && max_media_timestamp < entity.media_timestamp) {
continue;
}
auto entity_object = entity.get_text_entity_object();
if (entity_object->type_ != nullptr) {
result.push_back(std::move(entity_object));
}
}
return result;
}
StringBuilder &operator<<(StringBuilder &string_builder, const FormattedText &text) {
return string_builder << '"' << text.text << "\" with entities " << text.entities;
}
td_api::object_ptr<td_api::formattedText> get_formatted_text_object(const FormattedText &text, bool skip_bot_commands,
int32 max_media_timestamp) {
return td_api::make_object<td_api::formattedText>(
text.text, get_text_entities_object(text.entities, skip_bot_commands, max_media_timestamp));
}
static bool is_word_character(uint32 code) {
switch (get_unicode_simple_category(code)) {
case UnicodeSimpleCategory::Letter:
case UnicodeSimpleCategory::DecimalNumber:
case UnicodeSimpleCategory::Number:
return true;
default:
return code == '_';
}
}
/*
static bool is_word_boundary(uint32 a, uint32 b) {
return is_word_character(a) ^ is_word_character(b);
}
*/
static bool is_alpha_digit(uint32 code) {
return ('0' <= code && code <= '9') || ('a' <= code && code <= 'z') || ('A' <= code && code <= 'Z');
}
static bool is_alpha_digit_or_underscore(uint32 code) {
return is_alpha_digit(code) || code == '_';
}
static bool is_alpha_digit_or_underscore_or_minus(uint32 code) {
return is_alpha_digit_or_underscore(code) || code == '-';
}
// This functions just implements corresponding regexps
// All other fixes will be in other functions
static vector<Slice> match_mentions(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<=\B)@([a-zA-Z0-9_]{2,32})(?=\b)/u'
while (true) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, '@', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_word_character(prev)) {
ptr++;
continue;
}
}
auto mention_begin = ++ptr;
while (ptr != end && is_alpha_digit_or_underscore(*ptr)) {
ptr++;
}
auto mention_end = ptr;
auto mention_size = mention_end - mention_begin;
if (mention_size < 2 || mention_size > 32) {
continue;
}
uint32 next = 0;
if (ptr != end) {
next_utf8_unsafe(ptr, &next);
}
if (is_word_character(next)) {
continue;
}
result.emplace_back(mention_begin - 1, mention_end);
}
return result;
}
static vector<Slice> match_bot_commands(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<!\b|[\/<>])\/([a-zA-Z0-9_]{1,64})(?:@([a-zA-Z0-9_]{3,32}))?(?!\B|[\/<>])/u'
while (true) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, '/', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
ptr++;
continue;
}
}
auto command_begin = ++ptr;
while (ptr != end && is_alpha_digit_or_underscore(*ptr)) {
ptr++;
}
auto command_end = ptr;
auto command_size = command_end - command_begin;
if (command_size < 1 || command_size > 64) {
continue;
}
if (ptr != end && *ptr == '@') {
auto mention_begin = ++ptr;
while (ptr != end && is_alpha_digit_or_underscore(*ptr)) {
ptr++;
}
auto mention_end = ptr;
auto mention_size = mention_end - mention_begin;
if (mention_size < 3 || mention_size > 32) {
continue;
}
command_end = ptr;
}
uint32 next = 0;
if (ptr != end) {
next_utf8_unsafe(ptr, &next);
}
if (is_word_character(next) || next == '/' || next == '<' || next == '>') {
continue;
}
result.emplace_back(command_begin - 1, command_end);
}
return result;
}
static bool is_hashtag_letter(uint32 c, UnicodeSimpleCategory &category) {
category = get_unicode_simple_category(c);
if (c == '_' || c == 0x200c || c == 0xb7 || (0xd80 <= c && c <= 0xdff)) {
return true;
}
switch (category) {
case UnicodeSimpleCategory::DecimalNumber:
case UnicodeSimpleCategory::Letter:
return true;
default:
return false;
}
}
static vector<Slice> match_hashtags(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<=^|[^\d_\pL\x{200c}\x{0d80}-\x{0dff}])#([\d_\pL\x{200c}\x{0d80}-\x{0dff}]{1,256})(?![\d_\pL\x{200c}\x{0d80}-\x{0dff}]*#)/u'
// and at least one letter
UnicodeSimpleCategory category;
while (true) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, '#', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_hashtag_letter(prev, category)) {
ptr++;
continue;
}
}
auto hashtag_begin = ++ptr;
size_t hashtag_size = 0;
const unsigned char *hashtag_end = nullptr;
bool was_letter = false;
while (ptr != end) {
uint32 code;
auto next_ptr = next_utf8_unsafe(ptr, &code);
if (!is_hashtag_letter(code, category)) {
break;
}
ptr = next_ptr;
if (hashtag_size == 255) {
hashtag_end = ptr;
}
if (hashtag_size != 256) {
was_letter |= category == UnicodeSimpleCategory::Letter;
hashtag_size++;
}
}
if (!hashtag_end) {
hashtag_end = ptr;
}
if (hashtag_size < 1) {
continue;
}
if (ptr != end && ptr[0] == '#') {
continue;
}
if (!was_letter) {
continue;
}
result.emplace_back(hashtag_begin - 1, hashtag_end);
}
return result;
}
static vector<Slice> match_cashtags(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<=^|[^$\d_\pL\x{200c}\x{0d80}-\x{0dff}])\$(1INCH|[A-Z]{1,8})(?![$\d_\pL\x{200c}\x{0d80}-\x{0dff}])/u'
UnicodeSimpleCategory category;
while (true) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, '$', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_hashtag_letter(prev, category) || prev == '$') {
ptr++;
continue;
}
}
auto cashtag_begin = ++ptr;
if (end - ptr >= 5 && Slice(ptr, ptr + 5) == Slice("1INCH")) {
ptr += 5;
} else {
while (ptr != end && 'Z' >= *ptr && *ptr >= 'A') {
ptr++;
}
}
auto cashtag_end = ptr;
auto cashtag_size = cashtag_end - cashtag_begin;
if (cashtag_size < 1 || cashtag_size > 8) {
continue;
}
if (cashtag_end != end) {
uint32 code;
next_utf8_unsafe(ptr, &code);
if (is_hashtag_letter(code, category) || code == '$') {
continue;
}
}
result.emplace_back(cashtag_begin - 1, cashtag_end);
}
return result;
}
static vector<Slice> match_media_timestamps(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
while (true) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, ':', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
auto media_timestamp_begin = ptr;
while (media_timestamp_begin != begin &&
(media_timestamp_begin[-1] == ':' || is_digit(media_timestamp_begin[-1]))) {
media_timestamp_begin--;
}
auto media_timestamp_end = ptr;
while (media_timestamp_end + 1 != end && (media_timestamp_end[1] == ':' || is_digit(media_timestamp_end[1]))) {
media_timestamp_end++;
}
media_timestamp_end++;
if (media_timestamp_begin != ptr && media_timestamp_end != ptr + 1 && is_digit(ptr[1])) {
ptr = media_timestamp_end;
if (media_timestamp_begin != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(media_timestamp_begin), &prev);
if (is_word_character(prev)) {
continue;
}
}
if (media_timestamp_end != end) {
uint32 next;
next_utf8_unsafe(media_timestamp_end, &next);
if (is_word_character(next)) {
continue;
}
}
result.emplace_back(media_timestamp_begin, media_timestamp_end);
} else {
ptr = media_timestamp_end;
}
}
return result;
}
static vector<Slice> match_bank_card_numbers(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '/(?<=^|[^+_\pL\d-.,])[\d -]{13,}([^_\pL\d-]|$)/'
while (true) {
while (ptr != end && !is_digit(*ptr)) {
ptr++;
}
if (ptr == end) {
break;
}
if (ptr != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (prev == '.' || prev == ',' || prev == '+' || prev == '-' || prev == '_' ||
get_unicode_simple_category(prev) == UnicodeSimpleCategory::Letter) {
while (ptr != end && (is_digit(*ptr) || *ptr == ' ' || *ptr == '-')) {
ptr++;
}
continue;
}
}
auto card_number_begin = ptr;
size_t digit_count = 0;
while (ptr != end && (is_digit(*ptr) || *ptr == ' ' || *ptr == '-')) {
if (*ptr == ' ' && digit_count >= 16 && digit_count <= 19 &&
digit_count == static_cast<size_t>(ptr - card_number_begin)) {
// continuous card number
break;
}
digit_count += static_cast<size_t>(is_digit(*ptr));
ptr++;
}
if (digit_count < 13 || digit_count > 19) {
continue;
}
auto card_number_end = ptr;
while (!is_digit(card_number_end[-1])) {
card_number_end--;
}
auto card_number_size = static_cast<size_t>(card_number_end - card_number_begin);
if (card_number_size > 2 * digit_count - 1) {
continue;
}
if (card_number_end != end) {
uint32 next;
next_utf8_unsafe(card_number_end, &next);
if (next == '-' || next == '_' || get_unicode_simple_category(next) == UnicodeSimpleCategory::Letter) {
continue;
}
}
result.emplace_back(card_number_begin, card_number_end);
}
return result;
}
static bool is_url_unicode_symbol(uint32 c) {
if (0x2000 <= c && c <= 0x206f) { // General Punctuation
// Zero Width Non-Joiner/Joiner and various dashes
return c == 0x200c || c == 0x200d || (0x2010 <= c && c <= 0x2015);
}
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
}
static bool is_url_path_symbol(uint32 c) {
switch (c) {
case '\n':
case '<':
case '>':
case '"':
case 0xab: // «
case 0xbb: // »
return false;
default:
return is_url_unicode_symbol(c);
}
}
static vector<Slice> match_tg_urls(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
// '(tg|ton)://[a-z0-9_-]{1,253}([/?#][^\s\x{2000}-\x{200b}\x{200e}-\x{200f}\x{2016}-\x{206f}<>«»"]*)?'
Slice bad_path_end_chars(".:;,('?!`");
while (end - ptr > 5) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, ':', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
const unsigned char *url_begin = nullptr;
if (end - ptr >= 3 && ptr[1] == '/' && ptr[2] == '/') {
if (ptr - begin >= 2 && to_lower(ptr[-2]) == 't' && to_lower(ptr[-1]) == 'g') {
url_begin = ptr - 2;
} else if (ptr - begin >= 3 && to_lower(ptr[-3]) == 't' && to_lower(ptr[-2]) == 'o' && to_lower(ptr[-1]) == 'n') {
url_begin = ptr - 3;
}
}
if (url_begin == nullptr) {
++ptr;
continue;
}
ptr += 3;
auto domain_begin = ptr;
while (ptr != end && ptr - domain_begin != 253 && is_alpha_digit_or_underscore_or_minus(*ptr)) {
ptr++;
}
if (ptr == domain_begin) {
continue;
}
if (ptr != end && (*ptr == '/' || *ptr == '?' || *ptr == '#')) {
auto path_end_ptr = ptr + 1;
while (path_end_ptr != end) {
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code);
if (!is_url_path_symbol(code)) {
break;
}
path_end_ptr = next_ptr;
}
while (path_end_ptr > ptr + 1 && bad_path_end_chars.find(path_end_ptr[-1]) < bad_path_end_chars.size()) {
path_end_ptr--;
}
if (ptr[0] == '/' || path_end_ptr > ptr + 1) {
ptr = path_end_ptr;
}
}
result.emplace_back(url_begin, ptr);
}
return result;
}
static vector<Slice> match_urls(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const auto &is_protocol_symbol = [](uint32 c) {
if (c < 0x80) {
// do not allow dots in the protocol
return is_alpha_digit(c) || c == '+' || c == '-';
}
// add unicode letters and digits to later discard protocol as invalid
return get_unicode_simple_category(c) != UnicodeSimpleCategory::Separator;
};
const auto &is_user_data_symbol = [](uint32 c) {
switch (c) {
case '\n':
case '/':
case '[':
case ']':
case '{':
case '}':
case '(':
case ')':
case '\'':
case '`':
case '<':
case '>':
case '"':
case '@':
case 0xab: // «
case 0xbb: // »
return false;
default:
return is_url_unicode_symbol(c);
}
};
const auto &is_domain_symbol = [](uint32 c) {
if (c < 0xc0) {
return c == '.' || is_alpha_digit_or_underscore_or_minus(c) || c == '~';
}
return is_url_unicode_symbol(c);
};
Slice bad_path_end_chars(".:;,('?!`");
while (true) {
auto dot_pos = str.find('.');
if (dot_pos > str.size() || dot_pos + 1 == str.size()) {
break;
}
if (str[dot_pos + 1] == ' ') {
// fast path
str = str.substr(dot_pos + 2);
begin = str.ubegin();
continue;
}
const unsigned char *domain_begin_ptr = begin + dot_pos;
while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code);
if (!is_domain_symbol(code)) {
domain_begin_ptr = next_ptr;
break;
}
}
const unsigned char *last_at_ptr = nullptr;
const unsigned char *domain_end_ptr = begin + dot_pos;
while (domain_end_ptr != end) {
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code);
if (code == '@') {
last_at_ptr = domain_end_ptr;
} else if (!is_domain_symbol(code)) {
break;
}
domain_end_ptr = next_ptr;
}
if (last_at_ptr != nullptr) {
while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code);
if (!is_user_data_symbol(code)) {
domain_begin_ptr = next_ptr;
break;
}
}
}
// LOG(ERROR) << "Domain: " << Slice(domain_begin_ptr, domain_end_ptr);
const unsigned char *url_end_ptr = domain_end_ptr;
if (url_end_ptr != end && url_end_ptr[0] == ':') {
auto port_end_ptr = url_end_ptr + 1;
while (port_end_ptr != end && is_digit(port_end_ptr[0])) {
port_end_ptr++;
}
auto port_begin_ptr = url_end_ptr + 1;
while (port_begin_ptr != port_end_ptr && *port_begin_ptr == '0') {
port_begin_ptr++;
}
if (port_begin_ptr != port_end_ptr && narrow_cast<int>(port_end_ptr - port_begin_ptr) <= 5 &&
to_integer<uint32>(Slice(port_begin_ptr, port_end_ptr)) <= 65535) {
url_end_ptr = port_end_ptr;
}
}
// LOG(ERROR) << "Domain_port: " << Slice(domain_begin_ptr, url_end_ptr);
if (url_end_ptr != end && (url_end_ptr[0] == '/' || url_end_ptr[0] == '?' || url_end_ptr[0] == '#')) {
auto path_end_ptr = url_end_ptr + 1;
while (path_end_ptr != end) {
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code);
if (!is_url_path_symbol(code)) {
break;
}
path_end_ptr = next_ptr;
}
while (path_end_ptr > url_end_ptr + 1 && bad_path_end_chars.find(path_end_ptr[-1]) < bad_path_end_chars.size()) {
path_end_ptr--;
}
if (url_end_ptr[0] == '/' || path_end_ptr > url_end_ptr + 1) {
url_end_ptr = path_end_ptr;
}
}
while (url_end_ptr > begin + dot_pos + 1 && url_end_ptr[-1] == '.') {
url_end_ptr--;
}
// LOG(ERROR) << "Domain_port_path: " << Slice(domain_begin_ptr, url_end_ptr);
bool is_bad = false;
const unsigned char *url_begin_ptr = domain_begin_ptr;
if (url_begin_ptr != begin && url_begin_ptr[-1] == '@') {
if (last_at_ptr != nullptr) {
is_bad = true;
}
auto user_data_begin_ptr = url_begin_ptr - 1;
while (user_data_begin_ptr != begin) {
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code);
if (!is_user_data_symbol(code)) {
user_data_begin_ptr = next_ptr;
break;
}
}
if (user_data_begin_ptr == url_begin_ptr - 1) {
is_bad = true;
}
url_begin_ptr = user_data_begin_ptr;
}
// LOG(ERROR) << "User_data_port_path: " << Slice(url_begin_ptr, url_end_ptr);
if (url_begin_ptr != begin) {
Slice prefix(begin, url_begin_ptr);
if (prefix.size() >= 6 && ends_with(prefix, "://")) {
auto protocol_begin_ptr = url_begin_ptr - 3;
while (protocol_begin_ptr != begin) {
protocol_begin_ptr = prev_utf8_unsafe(protocol_begin_ptr);
uint32 code = 0;
auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code);
if (!is_protocol_symbol(code)) {
protocol_begin_ptr = next_ptr;
break;
}
}
auto protocol = to_lower(Slice(protocol_begin_ptr, url_begin_ptr - 3));
if (ends_with(protocol, "http") && protocol != "shttp") {
url_begin_ptr = url_begin_ptr - 7;
} else if (ends_with(protocol, "https")) {
url_begin_ptr = url_begin_ptr - 8;
} else if (ends_with(protocol, "ftp") && protocol != "tftp" && protocol != "sftp") {
url_begin_ptr = url_begin_ptr - 6;
} else {
is_bad = true;
}
} else {
auto prefix_end = prefix.uend();
auto prefix_back = prev_utf8_unsafe(prefix_end);
uint32 code = 0;
next_utf8_unsafe(prefix_back, &code);
if (is_word_character(code) || code == '/' || code == '#' || code == '@') {
is_bad = true;
}
}
}
// LOG(ERROR) << "Full: " << Slice(url_begin_ptr, url_end_ptr) << " " << is_bad;
if (!is_bad) {
if (url_end_ptr > begin + dot_pos + 1) {
result.emplace_back(url_begin_ptr, url_end_ptr);
}
while (url_end_ptr != end && url_end_ptr[0] == '.') {
url_end_ptr++;
}
} else {
while (url_end_ptr[-1] != '.') {
url_end_ptr--;
}
}
if (url_end_ptr <= begin + dot_pos) {
url_end_ptr = begin + dot_pos + 1;
}
str = str.substr(url_end_ptr - begin);
begin = url_end_ptr;
}
return result;
}
static bool is_valid_bank_card(Slice str) {
const size_t MIN_CARD_LENGTH = 13;
const size_t MAX_CARD_LENGTH = 19;
char digits[MAX_CARD_LENGTH];
size_t digit_count = 0;
for (auto c : str) {
if (is_digit(c)) {
CHECK(digit_count < MAX_CARD_LENGTH);
digits[digit_count++] = c;
}
}
CHECK(digit_count >= MIN_CARD_LENGTH);
// Luhn algorithm
int32 sum = 0;
for (size_t i = digit_count; i > 0; i--) {
int32 digit = digits[i - 1] - '0';
if ((digit_count - i) % 2 == 0) {
sum += digit;
} else {
sum += (digit < 5 ? 2 * digit : 2 * digit - 9);
}
}
if (sum % 10 != 0) {
return false;
}
int32 prefix1 = (digits[0] - '0');
int32 prefix2 = prefix1 * 10 + (digits[1] - '0');
int32 prefix3 = prefix2 * 10 + (digits[2] - '0');
int32 prefix4 = prefix3 * 10 + (digits[3] - '0');
if (prefix1 == 4) {
// Visa
return digit_count == 13 || digit_count == 16 || digit_count == 18 || digit_count == 19;
}
if ((51 <= prefix2 && prefix2 <= 55) || (2221 <= prefix4 && prefix4 <= 2720)) {
// mastercard
return digit_count == 16;
}
if (prefix2 == 34 || prefix2 == 37) {
// American Express
return digit_count == 15;
}
if (prefix2 == 62 || prefix2 == 81) {
// UnionPay
return digit_count >= 16;
}
if (2200 <= prefix4 && prefix4 <= 2204) {
// MIR
return digit_count == 16;
}
return true; // skip length check
}
bool is_email_address(Slice str) {
// /^([a-z0-9_-]{0,26}[.+]){0,10}[a-z0-9_-]{1,35}@(([a-z0-9][a-z0-9_-]{0,28})?[a-z0-9][.]){1,6}[a-z]{2,8}$/i
Slice userdata;
Slice domain;
std::tie(userdata, domain) = split(str, '@');
if (domain.empty()) {
return false;
}
size_t prev = 0;
size_t userdata_part_count = 0;
for (size_t i = 0; i < userdata.size(); i++) {
if (userdata[i] == '.' || userdata[i] == '+') {
if (i - prev >= 27) {
return false;
}
userdata_part_count++;
prev = i + 1;
} else if (!is_alpha_digit_or_underscore_or_minus(userdata[i])) {
return false;
}
}
userdata_part_count++;
if (userdata_part_count >= 12) {
return false;
}
auto last_part_length = userdata.size() - prev;
if (last_part_length == 0 || last_part_length >= 36) {
return false;
}
vector<Slice> domain_parts = full_split(domain, '.');
if (domain_parts.size() <= 1 || domain_parts.size() > 7) {
return false;
}
if (domain_parts.back().size() <= 1 || domain_parts.back().size() >= 9) {
return false;
}
for (auto c : domain_parts.back()) {
if (!is_alpha(c)) {
return false;
}
}
domain_parts.pop_back();
for (auto &part : domain_parts) {
if (part.empty() || part.size() >= 31) {
return false;
}
for (auto c : part) {
if (!is_alpha_digit_or_underscore_or_minus(c)) {
return false;
}
}
if (!is_alpha_digit(part[0])) {
return false;
}
if (!is_alpha_digit(part.back())) {
return false;
}
}
return true;
}
static bool is_common_tld(Slice str) {
static const FlatHashSet<Slice, SliceHash> tlds(
{"aaa", "aarp", "abb", "abbott", "abbvie", "abc", "able", "abogado", "abudhabi", "ac", "academy", "accenture",
"accountant", "accountants", "aco", "actor", "ad", "ads", "adult", "ae", "aeg", "aero", "aetna", "af", "afl",
"africa", "ag", "agakhan", "agency", "ai", "aig", "airbus", "airforce", "airtel", "akdn", "al", "alibaba",
"alipay", "allfinanz", "allstate", "ally", "alsace", "alstom", "am", "amazon", "americanexpress",
"americanfamily", "amex", "amfam", "amica", "amsterdam", "analytics", "android", "anquan", "anz", "ao", "aol",
"apartments", "app", "apple", "aq", "aquarelle", "ar", "arab", "aramco", "archi", "army", "arpa", "art", "arte",
"as", "asda", "asia", "associates", "at", "athleta", "attorney", "au", "auction", "audi", "audible", "audio",
"auspost", "author", "auto", "autos", "aw", "aws", "ax", "axa", "az", "azure", "ba", "baby", "baidu", "banamex",
"band", "bank", "bar", "barcelona", "barclaycard", "barclays", "barefoot", "bargains", "baseball", "basketball",
"bauhaus", "bayern", "bb", "bbc", "bbt", "bbva", "bcg", "bcn", "bd", "be", "beats", "beauty", "beer", "bentley",
"berlin", "best", "bestbuy", "bet", "bf", "bg", "bh", "bharti", "bi", "bible", "bid", "bike", "bing", "bingo",
"bio", "biz", "bj", "black", "blackfriday", "blockbuster", "blog", "bloomberg", "blue", "bm", "bms", "bmw", "bn",
"bnpparibas", "bo", "boats", "boehringer", "bofa", "bom", "bond", "boo", "book", "booking", "bosch", "bostik",
"boston", "bot", "boutique", "box", "br", "bradesco", "bridgestone", "broadway", "broker", "brother", "brussels",
"bs", "bt", "build", "builders", "business", "buy", "buzz", "bv", "bw", "by", "bz", "bzh", "ca", "cab", "cafe",
"cal", "call", "calvinklein", "cam", "camera", "camp", "canon", "capetown", "capital", "capitalone", "car",
"caravan", "cards", "care", "career", "careers", "cars", "casa", "case", "cash", "casino", "cat", "catering",
"catholic", "cba", "cbn", "cbre", "cc", "cd", "center", "ceo", "cern", "cf", "cfa", "cfd", "cg", "ch", "chanel",
"channel", "charity", "chase", "chat", "cheap", "chintai", "christmas", "chrome", "church", "ci", "cipriani",
"circle", "cisco", "citadel", "citi", "citic", "city", "ck", "cl", "claims", "cleaning", "click", "clinic",
"clinique", "clothing", "cloud", "club", "clubmed", "cm", "cn", "co", "coach", "codes", "coffee", "college",
"cologne", "com", "commbank", "community", "company", "compare", "computer", "comsec", "condos", "construction",
"consulting", "contact", "contractors", "cooking", "cool", "coop", "corsica", "country", "coupon", "coupons",
"courses", "cpa", "cr", "credit", "creditcard", "creditunion", "cricket", "crown", "crs", "cruise", "cruises",
"cu", "cuisinella", "cv", "cw", "cx", "cy", "cymru", "cyou", "cz", "dabur", "dad", "dance", "data", "date",
"dating", "datsun", "day", "dclk", "dds", "de", "deal", "dealer", "deals", "degree", "delivery", "dell",
"deloitte", "delta", "democrat", "dental", "dentist", "desi", "design", "dev", "dhl", "diamonds", "diet",
"digital", "direct", "directory", "discount", "discover", "dish", "diy", "dj", "dk", "dm", "dnp", "do", "docs",
"doctor", "dog", "domains", "dot", "download", "drive", "dtv", "dubai", "dunlop", "dupont", "durban", "dvag",
"dvr", "dz", "earth", "eat", "ec", "eco", "edeka", "edu", "education", "ee", "eg", "email", "emerck", "energy",
"engineer", "engineering", "enterprises", "epson", "equipment", "er", "ericsson", "erni", "es", "esq", "estate",
"et", "eu", "eurovision", "eus", "events", "exchange", "expert", "exposed", "express", "extraspace", "fage",
"fail", "fairwinds", "faith", "family", "fan", "fans", "farm", "farmers", "fashion", "fast", "fedex", "feedback",
"ferrari", "ferrero", "fi", "fidelity", "fido", "film", "final", "finance", "financial", "fire", "firestone",
"firmdale", "fish", "fishing", "fit", "fitness", "fj", "fk", "flickr", "flights", "flir", "florist", "flowers",
"fly", "fm", "fo", "foo", "food", "football", "ford", "forex", "forsale", "forum", "foundation", "fox", "fr",
"free", "fresenius", "frl", "frogans", "frontier", "ftr", "fujitsu", "fun", "fund", "furniture", "futbol", "fyi",
"ga", "gal", "gallery", "gallo", "gallup", "game", "games", "gap", "garden", "gay", "gb", "gbiz", "gd", "gdn",
"ge", "gea", "gent", "genting", "george", "gf", "gg", "ggee", "gh", "gi", "gift", "gifts", "gives", "giving",
"gl", "glass", "gle", "global", "globo", "gm", "gmail", "gmbh", "gmo", "gmx", "gn", "godaddy", "gold",
"goldpoint", "golf", "goo", "goodyear", "goog", "google", "gop", "got", "gov", "gp", "gq", "gr", "grainger",
"graphics", "gratis", "green", "gripe", "grocery", "group", "gs", "gt", "gu", "gucci", "guge", "guide",
"guitars", "guru", "gw", "gy", "hair", "hamburg", "hangout", "haus", "hbo", "hdfc", "hdfcbank", "health",
"healthcare", "help", "helsinki", "here", "hermes", "hiphop", "hisamitsu", "hitachi", "hiv", "hk", "hkt", "hm",
"hn", "hockey", "holdings", "holiday", "homedepot", "homegoods", "homes", "homesense", "honda", "horse",
"hospital", "host", "hosting", "hot", "hotels", "hotmail", "house", "how", "hr", "hsbc", "ht", "hu", "hughes",
"hyatt", "hyundai", "ibm", "icbc", "ice", "icu", "id", "ie", "ieee", "ifm", "ikano", "il", "im", "imamat",
"imdb", "immo", "immobilien", "in", "inc", "industries", "infiniti", "info", "ing", "ink", "institute",
"insurance", "insure", "int", "international", "intuit", "investments", "io", "ipiranga", "iq", "ir", "irish",
"is", "ismaili", "ist", "istanbul", "it", "itau", "itv", "jaguar", "java", "jcb", "je", "jeep", "jetzt",
"jewelry", "jio", "jll", "jm", "jmp", "jnj", "jo", "jobs", "joburg", "jot", "joy", "jp", "jpmorgan", "jprs",
"juegos", "juniper", "kaufen", "kddi", "ke", "kerryhotels", "kerrylogistics", "kerryproperties", "kfh", "kg",
"kh", "ki", "kia", "kids", "kim", "kindle", "kitchen", "kiwi", "km", "kn", "koeln", "komatsu", "kosher", "kp",
"kpmg", "kpn", "kr", "krd", "kred", "kuokgroup", "kw", "ky", "kyoto", "kz", "la", "lacaixa", "lamborghini",
"lamer", "lancaster", "land", "landrover", "lanxess", "lasalle", "lat", "latino", "latrobe", "law", "lawyer",
"lb", "lc", "lds", "lease", "leclerc", "lefrak", "legal", "lego", "lexus", "lgbt", "li", "lidl", "life",
"lifeinsurance", "lifestyle", "lighting", "like", "lilly", "limited", "limo", "lincoln", "link", "lipsy", "live",
"living", "lk", "llc", "llp", "loan", "loans", "locker", "locus", "lol", "london", "lotte", "lotto", "love",
"lpl", "lplfinancial", "lr", "ls", "lt", "ltd", "ltda", "lu", "lundbeck", "luxe", "luxury", "lv", "ly", "ma",
"madrid", "maif", "maison", "makeup", "man", "management", "mango", "map", "market", "marketing", "markets",
"marriott", "marshalls", "mattel", "mba", "mc", "mckinsey", "md", "me", "med", "media", "meet", "melbourne",
"meme", "memorial", "men", "menu", "merckmsd", "mg", "mh", "miami", "microsoft", "mil", "mini", "mint", "mit",
"mitsubishi", "mk", "ml", "mlb", "mls", "mm", "mma", "mn", "mo", "mobi", "mobile", "moda", "moe", "moi", "mom",
"monash", "money", "monster", "mormon", "mortgage", "moscow", "moto", "motorcycles", "mov", "movie", "mp", "mq",
"mr", "ms", "msd", "mt", "mtn", "mtr", "mu", "museum", "music", "mv", "mw", "mx", "my", "mz", "na", "nab",
"nagoya", "name", "navy", "nba", "nc", "ne", "nec", "net", "netbank", "netflix", "network", "neustar", "new",
"news", "next", "nextdirect", "nexus", "nf", "nfl", "ng", "ngo", "nhk", "ni", "nico", "nike", "nikon", "ninja",
"nissan", "nissay", "nl", "no", "nokia", "norton", "now", "nowruz", "nowtv", "np", "nr", "nra", "nrw", "ntt",
"nu", "nyc", "nz", "obi", "observer", "office", "okinawa", "olayan", "olayangroup", "ollo", "om", "omega", "one",
"ong", "onl", "online", "ooo", "open", "oracle", "orange", "org", "organic", "origins", "osaka", "otsuka", "ott",
"ovh", "pa", "page", "panasonic", "paris", "pars", "partners", "parts", "party", "pay", "pccw", "pe", "pet",
"pf", "pfizer", "pg", "ph", "pharmacy", "phd", "philips", "phone", "photo", "photography", "photos", "physio",
"pics", "pictet", "pictures", "pid", "pin", "ping", "pink", "pioneer", "pizza", "pk", "pl", "place", "play",
"playstation", "plumbing", "plus", "pm", "pn", "pnc", "pohl", "poker", "politie", "porn", "post", "pr",
"pramerica", "praxi", "press", "prime", "pro", "prod", "productions", "prof", "progressive", "promo",
"properties", "property", "protection", "pru", "prudential", "ps", "pt", "pub", "pw", "pwc", "py", "qa", "qpon",
"quebec", "quest", "racing", "radio", "re", "read", "realestate", "realtor", "realty", "recipes", "red",
"redstone", "redumbrella", "rehab", "reise", "reisen", "reit", "reliance", "ren", "rent", "rentals", "repair",
"report", "republican", "rest", "restaurant", "review", "reviews", "rexroth", "rich", "richardli", "ricoh",
"ril", "rio", "rip", "ro", "rocks", "rodeo", "rogers", "room", "rs", "rsvp", "ru", "rugby", "ruhr", "run", "rw",
"rwe", "ryukyu", "sa", "saarland", "safe", "safety", "sakura", "sale", "salon", "samsclub", "samsung", "sandvik",
"sandvikcoromant", "sanofi", "sap", "sarl", "sas", "save", "saxo", "sb", "sbi", "sbs", "sc", "scb", "schaeffler",
"schmidt", "scholarships", "school", "schule", "schwarz", "science", "scot", "sd", "se", "search", "seat",
"secure", "security", "seek", "select", "sener", "services", "seven", "sew", "sex", "sexy", "sfr", "sg", "sh",
"shangrila", "sharp", "shaw", "shell", "shia", "shiksha", "shoes", "shop", "shopping", "shouji", "show", "si",
"silk", "sina", "singles", "site", "sj", "sk", "ski", "skin", "sky", "skype", "sl", "sling", "sm", "smart",
"smile", "sn", "sncf", "so", "soccer", "social", "softbank", "software", "sohu", "solar", "solutions", "song",
"sony", "soy", "spa", "space", "sport", "spot", "sr", "srl", "ss", "st", "stada", "staples", "star", "statebank",
"statefarm", "stc", "stcgroup", "stockholm", "storage", "store", "stream", "studio", "study", "style", "su",
"sucks", "supplies", "supply", "support", "surf", "surgery", "suzuki", "sv", "swatch", "swiss", "sx", "sy",
"sydney", "systems", "sz", "tab", "taipei", "talk", "taobao", "target", "tatamotors", "tatar", "tattoo", "tax",
"taxi", "tc", "tci", "td", "tdk", "team", "tech", "technology", "tel", "temasek", "tennis", "teva", "tf", "tg",
"th", "thd", "theater", "theatre", "tiaa", "tickets", "tienda", "tips", "tires", "tirol", "tj", "tjmaxx", "tjx",
"tk", "tkmaxx", "tl", "tm", "tmall", "tn", "to", "today", "tokyo", "tools", "top", "toray", "toshiba", "total",
"tours", "town", "toyota", "toys", "tr", "trade", "trading", "training", "travel", "travelers",
"travelersinsurance", "trust", "trv", "tt", "tube", "tui", "tunes", "tushu", "tv", "tvs", "tw", "tz", "ua",
"ubank", "ubs", "ug", "uk", "unicom", "university", "uno", "uol", "ups", "us", "uy", "uz", "va", "vacations",
"vana", "vanguard", "vc", "ve", "vegas", "ventures", "verisign", "versicherung", "vet", "vg", "vi", "viajes",
"video", "vig", "viking", "villas", "vin", "vip", "virgin", "visa", "vision", "viva", "vivo", "vlaanderen", "vn",
"vodka", "volvo", "vote", "voting", "voto", "voyage", "vu", "wales", "walmart", "walter", "wang", "wanggou",
"watch", "watches", "weather", "weatherchannel", "webcam", "weber", "website", "wed", "wedding", "weibo", "weir",
"wf", "whoswho", "wien", "wiki", "williamhill", "win", "windows", "wine", "winners", "wme", "wolterskluwer",
"woodside", "work", "works", "world", "wow", "ws", "wtc", "wtf", "xbox", "xerox", "xihuan", "xin", "कॉम",
"セール", "佛山", "ಭಾರತ", "慈善", "集团", "在线", "한국", "ଭାରତ", "点看", "คอม", "ভাৰত", "ভারত", "八卦", "ישראל",
"موقع", "বাংলা", "公益", "公司", "香格里拉", "网站", "移动", "我爱你", "москва", "қаз", "католик", "онлайн",
"сайт", "联通", "срб", "бг", "бел", "קום", "时尚", "微博", "淡马锡", "ファッション", "орг", "नेट", "ストア",
"アマゾン", "삼성", "சிங்கப்பூர்", "商标", "商店", "商城", "дети", "мкд", "ею", "ポイント", "新闻", "家電", "كوم",
"中文网", "中信", "中国", "中國", "娱乐", "谷歌", "భారత్", "ලංකා", "電訊盈科", "购物", "クラウド", "ભારત", "通販",
"भारतम्", "भारत", "भारोत", "网店", "संगठन", "餐厅", "网络", "ком", "укр", "香港", "亚马逊", "食品", "飞利浦",
"台湾", "台灣", "手机", "мон", "الجزائر", "عمان", "ارامكو", "ایران", "العليان", "امارات", "بازار", "موريتانيا",
"پاکستان", "الاردن", "بارت", "بھارت", "المغرب", "ابوظبي", "البحرين", "السعودية", "ڀارت", "كاثوليك", "سودان",
"همراه", "عراق", "مليسيا", "澳門", "닷컴", "政府", "شبكة", "بيتك", "عرب", "გე", "机构", "组织机构", "健康",
"ไทย", "سورية", "招聘", "рус", "рф", "تونس", "大拿", "ລາວ", "みんな", "グーグル", "ευ", "ελ", "世界", "書籍",
"ഭാരതം", "ਭਾਰਤ", "网址", "닷넷", "コム", "天主教", "游戏", "vermögensberater", "vermögensberatung", "企业",
"信息", "嘉里大酒店", "嘉里", "مصر", "قطر", "广东", "இலங்கை", "இந்தியா", "հայ", "新加坡", "فلسطين", "政务", "xxx",
"xyz", "yachts", "yahoo", "yamaxun", "yandex", "ye", "yodobashi", "yoga", "yokohama", "you", "youtube", "yt",
"yun", "za", "zappos", "zara", "zero", "zip", "zm", "zone", "zuerich",
// comment for clang-format to prevent it from placing all strings on separate lines
"zw"});
bool is_lower = true;
for (auto c : str) {
if (static_cast<uint32>(c - 'a') > 'z' - 'a') {
is_lower = false;
break;
}
}
if (is_lower) {
// fast path
return tlds.count(str) > 0;
}
string str_lower = utf8_to_lower(str);
if (str_lower != str && utf8_substr(Slice(str_lower), 1) == utf8_substr(str, 1)) {
return false;
}
return tlds.count(str_lower) > 0;
}
static Slice fix_url(Slice str) {
auto full_url = str;
bool has_protocol = false;
auto str_begin = to_lower(str.substr(0, 8));
if (begins_with(str_begin, "http://") || begins_with(str_begin, "https://") || begins_with(str_begin, "ftp://")) {
auto pos = str.find(':');
str = str.substr(pos + 3);
has_protocol = true;
}
auto domain_end = std::min({str.size(), str.find('/'), str.find('?'), str.find('#')}); // TODO server: str.find('#')
auto domain = str.substr(0, domain_end);
auto path = str.substr(domain_end);
auto at_pos = domain.find('@');
if (at_pos < domain.size()) {
domain.remove_prefix(at_pos + 1);
}
domain.truncate(domain.rfind(':'));
if (domain.size() == 12 && (domain[0] == 't' || domain[0] == 'T')) {
string domain_lower = domain.str();
to_lower_inplace(domain_lower);
if (domain_lower == "teiegram.org") {
return Slice();
}
}
int32 balance[3] = {0, 0, 0};
size_t path_pos;
for (path_pos = 0; path_pos < path.size(); path_pos++) {
switch (path[path_pos]) {
case '(':
balance[0]++;
break;
case '[':
balance[1]++;
break;
case '{':
balance[2]++;
break;
case ')':
balance[0]--;
break;
case ']':
balance[1]--;
break;
case '}':
balance[2]--;
break;
}
if (balance[0] < 0 || balance[1] < 0 || balance[2] < 0) {
break;
}
}
Slice bad_path_end_chars(".:;,('?!`");
while (path_pos > 0 && bad_path_end_chars.find(path[path_pos - 1]) < bad_path_end_chars.size()) {
path_pos--;
}
full_url.remove_suffix(path.size() - path_pos);
size_t prev = 0;
size_t domain_part_count = 0;
bool has_non_digit = false;
bool is_ipv4 = true;
for (size_t i = 0; i <= domain.size(); i++) {
if (i == domain.size() || domain[i] == '.') {
auto part_size = i - prev;
if (part_size == 0 || part_size >= 64 || domain[i - 1] == '-') {
return Slice();
}
if (is_ipv4) {
if (part_size > 3) {
is_ipv4 = false;
}
if (part_size == 3 &&
(domain[prev] >= '3' || (domain[prev] == '2' && (domain[prev + 1] >= '6' ||
(domain[prev + 1] == '5' && domain[prev + 2] >= '6'))))) {
is_ipv4 = false;
}
if (domain[prev] == '0' && part_size >= 2) {
is_ipv4 = false;
}
}
domain_part_count++;
if (i != domain.size()) {
prev = i + 1;
}
} else if (!is_digit(domain[i])) {
is_ipv4 = false;
has_non_digit = true;
}
}
if (domain_part_count == 1) {
return Slice();
}
if (is_ipv4 && domain_part_count == 4) {
return full_url;
}
if (!has_non_digit) {
return Slice();
}
auto tld = domain.substr(prev);
if (utf8_length(tld) <= 1) {
return Slice();
}
if (begins_with(tld, "xn--")) {
if (tld.size() <= 5) {
return Slice();
}
for (auto c : tld.substr(4)) {
if (!is_alpha_digit(c)) {
return Slice();
}
}
} else {
if (tld.find('_') < tld.size()) {
return Slice();
}
if (tld.find('-') < tld.size()) {
return Slice();
}
if (!has_protocol && !is_common_tld(tld)) {
return Slice();
}
}
CHECK(prev > 0);
prev--;
while (prev-- > 0) {
if (domain[prev] == '_') {
return Slice();
} else if (domain[prev] == '.') {
break;
}
}
return full_url;
}
const FlatHashSet<Slice, SliceHash> &get_valid_short_usernames() {
static const FlatHashSet<Slice, SliceHash> valid_usernames{"gif", "vid", "pic"};
return valid_usernames;
}
vector<Slice> find_mentions(Slice str) {
auto mentions = match_mentions(str);
td::remove_if(mentions, [](Slice mention) {
mention.remove_prefix(1);
if (mention.size() >= 4) {
return false;
}
auto lowered_mention = to_lower(mention);
return get_valid_short_usernames().count(lowered_mention) == 0;
});
return mentions;
}
vector<Slice> find_bot_commands(Slice str) {
return match_bot_commands(str);
}
vector<Slice> find_hashtags(Slice str) {
return match_hashtags(str);
}
vector<Slice> find_cashtags(Slice str) {
return match_cashtags(str);
}
vector<Slice> find_bank_card_numbers(Slice str) {
vector<Slice> result;
for (auto bank_card : match_bank_card_numbers(str)) {
if (is_valid_bank_card(bank_card)) {
result.emplace_back(bank_card);
}
}
return result;
}
vector<Slice> find_tg_urls(Slice str) {
return match_tg_urls(str);
}
vector<std::pair<Slice, bool>> find_urls(Slice str) {
vector<std::pair<Slice, bool>> result;
for (auto url : match_urls(str)) {
if (is_email_address(url)) {
result.emplace_back(url, true);
} else if (begins_with(url, "mailto:") && is_email_address(url.substr(7))) {
result.emplace_back(url.substr(7), true);
} else {
url = fix_url(url);
if (!url.empty()) {
result.emplace_back(url, false);
}
}
}
return result;
}
vector<std::pair<Slice, int32>> find_media_timestamps(Slice str) {
vector<std::pair<Slice, int32>> result;
for (auto media_timestamp : match_media_timestamps(str)) {
vector<Slice> parts = full_split(media_timestamp, ':');
CHECK(parts.size() >= 2);
if (parts.size() > 3 || parts.back().size() != 2) {
continue;
}
auto seconds = to_integer<int32>(parts.back());
if (seconds >= 60) {
continue;
}
if (parts.size() == 2) {
if (parts[0].size() > 4 || parts[0].empty()) {
continue;
}
auto minutes = to_integer<int32>(parts[0]);
result.emplace_back(media_timestamp, minutes * 60 + seconds);
continue;
} else {
if (parts[0].size() > 2 || parts[1].size() > 2 || parts[0].empty() || parts[1].empty()) {
continue;
}
auto minutes = to_integer<int32>(parts[1]);
if (minutes >= 60) {
continue;
}
auto hours = to_integer<int32>(parts[0]);
result.emplace_back(media_timestamp, hours * 3600 + minutes * 60 + seconds);
}
}
return result;
}
void remove_empty_entities(vector<MessageEntity> &entities) {
td::remove_if(entities, [](const auto &entity) {
if (entity.length <= 0) {
return true;
}
switch (entity.type) {
case MessageEntity::Type::TextUrl:
return entity.argument.empty();
case MessageEntity::Type::MentionName:
return !entity.user_id.is_valid();
case MessageEntity::Type::CustomEmoji:
return !entity.custom_emoji_id.is_valid();
default:
return false;
}
});
}
static int32 text_length(Slice text) {
return narrow_cast<int32>(utf8_utf16_length(text));
}
static void sort_entities(vector<MessageEntity> &entities) {
if (std::is_sorted(entities.begin(), entities.end())) {
return;
}
std::sort(entities.begin(), entities.end());
}
#define check_is_sorted(entities) check_is_sorted_impl((entities), __LINE__)
static void check_is_sorted_impl(const vector<MessageEntity> &entities, int line) {
LOG_CHECK(std::is_sorted(entities.begin(), entities.end())) << line << " " << entities;
}
#define check_non_intersecting(entities) check_non_intersecting_impl((entities), __LINE__)
static void check_non_intersecting_impl(const vector<MessageEntity> &entities, int line) {
for (size_t i = 0; i + 1 < entities.size(); i++) {
LOG_CHECK(entities[i].offset + entities[i].length <= entities[i + 1].offset) << line << " " << entities;
}
}
static constexpr int32 get_entity_type_mask(MessageEntity::Type type) {
return 1 << static_cast<int32>(type);
}
static constexpr int32 get_splittable_entities_mask() {
return get_entity_type_mask(MessageEntity::Type::Bold) | get_entity_type_mask(MessageEntity::Type::Italic) |
get_entity_type_mask(MessageEntity::Type::Underline) |
get_entity_type_mask(MessageEntity::Type::Strikethrough) | get_entity_type_mask(MessageEntity::Type::Spoiler);
}
static constexpr int32 get_blockquote_entities_mask() {
return get_entity_type_mask(MessageEntity::Type::BlockQuote) |
get_entity_type_mask(MessageEntity::Type::ExpandableBlockQuote);
}
static constexpr int32 get_continuous_entities_mask() {
return get_entity_type_mask(MessageEntity::Type::Mention) | get_entity_type_mask(MessageEntity::Type::Hashtag) |
get_entity_type_mask(MessageEntity::Type::BotCommand) | get_entity_type_mask(MessageEntity::Type::Url) |
get_entity_type_mask(MessageEntity::Type::EmailAddress) | get_entity_type_mask(MessageEntity::Type::TextUrl) |
get_entity_type_mask(MessageEntity::Type::MentionName) | get_entity_type_mask(MessageEntity::Type::Cashtag) |
get_entity_type_mask(MessageEntity::Type::PhoneNumber) |
get_entity_type_mask(MessageEntity::Type::BankCardNumber) |
get_entity_type_mask(MessageEntity::Type::MediaTimestamp) |
get_entity_type_mask(MessageEntity::Type::CustomEmoji);
}
static constexpr int32 get_pre_entities_mask() {
return get_entity_type_mask(MessageEntity::Type::Pre) | get_entity_type_mask(MessageEntity::Type::Code) |
get_entity_type_mask(MessageEntity::Type::PreCode);
}
static constexpr int32 get_user_entities_mask() {
return get_splittable_entities_mask() | get_blockquote_entities_mask() |
get_entity_type_mask(MessageEntity::Type::TextUrl) | get_entity_type_mask(MessageEntity::Type::MentionName) |
get_entity_type_mask(MessageEntity::Type::CustomEmoji) | get_pre_entities_mask();
}
static int32 is_splittable_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_splittable_entities_mask()) != 0;
}
static int32 is_blockquote_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_blockquote_entities_mask()) != 0;
}
static int32 is_continuous_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_continuous_entities_mask()) != 0;
}
static int32 is_pre_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_pre_entities_mask()) != 0;
}
static int32 is_user_entity(MessageEntity::Type type) {
return (get_entity_type_mask(type) & get_user_entities_mask()) != 0;
}
static constexpr size_t SPLITTABLE_ENTITY_TYPE_COUNT = 5;
static size_t get_splittable_entity_type_index(MessageEntity::Type type) {
if (static_cast<int32>(type) <= static_cast<int32>(MessageEntity::Type::Bold) + 1) {
// Bold or Italic
return static_cast<int32>(type) - static_cast<int32>(MessageEntity::Type::Bold);
} else if (static_cast<int32>(type) <= static_cast<int32>(MessageEntity::Type::Underline) + 1) {
// Underline or Strikethrough
return static_cast<int32>(type) - static_cast<int32>(MessageEntity::Type::Underline) + 2;
} else {
CHECK(type == MessageEntity::Type::Spoiler);
return 4;
}
}
static bool are_entities_valid(const vector<MessageEntity> &entities) {
if (entities.empty()) {
return true;
}
check_is_sorted(entities);
int32 end_pos[SPLITTABLE_ENTITY_TYPE_COUNT];
std::fill_n(end_pos, SPLITTABLE_ENTITY_TYPE_COUNT, -1);
vector<const MessageEntity *> nested_entities_stack;
int32 nested_entity_type_mask = 0;
for (auto &entity : entities) {
while (!nested_entities_stack.empty() &&
entity.offset >= nested_entities_stack.back()->offset + nested_entities_stack.back()->length) {
// remove non-intersecting entities from the stack
nested_entity_type_mask -= get_entity_type_mask(nested_entities_stack.back()->type);
nested_entities_stack.pop_back();
}
if (!nested_entities_stack.empty()) {
if (entity.offset + entity.length > nested_entities_stack.back()->offset + nested_entities_stack.back()->length) {
// entity intersects some previous entity
return false;
}
if ((nested_entity_type_mask & get_entity_type_mask(entity.type)) != 0) {
// entity has the same type as one of the previous nested
return false;
}
auto parent_type = nested_entities_stack.back()->type;
if (is_pre_entity(parent_type)) {
// Pre and Code can't contain nested entities
return false;
}
// parents are not pre after this point
if (is_pre_entity(entity.type) && (nested_entity_type_mask & ~get_blockquote_entities_mask()) != 0) {
// Pre and Code can't be part of other entities, except blockquote
return false;
}
if ((is_continuous_entity(entity.type) || is_blockquote_entity(entity.type)) &&
(nested_entity_type_mask & get_continuous_entities_mask()) != 0) {
// continuous and blockquote can't be part of other continuous entity
return false;
}
if (is_blockquote_entity(entity.type) && (nested_entity_type_mask & get_blockquote_entities_mask()) != 0) {
// blockquote entities can't be nested
return false;
}
if ((nested_entity_type_mask & get_splittable_entities_mask()) != 0) {
// the previous nested entity may be needed to split for consistency
// alternatively, better entity merging needs to be implemented
return false;
}
}
if (is_splittable_entity(entity.type)) {
auto index = get_splittable_entity_type_index(entity.type);
if (end_pos[index] >= entity.offset) {
// the entities can be merged
return false;
}
end_pos[index] = entity.offset + entity.length;
}
nested_entities_stack.push_back(&entity);
nested_entity_type_mask += get_entity_type_mask(entity.type);
}
return true;
}
// removes all intersecting entities, including nested
static void remove_intersecting_entities(vector<MessageEntity> &entities) {
check_is_sorted(entities);
int32 last_entity_end = 0;
size_t left_entities = 0;
for (size_t i = 0; i < entities.size(); i++) {
CHECK(entities[i].length > 0);
if (entities[i].offset >= last_entity_end) {
last_entity_end = entities[i].offset + entities[i].length;
if (i != left_entities) {
entities[left_entities] = std::move(entities[i]);
}
left_entities++;
}
}
entities.erase(entities.begin() + left_entities, entities.end());
}
// continuous_entities and blockquote_entities must be pre-sorted and non-overlapping
static void remove_entities_intersecting_blockquote(vector<MessageEntity> &entities,
const vector<MessageEntity> &blockquote_entities) {
check_non_intersecting(entities);
check_non_intersecting(blockquote_entities);
if (blockquote_entities.empty()) {
// fast path
return;
}
auto blockquote_it = blockquote_entities.begin();
size_t left_entities = 0;
for (size_t i = 0; i < entities.size(); i++) {
while (blockquote_it != blockquote_entities.end() &&
(!is_blockquote_entity(blockquote_it->type) ||
blockquote_it->offset + blockquote_it->length <= entities[i].offset)) {
++blockquote_it;
}
if (blockquote_it != blockquote_entities.end() &&
(blockquote_it->offset + blockquote_it->length < entities[i].offset + entities[i].length ||
(entities[i].offset < blockquote_it->offset &&
blockquote_it->offset < entities[i].offset + entities[i].length))) {
continue;
}
if (i != left_entities) {
entities[left_entities] = std::move(entities[i]);
}
left_entities++;
}
entities.erase(entities.begin() + left_entities, entities.end());
}
// keeps only non-intersecting entities
// fixes entity offsets from UTF-8 to UTF-16 offsets
static void fix_entity_offsets(Slice text, vector<MessageEntity> &entities) {
if (entities.empty()) {
return;
}
sort_entities(entities);
remove_intersecting_entities(entities);
const unsigned char *begin = text.ubegin();
const unsigned char *ptr = begin;
const unsigned char *end = text.uend();
int32 utf16_pos = 0;
for (auto &entity : entities) {
int cnt = 2;
auto entity_begin = entity.offset;
auto entity_end = entity.offset + entity.length;
auto pos = static_cast<int32>(ptr - begin);
if (entity_begin == pos) {
cnt--;
entity.offset = utf16_pos;
}
uint32 skipped_code = 0;
while (ptr != end && cnt > 0) {
unsigned char c = ptr[0];
utf16_pos += 1 + (c >= 0xf0);
ptr = next_utf8_unsafe(ptr, &skipped_code);
pos = static_cast<int32>(ptr - begin);
if (entity_begin == pos) {
cnt--;
entity.offset = utf16_pos;
} else if (entity_end == pos) {
cnt--;
entity.length = utf16_pos - entity.offset;
}
}
CHECK(cnt == 0);
}
}
vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands, bool skip_media_timestamps) {
vector<MessageEntity> entities;
auto add_entities = [&entities, &text](MessageEntity::Type type, vector<Slice> (*find_entities_f)(Slice)) mutable {
auto new_entities = find_entities_f(text);
for (auto &entity : new_entities) {
auto offset = narrow_cast<int32>(entity.begin() - text.begin());
auto length = narrow_cast<int32>(entity.size());
entities.emplace_back(type, offset, length);
}
};
add_entities(MessageEntity::Type::Mention, find_mentions);
if (!skip_bot_commands) {
add_entities(MessageEntity::Type::BotCommand, find_bot_commands);
}
add_entities(MessageEntity::Type::Hashtag, find_hashtags);
add_entities(MessageEntity::Type::Cashtag, find_cashtags);
// TODO find_phone_numbers
add_entities(MessageEntity::Type::BankCardNumber, find_bank_card_numbers);
add_entities(MessageEntity::Type::Url, find_tg_urls);
auto urls = find_urls(text);
for (auto &url : urls) {
auto type = url.second ? MessageEntity::Type::EmailAddress : MessageEntity::Type::Url;
auto offset = narrow_cast<int32>(url.first.begin() - text.begin());
auto length = narrow_cast<int32>(url.first.size());
entities.emplace_back(type, offset, length);
}
if (!skip_media_timestamps) {
auto media_timestamps = find_media_timestamps(text);
for (auto &entity : media_timestamps) {
auto offset = narrow_cast<int32>(entity.first.begin() - text.begin());
auto length = narrow_cast<int32>(entity.first.size());
entities.emplace_back(MessageEntity::Type::MediaTimestamp, offset, length, entity.second);
}
}
fix_entity_offsets(text, entities);
return entities;
}
static vector<MessageEntity> find_media_timestamp_entities(Slice text) {
vector<MessageEntity> entities;
auto media_timestamps = find_media_timestamps(text);
for (auto &entity : media_timestamps) {
auto offset = narrow_cast<int32>(entity.first.begin() - text.begin());
auto length = narrow_cast<int32>(entity.first.size());
entities.emplace_back(MessageEntity::Type::MediaTimestamp, offset, length, entity.second);
}
fix_entity_offsets(text, entities);
return entities;
}
static vector<MessageEntity> merge_entities(vector<MessageEntity> old_entities, vector<MessageEntity> new_entities) {
if (new_entities.empty()) {
return old_entities;
}
if (old_entities.empty()) {
return new_entities;
}
vector<MessageEntity> result;
result.reserve(old_entities.size() + new_entities.size());
auto new_it = new_entities.begin();
auto new_end = new_entities.end();
for (auto &old_entity : old_entities) {
while (new_it != new_end && new_it->offset + new_it->length <= old_entity.offset) {
result.push_back(std::move(*new_it));
++new_it;
}
auto old_entity_end = old_entity.offset + old_entity.length;
result.push_back(std::move(old_entity));
while (new_it != new_end && new_it->offset < old_entity_end) {
++new_it;
}
}
while (new_it != new_end) {
result.push_back(std::move(*new_it));
++new_it;
}
return result;
}
static bool is_plain_domain(Slice url) {
return url.find('/') >= url.size() && url.find('?') >= url.size() && url.find('#') >= url.size();
}
Slice get_first_url(const FormattedText &text) {
for (auto &entity : text.entities) {
switch (entity.type) {
case MessageEntity::Type::Mention:
break;
case MessageEntity::Type::Hashtag:
break;
case MessageEntity::Type::BotCommand:
break;
case MessageEntity::Type::Url: {
if (entity.length <= 4) {
continue;
}
auto url = utf8_utf16_substr(text.text, entity.offset, entity.length);
string scheme = to_lower(url.substr(0, 4));
if (scheme == "ton:" || begins_with(scheme, "tg:") || scheme == "ftp:" || is_plain_domain(url)) {
continue;
}
return url;
}
case MessageEntity::Type::EmailAddress:
break;
case MessageEntity::Type::Bold:
break;
case MessageEntity::Type::Italic:
break;
case MessageEntity::Type::Underline:
break;
case MessageEntity::Type::Strikethrough:
break;
case MessageEntity::Type::BlockQuote:
break;
case MessageEntity::Type::Code:
break;
case MessageEntity::Type::Pre:
break;
case MessageEntity::Type::PreCode:
break;
case MessageEntity::Type::TextUrl: {
Slice url = entity.argument;
string scheme = to_lower(url.substr(0, 4));
if (scheme == "ton:" || begins_with(scheme, "tg:") || scheme == "ftp:") {
continue;
}
return url;
}
case MessageEntity::Type::MentionName:
break;
case MessageEntity::Type::Cashtag:
break;
case MessageEntity::Type::PhoneNumber:
break;
case MessageEntity::Type::BankCardNumber:
break;
case MessageEntity::Type::MediaTimestamp:
break;
case MessageEntity::Type::Spoiler:
break;
case MessageEntity::Type::CustomEmoji:
break;
case MessageEntity::Type::ExpandableBlockQuote:
break;
default:
UNREACHABLE();
}
}
return Slice();
}
bool is_visible_url(const FormattedText &text, const string &url) {
if (url.empty()) {
return false;
}
auto url_size = static_cast<int32>(utf8_utf16_length(url));
auto cur_offset = 0;
Slice left_text = text.text;
for (auto &entity : text.entities) {
if (entity.type == MessageEntity::Type::Url && url_size == entity.length) {
CHECK(entity.offset >= cur_offset);
left_text = utf8_utf16_substr(left_text, entity.offset - cur_offset);
cur_offset = entity.offset;
if (begins_with(left_text, url)) {
return true;
}
}
}
return false;
}
Result<vector<MessageEntity>> parse_markdown(string &text) {
size_t result_size = 0;
vector<MessageEntity> entities;
size_t size = text.size();
int32 utf16_offset = 0;
for (size_t i = 0; i < size; i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c == '\\' && (text[i + 1] == '_' || text[i + 1] == '*' || text[i + 1] == '`' || text[i + 1] == '[')) {
i++;
text[result_size++] = text[i];
utf16_offset++;
continue;
}
if (c != '_' && c != '*' && c != '`' && c != '[') {
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
text[result_size++] = text[i];
continue;
}
// we are at begin of the entity
size_t begin_pos = i;
char end_character = text[i];
bool is_pre = false;
if (c == '[') {
end_character = ']';
}
i++;
string language;
if (c == '`' && text[i] == '`' && text[i + 1] == '`') {
i += 2;
is_pre = true;
size_t language_end = i;
while (!is_space(text[language_end]) && text[language_end] != '`') {
language_end++;
}
if (i != language_end && language_end < size && text[language_end] != '`') {
language.assign(text, i, language_end - i);
i = language_end;
}
// skip one new line in the beginning of the text
if (text[i] == '\n' || text[i] == '\r') {
if ((text[i + 1] == '\n' || text[i + 1] == '\r') && text[i] != text[i + 1]) {
i += 2;
} else {
i++;
}
}
}
int32 entity_offset = utf16_offset;
while (i < size && (text[i] != end_character || (is_pre && !(text[i + 1] == '`' && text[i + 2] == '`')))) {
auto cur_ch = static_cast<unsigned char>(text[i]);
if (is_utf8_character_first_code_unit(cur_ch)) {
utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
text[result_size++] = text[i++];
}
if (i == size) {
return Status::Error(400, PSLICE() << "Can't find end of the entity starting at byte offset " << begin_pos);
}
if (entity_offset != utf16_offset) {
auto entity_length = utf16_offset - entity_offset;
switch (c) {
case '_':
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
break;
case '*':
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
break;
case '[': {
string url;
if (text[i + 1] != '(') {
// use text as a URL
url.assign(text, begin_pos + 1, i - begin_pos - 1);
} else {
i += 2;
while (i < size && text[i] != ')') {
url.push_back(text[i++]);
}
}
auto user_id = LinkManager::get_link_user_id(url);
if (user_id.is_valid()) {
entities.emplace_back(entity_offset, entity_length, user_id);
} else {
url = LinkManager::get_checked_link(url);
if (!url.empty()) {
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, std::move(url));
}
}
break;
}
case '`':
if (is_pre) {
if (language.empty()) {
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
} else {
entities.emplace_back(MessageEntity::Type::PreCode, entity_offset, entity_length, language);
}
} else {
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
}
break;
default:
UNREACHABLE();
}
}
if (is_pre) {
i += 2;
}
}
text.resize(result_size);
return std::move(entities);
}
Result<vector<MessageEntity>> parse_markdown_v2(string &text) {
size_t result_size = 0;
vector<MessageEntity> entities;
int32 utf16_offset = 0;
struct EntityInfo {
MessageEntity::Type type;
string argument;
int32 entity_offset;
size_t entity_byte_offset;
size_t entity_begin_pos;
EntityInfo(MessageEntity::Type type, string argument, int32 entity_offset, size_t entity_byte_offset,
size_t entity_begin_pos)
: type(type)
, argument(std::move(argument))
, entity_offset(entity_offset)
, entity_byte_offset(entity_byte_offset)
, entity_begin_pos(entity_begin_pos) {
}
};
vector<EntityInfo> nested_entities;
bool have_blockquote = false;
bool can_start_blockquote = true;
for (size_t i = 0; i < text.size(); i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c == '\\' && text[i + 1] > 0 && text[i + 1] <= 126) {
i++;
utf16_offset += 1;
text[result_size++] = text[i];
if (text[i] != '\r') {
can_start_blockquote = (text[i] == '\n');
}
continue;
}
Slice reserved_characters("_*[]()~`>#+-=|{}.!\n");
if (!nested_entities.empty()) {
switch (nested_entities.back().type) {
case MessageEntity::Type::Code:
case MessageEntity::Type::Pre:
case MessageEntity::Type::PreCode:
reserved_characters = Slice("`");
break;
default:
break;
}
}
if (reserved_characters.find(text[i]) == Slice::npos) {
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
if (c != '\r') {
can_start_blockquote = false;
}
}
text[result_size++] = text[i];
continue;
}
bool is_end_of_an_entity = [&] {
if (nested_entities.empty()) {
return false;
}
if (have_blockquote && c == '\n' && (i + 1 == text.size() || text[i + 1] != '>')) {
return true;
}
switch (nested_entities.back().type) {
case MessageEntity::Type::Bold:
return c == '*';
case MessageEntity::Type::Italic:
return c == '_' && text[i + 1] != '_';
case MessageEntity::Type::Code:
return c == '`';
case MessageEntity::Type::Pre:
case MessageEntity::Type::PreCode:
return c == '`' && text[i + 1] == '`' && text[i + 2] == '`';
case MessageEntity::Type::TextUrl:
return c == ']';
case MessageEntity::Type::Underline:
return c == '_' && text[i + 1] == '_';
case MessageEntity::Type::Strikethrough:
return c == '~';
case MessageEntity::Type::Spoiler:
return c == '|' && text[i + 1] == '|';
case MessageEntity::Type::CustomEmoji:
return c == ']';
case MessageEntity::Type::BlockQuote:
return false;
default:
UNREACHABLE();
return false;
}
}();
if (!is_end_of_an_entity) {
// begin of an entity
MessageEntity::Type type;
string argument;
auto entity_byte_offset = i;
switch (c) {
case '_':
if (text[i + 1] == '_') {
type = MessageEntity::Type::Underline;
i++;
} else {
type = MessageEntity::Type::Italic;
}
break;
case '*':
type = MessageEntity::Type::Bold;
break;
case '~':
type = MessageEntity::Type::Strikethrough;
break;
case '|':
if (text[i + 1] == '|') {
i++;
type = MessageEntity::Type::Spoiler;
} else {
return Status::Error(400, PSLICE() << "Character '" << text[i]
<< "' is reserved and must be escaped with the preceding '\\'");
}
break;
case '[':
type = MessageEntity::Type::TextUrl;
break;
case '`':
if (text[i + 1] == '`' && text[i + 2] == '`') {
i += 3;
type = MessageEntity::Type::Pre;
size_t language_end = i;
while (!is_space(text[language_end]) && text[language_end] != '`') {
language_end++;
}
if (i != language_end && language_end < text.size() && text[language_end] != '`') {
type = MessageEntity::Type::PreCode;
argument = text.substr(i, language_end - i);
i = language_end;
}
// skip one new line in the beginning of the text
if (text[i] == '\n' || text[i] == '\r') {
if ((text[i + 1] == '\n' || text[i + 1] == '\r') && text[i] != text[i + 1]) {
i += 2;
} else {
i++;
}
}
i--;
} else {
type = MessageEntity::Type::Code;
}
break;
case '!':
if (text[i + 1] == '[') {
i++;
type = MessageEntity::Type::CustomEmoji;
} else {
return Status::Error(400, PSLICE() << "Character '" << text[i]
<< "' is reserved and must be escaped with the preceding '\\'");
}
break;
case '\n':
utf16_offset += 1;
text[result_size++] = '\n';
can_start_blockquote = true;
type = MessageEntity::Type::Size;
break;
case '>':
if (can_start_blockquote) {
if (have_blockquote) {
type = MessageEntity::Type::Size;
} else {
type = MessageEntity::Type::BlockQuote;
have_blockquote = true;
}
} else {
return Status::Error(400, PSLICE() << "Character '" << text[i]
<< "' is reserved and must be escaped with the preceding '\\'");
}
break;
default:
return Status::Error(
400, PSLICE() << "Character '" << text[i] << "' is reserved and must be escaped with the preceding '\\'");
}
if (type == MessageEntity::Type::Size) {
continue;
}
nested_entities.emplace_back(type, std::move(argument), utf16_offset, entity_byte_offset, result_size);
} else {
// end of an entity
auto type = nested_entities.back().type;
if (c == '\n' && type != MessageEntity::Type::BlockQuote) {
if (type != MessageEntity::Type::Spoiler || !(nested_entities.back().entity_byte_offset == i - 2 ||
(nested_entities.back().entity_byte_offset == i - 3 &&
result_size != 0 && text[result_size - 1] == '\r'))) {
return Status::Error(400, PSLICE() << "Can't find end of " << nested_entities.back().type
<< " entity at byte offset " << nested_entities.back().entity_byte_offset);
}
nested_entities.pop_back();
CHECK(!nested_entities.empty());
type = nested_entities.back().type;
if (type != MessageEntity::Type::BlockQuote) {
CHECK(type != MessageEntity::Type::Spoiler);
return Status::Error(400, PSLICE() << "Can't find end of " << nested_entities.back().type
<< " entity at byte offset " << nested_entities.back().entity_byte_offset);
}
type = MessageEntity::Type::ExpandableBlockQuote;
}
auto argument = std::move(nested_entities.back().argument);
UserId user_id;
CustomEmojiId custom_emoji_id;
bool skip_entity = utf16_offset == nested_entities.back().entity_offset;
switch (type) {
case MessageEntity::Type::Bold:
case MessageEntity::Type::Italic:
case MessageEntity::Type::Code:
case MessageEntity::Type::Strikethrough:
break;
case MessageEntity::Type::Underline:
case MessageEntity::Type::Spoiler:
i++;
break;
case MessageEntity::Type::Pre:
case MessageEntity::Type::PreCode:
i += 2;
break;
case MessageEntity::Type::TextUrl: {
string url;
if (text[i + 1] != '(') {
// use text as a URL
url = text.substr(nested_entities.back().entity_begin_pos,
result_size - nested_entities.back().entity_begin_pos);
} else {
i += 2;
auto url_begin_pos = i;
while (i < text.size() && text[i] != ')') {
if (text[i] == '\\' && text[i + 1] > 0 && text[i + 1] <= 126) {
url += text[i + 1];
i += 2;
continue;
}
url += text[i++];
}
if (text[i] != ')') {
return Status::Error(400, PSLICE() << "Can't find end of a URL at byte offset " << url_begin_pos);
}
}
user_id = LinkManager::get_link_user_id(url);
if (!user_id.is_valid()) {
url = LinkManager::get_checked_link(url);
if (url.empty()) {
skip_entity = true;
} else {
argument = std::move(url);
}
}
break;
}
case MessageEntity::Type::CustomEmoji: {
if (text[i + 1] != '(') {
return Status::Error(400, "Custom emoji entity must contain a tg://emoji URL");
}
i += 2;
string url;
auto url_begin_pos = i;
while (i < text.size() && text[i] != ')') {
if (text[i] == '\\' && text[i + 1] > 0 && text[i + 1] <= 126) {
url += text[i + 1];
i += 2;
continue;
}
url += text[i++];
}
if (text[i] != ')') {
return Status::Error(400, PSLICE()
<< "Can't find end of a custom emoji URL at byte offset " << url_begin_pos);
}
TRY_RESULT_ASSIGN(custom_emoji_id, LinkManager::get_link_custom_emoji_id(url));
break;
}
case MessageEntity::Type::BlockQuote:
case MessageEntity::Type::ExpandableBlockQuote:
CHECK(have_blockquote);
have_blockquote = false;
text[result_size++] = text[i];
can_start_blockquote = true;
utf16_offset += 1;
skip_entity = false;
break;
default:
UNREACHABLE();
return false;
}
if (!skip_entity) {
auto entity_offset = nested_entities.back().entity_offset;
auto entity_length = utf16_offset - entity_offset;
if (user_id.is_valid()) {
entities.emplace_back(entity_offset, entity_length, user_id);
} else if (custom_emoji_id.is_valid()) {
entities.emplace_back(type, entity_offset, entity_length, custom_emoji_id);
} else {
entities.emplace_back(type, entity_offset, entity_length, std::move(argument));
}
}
nested_entities.pop_back();
}
}
if (have_blockquote) {
CHECK(!nested_entities.empty());
auto type = MessageEntity::Type::BlockQuote;
if (nested_entities.back().type == MessageEntity::Type::Spoiler &&
nested_entities.back().entity_byte_offset == text.size() - 2) {
nested_entities.pop_back();
CHECK(!nested_entities.empty());
type = MessageEntity::Type::ExpandableBlockQuote;
}
if (nested_entities.back().type == MessageEntity::Type::BlockQuote) {
have_blockquote = false;
auto entity_offset = nested_entities.back().entity_offset;
auto entity_length = utf16_offset - entity_offset;
if (entity_length != 0) {
entities.emplace_back(type, entity_offset, entity_length);
}
nested_entities.pop_back();
}
}
if (!nested_entities.empty()) {
return Status::Error(400, PSLICE() << "Can't find end of " << nested_entities.back().type
<< " entity at byte offset " << nested_entities.back().entity_byte_offset);
}
sort_entities(entities);
text.resize(result_size);
return std::move(entities);
}
static vector<Slice> find_text_url_entities_v3(Slice text) {
vector<Slice> result;
size_t size = text.size();
for (size_t i = 0; i < size; i++) {
if (text[i] != '[') {
continue;
}
auto text_begin = i;
auto text_end = text_begin + 1;
while (text_end < size && text[text_end] != ']') {
text_end++;
}
i = text_end; // prevent quadratic asymptotic
if (text_end == size || text_end == text_begin + 1) {
continue;
}
auto url_begin = text_end + 1;
if (url_begin == size || text[url_begin] != '(') {
continue;
}
size_t url_end = url_begin + 1;
while (url_end < size && text[url_end] != ')') {
url_end++;
}
i = url_end; // prevent quadratic asymptotic, disallows [a](b[c](t.me)
if (url_end < size) {
Slice url = text.substr(url_begin + 1, url_end - url_begin - 1);
if (!LinkManager::get_checked_link(url).empty()) {
result.push_back(text.substr(text_begin, text_end - text_begin + 1));
result.push_back(text.substr(url_begin, url_end - url_begin + 1));
}
}
}
return result;
}
// entities must be valid for the text
static FormattedText parse_text_url_entities_v3(Slice text, const vector<MessageEntity> &entities) {
// continuous entities can't intersect TextUrl entities,
// so try to find new TextUrl entities only between the predetermined continuous entities
Slice debug_initial_text = text;
FormattedText result;
int32 result_text_utf16_length = 0;
vector<MessageEntity> part_entities;
vector<MessageEntity> part_splittable_entities[SPLITTABLE_ENTITY_TYPE_COUNT];
int32 part_begin = 0;
int32 max_end = 0;
int32 skipped_length = 0;
auto add_part = [&](int32 part_end) {
// we have [part_begin, max_end) kept part and [max_end, part_end) part to parse text_url entities
if (max_end != part_begin) {
// add all entities from the kept part
auto kept_part_text = utf8_utf16_substr(text, 0, max_end - part_begin);
text = text.substr(kept_part_text.size());
result.text.append(kept_part_text.begin(), kept_part_text.size());
append(result.entities, std::move(part_entities));
part_entities.clear();
result_text_utf16_length += max_end - part_begin;
}
size_t splittable_entity_pos[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
for (const auto &splittable_entities : part_splittable_entities) {
check_non_intersecting(splittable_entities);
}
if (part_end != max_end) {
// try to find text_url entities in the left part
auto parsed_part_text = utf8_utf16_substr(text, 0, part_end - max_end);
text = text.substr(parsed_part_text.size());
vector<Slice> text_urls = find_text_url_entities_v3(parsed_part_text);
int32 text_utf16_offset = max_end;
size_t prev_pos = 0;
for (size_t i = 0; i < text_urls.size(); i += 2) {
auto text_begin_pos = static_cast<size_t>(text_urls[i].begin() - parsed_part_text.begin());
auto text_end_pos = text_begin_pos + text_urls[i].size() - 1;
auto url_begin_pos = static_cast<size_t>(text_urls[i + 1].begin() - parsed_part_text.begin());
auto url_end_pos = url_begin_pos + text_urls[i + 1].size() - 1;
CHECK(parsed_part_text[text_begin_pos] == '[');
CHECK(parsed_part_text[text_end_pos] == ']');
CHECK(url_begin_pos == text_end_pos + 1);
CHECK(parsed_part_text[url_begin_pos] == '(');
CHECK(parsed_part_text[url_end_pos] == ')');
Slice before_text_url = parsed_part_text.substr(prev_pos, text_begin_pos - prev_pos);
auto before_text_url_utf16_length = text_length(before_text_url);
result_text_utf16_length += before_text_url_utf16_length;
result.text.append(before_text_url.begin(), before_text_url.size());
text_utf16_offset += before_text_url_utf16_length;
Slice text_url = parsed_part_text.substr(text_begin_pos + 1, text_end_pos - text_begin_pos - 1);
auto text_url_utf16_length = text_length(text_url);
Slice url = parsed_part_text.substr(url_begin_pos + 1, url_end_pos - url_begin_pos - 1);
auto url_utf16_length = text_length(url);
result.entities.emplace_back(MessageEntity::Type::TextUrl, result_text_utf16_length, text_url_utf16_length,
LinkManager::get_checked_link(url));
result.text.append(text_url.begin(), text_url.size());
result_text_utf16_length += text_url_utf16_length;
auto initial_utf16_length = 1 + text_url_utf16_length + 1 + 1 + url_utf16_length + 1;
// adjust splittable entities, removing deleted parts from them
// in the segment [text_utf16_offset, text_utf16_offset + initial_utf16_length)
// the first character and the last (url_utf16_length + 3) characters are deleted
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
auto &pos = splittable_entity_pos[index];
auto &splittable_entities = part_splittable_entities[index];
while (pos < splittable_entities.size() &&
splittable_entities[pos].offset < text_utf16_offset + initial_utf16_length) {
auto offset = splittable_entities[pos].offset;
auto length = splittable_entities[pos].length;
if (offset + length > text_utf16_offset + 1 + text_url_utf16_length) {
// ends after last removed part; truncate length
length = text_utf16_offset + 1 + text_url_utf16_length - offset;
}
if (offset >= text_utf16_offset + 1) {
offset--;
} else if (offset + length >= text_utf16_offset + 1) {
length--;
}
if (length > 0) {
CHECK(offset >= skipped_length);
CHECK(offset - skipped_length + length <= result_text_utf16_length);
if (offset < text_utf16_offset && offset + length > text_utf16_offset) {
// entity intersects start on the new text_url entity; split it
result.entities.emplace_back(splittable_entities[pos].type, offset - skipped_length,
text_utf16_offset - offset);
length -= text_utf16_offset - offset;
offset = text_utf16_offset;
}
result.entities.emplace_back(splittable_entities[pos].type, offset - skipped_length, length);
}
if (splittable_entities[pos].offset + splittable_entities[pos].length >
text_utf16_offset + initial_utf16_length) {
// begins before end of the segment, but ends after it
// need to keep the entity for future segments, so split the entity
splittable_entities[pos].length = splittable_entities[pos].offset + splittable_entities[pos].length -
(text_utf16_offset + initial_utf16_length);
splittable_entities[pos].offset = text_utf16_offset + initial_utf16_length;
} else {
pos++;
}
}
}
text_utf16_offset += initial_utf16_length;
skipped_length += 2 + 2 + url_utf16_length;
prev_pos = url_end_pos + 1;
}
result.text.append(parsed_part_text.begin() + prev_pos, parsed_part_text.size() - prev_pos);
result_text_utf16_length += part_end - text_utf16_offset;
}
// now add all left splittable entities from [part_begin, part_end)
for (size_t index = 0; index < SPLITTABLE_ENTITY_TYPE_COUNT; index++) {
auto &pos = splittable_entity_pos[index];
auto &splittable_entities = part_splittable_entities[index];
while (pos < splittable_entities.size() && splittable_entities[pos].offset < part_end) {
if (splittable_entities[pos].offset + splittable_entities[pos].length > part_end) {
// begins before end of the segment, but ends after it
// need to keep the entity for future segments, so split the entity
// entities don't intersect each other, so there can be at most one such entity
result.entities.emplace_back(splittable_entities[pos].type, splittable_entities[pos].offset - skipped_length,
part_end - splittable_entities[pos].offset);
splittable_entities[pos].length =
splittable_entities[pos].offset + splittable_entities[pos].length - part_end;
splittable_entities[pos].offset = part_end;
} else {
result.entities.emplace_back(splittable_entities[pos].type, splittable_entities[pos].offset - skipped_length,
splittable_entities[pos].length);
pos++;
}
}
if (pos == splittable_entities.size()) {
splittable_entities.clear();
} else {
CHECK(pos == splittable_entities.size() - 1);
LOG_CHECK(!text.empty()) << '"' << debug_initial_text << "\" " << entities;
splittable_entities[0] = std::move(splittable_entities.back());
splittable_entities.resize(1);
}
}
part_begin = part_end;
};
for (const auto &entity : entities) {
if (is_splittable_entity(entity.type)) {
auto index = get_splittable_entity_type_index(entity.type);
part_splittable_entities[index].push_back(entity);
continue;
}
CHECK(is_continuous_entity(entity.type));
if (entity.offset > max_end) {
// found a gap from max_end to entity.offset between predetermined entities
add_part(entity.offset);
} else {
CHECK(entity.offset == max_end);
}
max_end = entity.offset + entity.length;
part_entities.push_back(entity);
part_entities.back().offset -= skipped_length;
}
add_part(part_begin + text_length(text));
return result;
}
static vector<MessageEntity> find_splittable_entities_v3(Slice text, const vector<MessageEntity> &entities) {
FlatHashSet<int32, Hash<int32>> unallowed_boundaries;
for (auto &entity : entities) {
unallowed_boundaries.insert(entity.offset + 1);
unallowed_boundaries.insert(entity.offset + entity.length + 1);
if (entity.type == MessageEntity::Type::Mention || entity.type == MessageEntity::Type::Hashtag ||
entity.type == MessageEntity::Type::BotCommand || entity.type == MessageEntity::Type::Cashtag ||
entity.type == MessageEntity::Type::PhoneNumber || entity.type == MessageEntity::Type::BankCardNumber) {
for (int32 i = 1; i < entity.length; i++) {
unallowed_boundaries.insert(entity.offset + i + 1);
}
}
}
auto found_entities = find_entities(text, false, true);
td::remove_if(found_entities, [](const auto &entity) {
return entity.type == MessageEntity::Type::EmailAddress || entity.type == MessageEntity::Type::Url;
});
for (auto &entity : found_entities) {
for (int32 i = 0; i <= entity.length; i++) {
unallowed_boundaries.insert(entity.offset + i + 1);
}
}
vector<MessageEntity> result;
int32 splittable_entity_offset[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
int32 utf16_offset = 0;
for (size_t i = 0; i + 1 < text.size(); i++) {
auto c = static_cast<unsigned char>(text[i]);
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
if ((c == '_' || c == '*' || c == '~' || c == '|') && text[i] == text[i + 1] &&
unallowed_boundaries.count(utf16_offset + 1) == 0) {
auto j = i + 2;
while (j != text.size() && text[j] == text[i] &&
unallowed_boundaries.count(utf16_offset + static_cast<int32>(j - i)) == 0) {
j++;
}
if (j == i + 2) {
auto type = [c] {
switch (c) {
case '_':
return MessageEntity::Type::Italic;
case '*':
return MessageEntity::Type::Bold;
case '~':
return MessageEntity::Type::Strikethrough;
case '|':
return MessageEntity::Type::Spoiler;
default:
UNREACHABLE();
return MessageEntity::Type::Size;
}
}();
auto index = get_splittable_entity_type_index(type);
if (splittable_entity_offset[index] != 0) {
auto length = utf16_offset - splittable_entity_offset[index] - 1;
if (length > 0) {
result.emplace_back(type, splittable_entity_offset[index], length);
}
splittable_entity_offset[index] = 0;
} else {
splittable_entity_offset[index] = utf16_offset + 1;
}
}
utf16_offset += narrow_cast<int32>(j - i - 1);
i = j - 1;
}
}
return result;
}
// entities must be valid and can contain only splittable and continuous entities
// __italic__ ~~strikethrough~~ **bold** ||spoiler|| and [text_url](telegram.org) entities are left to be parsed
static FormattedText parse_markdown_v3_without_pre(Slice text, vector<MessageEntity> entities) {
check_is_sorted(entities);
FormattedText parsed_text_url_text;
if (text.find('[') != string::npos) {
parsed_text_url_text = parse_text_url_entities_v3(text, entities);
text = parsed_text_url_text.text;
entities = std::move(parsed_text_url_text.entities);
}
// splittable entities are sorted only within a fixed type now
bool have_splittable_entities = false;
for (size_t i = 0; i + 1 < text.size(); i++) {
if ((text[i] == '_' || text[i] == '*' || text[i] == '~' || text[i] == '|') && text[i] == text[i + 1]) {
have_splittable_entities = true;
break;
}
}
if (!have_splittable_entities) {
// fast path
sort_entities(entities);
return {text.str(), std::move(entities)};
}
auto found_splittable_entities = find_splittable_entities_v3(text, entities);
vector<int32> removed_pos;
for (auto &entity : found_splittable_entities) {
removed_pos.push_back(entity.offset - 1);
removed_pos.push_back(entity.offset + entity.length + 1);
}
std::sort(removed_pos.begin(), removed_pos.end());
string new_text;
CHECK(text.size() >= 2 * removed_pos.size());
new_text.reserve(text.size() - 2 * removed_pos.size());
size_t j = 0;
int32 utf16_offset = 0;
for (size_t i = 0; i < text.size(); i++) {
auto c = static_cast<unsigned char>(text[i]);
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
if (j < removed_pos.size() && utf16_offset == removed_pos[j]) {
i++;
utf16_offset++;
CHECK(j + 1 == removed_pos.size() || removed_pos[j + 1] >= removed_pos[j] + 2);
j++;
} else {
new_text += text[i];
}
}
CHECK(j == removed_pos.size());
combine(entities, std::move(found_splittable_entities));
for (auto &entity : entities) {
auto removed_before_begin = narrow_cast<int32>(
std::upper_bound(removed_pos.begin(), removed_pos.end(), entity.offset) - removed_pos.begin());
auto removed_before_end = narrow_cast<int32>(
std::upper_bound(removed_pos.begin(), removed_pos.end(), entity.offset + entity.length) - removed_pos.begin());
entity.length -= 2 * (removed_before_end - removed_before_begin);
entity.offset -= 2 * removed_before_begin;
CHECK(entity.offset >= 0);
CHECK(entity.length >= 0);
CHECK(entity.offset + entity.length <= utf16_offset);
}
remove_empty_entities(entities);
sort_entities(entities);
return {std::move(new_text), std::move(entities)};
}
static FormattedText parse_pre_entities_v3(Slice text) {
string result;
vector<MessageEntity> entities;
size_t size = text.size();
int32 utf16_offset = 0;
for (size_t i = 0; i < size; i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c != '`') {
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
result.push_back(text[i]);
continue;
}
size_t j = i + 1;
while (j < size && text[j] == '`') {
j++;
}
if (j - i == 1 || j - i == 3) {
// trying to find end of the entity
int32 entity_length = 0;
bool is_found = false;
for (size_t end_tag_begin = j; end_tag_begin < size; end_tag_begin++) {
auto cur_c = static_cast<unsigned char>(text[end_tag_begin]);
if (cur_c == '`') {
// possible end tag
size_t end_tag_end = end_tag_begin + 1;
while (end_tag_end < size && text[end_tag_end] == '`') {
end_tag_end++;
}
if (end_tag_end - end_tag_begin == j - i) {
// end tag found
CHECK(entity_length > 0);
auto entity_begin = j;
string language_code;
if (j - i == 3) {
size_t language_code_end = j;
while (language_code_end < end_tag_begin - 1 && 33 <= text[language_code_end] &&
text[language_code_end] <= 126) {
language_code_end++;
}
if (language_code_end < end_tag_begin - 1 && text[language_code_end] == '\n') {
language_code = text.substr(entity_begin, language_code_end - entity_begin).str();
entity_begin = language_code_end + 1;
entity_length -= static_cast<int32>(entity_begin - j);
CHECK(entity_length > 0);
}
}
if (!language_code.empty()) {
entities.emplace_back(MessageEntity::Type::PreCode, utf16_offset, entity_length,
std::move(language_code));
} else {
entities.emplace_back(j - i == 3 ? MessageEntity::Type::Pre : MessageEntity::Type::Code, utf16_offset,
entity_length);
}
result.append(text.begin() + entity_begin, end_tag_begin - entity_begin);
utf16_offset += entity_length;
i = end_tag_end - 1;
is_found = true;
break;
} else {
// not an end tag, skip
entity_length += narrow_cast<int32>(end_tag_end - end_tag_begin);
end_tag_begin = end_tag_end - 1;
}
} else if (is_utf8_character_first_code_unit(cur_c)) {
entity_length += 1 + (cur_c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
}
if (is_found) {
continue;
}
}
result.append(text.begin() + i, j - i);
utf16_offset += narrow_cast<int32>(j - i);
i = j - 1;
}
return {std::move(result), std::move(entities)};
}
// entities must be valid for the text
static FormattedText parse_pre_entities_v3(Slice text, vector<MessageEntity> entities) {
// nothing can intersect pre entities, so ignore all '`' inside the predetermined entities
// and try to find new pre entities only between the predetermined entities
FormattedText result;
int32 result_text_utf16_length = 0;
int32 part_begin = 0;
int32 max_end = 0;
int32 skipped_length = 0;
auto add_part = [&](int32 part_end) {
// we have [part_begin, max_end) kept part and [max_end, part_end) part to parse pre entities
CHECK(part_begin == result_text_utf16_length + skipped_length);
if (max_end != part_begin) {
// add the kept part
auto kept_part_text = utf8_utf16_substr(text, 0, max_end - part_begin);
text = text.substr(kept_part_text.size());
result.text.append(kept_part_text.begin(), kept_part_text.size());
result_text_utf16_length += max_end - part_begin;
}
if (part_end != max_end) {
// try to find pre entities in the left part
auto parsed_part_text = utf8_utf16_substr(text, 0, part_end - max_end);
text = text.substr(parsed_part_text.size());
if (parsed_part_text.find('`') == string::npos) {
// fast path, no pre entities; just append the text
result.text.append(parsed_part_text.begin(), parsed_part_text.size());
result_text_utf16_length += part_end - max_end;
} else {
FormattedText parsed_text = parse_pre_entities_v3(parsed_part_text);
auto new_skipped_length = static_cast<int32>(parsed_part_text.size() - parsed_text.text.size());
CHECK(new_skipped_length < part_end - max_end);
result.text += parsed_text.text;
for (auto &entity : parsed_text.entities) {
entity.offset += result_text_utf16_length;
}
append(result.entities, std::move(parsed_text.entities));
result_text_utf16_length += part_end - max_end - new_skipped_length;
skipped_length += new_skipped_length;
}
}
part_begin = part_end;
};
for (auto &entity : entities) {
if (entity.offset > max_end) {
// found a gap from max_end to entity.offset between predetermined entities
add_part(entity.offset);
}
max_end = td::max(max_end, entity.offset + entity.length);
result.entities.push_back(std::move(entity));
result.entities.back().offset -= skipped_length;
}
add_part(part_begin + text_length(text));
return result;
}
// entities must be valid and can contain only splittable, continuous and pre entities
static FormattedText parse_markdown_v3_without_blockquote(FormattedText text) {
if (text.text.find('`') != string::npos) {
text = parse_pre_entities_v3(text.text, std::move(text.entities));
check_is_sorted(text.entities);
}
bool have_pre = false;
for (auto &entity : text.entities) {
if (is_pre_entity(entity.type)) {
have_pre = true;
break;
}
}
if (!have_pre) {
// fast path
return parse_markdown_v3_without_pre(text.text, std::move(text.entities));
}
FormattedText result;
int32 result_text_utf16_length = 0;
vector<MessageEntity> part_entities;
int32 part_begin = 0;
int32 max_end = 0;
Slice left_text = text.text;
auto add_part = [&](int32 part_end) {
auto part_text = utf8_utf16_substr(left_text, 0, part_end - part_begin);
left_text = left_text.substr(part_text.size());
FormattedText part = parse_markdown_v3_without_pre(part_text, std::move(part_entities));
part_entities.clear();
result.text += part.text;
for (auto &entity : part.entities) {
entity.offset += result_text_utf16_length;
}
append(result.entities, std::move(part.entities));
result_text_utf16_length += text_length(part.text);
part_begin = part_end;
};
for (size_t i = 0; i < text.entities.size(); i++) {
auto &entity = text.entities[i];
CHECK(is_splittable_entity(entity.type) || is_pre_entity(entity.type) || is_continuous_entity(entity.type));
if (is_pre_entity(entity.type)) {
CHECK(entity.offset >= max_end);
CHECK(i + 1 == text.entities.size() || text.entities[i + 1].offset >= entity.offset + entity.length);
add_part(entity.offset);
auto part_text = utf8_utf16_substr(left_text, 0, entity.length);
left_text = left_text.substr(part_text.size());
result.text.append(part_text.begin(), part_text.size());
result.entities.push_back(entity);
result.entities.back().offset = result_text_utf16_length;
result_text_utf16_length += entity.length;
part_begin = entity.offset + entity.length;
} else {
CHECK(entity.offset >= part_begin);
part_entities.push_back(entity);
part_entities.back().offset -= part_begin;
}
max_end = td::max(max_end, entity.offset + entity.length);
}
add_part(part_begin + text_length(left_text));
return result;
}
// text entities must be valid
// returned entities must be resplit and fixed
FormattedText parse_markdown_v3(FormattedText text) {
bool have_blockquote = false;
for (auto &entity : text.entities) {
if (is_blockquote_entity(entity.type)) {
have_blockquote = true;
break;
}
}
if (!have_blockquote) {
// fast path
return parse_markdown_v3_without_blockquote(std::move(text));
}
FormattedText result;
int32 result_text_utf16_length = 0;
vector<MessageEntity> part_entities;
int32 part_begin = 0;
int32 max_end = 0;
Slice left_text = text.text;
auto add_part = [&](int32 part_end) {
auto part_text = utf8_utf16_substr(left_text, 0, part_end - part_begin);
left_text = left_text.substr(part_text.size());
FormattedText part = parse_markdown_v3_without_blockquote({part_text.str(), std::move(part_entities)});
part_entities.clear();
result.text += part.text;
for (auto &entity : part.entities) {
entity.offset += result_text_utf16_length;
}
append(result.entities, std::move(part.entities));
result_text_utf16_length += text_length(part.text);
part_begin = part_end;
};
for (size_t i = 0; i < text.entities.size(); i++) {
auto &entity = text.entities[i];
CHECK(is_splittable_entity(entity.type) || is_pre_entity(entity.type) || is_continuous_entity(entity.type) ||
is_blockquote_entity(entity.type));
if (is_blockquote_entity(entity.type)) {
CHECK(entity.offset >= max_end);
add_part(entity.offset);
auto offset = result_text_utf16_length;
result.entities.push_back(entity);
result.entities.back().offset = offset;
auto index = result.entities.size() - 1;
while (i + 1 < text.entities.size() &&
text.entities[i + 1].offset + text.entities[i + 1].length <= entity.offset + entity.length) {
i++;
auto &next_entity = text.entities[i];
CHECK(is_splittable_entity(next_entity.type) || is_pre_entity(next_entity.type) ||
is_continuous_entity(next_entity.type));
CHECK(next_entity.offset >= part_begin);
part_entities.push_back(next_entity);
part_entities.back().offset -= part_begin;
}
CHECK(i + 1 == text.entities.size() || text.entities[i + 1].offset >= entity.offset + entity.length);
add_part(entity.offset + entity.length);
result.entities[index].length = result_text_utf16_length - offset;
} else {
CHECK(entity.offset >= part_begin);
part_entities.push_back(entity);
part_entities.back().offset -= part_begin;
}
max_end = td::max(max_end, entity.offset + entity.length);
}
add_part(part_begin + text_length(left_text));
return result;
}
// text entities must be valid
FormattedText get_markdown_v3(FormattedText text) {
if (text.entities.empty()) {
return text;
}
check_is_sorted(text.entities);
for (auto &entity : text.entities) {
if (!is_user_entity(entity.type)) {
return text;
}
}
FormattedText result;
struct EntityInfo {
const MessageEntity *entity;
int32 utf16_added_before;
EntityInfo(MessageEntity *entity, int32 utf16_added_before)
: entity(entity), utf16_added_before(utf16_added_before) {
}
};
vector<EntityInfo> nested_entities_stack;
size_t current_entity = 0;
int32 utf16_offset = 0;
int32 utf16_added = 0;
auto is_valid_language_code = [](Slice code) {
for (auto c : code) {
if (c < 33 || c > 126) {
return false;
}
}
return true;
};
for (size_t pos = 0; pos <= text.text.size(); pos++) {
auto c = static_cast<unsigned char>(text.text[pos]);
if (is_utf8_character_first_code_unit(c)) {
while (!nested_entities_stack.empty()) {
const auto *entity = nested_entities_stack.back().entity;
auto entity_end = entity->offset + entity->length;
if (utf16_offset < entity_end) {
break;
}
CHECK(utf16_offset == entity_end);
bool need_entity = false;
switch (entity->type) {
case MessageEntity::Type::Italic:
result.text += "__";
utf16_added += 2;
break;
case MessageEntity::Type::Bold:
result.text += "**";
utf16_added += 2;
break;
case MessageEntity::Type::Strikethrough:
result.text += "~~";
utf16_added += 2;
break;
case MessageEntity::Type::Spoiler:
result.text += "||";
utf16_added += 2;
break;
case MessageEntity::Type::TextUrl:
result.text += "](";
result.text += entity->argument;
result.text += ')';
utf16_added += narrow_cast<int32>(3 + entity->argument.size());
break;
case MessageEntity::Type::Code:
result.text += '`';
utf16_added++;
break;
case MessageEntity::Type::Pre:
result.text += "```";
utf16_added += 3;
break;
case MessageEntity::Type::PreCode:
if (is_valid_language_code(entity->argument)) {
result.text += "```";
utf16_added += 3;
} else {
need_entity = true;
}
break;
default:
need_entity = true;
break;
}
if (need_entity) {
result.entities.push_back(*entity);
result.entities.back().offset += nested_entities_stack.back().utf16_added_before;
result.entities.back().length += utf16_added - nested_entities_stack.back().utf16_added_before;
}
nested_entities_stack.pop_back();
}
while (current_entity < text.entities.size() && utf16_offset >= text.entities[current_entity].offset) {
CHECK(utf16_offset == text.entities[current_entity].offset);
switch (text.entities[current_entity].type) {
case MessageEntity::Type::Italic:
result.text += "__";
utf16_added += 2;
break;
case MessageEntity::Type::Bold:
result.text += "**";
utf16_added += 2;
break;
case MessageEntity::Type::Strikethrough:
result.text += "~~";
utf16_added += 2;
break;
case MessageEntity::Type::Spoiler:
result.text += "||";
utf16_added += 2;
break;
case MessageEntity::Type::TextUrl:
result.text += '[';
utf16_added++;
break;
case MessageEntity::Type::Code:
result.text += '`';
utf16_added++;
break;
case MessageEntity::Type::Pre:
result.text += "```";
utf16_added += 3;
if (c != '\n') {
result.text += "\n";
utf16_added++;
}
break;
case MessageEntity::Type::PreCode:
if (is_valid_language_code(text.entities[current_entity].argument)) {
result.text += "```";
utf16_added += 3;
if (!text.entities[current_entity].argument.empty()) {
result.text += text.entities[current_entity].argument;
utf16_added += static_cast<int32>(text.entities[current_entity].argument.size());
}
if (c != '\n') {
result.text += "\n";
utf16_added++;
}
}
break;
default:
// keep as is
break;
}
nested_entities_stack.emplace_back(&text.entities[current_entity++], utf16_added);
}
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
if (pos == text.text.size()) {
break;
}
result.text.push_back(text.text[pos]);
}
sort_entities(result.entities);
if (parse_markdown_v3(result) != text) {
return text;
}
return result;
}
static uint32 decode_html_entity(CSlice text, size_t &pos) {
CHECK(text[pos] == '&');
size_t end_pos = pos + 1;
uint32 res = 0;
if (text[pos + 1] == '#') {
// numeric character reference
end_pos++;
if (text[pos + 2] == 'x') {
// hexadecimal numeric character reference
end_pos++;
while (is_hex_digit(text[end_pos])) {
res = res * 16 + hex_to_int(text[end_pos++]);
}
} else {
// decimal numeric character reference
while (is_digit(text[end_pos])) {
res = res * 10 + text[end_pos++] - '0';
}
}
if (res == 0 || res >= 0x10ffff || end_pos - pos >= 10) {
return 0;
}
} else {
while (is_alpha(text[end_pos])) {
end_pos++;
}
Slice entity = text.substr(pos + 1, end_pos - pos - 1);
if (entity == Slice("lt")) {
res = static_cast<uint32>('<');
} else if (entity == Slice("gt")) {
res = static_cast<uint32>('>');
} else if (entity == Slice("amp")) {
res = static_cast<uint32>('&');
} else if (entity == Slice("quot")) {
res = static_cast<uint32>('"');
} else {
// unsupported literal entity
return 0;
}
}
if (text[end_pos] == ';') {
pos = end_pos + 1;
} else {
pos = end_pos;
}
return res;
}
Result<vector<MessageEntity>> parse_html(string &str) {
auto str_size = str.size();
const char *text = str.c_str();
auto result_end = MutableSlice(str).ubegin();
const unsigned char *result_begin = result_end;
vector<MessageEntity> entities;
int32 utf16_offset = 0;
bool need_recheck_utf8 = false;
struct EntityInfo {
string tag_name;
string argument;
int32 entity_offset;
size_t entity_begin_pos;
EntityInfo(string &&tag_name, string &&argument, int32 entity_offset, size_t entity_begin_pos)
: tag_name(std::move(tag_name))
, argument(std::move(argument))
, entity_offset(entity_offset)
, entity_begin_pos(entity_begin_pos) {
}
};
vector<EntityInfo> nested_entities;
for (size_t i = 0; i < str_size; i++) {
auto c = static_cast<unsigned char>(text[i]);
if (c == '&') {
auto code = decode_html_entity(str, i);
if (code != 0) {
i--; // i will be incremented in for
utf16_offset += 1 + (code > 0xffff);
if (code >= 0xd800 && code <= 0xdfff) {
// half of a surrogate pair
need_recheck_utf8 = true;
}
result_end = append_utf8_character_unsafe(result_end, code);
CHECK(result_end <= result_begin + i);
continue;
}
}
if (c != '<') {
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
*result_end++ = c;
continue;
}
auto begin_pos = i++;
if (text[i] != '/') {
// begin of an entity
while (!is_space(text[i]) && text[i] != '>') {
i++;
}
if (text[i] == 0) {
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
}
string tag_name = to_lower(Slice(text + begin_pos + 1, i - begin_pos - 1));
if (tag_name != "a" && tag_name != "b" && tag_name != "strong" && tag_name != "i" && tag_name != "em" &&
tag_name != "s" && tag_name != "strike" && tag_name != "del" && tag_name != "u" && tag_name != "ins" &&
tag_name != "tg-spoiler" && tag_name != "tg-emoji" && tag_name != "span" && tag_name != "pre" &&
tag_name != "code" && tag_name != "blockquote") {
return Status::Error(400, PSLICE()
<< "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
string argument;
while (text[i] != '>') {
while (text[i] != 0 && is_space(text[i])) {
i++;
}
if (text[i] == '>') {
break;
}
auto attribute_begin_pos = i;
while (!is_space(text[i]) && text[i] != '=' && text[i] != '>' && text[i] != '/' && text[i] != '"' &&
text[i] != '\'') {
i++;
}
Slice attribute_name(text + attribute_begin_pos, i - attribute_begin_pos);
if (attribute_name.empty()) {
return Status::Error(
400, PSLICE() << "Empty attribute name in the tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
while (text[i] != 0 && is_space(text[i])) {
i++;
}
if (text[i] != '=') {
if (text[i] == 0) {
return Status::Error(400, PSLICE()
<< "Unclosed start tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
if (tag_name == "blockquote" && attribute_name == Slice("expandable")) {
argument = "1";
}
continue;
}
i++;
while (text[i] != 0 && is_space(text[i])) {
i++;
}
if (text[i] == 0) {
return Status::Error(400, PSLICE()
<< "Unclosed start tag \"" << tag_name << "\" at byte offset " << begin_pos);
}
string attribute_value;
if (text[i] != '\'' && text[i] != '"') {
// A name token (a sequence of letters, digits, periods, or hyphens). Name tokens are not case sensitive.
auto token_begin_pos = i;
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
i++;
}
attribute_value = to_lower(Slice(text + token_begin_pos, i - token_begin_pos));
if (!is_space(text[i]) && text[i] != '>') {
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
}
} else {
// A string literal
char end_character = text[i++];
char *attribute_end = &str[i];
const char *attribute_begin = attribute_end;
while (text[i] != end_character && text[i] != 0) {
if (text[i] == '&') {
auto code = decode_html_entity(str, i);
if (code != 0) {
attribute_end = reinterpret_cast<char *>(
append_utf8_character_unsafe(reinterpret_cast<unsigned char *>(attribute_end), code));
continue;
}
}
*attribute_end++ = text[i++];
}
if (text[i] == end_character) {
i++;
}
attribute_value.assign(attribute_begin, static_cast<size_t>(attribute_end - attribute_begin));
}
if (text[i] == 0) {
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
}
if (tag_name == "a" && attribute_name == Slice("href")) {
argument = std::move(attribute_value);
} else if (tag_name == "code" && attribute_name == Slice("class") &&
begins_with(attribute_value, "language-")) {
argument = attribute_value.substr(9);
} else if (tag_name == "span" && attribute_name == Slice("class") && begins_with(attribute_value, "tg-")) {
argument = attribute_value.substr(3);
} else if (tag_name == "tg-emoji" && attribute_name == Slice("emoji-id")) {
argument = std::move(attribute_value);
} else if (tag_name == "blockquote" && attribute_name == Slice("expandable")) {
argument = "1";
}
}
if (tag_name == "span" && argument != "spoiler") {
return Status::Error(400, PSLICE()
<< "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
}
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result_end - result_begin);
} else {
// end of an entity
if (nested_entities.empty()) {
return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos);
}
while (!is_space(text[i]) && text[i] != '>') {
i++;
}
string end_tag_name = to_lower(Slice(text + begin_pos + 2, i - begin_pos - 2));
while (is_space(text[i]) && text[i] != 0) {
i++;
}
if (text[i] != '>') {
return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << begin_pos);
}
const string &tag_name = nested_entities.back().tag_name;
if (!end_tag_name.empty() && end_tag_name != tag_name) {
return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << begin_pos << ", expected \"</"
<< tag_name << ">\", found \"</" << end_tag_name << ">\"");
}
if (utf16_offset > nested_entities.back().entity_offset) {
auto entity_offset = nested_entities.back().entity_offset;
auto entity_length = utf16_offset - entity_offset;
if (tag_name == "i" || tag_name == "em") {
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
} else if (tag_name == "b" || tag_name == "strong") {
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
} else if (tag_name == "s" || tag_name == "strike" || tag_name == "del") {
entities.emplace_back(MessageEntity::Type::Strikethrough, entity_offset, entity_length);
} else if (tag_name == "u" || tag_name == "ins") {
entities.emplace_back(MessageEntity::Type::Underline, entity_offset, entity_length);
} else if (tag_name == "tg-spoiler" || (tag_name == "span" && nested_entities.back().argument == "spoiler")) {
entities.emplace_back(MessageEntity::Type::Spoiler, entity_offset, entity_length);
} else if (tag_name == "tg-emoji") {
auto r_document_id = to_integer_safe<int64>(nested_entities.back().argument);
if (r_document_id.is_error() || r_document_id.ok() == 0) {
return Status::Error(400, "Invalid custom emoji identifier specified");
}
entities.emplace_back(MessageEntity::Type::CustomEmoji, entity_offset, entity_length,
CustomEmojiId(r_document_id.ok()));
} else if (tag_name == "a") {
auto url = std::move(nested_entities.back().argument);
if (url.empty()) {
url = Slice(result_begin + nested_entities.back().entity_begin_pos, result_end).str();
}
auto user_id = LinkManager::get_link_user_id(url);
if (user_id.is_valid()) {
entities.emplace_back(entity_offset, entity_length, user_id);
} else {
url = LinkManager::get_checked_link(url);
if (!url.empty()) {
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, std::move(url));
}
}
} else if (tag_name == "pre") {
if (!entities.empty() && entities.back().type == MessageEntity::Type::Code &&
entities.back().offset == entity_offset && entities.back().length == entity_length &&
!entities.back().argument.empty()) {
entities.back().type = MessageEntity::Type::PreCode;
} else {
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
}
} else if (tag_name == "code") {
if (!entities.empty() && entities.back().type == MessageEntity::Type::Pre &&
entities.back().offset == entity_offset && entities.back().length == entity_length &&
!nested_entities.back().argument.empty()) {
entities.back().type = MessageEntity::Type::PreCode;
entities.back().argument = std::move(nested_entities.back().argument);
} else {
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length,
nested_entities.back().argument);
}
} else if (tag_name == "blockquote") {
if (!nested_entities.back().argument.empty()) {
entities.emplace_back(MessageEntity::Type::ExpandableBlockQuote, entity_offset, entity_length);
} else {
entities.emplace_back(MessageEntity::Type::BlockQuote, entity_offset, entity_length);
}
} else {
UNREACHABLE();
}
}
nested_entities.pop_back();
}
}
if (!nested_entities.empty()) {
return Status::Error(
400, PSLICE() << "Can't find end tag corresponding to start tag \"" << nested_entities.back().tag_name << '"');
}
for (auto &entity : entities) {
if (entity.type == MessageEntity::Type::Code && !entity.argument.empty()) {
entity.argument.clear();
}
}
sort_entities(entities);
str.resize(static_cast<size_t>(result_end - result_begin));
if (need_recheck_utf8 && !check_utf8(str)) {
return Status::Error(400,
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
"surrogate code units");
}
return std::move(entities);
}
vector<tl_object_ptr<secret_api::MessageEntity>> get_input_secret_message_entities(
const vector<MessageEntity> &entities, int32 layer) {
vector<tl_object_ptr<secret_api::MessageEntity>> result;
for (auto &entity : entities) {
switch (entity.type) {
case MessageEntity::Type::Mention:
result.push_back(make_tl_object<secret_api::messageEntityMention>(entity.offset, entity.length));
break;
case MessageEntity::Type::Hashtag:
result.push_back(make_tl_object<secret_api::messageEntityHashtag>(entity.offset, entity.length));
break;
case MessageEntity::Type::Cashtag:
break;
case MessageEntity::Type::BotCommand:
break;
case MessageEntity::Type::PhoneNumber:
break;
case MessageEntity::Type::BankCardNumber:
break;
case MessageEntity::Type::Url:
result.push_back(make_tl_object<secret_api::messageEntityUrl>(entity.offset, entity.length));
break;
case MessageEntity::Type::EmailAddress:
result.push_back(make_tl_object<secret_api::messageEntityEmail>(entity.offset, entity.length));
break;
case MessageEntity::Type::Bold:
result.push_back(make_tl_object<secret_api::messageEntityBold>(entity.offset, entity.length));
break;
case MessageEntity::Type::Italic:
result.push_back(make_tl_object<secret_api::messageEntityItalic>(entity.offset, entity.length));
break;
case MessageEntity::Type::Underline:
if (layer >= static_cast<int32>(SecretChatLayer::NewEntities)) {
result.push_back(make_tl_object<secret_api::messageEntityUnderline>(entity.offset, entity.length));
}
break;
case MessageEntity::Type::Strikethrough:
if (layer >= static_cast<int32>(SecretChatLayer::NewEntities)) {
result.push_back(make_tl_object<secret_api::messageEntityStrike>(entity.offset, entity.length));
}
break;
case MessageEntity::Type::BlockQuote:
if (layer >= static_cast<int32>(SecretChatLayer::NewEntities)) {
// result.push_back(make_tl_object<secret_api::messageEntityBlockquote>(0, false /*ignored*/, entity.offset, entity.length));
}
break;
case MessageEntity::Type::Code:
result.push_back(make_tl_object<secret_api::messageEntityCode>(entity.offset, entity.length));
break;
case MessageEntity::Type::Pre:
result.push_back(make_tl_object<secret_api::messageEntityPre>(entity.offset, entity.length, string()));
break;
case MessageEntity::Type::PreCode:
result.push_back(make_tl_object<secret_api::messageEntityPre>(entity.offset, entity.length, entity.argument));
break;
case MessageEntity::Type::TextUrl:
result.push_back(
make_tl_object<secret_api::messageEntityTextUrl>(entity.offset, entity.length, entity.argument));
break;
case MessageEntity::Type::MentionName:
break;
case MessageEntity::Type::MediaTimestamp:
break;
case MessageEntity::Type::Spoiler:
if (layer >= static_cast<int32>(SecretChatLayer::SpoilerAndCustomEmojiEntities)) {
result.push_back(make_tl_object<secret_api::messageEntitySpoiler>(entity.offset, entity.length));
}
break;
case MessageEntity::Type::CustomEmoji:
if (layer >= static_cast<int32>(SecretChatLayer::SpoilerAndCustomEmojiEntities)) {
result.push_back(make_tl_object<secret_api::messageEntityCustomEmoji>(entity.offset, entity.length,
entity.custom_emoji_id.get()));
}
break;
case MessageEntity::Type::ExpandableBlockQuote:
if (layer >= static_cast<int32>(SecretChatLayer::NewEntities)) {
// result.push_back(make_tl_object<secret_api::messageEntityBlockquote>(
// secret_api::messageEntityBlockquote::COLLAPSED_MASK, false /*ignored*/, entity.offset, entity.length));
}
break;
default:
UNREACHABLE();
}
}
return result;
}
Result<vector<MessageEntity>> get_message_entities(const UserManager *user_manager,
vector<tl_object_ptr<td_api::textEntity>> &&input_entities,
bool allow_all) {
vector<MessageEntity> entities;
entities.reserve(input_entities.size());
for (auto &input_entity : input_entities) {
if (input_entity == nullptr || input_entity->type_ == nullptr) {
continue;
}
auto offset = input_entity->offset_;
auto length = input_entity->length_;
switch (input_entity->type_->get_id()) {
case td_api::textEntityTypeMention::ID:
entities.emplace_back(MessageEntity::Type::Mention, offset, length);
break;
case td_api::textEntityTypeHashtag::ID:
entities.emplace_back(MessageEntity::Type::Hashtag, offset, length);
break;
case td_api::textEntityTypeBotCommand::ID:
entities.emplace_back(MessageEntity::Type::BotCommand, offset, length);
break;
case td_api::textEntityTypeUrl::ID:
entities.emplace_back(MessageEntity::Type::Url, offset, length);
break;
case td_api::textEntityTypeEmailAddress::ID:
entities.emplace_back(MessageEntity::Type::EmailAddress, offset, length);
break;
case td_api::textEntityTypeCashtag::ID:
entities.emplace_back(MessageEntity::Type::Cashtag, offset, length);
break;
case td_api::textEntityTypePhoneNumber::ID:
entities.emplace_back(MessageEntity::Type::PhoneNumber, offset, length);
break;
case td_api::textEntityTypeBankCardNumber::ID:
entities.emplace_back(MessageEntity::Type::BankCardNumber, offset, length);
break;
case td_api::textEntityTypeBold::ID:
entities.emplace_back(MessageEntity::Type::Bold, offset, length);
break;
case td_api::textEntityTypeItalic::ID:
entities.emplace_back(MessageEntity::Type::Italic, offset, length);
break;
case td_api::textEntityTypeUnderline::ID:
entities.emplace_back(MessageEntity::Type::Underline, offset, length);
break;
case td_api::textEntityTypeStrikethrough::ID:
entities.emplace_back(MessageEntity::Type::Strikethrough, offset, length);
break;
case td_api::textEntityTypeBlockQuote::ID:
entities.emplace_back(MessageEntity::Type::BlockQuote, offset, length);
break;
case td_api::textEntityTypeCode::ID:
entities.emplace_back(MessageEntity::Type::Code, offset, length);
break;
case td_api::textEntityTypePre::ID:
entities.emplace_back(MessageEntity::Type::Pre, offset, length);
break;
case td_api::textEntityTypePreCode::ID: {
auto entity = static_cast<td_api::textEntityTypePreCode *>(input_entity->type_.get());
if (!clean_input_string(entity->language_)) {
return Status::Error(400, "MessageEntityPreCode.language must be encoded in UTF-8");
}
entities.emplace_back(MessageEntity::Type::PreCode, offset, length, entity->language_);
break;
}
case td_api::textEntityTypeTextUrl::ID: {
auto entity = static_cast<td_api::textEntityTypeTextUrl *>(input_entity->type_.get());
if (!clean_input_string(entity->url_)) {
return Status::Error(400, "MessageEntityTextUrl.url must be encoded in UTF-8");
}
auto user_id = LinkManager::get_link_user_id(entity->url_);
if (user_id.is_valid()) {
if (user_manager != nullptr) {
TRY_STATUS(user_manager->get_input_user(user_id));
}
entities.emplace_back(offset, length, user_id);
break;
}
auto r_url = LinkManager::check_link(entity->url_);
if (r_url.is_error()) {
return Status::Error(400, PSTRING() << "Entity " << r_url.error().message());
}
entities.emplace_back(MessageEntity::Type::TextUrl, offset, length, r_url.move_as_ok());
break;
}
case td_api::textEntityTypeMentionName::ID: {
auto entity = static_cast<const td_api::textEntityTypeMentionName *>(input_entity->type_.get());
UserId user_id(entity->user_id_);
if (user_manager != nullptr) {
TRY_STATUS(user_manager->get_input_user(user_id));
}
entities.emplace_back(offset, length, user_id);
break;
}
case td_api::textEntityTypeMediaTimestamp::ID: {
auto entity = static_cast<const td_api::textEntityTypeMediaTimestamp *>(input_entity->type_.get());
if (entity->media_timestamp_ < 0) {
return Status::Error(400, "Invalid media timestamp specified");
}
entities.emplace_back(MessageEntity::Type::MediaTimestamp, offset, length, entity->media_timestamp_);
break;
}
case td_api::textEntityTypeSpoiler::ID:
entities.emplace_back(MessageEntity::Type::Spoiler, offset, length);
break;
case td_api::textEntityTypeCustomEmoji::ID: {
auto entity = static_cast<const td_api::textEntityTypeCustomEmoji *>(input_entity->type_.get());
CustomEmojiId custom_emoji_id(entity->custom_emoji_id_);
if (!custom_emoji_id.is_valid()) {
return Status::Error(400, "Invalid custom emoji identifier specified");
}
entities.emplace_back(MessageEntity::Type::CustomEmoji, offset, length, custom_emoji_id);
break;
}
case td_api::textEntityTypeExpandableBlockQuote::ID:
entities.emplace_back(MessageEntity::Type::ExpandableBlockQuote, offset, length);
break;
default:
UNREACHABLE();
}
CHECK(!entities.empty());
if (!allow_all && !is_user_entity(entities.back().type)) {
entities.pop_back();
}
}
return std::move(entities);
}
vector<MessageEntity> get_message_entities(const UserManager *user_manager,
vector<tl_object_ptr<telegram_api::MessageEntity>> &&server_entities,
const char *source) {
vector<MessageEntity> entities;
entities.reserve(server_entities.size());
for (auto &server_entity : server_entities) {
switch (server_entity->get_id()) {
case telegram_api::messageEntityUnknown::ID:
break;
case telegram_api::messageEntityMention::ID: {
auto entity = static_cast<const telegram_api::messageEntityMention *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Mention, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityHashtag::ID: {
auto entity = static_cast<const telegram_api::messageEntityHashtag *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Hashtag, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityCashtag::ID: {
auto entity = static_cast<const telegram_api::messageEntityCashtag *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Cashtag, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityPhone::ID: {
auto entity = static_cast<const telegram_api::messageEntityPhone *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::PhoneNumber, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityBotCommand::ID: {
auto entity = static_cast<const telegram_api::messageEntityBotCommand *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::BotCommand, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityBankCard::ID: {
auto entity = static_cast<const telegram_api::messageEntityBankCard *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::BankCardNumber, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityUrl::ID: {
auto entity = static_cast<const telegram_api::messageEntityUrl *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Url, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityEmail::ID: {
auto entity = static_cast<const telegram_api::messageEntityEmail *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::EmailAddress, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityBold::ID: {
auto entity = static_cast<const telegram_api::messageEntityBold *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Bold, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityItalic::ID: {
auto entity = static_cast<const telegram_api::messageEntityItalic *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Italic, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityUnderline::ID: {
auto entity = static_cast<const telegram_api::messageEntityUnderline *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Underline, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityStrike::ID: {
auto entity = static_cast<const telegram_api::messageEntityStrike *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Strikethrough, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntitySpoiler::ID: {
auto entity = static_cast<const telegram_api::messageEntitySpoiler *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Spoiler, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityBlockquote::ID: {
auto entity = static_cast<const telegram_api::messageEntityBlockquote *>(server_entity.get());
auto type = entity->collapsed_ ? MessageEntity::Type::ExpandableBlockQuote : MessageEntity::Type::BlockQuote;
entities.emplace_back(type, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityCode::ID: {
auto entity = static_cast<const telegram_api::messageEntityCode *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::Code, entity->offset_, entity->length_);
break;
}
case telegram_api::messageEntityPre::ID: {
auto entity = static_cast<telegram_api::messageEntityPre *>(server_entity.get());
if (entity->language_.empty()) {
entities.emplace_back(MessageEntity::Type::Pre, entity->offset_, entity->length_);
} else {
entities.emplace_back(MessageEntity::Type::PreCode, entity->offset_, entity->length_,
std::move(entity->language_));
}
break;
}
case telegram_api::messageEntityTextUrl::ID: {
auto entity = static_cast<const telegram_api::messageEntityTextUrl *>(server_entity.get());
auto r_url = LinkManager::check_link(entity->url_);
if (r_url.is_error()) {
LOG(ERROR) << "Entity " << r_url.error().message() << " from " << source;
continue;
}
entities.emplace_back(MessageEntity::Type::TextUrl, entity->offset_, entity->length_, r_url.move_as_ok());
break;
}
case telegram_api::messageEntityMentionName::ID: {
auto entity = static_cast<const telegram_api::messageEntityMentionName *>(server_entity.get());
UserId user_id(entity->user_id_);
if (!user_id.is_valid()) {
LOG(ERROR) << "Receive invalid " << user_id << " in MentionName from " << source;
continue;
}
if (user_manager == nullptr) {
LOG(ERROR) << "Receive unknown " << user_id << " in MentionName from " << source;
continue;
}
auto r_input_user = user_manager->get_input_user(user_id);
if (r_input_user.is_error()) {
LOG(ERROR) << "Receive wrong " << user_id << ": " << r_input_user.error() << " from " << source;
continue;
}
entities.emplace_back(entity->offset_, entity->length_, user_id);
break;
}
case telegram_api::messageEntityCustomEmoji::ID: {
auto entity = static_cast<const telegram_api::messageEntityCustomEmoji *>(server_entity.get());
entities.emplace_back(MessageEntity::Type::CustomEmoji, entity->offset_, entity->length_,
CustomEmojiId(entity->document_id_));
break;
}
default:
UNREACHABLE();
}
}
return entities;
}
vector<MessageEntity> get_message_entities(Td *td, vector<tl_object_ptr<secret_api::MessageEntity>> &&secret_entities,
bool is_premium, MultiPromiseActor &load_data_multipromise) {
constexpr size_t MAX_SECRET_CHAT_ENTITIES = 1000;
constexpr size_t MAX_CUSTOM_EMOJI_ENTITIES = 100;
vector<MessageEntity> entities;
entities.reserve(secret_entities.size());
vector<CustomEmojiId> custom_emoji_ids;
for (auto &secret_entity : secret_entities) {
switch (secret_entity->get_id()) {
case secret_api::messageEntityUnknown::ID:
break;
case secret_api::messageEntityMention::ID:
// skip, will find it ourselves
break;
case secret_api::messageEntityHashtag::ID:
// skip, will find it ourselves
break;
case secret_api::messageEntityCashtag::ID:
// skip, will find it ourselves
break;
case secret_api::messageEntityPhone::ID:
// skip, will find it ourselves
break;
case secret_api::messageEntityBotCommand::ID:
// skip all bot commands in secret chats
break;
case secret_api::messageEntityBankCard::ID:
// skip, will find it ourselves
break;
case secret_api::messageEntityUrl::ID: {
auto entity = static_cast<const secret_api::messageEntityUrl *>(secret_entity.get());
// TODO skip URL when find_urls will be better
entities.emplace_back(MessageEntity::Type::Url, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityEmail::ID: {
auto entity = static_cast<const secret_api::messageEntityEmail *>(secret_entity.get());
// TODO skip emails when find_urls will be better
entities.emplace_back(MessageEntity::Type::EmailAddress, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityBold::ID: {
auto entity = static_cast<const secret_api::messageEntityBold *>(secret_entity.get());
entities.emplace_back(MessageEntity::Type::Bold, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityItalic::ID: {
auto entity = static_cast<const secret_api::messageEntityItalic *>(secret_entity.get());
entities.emplace_back(MessageEntity::Type::Italic, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityUnderline::ID: {
auto entity = static_cast<const secret_api::messageEntityUnderline *>(secret_entity.get());
entities.emplace_back(MessageEntity::Type::Underline, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityStrike::ID: {
auto entity = static_cast<const secret_api::messageEntityStrike *>(secret_entity.get());
entities.emplace_back(MessageEntity::Type::Strikethrough, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityBlockquote::ID: {
auto entity = static_cast<const secret_api::messageEntityBlockquote *>(secret_entity.get());
entities.emplace_back(MessageEntity::Type::BlockQuote, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityCode::ID: {
auto entity = static_cast<const secret_api::messageEntityCode *>(secret_entity.get());
entities.emplace_back(MessageEntity::Type::Code, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityPre::ID: {
auto entity = static_cast<secret_api::messageEntityPre *>(secret_entity.get());
if (!clean_input_string(entity->language_)) {
LOG(WARNING) << "Wrong language in entity: \"" << entity->language_ << '"';
entity->language_.clear();
}
if (entity->language_.empty()) {
entities.emplace_back(MessageEntity::Type::Pre, entity->offset_, entity->length_);
} else {
entities.emplace_back(MessageEntity::Type::PreCode, entity->offset_, entity->length_,
std::move(entity->language_));
}
break;
}
case secret_api::messageEntityTextUrl::ID: {
auto entity = static_cast<secret_api::messageEntityTextUrl *>(secret_entity.get());
if (!clean_input_string(entity->url_)) {
LOG(WARNING) << "Wrong URL entity: \"" << entity->url_ << '"';
continue;
}
auto r_url = LinkManager::check_link(entity->url_);
if (r_url.is_error()) {
LOG(WARNING) << "Entity " << r_url.error().message();
continue;
}
entities.emplace_back(MessageEntity::Type::TextUrl, entity->offset_, entity->length_, r_url.move_as_ok());
break;
}
case secret_api::messageEntityMentionName::ID:
// skip all name mentions in secret chats
break;
case secret_api::messageEntitySpoiler::ID: {
auto entity = static_cast<const secret_api::messageEntitySpoiler *>(secret_entity.get());
entities.emplace_back(MessageEntity::Type::Spoiler, entity->offset_, entity->length_);
break;
}
case secret_api::messageEntityCustomEmoji::ID: {
auto entity = static_cast<const secret_api::messageEntityCustomEmoji *>(secret_entity.get());
CustomEmojiId custom_emoji_id(entity->document_id_);
if (is_premium || !td->stickers_manager_->is_premium_custom_emoji(custom_emoji_id, false)) {
if (custom_emoji_ids.size() < MAX_CUSTOM_EMOJI_ENTITIES) {
entities.emplace_back(MessageEntity::Type::CustomEmoji, entity->offset_, entity->length_, custom_emoji_id);
custom_emoji_ids.push_back(custom_emoji_id);
}
}
break;
}
default:
UNREACHABLE();
}
if (entities.size() >= MAX_SECRET_CHAT_ENTITIES) {
break;
}
}
if (!custom_emoji_ids.empty() && !is_premium) {
// preload custom emoji to check that they aren't premium
td->stickers_manager_->get_custom_emoji_stickers(
std::move(custom_emoji_ids), true,
PromiseCreator::lambda(
[promise = load_data_multipromise.get_promise()](td_api::object_ptr<td_api::stickers> result) mutable {
promise.set_value(Unit());
}));
}
return entities;
}
telegram_api::object_ptr<telegram_api::textWithEntities> get_input_text_with_entities(const UserManager *user_manager,
const FormattedText &text,
const char *source) {
return telegram_api::make_object<telegram_api::textWithEntities>(
text.text, get_input_message_entities(user_manager, text.entities, source));
}
FormattedText get_formatted_text(const UserManager *user_manager, string &&text,
vector<telegram_api::object_ptr<telegram_api::MessageEntity>> &&server_entities,
bool skip_media_timestamps, bool skip_trim, const char *source) {
auto entities = get_message_entities(user_manager, std::move(server_entities), source);
auto status = fix_formatted_text(text, entities, true, true, true, skip_media_timestamps, skip_trim);
if (status.is_error()) {
LOG(ERROR) << "Receive error " << status << " from " << source << " while parsing \"" << text << "\"("
<< hex_encode(text) << ')';
if (!clean_input_string(text)) {
text.clear();
}
entities = find_entities(text, true, skip_media_timestamps);
}
return {std::move(text), std::move(entities)};
}
FormattedText get_formatted_text(const UserManager *user_manager,
telegram_api::object_ptr<telegram_api::textWithEntities> text_with_entities,
bool skip_media_timestamps, bool skip_trim, const char *source) {
CHECK(text_with_entities != nullptr);
return get_formatted_text(user_manager, std::move(text_with_entities->text_),
std::move(text_with_entities->entities_), skip_media_timestamps, skip_trim, source);
}
// like clean_input_string but also fixes entities
// entities must be sorted, can be nested, but must not intersect each other
static Result<string> clean_input_string_with_entities(const string &text, vector<MessageEntity> &entities) {
check_is_sorted(entities);
struct EntityInfo {
MessageEntity *entity;
int32 utf16_skipped_before;
EntityInfo(MessageEntity *entity, int32 utf16_skipped_before)
: entity(entity), utf16_skipped_before(utf16_skipped_before) {
}
};
vector<EntityInfo> nested_entities_stack;
size_t current_entity = 0;
int32 utf16_offset = 0;
int32 utf16_skipped = 0;
size_t text_size = text.size();
string result;
result.reserve(text_size);
for (size_t pos = 0; pos <= text_size; pos++) {
auto c = static_cast<unsigned char>(text[pos]);
bool is_utf8_character_begin = is_utf8_character_first_code_unit(c);
if (is_utf8_character_begin) {
while (!nested_entities_stack.empty()) {
auto *entity = nested_entities_stack.back().entity;
auto entity_end = entity->offset + entity->length;
if (utf16_offset < entity_end) {
break;
}
if (utf16_offset != entity_end) {
CHECK(utf16_offset == entity_end + 1);
return Status::Error(400, PSLICE() << "Entity beginning at UTF-16 offset " << entity->offset
<< " ends in a middle of a UTF-16 symbol at byte offset " << pos);
}
auto skipped_before_current_entity = nested_entities_stack.back().utf16_skipped_before;
entity->offset -= skipped_before_current_entity;
entity->length -= utf16_skipped - skipped_before_current_entity;
nested_entities_stack.pop_back();
}
while (current_entity < entities.size() && utf16_offset >= entities[current_entity].offset) {
if (utf16_offset != entities[current_entity].offset) {
CHECK(utf16_offset == entities[current_entity].offset + 1);
return Status::Error(400, PSLICE() << "Entity begins in a middle of a UTF-16 symbol at byte offset " << pos);
}
nested_entities_stack.emplace_back(&entities[current_entity++], utf16_skipped);
}
}
if (pos == text_size) {
break;
}
switch (c) {
// remove control characters
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
case 8:
case 9:
// allow '\n'
case 11:
case 12:
// ignore '\r'
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
case 20:
case 21:
case 22:
case 23:
case 24:
case 25:
case 26:
case 27:
case 28:
case 29:
case 30:
case 31:
case 32:
result.push_back(' ');
utf16_offset++;
break;
case '\r':
// skip
utf16_offset++;
utf16_skipped++;
break;
default:
if (is_utf8_character_begin) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
if (c == 0xe2 && pos + 2 < text_size) {
auto next = static_cast<unsigned char>(text[pos + 1]);
if (next == 0x80) {
next = static_cast<unsigned char>(text[pos + 2]);
if (0xa8 <= next && next <= 0xae) {
pos += 2;
utf16_skipped++;
break;
}
}
}
if (c == 0xcc && pos + 1 < text_size) {
auto next = static_cast<unsigned char>(text[pos + 1]);
// remove vertical lines
if (next == 0xb3 || next == 0xbf || next == 0x8a) {
pos++;
utf16_skipped++;
break;
}
}
result.push_back(text[pos]);
break;
}
}
if (current_entity != entities.size()) {
return Status::Error(400, PSLICE() << "Entity begins after the end of the text at UTF-16 offset "
<< entities[current_entity].offset);
}
if (!nested_entities_stack.empty()) {
auto *entity = nested_entities_stack.back().entity;
return Status::Error(400, PSLICE() << "Entity beginning at UTF-16 offset " << entity->offset
<< " ends after the end of the text at UTF-16 offset "
<< entity->offset + entity->length);
}
replace_offending_characters(result);
return result;
}
// removes empty entities
// entities must be sorted by offset and length, but not necessary by type
// returns {last_non_whitespace_pos, last_non_whitespace_utf16_offset}
static std::pair<size_t, int32> remove_invalid_entities(const string &text, vector<MessageEntity> &entities) {
if (entities.empty()) {
// fast path
for (size_t pos = 0; pos < text.size(); pos++) {
auto back_pos = text.size() - pos - 1;
auto c = text[back_pos];
if (c != '\n' && c != ' ') {
return {back_pos, 0 /*unused*/};
}
}
return {text.size(), -1};
}
// check_is_sorted(entities);
size_t last_non_whitespace_pos = text.size();
int32 utf16_offset = 0;
int32 last_non_whitespace_utf16_offset = -1;
remove_empty_entities(entities);
for (size_t pos = 0; pos < text.size(); pos++) {
auto c = static_cast<unsigned char>(text[pos]);
switch (c) {
case '\n':
case 32:
break;
default:
while (!is_utf8_character_first_code_unit(static_cast<unsigned char>(text[pos + 1]))) {
pos++;
}
utf16_offset += (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
last_non_whitespace_pos = pos;
last_non_whitespace_utf16_offset = utf16_offset;
break;
}
utf16_offset++;
}
return {last_non_whitespace_pos, last_non_whitespace_utf16_offset};
}
// enitities must contain only splittable entities
static void split_entities(vector<MessageEntity> &entities, const vector<MessageEntity> &other_entities) {
check_is_sorted(entities);
check_is_sorted(other_entities);
int32 begin_pos[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
int32 end_pos[SPLITTABLE_ENTITY_TYPE_COUNT] = {};
auto it = entities.begin();
vector<MessageEntity> result;
auto add_entities = [&](int32 end_offset) {
auto flush_entities = [&](int32 offset) {
for (auto type : {MessageEntity::Type::Bold, MessageEntity::Type::Italic, MessageEntity::Type::Underline,
MessageEntity::Type::Strikethrough, MessageEntity::Type::Spoiler}) {
auto index = get_splittable_entity_type_index(type);
if (end_pos[index] != 0 && begin_pos[index] < offset) {
if (end_pos[index] <= offset) {
result.emplace_back(type, begin_pos[index], end_pos[index] - begin_pos[index]);
begin_pos[index] = 0;
end_pos[index] = 0;
} else {
result.emplace_back(type, begin_pos[index], offset - begin_pos[index]);
begin_pos[index] = offset;
}
}
}
};
while (it != entities.end()) {
if (it->offset >= end_offset) {
break;
}
CHECK(is_splittable_entity(it->type));
auto index = get_splittable_entity_type_index(it->type);
if (it->offset <= end_pos[index] && end_pos[index] != 0) {
if (it->offset + it->length > end_pos[index]) {
end_pos[index] = it->offset + it->length;
}
} else {
flush_entities(it->offset);
begin_pos[index] = it->offset;
end_pos[index] = it->offset + it->length;
}
++it;
}
flush_entities(end_offset);
};
vector<const MessageEntity *> nested_entities_stack;
auto add_offset = [&](int32 offset) {
while (!nested_entities_stack.empty() &&
offset >= nested_entities_stack.back()->offset + nested_entities_stack.back()->length) {
// remove non-intersecting entities from the stack
auto old_size = result.size();
add_entities(nested_entities_stack.back()->offset + nested_entities_stack.back()->length);
if (is_pre_entity(nested_entities_stack.back()->type)) {
result.resize(old_size);
}
nested_entities_stack.pop_back();
}
add_entities(offset);
};
for (auto &other_entity : other_entities) {
add_offset(other_entity.offset);
nested_entities_stack.push_back(&other_entity);
}
add_offset(std::numeric_limits<int32>::max());
entities = std::move(result);
// entities are sorted only by offset now, re-sort if needed
sort_entities(entities);
}
static vector<MessageEntity> resplit_entities(vector<MessageEntity> &&splittable_entities,
vector<MessageEntity> &&entities) {
if (!splittable_entities.empty()) {
split_entities(splittable_entities, entities); // can merge some entities
if (entities.empty()) {
return std::move(splittable_entities);
}
combine(entities, std::move(splittable_entities));
sort_entities(entities);
}
return std::move(entities);
}
void fix_entities(vector<MessageEntity> &entities) {
sort_entities(entities);
if (are_entities_valid(entities)) {
// fast path
return;
}
vector<MessageEntity> continuous_entities;
vector<MessageEntity> blockquote_entities;
vector<MessageEntity> splittable_entities;
for (auto &entity : entities) {
if (is_splittable_entity(entity.type)) {
splittable_entities.push_back(std::move(entity));
} else if (is_blockquote_entity(entity.type)) {
blockquote_entities.push_back(std::move(entity));
} else {
continuous_entities.push_back(std::move(entity));
}
}
remove_intersecting_entities(continuous_entities); // continuous entities can't intersect each other
if (!blockquote_entities.empty()) {
remove_intersecting_entities(blockquote_entities); // blockquote entities can't intersect each other
// blockquote entities can contain continuous entities, but can't intersect them in the other ways
remove_entities_intersecting_blockquote(continuous_entities, blockquote_entities);
combine(continuous_entities, std::move(blockquote_entities));
sort_entities(continuous_entities);
}
// must be called once to not merge some adjacent entities
entities = resplit_entities(std::move(splittable_entities), std::move(continuous_entities));
check_is_sorted(entities);
}
static void merge_new_entities(vector<MessageEntity> &entities, vector<MessageEntity> new_entities) {
check_is_sorted(entities);
if (new_entities.empty()) {
// fast path
return;
}
check_non_intersecting(new_entities);
vector<MessageEntity> continuous_entities;
vector<MessageEntity> blockquote_entities;
vector<MessageEntity> splittable_entities;
for (auto &entity : entities) {
if (is_splittable_entity(entity.type)) {
splittable_entities.push_back(std::move(entity));
} else if (is_blockquote_entity(entity.type)) {
blockquote_entities.push_back(std::move(entity));
} else {
continuous_entities.push_back(std::move(entity));
}
}
remove_entities_intersecting_blockquote(new_entities, blockquote_entities);
// merge before combining with blockquote entities
continuous_entities = merge_entities(std::move(continuous_entities), std::move(new_entities));
if (!blockquote_entities.empty()) {
combine(continuous_entities, std::move(blockquote_entities));
sort_entities(continuous_entities);
}
// must be called once to not merge some adjacent entities
entities = resplit_entities(std::move(splittable_entities), std::move(continuous_entities));
check_is_sorted(entities);
}
Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool allow_empty, bool skip_new_entities,
bool skip_bot_commands, bool skip_media_timestamps, bool skip_trim, int32 *ltrim_count) {
string result;
if (entities.empty()) {
// fast path
if (!clean_input_string(text)) {
return Status::Error(400, "Strings must be encoded in UTF-8");
}
result = std::move(text);
} else {
if (!check_utf8(text)) {
return Status::Error(400, "Strings must be encoded in UTF-8");
}
for (auto &entity : entities) {
if (entity.offset < 0 || entity.offset > 1000000) {
return Status::Error(400, PSLICE() << "Receive an entity with incorrect offset " << entity.offset);
}
if (entity.length < 0 || entity.length > 1000000) {
return Status::Error(400, PSLICE() << "Receive an entity with incorrect length " << entity.length);
}
}
remove_empty_entities(entities);
fix_entities(entities);
TRY_RESULT_ASSIGN(result, clean_input_string_with_entities(text, entities));
}
// now entities are still sorted by offset and length, but not type,
// because some characters could be deleted and after that some entities begin to share a common end
size_t last_non_whitespace_pos;
int32 last_non_whitespace_utf16_offset;
std::tie(last_non_whitespace_pos, last_non_whitespace_utf16_offset) = remove_invalid_entities(result, entities);
if (last_non_whitespace_utf16_offset == -1) {
if (allow_empty) {
text.clear();
entities.clear();
return Status::OK();
}
return Status::Error(400, "Text must be non-empty");
}
// re-fix entities if needed after removal of some characters
// the sort order can be incorrect by type
// some splittable entities may be needed to be concatenated
fix_entities(entities);
if (ltrim_count != nullptr) {
*ltrim_count = 0;
}
if (skip_trim) {
text = std::move(result);
} else {
// rtrim
CHECK(last_non_whitespace_pos < result.size());
result.resize(last_non_whitespace_pos + 1);
while (!entities.empty() && entities.back().offset > last_non_whitespace_utf16_offset) {
entities.pop_back();
}
bool need_sort = false;
for (auto &entity : entities) {
if (entity.offset + entity.length > last_non_whitespace_utf16_offset + 1) {
entity.length = last_non_whitespace_utf16_offset + 1 - entity.offset;
need_sort = true;
CHECK(entity.length > 0);
}
}
if (need_sort) {
sort_entities(entities);
}
// ltrim
size_t first_non_whitespaces_pos = 0;
size_t first_entity_begin_pos = entities.empty() ? result.size() : entities[0].offset;
while (first_non_whitespaces_pos < first_entity_begin_pos &&
(result[first_non_whitespaces_pos] == ' ' || result[first_non_whitespaces_pos] == '\n')) {
first_non_whitespaces_pos++;
}
if (first_non_whitespaces_pos > 0) {
auto offset = narrow_cast<int32>(first_non_whitespaces_pos);
if (ltrim_count != nullptr) {
*ltrim_count = offset;
}
text = result.substr(first_non_whitespaces_pos);
for (auto &entity : entities) {
entity.offset -= offset;
CHECK(entity.offset >= 0);
}
} else {
text = std::move(result);
}
}
LOG_CHECK(check_utf8(text)) << text;
if (!allow_empty && is_empty_string(text)) {
return Status::Error(400, "Text must be non-empty");
}
constexpr size_t LENGTH_LIMIT = 35000; // server side limit
if (text.size() > LENGTH_LIMIT) {
size_t new_size = LENGTH_LIMIT;
while (!is_utf8_character_first_code_unit(text[new_size])) {
new_size--;
}
text.resize(new_size);
td::remove_if(entities, [text_utf16_length = text_length(text)](const auto &entity) {
return entity.offset + entity.length > text_utf16_length;
});
}
if (!skip_new_entities) {
merge_new_entities(entities, find_entities(text, skip_bot_commands, skip_media_timestamps));
} else if (!skip_media_timestamps) {
merge_new_entities(entities, find_media_timestamp_entities(text));
}
return Status::OK();
}
FormattedText get_message_text(const UserManager *user_manager, string message_text,
vector<tl_object_ptr<telegram_api::MessageEntity>> &&server_entities,
bool skip_new_entities, bool skip_media_timestamps, int32 send_date, bool from_album,
const char *source) {
auto entities = get_message_entities(user_manager, std::move(server_entities), source);
auto debug_message_text = message_text;
auto debug_entities = entities;
auto status = fix_formatted_text(message_text, entities, true, skip_new_entities, true, skip_media_timestamps, false);
if (status.is_error()) {
// message entities in media albums can be wrong because of a long time ago fixed server-side bug
if (!from_album && (send_date == 0 || send_date > 1600340000)) { // approximate fix date
LOG(ERROR) << "Receive error " << status << " while parsing message text from " << source << " sent at "
<< send_date << " with content \"" << debug_message_text << "\" -> \"" << message_text
<< "\" with entities " << format::as_array(debug_entities) << " -> " << format::as_array(entities);
}
if (!clean_input_string(message_text)) {
message_text.clear();
}
entities = find_entities(message_text, false, skip_media_timestamps);
}
return FormattedText{std::move(message_text), std::move(entities)};
}
void truncate_formatted_text(FormattedText &text, size_t length) {
auto result_size = utf8_truncate(Slice(text.text), length).size();
if (result_size == text.text.size()) {
return;
}
text.text.resize(result_size);
auto utf16_length = narrow_cast<int32>(utf8_utf16_length(text.text));
for (auto &entity : text.entities) {
if (entity.offset + entity.length > utf16_length) {
if (entity.offset >= utf16_length || is_continuous_entity(entity.type)) {
entity.length = 0;
continue;
}
entity.length = utf16_length - entity.offset; // truncate the entity
}
}
remove_empty_entities(text.entities);
}
Result<FormattedText> get_formatted_text(const Td *td, DialogId dialog_id,
td_api::object_ptr<td_api::formattedText> &&text, bool is_bot,
bool allow_empty, bool skip_media_timestamps, bool skip_trim,
int32 *ltrim_count) {
if (text == nullptr) {
if (allow_empty) {
return FormattedText();
}
return Status::Error(400, "Text must be non-empty");
}
TRY_RESULT(entities, get_message_entities(td->user_manager_.get(), std::move(text->entities_)));
auto need_skip_bot_commands = need_always_skip_bot_commands(td->user_manager_.get(), dialog_id, is_bot);
bool parse_markdown = td->option_manager_->get_option_boolean("always_parse_markdown");
bool skip_new_entities = is_bot && td->option_manager_->get_option_integer("session_count") > 1;
TRY_STATUS(fix_formatted_text(text->text_, entities, allow_empty, skip_new_entities || parse_markdown,
skip_new_entities || need_skip_bot_commands,
is_bot || skip_media_timestamps || parse_markdown, skip_trim, ltrim_count));
FormattedText result{std::move(text->text_), std::move(entities)};
if (parse_markdown) {
result = parse_markdown_v3(std::move(result));
fix_formatted_text(result.text, result.entities, allow_empty, false, need_skip_bot_commands,
is_bot || skip_media_timestamps, skip_trim, nullptr)
.ensure();
}
remove_unallowed_entities(td, result, dialog_id);
return std::move(result);
}
void add_formatted_text_dependencies(Dependencies &dependencies, const FormattedText *text) {
if (text == nullptr) {
return;
}
for (auto &entity : text->entities) {
dependencies.add(entity.user_id);
}
}
bool has_media_timestamps(const FormattedText *text, int32 min_media_timestamp, int32 max_media_timestamp) {
if (text == nullptr) {
return false;
}
for (auto &entity : text->entities) {
if (entity.type == MessageEntity::Type::MediaTimestamp && min_media_timestamp <= entity.media_timestamp &&
entity.media_timestamp <= max_media_timestamp) {
return true;
}
}
return false;
}
bool has_bot_commands(const FormattedText *text) {
if (text == nullptr) {
return false;
}
for (auto &entity : text->entities) {
if (entity.type == MessageEntity::Type::BotCommand) {
return true;
}
}
return false;
}
bool need_always_skip_bot_commands(const UserManager *user_manager, DialogId dialog_id, bool is_bot) {
if (!dialog_id.is_valid()) {
return true;
}
if (is_bot) {
return false;
}
switch (dialog_id.get_type()) {
case DialogType::User: {
auto user_id = dialog_id.get_user_id();
return user_id == UserManager::get_replies_bot_user_id() || !user_manager->is_user_bot(user_id);
}
case DialogType::SecretChat: {
auto user_id = user_manager->get_secret_chat_user_id(dialog_id.get_secret_chat_id());
return !user_id.is_valid() || !user_manager->is_user_bot(user_id);
}
case DialogType::Chat:
case DialogType::Channel:
case DialogType::None:
return false;
default:
UNREACHABLE();
return false;
}
}
vector<tl_object_ptr<telegram_api::MessageEntity>> get_input_message_entities(const UserManager *user_manager,
const vector<MessageEntity> &entities,
const char *source) {
vector<tl_object_ptr<telegram_api::MessageEntity>> result;
vector<MessageEntity> splittable_entities;
constexpr size_t MAX_USER_ENTITY_COUNT = 100; // server-side limit
size_t user_entity_count = 0;
for (auto &entity : entities) {
if (!is_user_entity(entity.type)) {
continue;
}
if (is_splittable_entity(entity.type)) {
splittable_entities.push_back(entity);
continue;
}
if (entity.type == MessageEntity::Type::CustomEmoji) {
result.push_back(make_tl_object<telegram_api::messageEntityCustomEmoji>(entity.offset, entity.length,
entity.custom_emoji_id.get()));
continue;
}
if (user_entity_count >= MAX_USER_ENTITY_COUNT) {
continue;
}
user_entity_count++;
switch (entity.type) {
case MessageEntity::Type::BlockQuote:
result.push_back(
make_tl_object<telegram_api::messageEntityBlockquote>(0, false /*ignored*/, entity.offset, entity.length));
break;
case MessageEntity::Type::Code:
result.push_back(make_tl_object<telegram_api::messageEntityCode>(entity.offset, entity.length));
break;
case MessageEntity::Type::Pre:
result.push_back(make_tl_object<telegram_api::messageEntityPre>(entity.offset, entity.length, string()));
break;
case MessageEntity::Type::PreCode:
result.push_back(make_tl_object<telegram_api::messageEntityPre>(entity.offset, entity.length, entity.argument));
break;
case MessageEntity::Type::TextUrl:
result.push_back(
make_tl_object<telegram_api::messageEntityTextUrl>(entity.offset, entity.length, entity.argument));
break;
case MessageEntity::Type::MentionName: {
CHECK(user_manager != nullptr);
auto input_user = user_manager->get_input_user_force(entity.user_id);
result.push_back(make_tl_object<telegram_api::inputMessageEntityMentionName>(entity.offset, entity.length,
std::move(input_user)));
break;
}
case MessageEntity::Type::ExpandableBlockQuote:
result.push_back(make_tl_object<telegram_api::messageEntityBlockquote>(
telegram_api::messageEntityBlockquote::COLLAPSED_MASK, false /*ignored*/, entity.offset, entity.length));
break;
default:
UNREACHABLE();
}
}
split_entities(splittable_entities, vector<MessageEntity>());
for (auto &entity : splittable_entities) {
if (user_entity_count >= MAX_USER_ENTITY_COUNT) {
break;
}
user_entity_count++;
switch (entity.type) {
case MessageEntity::Type::Bold:
result.push_back(make_tl_object<telegram_api::messageEntityBold>(entity.offset, entity.length));
break;
case MessageEntity::Type::Italic:
result.push_back(make_tl_object<telegram_api::messageEntityItalic>(entity.offset, entity.length));
break;
case MessageEntity::Type::Underline:
result.push_back(make_tl_object<telegram_api::messageEntityUnderline>(entity.offset, entity.length));
break;
case MessageEntity::Type::Strikethrough:
result.push_back(make_tl_object<telegram_api::messageEntityStrike>(entity.offset, entity.length));
break;
case MessageEntity::Type::Spoiler:
result.push_back(make_tl_object<telegram_api::messageEntitySpoiler>(entity.offset, entity.length));
break;
default:
UNREACHABLE();
}
}
return result;
}
vector<tl_object_ptr<telegram_api::MessageEntity>> get_input_message_entities(const UserManager *user_manager,
const FormattedText *text,
const char *source) {
if (text != nullptr && !text->entities.empty()) {
return get_input_message_entities(user_manager, text->entities, source);
}
return {};
}
void remove_premium_custom_emoji_entities(const Td *td, vector<MessageEntity> &entities, bool remove_unknown) {
td::remove_if(entities, [&](const MessageEntity &entity) {
return entity.type == MessageEntity::Type::CustomEmoji &&
td->stickers_manager_->is_premium_custom_emoji(entity.custom_emoji_id, remove_unknown);
});
}
void remove_unallowed_entities(const Td *td, FormattedText &text, DialogId dialog_id) {
if (text.entities.empty()) {
return;
}
if (dialog_id.get_type() == DialogType::SecretChat) {
auto layer = td->user_manager_->get_secret_chat_layer(dialog_id.get_secret_chat_id());
td::remove_if(text.entities, [layer](const MessageEntity &entity) {
if (layer < static_cast<int32>(SecretChatLayer::NewEntities) &&
(entity.type == MessageEntity::Type::Underline || entity.type == MessageEntity::Type::Strikethrough ||
entity.type == MessageEntity::Type::BlockQuote ||
entity.type == MessageEntity::Type::ExpandableBlockQuote)) {
return true;
}
if (layer < static_cast<int32>(SecretChatLayer::SpoilerAndCustomEmojiEntities) &&
(entity.type == MessageEntity::Type::Spoiler || entity.type == MessageEntity::Type::CustomEmoji)) {
return true;
}
return false;
});
if (layer < static_cast<int32>(SecretChatLayer::NewEntities)) {
sort_entities(text.entities);
remove_intersecting_entities(text.entities);
}
}
if (!td->dialog_manager_->can_use_premium_custom_emoji_in_dialog(dialog_id)) {
remove_premium_custom_emoji_entities(td, text.entities, true);
}
}
} // namespace td