Support nested entities in parse_html.
GitOrigin-RevId: c8a00262f5a8739d09b45ce710a5c7b920f2bfd4
This commit is contained in:
parent
3006357f7c
commit
b1d1ea2e6c
@ -24,7 +24,7 @@
|
|||||||
namespace td {
|
namespace td {
|
||||||
|
|
||||||
int MessageEntity::get_type_priority(Type type) {
|
int MessageEntity::get_type_priority(Type type) {
|
||||||
static const int types[] = {5, 5, 5, 5, 5, 9, 9, 2, 1, 1, 5, 5, 5, 5, 9, 9, 0};
|
static const int types[] = {50, 50, 50, 50, 50, 90, 91, 20, 11, 10, 49, 49, 50, 50, 92, 93, 0};
|
||||||
return types[static_cast<int32>(type)];
|
return types[static_cast<int32>(type)];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1461,6 +1461,22 @@ static uint32 decode_html_entity(Slice text, size_t &pos) {
|
|||||||
static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
||||||
vector<MessageEntity> entities;
|
vector<MessageEntity> entities;
|
||||||
int32 utf16_offset = 0;
|
int32 utf16_offset = 0;
|
||||||
|
|
||||||
|
struct EntityInfo {
|
||||||
|
string tag_name;
|
||||||
|
string url;
|
||||||
|
int32 entity_offset;
|
||||||
|
size_t entity_begin_pos;
|
||||||
|
|
||||||
|
EntityInfo(string tag_name, string url, int32 entity_offset, size_t entity_begin_pos)
|
||||||
|
: tag_name(std::move(tag_name))
|
||||||
|
, url(std::move(url))
|
||||||
|
, entity_offset(entity_offset)
|
||||||
|
, entity_begin_pos(entity_begin_pos) {
|
||||||
|
}
|
||||||
|
};
|
||||||
|
std::vector<EntityInfo> nested_entities;
|
||||||
|
|
||||||
for (size_t i = 0; i < text.size(); i++) {
|
for (size_t i = 0; i < text.size(); i++) {
|
||||||
auto c = static_cast<unsigned char>(text[i]);
|
auto c = static_cast<unsigned char>(text[i]);
|
||||||
if (c == '&') {
|
if (c == '&') {
|
||||||
@ -1480,164 +1496,158 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// we are at begin of the entity
|
auto begin_pos = i++;
|
||||||
size_t begin_pos = i++;
|
if (text[i] != '/') {
|
||||||
if (text[i] == '/') {
|
// begin of an entity
|
||||||
return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos);
|
while (!is_space(text[i]) && text[i] != '>') {
|
||||||
}
|
|
||||||
while (!is_space(text[i]) && text[i] != '>') {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
if (text[i] == 0) {
|
|
||||||
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1));
|
|
||||||
if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" &&
|
|
||||||
tag_name != "pre" && tag_name != "code") {
|
|
||||||
return Status::Error(400,
|
|
||||||
PSLICE() << "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
string url;
|
|
||||||
// string language; TODO PreCode support
|
|
||||||
while (text[i] != '>') {
|
|
||||||
while (text[i] != 0 && is_space(text[i])) {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
if (text[i] == '>') {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto attribute_begin_pos = i;
|
|
||||||
while (!is_space(text[i]) && text[i] != '=') {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos);
|
|
||||||
if (attribute_name.empty()) {
|
|
||||||
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
|
|
||||||
<< tag_name << "\" at byte offset " << begin_pos);
|
|
||||||
}
|
|
||||||
while (text[i] != 0 && is_space(text[i])) {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
if (text[i] != '=') {
|
|
||||||
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of attribute of the tag \""
|
|
||||||
<< tag_name << "\" at byte offset " << begin_pos);
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
while (text[i] != 0 && is_space(text[i])) {
|
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
if (text[i] == 0) {
|
if (text[i] == 0) {
|
||||||
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
string attribute_value;
|
string tag_name = to_lower(text.substr(begin_pos + 1, i - begin_pos - 1));
|
||||||
if (text[i] != '\'' && text[i] != '"') {
|
if (tag_name != "em" && tag_name != "strong" && tag_name != "a" && tag_name != "b" && tag_name != "i" &&
|
||||||
// A name token (a sequence of letters, digits, periods, or hyphens). Name tokens are not case sensitive.
|
tag_name != "pre" && tag_name != "code") {
|
||||||
auto token_begin_pos = i;
|
return Status::Error(400, PSLICE()
|
||||||
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
|
<< "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
string url;
|
||||||
|
// string language; TODO PreCode support
|
||||||
|
while (text[i] != '>') {
|
||||||
|
while (text[i] != 0 && is_space(text[i])) {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos));
|
if (text[i] == '>') {
|
||||||
|
break;
|
||||||
if (!is_space(text[i]) && text[i] != '>') {
|
|
||||||
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
|
|
||||||
}
|
}
|
||||||
} else {
|
auto attribute_begin_pos = i;
|
||||||
// A string literal
|
while (!is_space(text[i]) && text[i] != '=') {
|
||||||
char end_character = text[i++];
|
i++;
|
||||||
while (text[i] != end_character && text[i] != 0) {
|
}
|
||||||
if (text[i] == '&') {
|
Slice attribute_name = text.substr(attribute_begin_pos, i - attribute_begin_pos);
|
||||||
auto ch = decode_html_entity(text, i);
|
if (attribute_name.empty()) {
|
||||||
if (ch != 0) {
|
return Status::Error(
|
||||||
append_utf8_character(attribute_value, ch);
|
400, PSLICE() << "Empty attribute name in the tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||||
continue;
|
}
|
||||||
|
while (text[i] != 0 && is_space(text[i])) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (text[i] != '=') {
|
||||||
|
return Status::Error(400, PSLICE() << "Expected equal sign in declaration of an attribute of the tag \""
|
||||||
|
<< tag_name << "\" at byte offset " << begin_pos);
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
while (text[i] != 0 && is_space(text[i])) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (text[i] == 0) {
|
||||||
|
return Status::Error(400, PSLICE()
|
||||||
|
<< "Unclosed start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
string attribute_value;
|
||||||
|
if (text[i] != '\'' && text[i] != '"') {
|
||||||
|
// A name token (a sequence of letters, digits, periods, or hyphens). Name tokens are not case sensitive.
|
||||||
|
auto token_begin_pos = i;
|
||||||
|
while (is_alnum(text[i]) || text[i] == '.' || text[i] == '-') {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
attribute_value = to_lower(text.substr(token_begin_pos, i - token_begin_pos));
|
||||||
|
|
||||||
|
if (!is_space(text[i]) && text[i] != '>') {
|
||||||
|
return Status::Error(400, PSLICE() << "Unexpected end of name token at byte offset " << token_begin_pos);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// A string literal
|
||||||
|
char end_character = text[i++];
|
||||||
|
while (text[i] != end_character && text[i] != 0) {
|
||||||
|
if (text[i] == '&') {
|
||||||
|
auto ch = decode_html_entity(text, i);
|
||||||
|
if (ch != 0) {
|
||||||
|
append_utf8_character(attribute_value, ch);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
attribute_value.push_back(text[i++]);
|
||||||
|
}
|
||||||
|
if (text[i] == end_character) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (text[i] == 0) {
|
||||||
|
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tag_name == "a" && attribute_name == Slice("href")) {
|
||||||
|
url = std::move(attribute_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nested_entities.emplace_back(std::move(tag_name), std::move(url), utf16_offset, result.size());
|
||||||
|
} else {
|
||||||
|
// end of an entity
|
||||||
|
if (nested_entities.empty()) {
|
||||||
|
return Status::Error(400, PSLICE() << "Unexpected end tag at byte offset " << begin_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!is_space(text[i]) && text[i] != '>') {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
Slice end_tag_name = text.substr(begin_pos + 2, i - begin_pos - 2);
|
||||||
|
while (is_space(text[i]) && text[i] != 0) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (text[i] != '>') {
|
||||||
|
return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << begin_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
string tag_name = std::move(nested_entities.back().tag_name);
|
||||||
|
if (!end_tag_name.empty() && end_tag_name != tag_name) {
|
||||||
|
return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << begin_pos << ", expected \"</"
|
||||||
|
<< tag_name << ">\", found \"</" << end_tag_name << ">\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (utf16_offset > nested_entities.back().entity_offset) {
|
||||||
|
auto entity_offset = nested_entities.back().entity_offset;
|
||||||
|
auto entity_length = utf16_offset - entity_offset;
|
||||||
|
if (tag_name == "i" || tag_name == "em") {
|
||||||
|
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
|
||||||
|
} else if (tag_name == "b" || tag_name == "strong") {
|
||||||
|
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
|
||||||
|
} else if (tag_name == "a") {
|
||||||
|
auto url = std::move(nested_entities.back().url);
|
||||||
|
if (url.empty()) {
|
||||||
|
url = result.substr(nested_entities.back().entity_begin_pos);
|
||||||
|
}
|
||||||
|
auto user_id = get_link_user_id(url);
|
||||||
|
if (user_id.is_valid()) {
|
||||||
|
entities.emplace_back(entity_offset, entity_length, user_id);
|
||||||
|
} else {
|
||||||
|
auto r_url = check_url(url);
|
||||||
|
if (r_url.is_ok()) {
|
||||||
|
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
attribute_value.push_back(text[i++]);
|
} else if (tag_name == "pre") {
|
||||||
}
|
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
|
||||||
if (text[i] == end_character) {
|
} else if (tag_name == "code") {
|
||||||
i++;
|
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
|
||||||
}
|
|
||||||
}
|
|
||||||
if (text[i] == 0) {
|
|
||||||
return Status::Error(400, PSLICE() << "Unclosed start tag at byte offset " << begin_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tag_name == "a" && attribute_name == Slice("href")) {
|
|
||||||
url = std::move(attribute_value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
|
|
||||||
int32 entity_offset = utf16_offset;
|
|
||||||
size_t entity_begin_pos = result.size();
|
|
||||||
while (text[i] != 0 && text[i] != '<') {
|
|
||||||
auto cur_ch = static_cast<unsigned char>(text[i]);
|
|
||||||
if (cur_ch == '&') {
|
|
||||||
auto ch = decode_html_entity(text, i);
|
|
||||||
if (ch != 0) {
|
|
||||||
utf16_offset += 1 + (ch > 0xffff);
|
|
||||||
append_utf8_character(result, ch);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (is_utf8_character_first_code_unit(cur_ch)) {
|
|
||||||
utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
|
|
||||||
}
|
|
||||||
result.push_back(text[i++]);
|
|
||||||
}
|
|
||||||
if (text[i] == 0) {
|
|
||||||
return Status::Error(400,
|
|
||||||
PSLICE() << "Can't find end tag corresponding to start tag at byte offset " << begin_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto end_tag_begin_pos = i++;
|
|
||||||
if (text[i] != '/') {
|
|
||||||
return Status::Error(400, PSLICE() << "Expected end tag at byte offset " << end_tag_begin_pos);
|
|
||||||
}
|
|
||||||
while (!is_space(text[i]) && text[i] != '>') {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
Slice end_tag_name = text.substr(end_tag_begin_pos + 2, i - end_tag_begin_pos - 2);
|
|
||||||
while (is_space(text[i]) && text[i] != 0) {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
if (text[i] != '>') {
|
|
||||||
return Status::Error(400, PSLICE() << "Unclosed end tag at byte offset " << end_tag_begin_pos);
|
|
||||||
}
|
|
||||||
if (!end_tag_name.empty() && end_tag_name != tag_name) {
|
|
||||||
return Status::Error(400, PSLICE() << "Unmatched end tag at byte offset " << end_tag_begin_pos
|
|
||||||
<< ", expected \"</" << tag_name << ">\", found\"</" << end_tag_name << ">\"");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (utf16_offset > entity_offset) {
|
|
||||||
auto entity_length = utf16_offset - entity_offset;
|
|
||||||
if (tag_name == "i" || tag_name == "em") {
|
|
||||||
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
|
|
||||||
} else if (tag_name == "b" || tag_name == "strong") {
|
|
||||||
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
|
|
||||||
} else if (tag_name == "a") {
|
|
||||||
if (url.empty()) {
|
|
||||||
url = result.substr(entity_begin_pos);
|
|
||||||
}
|
|
||||||
auto user_id = get_link_user_id(url);
|
|
||||||
if (user_id.is_valid()) {
|
|
||||||
entities.emplace_back(entity_offset, entity_length, user_id);
|
|
||||||
} else {
|
} else {
|
||||||
auto r_url = check_url(url);
|
UNREACHABLE();
|
||||||
if (r_url.is_ok()) {
|
|
||||||
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else if (tag_name == "pre") {
|
|
||||||
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
|
|
||||||
} else if (tag_name == "code") {
|
|
||||||
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
|
|
||||||
}
|
}
|
||||||
|
nested_entities.pop_back();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!nested_entities.empty()) {
|
||||||
|
return Status::Error(
|
||||||
|
400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(entities.begin(), entities.end());
|
||||||
|
|
||||||
return entities;
|
return entities;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
REGISTER_TESTS(message_entities);
|
REGISTER_TESTS(message_entities);
|
||||||
|
|
||||||
static void check_mention(td::string str, td::vector<td::string> expected) {
|
static void check_mention(const td::string &str, const td::vector<td::string> &expected) {
|
||||||
auto result_slice = td::find_mentions(str);
|
auto result_slice = td::find_mentions(str);
|
||||||
td::vector<td::string> result;
|
td::vector<td::string> result;
|
||||||
for (auto &it : result_slice) {
|
for (auto &it : result_slice) {
|
||||||
@ -44,7 +44,7 @@ TEST(MessageEntities, mention) {
|
|||||||
{"@gif", "@wiki", "@vid", "@bing", "@pic", "@bold", "@imdb", "@coub", "@like", "@vote", "@bingg"});
|
{"@gif", "@wiki", "@vid", "@bing", "@pic", "@bold", "@imdb", "@coub", "@like", "@vote", "@bingg"});
|
||||||
};
|
};
|
||||||
|
|
||||||
static void check_bot_command(td::string str, td::vector<td::string> expected) {
|
static void check_bot_command(const td::string &str, const td::vector<td::string> &expected) {
|
||||||
auto result_slice = td::find_bot_commands(str);
|
auto result_slice = td::find_bot_commands(str);
|
||||||
td::vector<td::string> result;
|
td::vector<td::string> result;
|
||||||
for (auto &it : result_slice) {
|
for (auto &it : result_slice) {
|
||||||
@ -68,7 +68,7 @@ TEST(MessageEntities, bot_command) {
|
|||||||
check_bot_command("/test/", {});
|
check_bot_command("/test/", {});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_hashtag(td::string str, td::vector<td::string> expected) {
|
static void check_hashtag(const td::string &str, const td::vector<td::string> &expected) {
|
||||||
auto result_slice = td::find_hashtags(str);
|
auto result_slice = td::find_hashtags(str);
|
||||||
td::vector<td::string> result;
|
td::vector<td::string> result;
|
||||||
for (auto &it : result_slice) {
|
for (auto &it : result_slice) {
|
||||||
@ -109,7 +109,7 @@ TEST(MessageEntities, hashtag) {
|
|||||||
check_hashtag(u8"#a\u2122", {"#a"});
|
check_hashtag(u8"#a\u2122", {"#a"});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_cashtag(td::string str, td::vector<td::string> expected) {
|
static void check_cashtag(const td::string &str, const td::vector<td::string> &expected) {
|
||||||
auto result_slice = td::find_cashtags(str);
|
auto result_slice = td::find_cashtags(str);
|
||||||
td::vector<td::string> result;
|
td::vector<td::string> result;
|
||||||
for (auto &it : result_slice) {
|
for (auto &it : result_slice) {
|
||||||
@ -161,7 +161,7 @@ TEST(MessageEntities, cashtag) {
|
|||||||
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
|
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_is_email_address(td::string str, bool expected) {
|
static void check_is_email_address(const td::string &str, bool expected) {
|
||||||
bool result = td::is_email_address(str);
|
bool result = td::is_email_address(str);
|
||||||
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";
|
LOG_IF(FATAL, result != expected) << "Expected " << expected << " as result of is_email_address(" << str << ")";
|
||||||
}
|
}
|
||||||
@ -279,7 +279,7 @@ TEST(MessageEntities, is_email_address) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_url(td::string str, td::vector<td::string> expected_urls,
|
static void check_url(const td::string &str, const td::vector<td::string> &expected_urls,
|
||||||
td::vector<td::string> expected_email_addresses = {}) {
|
td::vector<td::string> expected_email_addresses = {}) {
|
||||||
auto result_slice = td::find_urls(str);
|
auto result_slice = td::find_urls(str);
|
||||||
td::vector<td::string> result_urls;
|
td::vector<td::string> result_urls;
|
||||||
@ -530,8 +530,9 @@ TEST(MessageEntities, url) {
|
|||||||
check_url("...πhttp://ab.com/cdefgh-1IJ", {}); // TODO
|
check_url("...πhttp://ab.com/cdefgh-1IJ", {}); // TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities, td::string expected_str,
|
static void check_fix_formatted_text(td::string str, td::vector<td::MessageEntity> entities,
|
||||||
td::vector<td::MessageEntity> expected_entities, bool allow_empty,
|
const td::string &expected_str,
|
||||||
|
const td::vector<td::MessageEntity> &expected_entities, bool allow_empty,
|
||||||
bool skip_new_entities, bool skip_bot_commands, bool for_draft) {
|
bool skip_new_entities, bool skip_bot_commands, bool for_draft) {
|
||||||
ASSERT_TRUE(
|
ASSERT_TRUE(
|
||||||
td::fix_formatted_text(str, entities, allow_empty, skip_new_entities, skip_bot_commands, for_draft).is_ok());
|
td::fix_formatted_text(str, entities, allow_empty, skip_new_entities, skip_bot_commands, for_draft).is_ok());
|
||||||
@ -721,3 +722,87 @@ TEST(MessageEntities, fix_formatted_text) {
|
|||||||
false);
|
false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void check_parse_html(td::string text, const td::string &result, const td::vector<td::MessageEntity> &entities) {
|
||||||
|
auto r_entities = td::parse_html(text);
|
||||||
|
ASSERT_TRUE(r_entities.is_ok());
|
||||||
|
ASSERT_EQ(entities, r_entities.ok());
|
||||||
|
ASSERT_STREQ(result, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void check_parse_html(td::string text, const td::string &error_message) {
|
||||||
|
auto r_entities = td::parse_html(text);
|
||||||
|
ASSERT_TRUE(r_entities.is_error());
|
||||||
|
ASSERT_EQ(400, r_entities.error().code());
|
||||||
|
ASSERT_STREQ(error_message, r_entities.error().message());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(MessageEntities, parse_html) {
|
||||||
|
td::string invalid_surrogate_pair_error_message =
|
||||||
|
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched surrogate code units";
|
||||||
|
check_parse_html("�", invalid_surrogate_pair_error_message);
|
||||||
|
check_parse_html("�", invalid_surrogate_pair_error_message);
|
||||||
|
check_parse_html("�", invalid_surrogate_pair_error_message);
|
||||||
|
check_parse_html("π π<<abacaba", "Unclosed start tag at byte offset 13");
|
||||||
|
check_parse_html("π π<<abac aba>", "Unsupported start tag \"abac\" at byte offset 13");
|
||||||
|
check_parse_html("π π<<abac>", "Unsupported start tag \"abac\" at byte offset 13");
|
||||||
|
check_parse_html("π π<<i =aba>", "Empty attribute name in the tag \"i\" at byte offset 13");
|
||||||
|
check_parse_html("π π<<i aba>",
|
||||||
|
"Expected equal sign in declaration of an attribute of the tag \"i\" at byte offset 13");
|
||||||
|
check_parse_html("π π<<i aba = ", "Unclosed start tag \"i\" at byte offset 13");
|
||||||
|
check_parse_html("π π<<i aba = 190azAz-.,", "Unexpected end of name token at byte offset 27");
|
||||||
|
check_parse_html("π π<<i aba = \"<>">", "Unclosed start tag at byte offset 13");
|
||||||
|
check_parse_html("π π<<i aba = \'<>">", "Unclosed start tag at byte offset 13");
|
||||||
|
check_parse_html("π π<</", "Unexpected end tag at byte offset 13");
|
||||||
|
check_parse_html("π π<<b></b></", "Unexpected end tag at byte offset 20");
|
||||||
|
check_parse_html("π π<<i>a</i ", "Unclosed end tag at byte offset 17");
|
||||||
|
check_parse_html("π π<<i>a</em >",
|
||||||
|
"Unmatched end tag at byte offset 17, expected \"</i>\", found \"</em>\"");
|
||||||
|
|
||||||
|
check_parse_html("", "", {});
|
||||||
|
check_parse_html("β‘οΈ β‘οΈ", "β‘οΈ β‘οΈ", {});
|
||||||
|
check_parse_html("<>&"«»�", "<>&\"«»�", {});
|
||||||
|
check_parse_html("β‘οΈ β‘οΈ<i>β‘οΈ β‘οΈ</i>", "β‘οΈ β‘οΈβ‘οΈ β‘οΈ",
|
||||||
|
{{td::MessageEntity::Type::Italic, 5, 5}});
|
||||||
|
check_parse_html("π π<i>π <π</i>", "π ππ <π", {{td::MessageEntity::Type::Italic, 5, 6}});
|
||||||
|
check_parse_html("π π<i>π ><b aba = caba><π</b></i>", "π ππ ><π",
|
||||||
|
{{td::MessageEntity::Type::Italic, 5, 7}, {td::MessageEntity::Type::Bold, 9, 3}});
|
||||||
|
check_parse_html("π π<<i aba = 190azAz-. >a</i>", "π π<a",
|
||||||
|
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||||
|
check_parse_html("π π<<i aba = 190azAz-.>a</i>", "π π<a",
|
||||||
|
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||||
|
check_parse_html("π π<<i aba = \"<>"\">a</i>", "π π<a",
|
||||||
|
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||||
|
check_parse_html("π π<<i aba = '<>"'>a</i>", "π π<a",
|
||||||
|
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||||
|
check_parse_html("π π<<i aba = '<>"'>a</>", "π π<a",
|
||||||
|
{{td::MessageEntity::Type::Italic, 6, 1}});
|
||||||
|
check_parse_html("π π<<i>a</ >", "π π<a", {{td::MessageEntity::Type::Italic, 6, 1}});
|
||||||
|
check_parse_html("π π<<i>a</i >", "π π<a", {{td::MessageEntity::Type::Italic, 6, 1}});
|
||||||
|
check_parse_html("π π<<b></b>", "π π<", {});
|
||||||
|
check_parse_html("<code><i><b> </b></i></code><i><b><code> </code></b></i>", " ",
|
||||||
|
{{td::MessageEntity::Type::Code, 0, 1},
|
||||||
|
{td::MessageEntity::Type::Bold, 0, 1},
|
||||||
|
{td::MessageEntity::Type::Italic, 0, 1},
|
||||||
|
{td::MessageEntity::Type::Code, 1, 1},
|
||||||
|
{td::MessageEntity::Type::Bold, 1, 1},
|
||||||
|
{td::MessageEntity::Type::Italic, 1, 1}});
|
||||||
|
check_parse_html("<i><b> </b> <code> </code></i>", " ",
|
||||||
|
{{td::MessageEntity::Type::Italic, 0, 3},
|
||||||
|
{td::MessageEntity::Type::Bold, 0, 1},
|
||||||
|
{td::MessageEntity::Type::Code, 2, 1}});
|
||||||
|
check_parse_html("<a href=telegram.org> </a>", " ",
|
||||||
|
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
|
||||||
|
check_parse_html("<a href =\"telegram.org\" > </a>", " ",
|
||||||
|
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
|
||||||
|
check_parse_html("<a href= 'telegram.org' > </a>", " ",
|
||||||
|
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/"}});
|
||||||
|
check_parse_html("<a href= 'telegram.org?<' > </a>", " ",
|
||||||
|
{{td::MessageEntity::Type::TextUrl, 0, 1, "http://telegram.org/?<"}});
|
||||||
|
check_parse_html("<a> </a>", " ", {});
|
||||||
|
check_parse_html("<a>telegram.org </a>", "telegram.org ", {});
|
||||||
|
check_parse_html("<a>telegram.org</a>", "telegram.org",
|
||||||
|
{{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}});
|
||||||
|
check_parse_html("<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>", "https://telegram.org/asdsa?asdasdwe#12e3we",
|
||||||
|
{{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}});
|
||||||
|
}
|
||||||
|
Loadingβ¦
x
Reference in New Issue
Block a user