Support PreCode in parse_html.

GitOrigin-RevId: 20a3bcb168ecd45f416e7df17247997eacecbc11
This commit is contained in:
levlam 2019-10-03 16:39:50 +03:00
parent 5a07029e6b
commit 6114516b40
2 changed files with 48 additions and 10 deletions

View File

@ -1662,13 +1662,13 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
struct EntityInfo { struct EntityInfo {
string tag_name; string tag_name;
string url; string argument;
int32 entity_offset; int32 entity_offset;
size_t entity_begin_pos; size_t entity_begin_pos;
EntityInfo(string tag_name, string url, int32 entity_offset, size_t entity_begin_pos) EntityInfo(string tag_name, string argument, int32 entity_offset, size_t entity_begin_pos)
: tag_name(std::move(tag_name)) : tag_name(std::move(tag_name))
, url(std::move(url)) , argument(std::move(argument))
, entity_offset(entity_offset) , entity_offset(entity_offset)
, entity_begin_pos(entity_begin_pos) { , entity_begin_pos(entity_begin_pos) {
} }
@ -1711,8 +1711,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
<< "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos); << "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
} }
string url; string argument;
// string language; TODO PreCode support
while (text[i] != '>') { while (text[i] != '>') {
while (text[i] != 0 && is_space(text[i])) { while (text[i] != 0 && is_space(text[i])) {
i++; i++;
@ -1779,11 +1778,14 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
} }
if (tag_name == "a" && attribute_name == Slice("href")) { if (tag_name == "a" && attribute_name == Slice("href")) {
url = std::move(attribute_value); argument = std::move(attribute_value);
}
if (tag_name == "code" && attribute_name == Slice("class") && begins_with(attribute_value, "language-")) {
argument = attribute_value.substr(9);
} }
} }
nested_entities.emplace_back(std::move(tag_name), std::move(url), utf16_offset, result.size()); nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result.size());
} else { } else {
// end of an entity // end of an entity
if (nested_entities.empty()) { if (nested_entities.empty()) {
@ -1815,7 +1817,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
} else if (tag_name == "b" || tag_name == "strong") { } else if (tag_name == "b" || tag_name == "strong") {
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length); entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
} else if (tag_name == "a") { } else if (tag_name == "a") {
auto url = std::move(nested_entities.back().url); auto url = std::move(nested_entities.back().argument);
if (url.empty()) { if (url.empty()) {
url = result.substr(nested_entities.back().entity_begin_pos); url = result.substr(nested_entities.back().entity_begin_pos);
} }
@ -1829,9 +1831,23 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
} }
} }
} else if (tag_name == "pre") { } else if (tag_name == "pre") {
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length); if (!entities.empty() && entities.back().type == MessageEntity::Type::Code &&
entities.back().offset == entity_offset && entities.back().length == entity_length &&
!entities.back().argument.empty()) {
entities.back().type = MessageEntity::Type::PreCode;
} else {
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
}
} else if (tag_name == "code") { } else if (tag_name == "code") {
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length); if (!entities.empty() && entities.back().type == MessageEntity::Type::Pre &&
entities.back().offset == entity_offset && entities.back().length == entity_length &&
!nested_entities.back().argument.empty()) {
entities.back().type = MessageEntity::Type::PreCode;
entities.back().argument = std::move(nested_entities.back().argument);
} else {
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length,
nested_entities.back().argument);
}
} else { } else {
UNREACHABLE(); UNREACHABLE();
} }
@ -1844,6 +1860,12 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name); 400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name);
} }
for (auto &entity : entities) {
if (entity.type == MessageEntity::Type::Code && !entity.argument.empty()) {
entity.argument.clear();
}
}
std::sort(entities.begin(), entities.end()); std::sort(entities.begin(), entities.end());
return entities; return entities;

View File

@ -810,6 +810,22 @@ TEST(MessageEntities, parse_html) {
{{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}}); {{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}});
check_parse_html("<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>", "https://telegram.org/asdsa?asdasdwe#12e3we", check_parse_html("<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>", "https://telegram.org/asdsa?asdasdwe#12e3we",
{{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}}); {{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<pre >๐ŸŸ ๐ŸŸ&lt;</>", "๐ŸŸ ๐ŸŸ<๐ŸŸ ๐ŸŸ<",
{{td::MessageEntity::Type::Pre, 6, 6}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<code >๐ŸŸ ๐ŸŸ&lt;</>", "๐ŸŸ ๐ŸŸ<๐ŸŸ ๐ŸŸ<",
{{td::MessageEntity::Type::Code, 6, 6}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<pre><code>๐ŸŸ ๐ŸŸ&lt;</code></>", "๐ŸŸ ๐ŸŸ<๐ŸŸ ๐ŸŸ<",
{{td::MessageEntity::Type::Pre, 6, 6}, {td::MessageEntity::Type::Code, 6, 6}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<pre><code class=\"language-\">๐ŸŸ ๐ŸŸ&lt;</code></>", "๐ŸŸ ๐ŸŸ<๐ŸŸ ๐ŸŸ<",
{{td::MessageEntity::Type::Pre, 6, 6}, {td::MessageEntity::Type::Code, 6, 6}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<pre><code class=\"language-fift\">๐ŸŸ ๐ŸŸ&lt;</></>", "๐ŸŸ ๐ŸŸ<๐ŸŸ ๐ŸŸ<",
{{td::MessageEntity::Type::PreCode, 6, 6, "fift"}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<code class=\"language-fift\"><pre>๐ŸŸ ๐ŸŸ&lt;</></>", "๐ŸŸ ๐ŸŸ<๐ŸŸ ๐ŸŸ<",
{{td::MessageEntity::Type::PreCode, 6, 6, "fift"}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<pre><code class=\"language-fift\">๐ŸŸ ๐ŸŸ&lt;</> </>", "๐ŸŸ ๐ŸŸ<๐ŸŸ ๐ŸŸ< ",
{{td::MessageEntity::Type::Pre, 6, 7}, {td::MessageEntity::Type::Code, 6, 6}});
check_parse_html("๐ŸŸ ๐ŸŸ&lt;<pre> <code class=\"language-fift\">๐ŸŸ ๐ŸŸ&lt;</></>", "๐ŸŸ ๐ŸŸ< ๐ŸŸ ๐ŸŸ<",
{{td::MessageEntity::Type::Pre, 6, 7}, {td::MessageEntity::Type::Code, 7, 6}});
} }
static void check_parse_markdown(td::string text, const td::string &result, static void check_parse_markdown(td::string text, const td::string &result,