Support PreCode in parse_html.
GitOrigin-RevId: 20a3bcb168ecd45f416e7df17247997eacecbc11
This commit is contained in:
parent
5a07029e6b
commit
6114516b40
@ -1662,13 +1662,13 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
|
|
||||||
struct EntityInfo {
|
struct EntityInfo {
|
||||||
string tag_name;
|
string tag_name;
|
||||||
string url;
|
string argument;
|
||||||
int32 entity_offset;
|
int32 entity_offset;
|
||||||
size_t entity_begin_pos;
|
size_t entity_begin_pos;
|
||||||
|
|
||||||
EntityInfo(string tag_name, string url, int32 entity_offset, size_t entity_begin_pos)
|
EntityInfo(string tag_name, string argument, int32 entity_offset, size_t entity_begin_pos)
|
||||||
: tag_name(std::move(tag_name))
|
: tag_name(std::move(tag_name))
|
||||||
, url(std::move(url))
|
, argument(std::move(argument))
|
||||||
, entity_offset(entity_offset)
|
, entity_offset(entity_offset)
|
||||||
, entity_begin_pos(entity_begin_pos) {
|
, entity_begin_pos(entity_begin_pos) {
|
||||||
}
|
}
|
||||||
@ -1711,8 +1711,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
<< "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
<< "Unsupported start tag \"" << tag_name << "\" at byte offset " << begin_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
string url;
|
string argument;
|
||||||
// string language; TODO PreCode support
|
|
||||||
while (text[i] != '>') {
|
while (text[i] != '>') {
|
||||||
while (text[i] != 0 && is_space(text[i])) {
|
while (text[i] != 0 && is_space(text[i])) {
|
||||||
i++;
|
i++;
|
||||||
@ -1779,11 +1778,14 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (tag_name == "a" && attribute_name == Slice("href")) {
|
if (tag_name == "a" && attribute_name == Slice("href")) {
|
||||||
url = std::move(attribute_value);
|
argument = std::move(attribute_value);
|
||||||
|
}
|
||||||
|
if (tag_name == "code" && attribute_name == Slice("class") && begins_with(attribute_value, "language-")) {
|
||||||
|
argument = attribute_value.substr(9);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
nested_entities.emplace_back(std::move(tag_name), std::move(url), utf16_offset, result.size());
|
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result.size());
|
||||||
} else {
|
} else {
|
||||||
// end of an entity
|
// end of an entity
|
||||||
if (nested_entities.empty()) {
|
if (nested_entities.empty()) {
|
||||||
@ -1815,7 +1817,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
} else if (tag_name == "b" || tag_name == "strong") {
|
} else if (tag_name == "b" || tag_name == "strong") {
|
||||||
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
|
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
|
||||||
} else if (tag_name == "a") {
|
} else if (tag_name == "a") {
|
||||||
auto url = std::move(nested_entities.back().url);
|
auto url = std::move(nested_entities.back().argument);
|
||||||
if (url.empty()) {
|
if (url.empty()) {
|
||||||
url = result.substr(nested_entities.back().entity_begin_pos);
|
url = result.substr(nested_entities.back().entity_begin_pos);
|
||||||
}
|
}
|
||||||
@ -1829,9 +1831,23 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tag_name == "pre") {
|
} else if (tag_name == "pre") {
|
||||||
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
|
if (!entities.empty() && entities.back().type == MessageEntity::Type::Code &&
|
||||||
|
entities.back().offset == entity_offset && entities.back().length == entity_length &&
|
||||||
|
!entities.back().argument.empty()) {
|
||||||
|
entities.back().type = MessageEntity::Type::PreCode;
|
||||||
|
} else {
|
||||||
|
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
|
||||||
|
}
|
||||||
} else if (tag_name == "code") {
|
} else if (tag_name == "code") {
|
||||||
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
|
if (!entities.empty() && entities.back().type == MessageEntity::Type::Pre &&
|
||||||
|
entities.back().offset == entity_offset && entities.back().length == entity_length &&
|
||||||
|
!nested_entities.back().argument.empty()) {
|
||||||
|
entities.back().type = MessageEntity::Type::PreCode;
|
||||||
|
entities.back().argument = std::move(nested_entities.back().argument);
|
||||||
|
} else {
|
||||||
|
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length,
|
||||||
|
nested_entities.back().argument);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
UNREACHABLE();
|
UNREACHABLE();
|
||||||
}
|
}
|
||||||
@ -1844,6 +1860,12 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
|||||||
400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name);
|
400, PSLICE() << "Can't find end tag corresponding to start tag " << nested_entities.back().tag_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto &entity : entities) {
|
||||||
|
if (entity.type == MessageEntity::Type::Code && !entity.argument.empty()) {
|
||||||
|
entity.argument.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::sort(entities.begin(), entities.end());
|
std::sort(entities.begin(), entities.end());
|
||||||
|
|
||||||
return entities;
|
return entities;
|
||||||
|
@ -810,6 +810,22 @@ TEST(MessageEntities, parse_html) {
|
|||||||
{{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}});
|
{{td::MessageEntity::Type::TextUrl, 0, 12, "http://telegram.org/"}});
|
||||||
check_parse_html("<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>", "https://telegram.org/asdsa?asdasdwe#12e3we",
|
check_parse_html("<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>", "https://telegram.org/asdsa?asdasdwe#12e3we",
|
||||||
{{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}});
|
{{td::MessageEntity::Type::TextUrl, 0, 42, "https://telegram.org/asdsa?asdasdwe#12e3we"}});
|
||||||
|
check_parse_html("๐ ๐<<pre >๐ ๐<</>", "๐ ๐<๐ ๐<",
|
||||||
|
{{td::MessageEntity::Type::Pre, 6, 6}});
|
||||||
|
check_parse_html("๐ ๐<<code >๐ ๐<</>", "๐ ๐<๐ ๐<",
|
||||||
|
{{td::MessageEntity::Type::Code, 6, 6}});
|
||||||
|
check_parse_html("๐ ๐<<pre><code>๐ ๐<</code></>", "๐ ๐<๐ ๐<",
|
||||||
|
{{td::MessageEntity::Type::Pre, 6, 6}, {td::MessageEntity::Type::Code, 6, 6}});
|
||||||
|
check_parse_html("๐ ๐<<pre><code class=\"language-\">๐ ๐<</code></>", "๐ ๐<๐ ๐<",
|
||||||
|
{{td::MessageEntity::Type::Pre, 6, 6}, {td::MessageEntity::Type::Code, 6, 6}});
|
||||||
|
check_parse_html("๐ ๐<<pre><code class=\"language-fift\">๐ ๐<</></>", "๐ ๐<๐ ๐<",
|
||||||
|
{{td::MessageEntity::Type::PreCode, 6, 6, "fift"}});
|
||||||
|
check_parse_html("๐ ๐<<code class=\"language-fift\"><pre>๐ ๐<</></>", "๐ ๐<๐ ๐<",
|
||||||
|
{{td::MessageEntity::Type::PreCode, 6, 6, "fift"}});
|
||||||
|
check_parse_html("๐ ๐<<pre><code class=\"language-fift\">๐ ๐<</> </>", "๐ ๐<๐ ๐< ",
|
||||||
|
{{td::MessageEntity::Type::Pre, 6, 7}, {td::MessageEntity::Type::Code, 6, 6}});
|
||||||
|
check_parse_html("๐ ๐<<pre> <code class=\"language-fift\">๐ ๐<</></>", "๐ ๐< ๐ ๐<",
|
||||||
|
{{td::MessageEntity::Type::Pre, 6, 7}, {td::MessageEntity::Type::Code, 7, 6}});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void check_parse_markdown(td::string text, const td::string &result,
|
static void check_parse_markdown(td::string text, const td::string &result,
|
||||||
|
Reference in New Issue
Block a user