Use entity_offset instead of utf16_entity_length.

GitOrigin-RevId: 8e02eac54c534eb33b37ef2b2f345e9c0781164a
This commit is contained in:
levlam 2019-09-25 02:05:42 +03:00
parent 38d5be1ee9
commit 3006357f7c

View File

@ -1339,11 +1339,11 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
}
}
int32 utf16_entity_length = 0;
int32 entity_offset = utf16_offset;
while (i < size && (text[i] != end_character || (is_pre && !(text[i + 1] == '`' && text[i + 2] == '`')))) {
auto cur_ch = static_cast<unsigned char>(text[i]);
if (is_utf8_character_first_code_unit(cur_ch)) {
utf16_entity_length += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
}
result.push_back(text[i++]);
}
@ -1351,13 +1351,14 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
return Status::Error(400, PSLICE() << "Can't find end of the entity starting at byte offset " << begin_pos);
}
if (utf16_entity_length > 0) {
if (entity_offset != utf16_offset) {
auto entity_length = utf16_offset - entity_offset;
switch (c) {
case '_':
entities.emplace_back(MessageEntity::Type::Italic, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
break;
case '*':
entities.emplace_back(MessageEntity::Type::Bold, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
break;
case '[': {
string url;
@ -1372,12 +1373,11 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
}
auto user_id = get_link_user_id(url);
if (user_id.is_valid()) {
entities.emplace_back(utf16_offset, utf16_entity_length, user_id);
entities.emplace_back(entity_offset, entity_length, user_id);
} else {
auto r_url = check_url(url);
if (r_url.is_ok()) {
entities.emplace_back(MessageEntity::Type::TextUrl, utf16_offset, utf16_entity_length,
r_url.move_as_ok());
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
}
}
break;
@ -1385,18 +1385,17 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
case '`':
if (is_pre) {
if (language.empty()) {
entities.emplace_back(MessageEntity::Type::Pre, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
} else {
entities.emplace_back(MessageEntity::Type::PreCode, utf16_offset, utf16_entity_length, language);
entities.emplace_back(MessageEntity::Type::PreCode, entity_offset, entity_length, language);
}
} else {
entities.emplace_back(MessageEntity::Type::Code, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
}
break;
default:
UNREACHABLE();
}
utf16_offset += utf16_entity_length;
}
if (is_pre) {
i += 2;
@ -1572,20 +1571,20 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
}
i++;
int32 utf16_entity_length = 0;
int32 entity_offset = utf16_offset;
size_t entity_begin_pos = result.size();
while (text[i] != 0 && text[i] != '<') {
auto cur_ch = static_cast<unsigned char>(text[i]);
if (cur_ch == '&') {
auto ch = decode_html_entity(text, i);
if (ch != 0) {
utf16_entity_length += 1 + (ch > 0xffff);
utf16_offset += 1 + (ch > 0xffff);
append_utf8_character(result, ch);
continue;
}
}
if (is_utf8_character_first_code_unit(cur_ch)) {
utf16_entity_length += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
}
result.push_back(text[i++]);
}
@ -1613,30 +1612,30 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
<< ", expected \"</" << tag_name << ">\", found\"</" << end_tag_name << ">\"");
}
if (utf16_entity_length > 0) {
if (utf16_offset > entity_offset) {
auto entity_length = utf16_offset - entity_offset;
if (tag_name == "i" || tag_name == "em") {
entities.emplace_back(MessageEntity::Type::Italic, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
} else if (tag_name == "b" || tag_name == "strong") {
entities.emplace_back(MessageEntity::Type::Bold, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
} else if (tag_name == "a") {
if (url.empty()) {
url = result.substr(entity_begin_pos);
}
auto user_id = get_link_user_id(url);
if (user_id.is_valid()) {
entities.emplace_back(utf16_offset, utf16_entity_length, user_id);
entities.emplace_back(entity_offset, entity_length, user_id);
} else {
auto r_url = check_url(url);
if (r_url.is_ok()) {
entities.emplace_back(MessageEntity::Type::TextUrl, utf16_offset, utf16_entity_length, r_url.move_as_ok());
entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
}
}
} else if (tag_name == "pre") {
entities.emplace_back(MessageEntity::Type::Pre, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
} else if (tag_name == "code") {
entities.emplace_back(MessageEntity::Type::Code, utf16_offset, utf16_entity_length);
entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
}
utf16_offset += utf16_entity_length;
}
}
return entities;