Use entity_offset instead of utf16_entity_length.

GitOrigin-RevId: 8e02eac54c534eb33b37ef2b2f345e9c0781164a
This commit is contained in:
levlam 2019-09-25 02:05:42 +03:00
parent 38d5be1ee9
commit 3006357f7c

View File

@ -1339,11 +1339,11 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
} }
} }
int32 utf16_entity_length = 0; int32 entity_offset = utf16_offset;
while (i < size && (text[i] != end_character || (is_pre && !(text[i + 1] == '`' && text[i + 2] == '`')))) { while (i < size && (text[i] != end_character || (is_pre && !(text[i + 1] == '`' && text[i + 2] == '`')))) {
auto cur_ch = static_cast<unsigned char>(text[i]); auto cur_ch = static_cast<unsigned char>(text[i]);
if (is_utf8_character_first_code_unit(cur_ch)) { if (is_utf8_character_first_code_unit(cur_ch)) {
utf16_entity_length += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
} }
result.push_back(text[i++]); result.push_back(text[i++]);
} }
@ -1351,13 +1351,14 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
return Status::Error(400, PSLICE() << "Can't find end of the entity starting at byte offset " << begin_pos); return Status::Error(400, PSLICE() << "Can't find end of the entity starting at byte offset " << begin_pos);
} }
if (utf16_entity_length > 0) { if (entity_offset != utf16_offset) {
auto entity_length = utf16_offset - entity_offset;
switch (c) { switch (c) {
case '_': case '_':
entities.emplace_back(MessageEntity::Type::Italic, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
break; break;
case '*': case '*':
entities.emplace_back(MessageEntity::Type::Bold, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
break; break;
case '[': { case '[': {
string url; string url;
@ -1372,12 +1373,11 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
} }
auto user_id = get_link_user_id(url); auto user_id = get_link_user_id(url);
if (user_id.is_valid()) { if (user_id.is_valid()) {
entities.emplace_back(utf16_offset, utf16_entity_length, user_id); entities.emplace_back(entity_offset, entity_length, user_id);
} else { } else {
auto r_url = check_url(url); auto r_url = check_url(url);
if (r_url.is_ok()) { if (r_url.is_ok()) {
entities.emplace_back(MessageEntity::Type::TextUrl, utf16_offset, utf16_entity_length, entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
r_url.move_as_ok());
} }
} }
break; break;
@ -1385,18 +1385,17 @@ Result<vector<MessageEntity>> parse_markdown(string &text) {
case '`': case '`':
if (is_pre) { if (is_pre) {
if (language.empty()) { if (language.empty()) {
entities.emplace_back(MessageEntity::Type::Pre, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
} else { } else {
entities.emplace_back(MessageEntity::Type::PreCode, utf16_offset, utf16_entity_length, language); entities.emplace_back(MessageEntity::Type::PreCode, entity_offset, entity_length, language);
} }
} else { } else {
entities.emplace_back(MessageEntity::Type::Code, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
} }
break; break;
default: default:
UNREACHABLE(); UNREACHABLE();
} }
utf16_offset += utf16_entity_length;
} }
if (is_pre) { if (is_pre) {
i += 2; i += 2;
@ -1572,20 +1571,20 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
} }
i++; i++;
int32 utf16_entity_length = 0; int32 entity_offset = utf16_offset;
size_t entity_begin_pos = result.size(); size_t entity_begin_pos = result.size();
while (text[i] != 0 && text[i] != '<') { while (text[i] != 0 && text[i] != '<') {
auto cur_ch = static_cast<unsigned char>(text[i]); auto cur_ch = static_cast<unsigned char>(text[i]);
if (cur_ch == '&') { if (cur_ch == '&') {
auto ch = decode_html_entity(text, i); auto ch = decode_html_entity(text, i);
if (ch != 0) { if (ch != 0) {
utf16_entity_length += 1 + (ch > 0xffff); utf16_offset += 1 + (ch > 0xffff);
append_utf8_character(result, ch); append_utf8_character(result, ch);
continue; continue;
} }
} }
if (is_utf8_character_first_code_unit(cur_ch)) { if (is_utf8_character_first_code_unit(cur_ch)) {
utf16_entity_length += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair utf16_offset += 1 + (cur_ch >= 0xf0); // >= 4 bytes in symbol => surrogaite pair
} }
result.push_back(text[i++]); result.push_back(text[i++]);
} }
@ -1613,30 +1612,30 @@ static Result<vector<MessageEntity>> do_parse_html(Slice text, string &result) {
<< ", expected \"</" << tag_name << ">\", found\"</" << end_tag_name << ">\""); << ", expected \"</" << tag_name << ">\", found\"</" << end_tag_name << ">\"");
} }
if (utf16_entity_length > 0) { if (utf16_offset > entity_offset) {
auto entity_length = utf16_offset - entity_offset;
if (tag_name == "i" || tag_name == "em") { if (tag_name == "i" || tag_name == "em") {
entities.emplace_back(MessageEntity::Type::Italic, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Italic, entity_offset, entity_length);
} else if (tag_name == "b" || tag_name == "strong") { } else if (tag_name == "b" || tag_name == "strong") {
entities.emplace_back(MessageEntity::Type::Bold, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Bold, entity_offset, entity_length);
} else if (tag_name == "a") { } else if (tag_name == "a") {
if (url.empty()) { if (url.empty()) {
url = result.substr(entity_begin_pos); url = result.substr(entity_begin_pos);
} }
auto user_id = get_link_user_id(url); auto user_id = get_link_user_id(url);
if (user_id.is_valid()) { if (user_id.is_valid()) {
entities.emplace_back(utf16_offset, utf16_entity_length, user_id); entities.emplace_back(entity_offset, entity_length, user_id);
} else { } else {
auto r_url = check_url(url); auto r_url = check_url(url);
if (r_url.is_ok()) { if (r_url.is_ok()) {
entities.emplace_back(MessageEntity::Type::TextUrl, utf16_offset, utf16_entity_length, r_url.move_as_ok()); entities.emplace_back(MessageEntity::Type::TextUrl, entity_offset, entity_length, r_url.move_as_ok());
} }
} }
} else if (tag_name == "pre") { } else if (tag_name == "pre") {
entities.emplace_back(MessageEntity::Type::Pre, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Pre, entity_offset, entity_length);
} else if (tag_name == "code") { } else if (tag_name == "code") {
entities.emplace_back(MessageEntity::Type::Code, utf16_offset, utf16_entity_length); entities.emplace_back(MessageEntity::Type::Code, entity_offset, entity_length);
} }
utf16_offset += utf16_entity_length;
} }
} }
return entities; return entities;