Replace text in-place in parse_html.
This commit is contained in:
parent
ac0de06b5f
commit
137fd3beb9
|
@ -2991,14 +2991,13 @@ static uint32 decode_html_entity(CSlice text, size_t &pos) {
|
||||||
Result<vector<MessageEntity>> parse_html(string &str) {
|
Result<vector<MessageEntity>> parse_html(string &str) {
|
||||||
auto str_size = str.size();
|
auto str_size = str.size();
|
||||||
const char *text = str.c_str();
|
const char *text = str.c_str();
|
||||||
|
auto result_end = MutableSlice(str).ubegin();
|
||||||
|
const unsigned char *result_begin = result_end;
|
||||||
|
|
||||||
vector<MessageEntity> entities;
|
vector<MessageEntity> entities;
|
||||||
int32 utf16_offset = 0;
|
int32 utf16_offset = 0;
|
||||||
bool need_recheck_utf8 = false;
|
bool need_recheck_utf8 = false;
|
||||||
|
|
||||||
auto buf = StackAllocator::alloc(str_size + 30);
|
|
||||||
StringBuilder new_text(buf.as_slice(), true);
|
|
||||||
|
|
||||||
struct EntityInfo {
|
struct EntityInfo {
|
||||||
string tag_name;
|
string tag_name;
|
||||||
string argument;
|
string argument;
|
||||||
|
@ -3025,7 +3024,8 @@ Result<vector<MessageEntity>> parse_html(string &str) {
|
||||||
// half of a surrogate pair
|
// half of a surrogate pair
|
||||||
need_recheck_utf8 = true;
|
need_recheck_utf8 = true;
|
||||||
}
|
}
|
||||||
append_utf8_character(new_text, ch);
|
result_end = append_utf8_character_unsafe(result_end, ch);
|
||||||
|
CHECK(result_end <= result_begin + i);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3033,7 +3033,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
|
||||||
if (is_utf8_character_first_code_unit(c)) {
|
if (is_utf8_character_first_code_unit(c)) {
|
||||||
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
|
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
|
||||||
}
|
}
|
||||||
new_text.push_back(text[i]);
|
*result_end++ = c;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3139,7 +3139,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
|
||||||
<< "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
|
<< "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, new_text.size());
|
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result_end - result_begin);
|
||||||
} else {
|
} else {
|
||||||
// end of an entity
|
// end of an entity
|
||||||
if (nested_entities.empty()) {
|
if (nested_entities.empty()) {
|
||||||
|
@ -3186,7 +3186,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
|
||||||
} else if (tag_name == "a") {
|
} else if (tag_name == "a") {
|
||||||
auto url = std::move(nested_entities.back().argument);
|
auto url = std::move(nested_entities.back().argument);
|
||||||
if (url.empty()) {
|
if (url.empty()) {
|
||||||
url = new_text.as_cslice().substr(nested_entities.back().entity_begin_pos).str();
|
url = Slice(result_begin + nested_entities.back().entity_begin_pos, result_end).str();
|
||||||
}
|
}
|
||||||
auto user_id = LinkManager::get_link_user_id(url);
|
auto user_id = LinkManager::get_link_user_id(url);
|
||||||
if (user_id.is_valid()) {
|
if (user_id.is_valid()) {
|
||||||
|
@ -3235,12 +3235,12 @@ Result<vector<MessageEntity>> parse_html(string &str) {
|
||||||
|
|
||||||
sort_entities(entities);
|
sort_entities(entities);
|
||||||
|
|
||||||
if (need_recheck_utf8 && !check_utf8(new_text.as_cslice())) {
|
str.resize(static_cast<size_t>(result_end - result_begin));
|
||||||
|
if (need_recheck_utf8 && !check_utf8(str)) {
|
||||||
return Status::Error(400,
|
return Status::Error(400,
|
||||||
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
|
"Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
|
||||||
"surrogate code units");
|
"surrogate code units");
|
||||||
}
|
}
|
||||||
str = new_text.as_cslice().str();
|
|
||||||
return std::move(entities);
|
return std::move(entities);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -82,6 +82,25 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code) {
|
||||||
|
if (code <= 0x7f) {
|
||||||
|
*ptr++ = static_cast<unsigned char>(code);
|
||||||
|
} else if (code <= 0x7ff) {
|
||||||
|
*ptr++ = static_cast<unsigned char>(0xc0 | (code >> 6));
|
||||||
|
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
|
||||||
|
} else if (code <= 0xffff) {
|
||||||
|
*ptr++ = static_cast<unsigned char>(0xe0 | (code >> 12));
|
||||||
|
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
|
||||||
|
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
|
||||||
|
} else {
|
||||||
|
*ptr++ = static_cast<unsigned char>(0xf0 | (code >> 18));
|
||||||
|
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 12) & 0x3f));
|
||||||
|
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
|
||||||
|
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
string utf8_to_lower(Slice str) {
|
string utf8_to_lower(Slice str) {
|
||||||
string result;
|
string result;
|
||||||
auto pos = str.ubegin();
|
auto pos = str.ubegin();
|
||||||
|
|
|
@ -33,21 +33,21 @@ size_t utf8_utf16_length(Slice str);
|
||||||
|
|
||||||
/// appends a Unicode character using UTF-8 encoding
|
/// appends a Unicode character using UTF-8 encoding
|
||||||
template <class T>
|
template <class T>
|
||||||
void append_utf8_character(T &str, uint32 ch) {
|
void append_utf8_character(T &str, uint32 code) {
|
||||||
if (ch <= 0x7f) {
|
if (code <= 0x7f) {
|
||||||
str.push_back(static_cast<char>(ch));
|
str.push_back(static_cast<char>(code));
|
||||||
} else if (ch <= 0x7ff) {
|
} else if (code <= 0x7ff) {
|
||||||
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
|
str.push_back(static_cast<char>(0xc0 | (code >> 6))); // implementation-defined
|
||||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
|
||||||
} else if (ch <= 0xffff) {
|
} else if (code <= 0xffff) {
|
||||||
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
|
str.push_back(static_cast<char>(0xe0 | (code >> 12))); // implementation-defined
|
||||||
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
|
||||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
|
||||||
} else {
|
} else {
|
||||||
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
|
str.push_back(static_cast<char>(0xf0 | (code >> 18))); // implementation-defined
|
||||||
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
|
str.push_back(static_cast<char>(0x80 | ((code >> 12) & 0x3f)));
|
||||||
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
|
||||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,6 +62,9 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
|
||||||
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
|
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
|
||||||
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
|
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
|
||||||
|
|
||||||
|
/// appends a Unicode character using UTF-8 encoding and returns updated pointer
|
||||||
|
unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code);
|
||||||
|
|
||||||
/// truncates UTF-8 string to the given length in Unicode characters
|
/// truncates UTF-8 string to the given length in Unicode characters
|
||||||
template <class T>
|
template <class T>
|
||||||
T utf8_truncate(T str, size_t length) {
|
T utf8_truncate(T str, size_t length) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user