Use StringBuilder to create new string in parse_html.
This commit is contained in:
parent
2ba41ac279
commit
0fb64f97a4
@ -24,6 +24,8 @@
|
||||
#include "td/utils/misc.h"
|
||||
#include "td/utils/Promise.h"
|
||||
#include "td/utils/SliceBuilder.h"
|
||||
#include "td/utils/StackAllocator.h"
|
||||
#include "td/utils/StringBuilder.h"
|
||||
#include "td/utils/unicode.h"
|
||||
#include "td/utils/utf8.h"
|
||||
|
||||
@ -2994,6 +2996,9 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
||||
vector<MessageEntity> entities;
|
||||
int32 utf16_offset = 0;
|
||||
|
||||
auto buf = StackAllocator::alloc(text.size() + 30);
|
||||
StringBuilder new_text(buf.as_slice(), true);
|
||||
|
||||
struct EntityInfo {
|
||||
string tag_name;
|
||||
string argument;
|
||||
@ -3016,7 +3021,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
||||
if (ch != 0) {
|
||||
i--; // i will be incremented in for
|
||||
utf16_offset += 1 + (ch > 0xffff);
|
||||
append_utf8_character(result, ch);
|
||||
append_utf8_character(new_text, ch);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -3024,7 +3029,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
||||
if (is_utf8_character_first_code_unit(c)) {
|
||||
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
|
||||
}
|
||||
result.push_back(text[i]);
|
||||
new_text.push_back(text[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3130,7 +3135,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
||||
<< "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
|
||||
}
|
||||
|
||||
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result.size());
|
||||
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, new_text.size());
|
||||
} else {
|
||||
// end of an entity
|
||||
if (nested_entities.empty()) {
|
||||
@ -3177,7 +3182,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
||||
} else if (tag_name == "a") {
|
||||
auto url = std::move(nested_entities.back().argument);
|
||||
if (url.empty()) {
|
||||
url = result.substr(nested_entities.back().entity_begin_pos);
|
||||
url = new_text.as_cslice().substr(nested_entities.back().entity_begin_pos).str();
|
||||
}
|
||||
auto user_id = LinkManager::get_link_user_id(url);
|
||||
if (user_id.is_valid()) {
|
||||
@ -3226,6 +3231,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
|
||||
|
||||
sort_entities(entities);
|
||||
|
||||
result = new_text.as_cslice().str();
|
||||
return std::move(entities);
|
||||
}
|
||||
|
||||
|
@ -32,6 +32,16 @@ class StringBuilder {
|
||||
current_ptr_--;
|
||||
}
|
||||
|
||||
void push_back(char c) {
|
||||
if (unlikely(end_ptr_ <= current_ptr_)) {
|
||||
if (!reserve_inner(RESERVED_SIZE)) {
|
||||
on_error();
|
||||
return;
|
||||
}
|
||||
}
|
||||
*current_ptr_++ = c;
|
||||
}
|
||||
|
||||
MutableCSlice as_cslice() {
|
||||
if (current_ptr_ >= end_ptr_ + RESERVED_SIZE) {
|
||||
std::abort(); // shouldn't happen
|
||||
@ -40,6 +50,10 @@ class StringBuilder {
|
||||
return MutableCSlice(begin_ptr_, current_ptr_);
|
||||
}
|
||||
|
||||
size_t size() {
|
||||
return static_cast<size_t>(current_ptr_ - begin_ptr_);
|
||||
}
|
||||
|
||||
bool is_error() const {
|
||||
return error_flag_;
|
||||
}
|
||||
@ -132,6 +146,7 @@ class StringBuilder {
|
||||
}
|
||||
return reserve_inner(RESERVED_SIZE);
|
||||
}
|
||||
|
||||
bool reserve(size_t size) {
|
||||
if (end_ptr_ > current_ptr_ && static_cast<size_t>(end_ptr_ - current_ptr_) >= size) {
|
||||
return true;
|
||||
|
@ -62,24 +62,6 @@ bool check_utf8(CSlice str) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void append_utf8_character(string &str, uint32 ch) {
|
||||
if (ch <= 0x7f) {
|
||||
str.push_back(static_cast<char>(ch));
|
||||
} else if (ch <= 0x7ff) {
|
||||
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
|
||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
||||
} else if (ch <= 0xffff) {
|
||||
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
|
||||
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
||||
} else {
|
||||
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
|
||||
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
|
||||
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
||||
}
|
||||
}
|
||||
|
||||
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
|
||||
uint32 a = ptr[0];
|
||||
if ((a & 0x80) == 0) {
|
||||
|
@ -32,7 +32,24 @@ inline size_t utf8_length(Slice str) {
|
||||
size_t utf8_utf16_length(Slice str);
|
||||
|
||||
/// appends a Unicode character using UTF-8 encoding
|
||||
void append_utf8_character(string &str, uint32 ch);
|
||||
template <class T>
|
||||
void append_utf8_character(T &str, uint32 ch) {
|
||||
if (ch <= 0x7f) {
|
||||
str.push_back(static_cast<char>(ch));
|
||||
} else if (ch <= 0x7ff) {
|
||||
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
|
||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
||||
} else if (ch <= 0xffff) {
|
||||
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
|
||||
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
||||
} else {
|
||||
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
|
||||
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
|
||||
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
||||
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
||||
}
|
||||
}
|
||||
|
||||
/// moves pointer one UTF-8 character back
|
||||
inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user