Use StringBuilder to create new string in parse_html.

This commit is contained in:
levlam 2023-01-09 12:43:15 +03:00
parent 2ba41ac279
commit 0fb64f97a4
4 changed files with 43 additions and 23 deletions

View File

@ -24,6 +24,8 @@
#include "td/utils/misc.h"
#include "td/utils/Promise.h"
#include "td/utils/SliceBuilder.h"
#include "td/utils/StackAllocator.h"
#include "td/utils/StringBuilder.h"
#include "td/utils/unicode.h"
#include "td/utils/utf8.h"
@ -2994,6 +2996,9 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
vector<MessageEntity> entities;
int32 utf16_offset = 0;
auto buf = StackAllocator::alloc(text.size() + 30);
StringBuilder new_text(buf.as_slice(), true);
struct EntityInfo {
string tag_name;
string argument;
@ -3016,7 +3021,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
if (ch != 0) {
i--; // i will be incremented in for
utf16_offset += 1 + (ch > 0xffff);
append_utf8_character(result, ch);
append_utf8_character(new_text, ch);
continue;
}
}
@ -3024,7 +3029,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
if (is_utf8_character_first_code_unit(c)) {
utf16_offset += 1 + (c >= 0xf0); // >= 4 bytes in symbol => surrogate pair
}
result.push_back(text[i]);
new_text.push_back(text[i]);
continue;
}
@ -3130,7 +3135,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
<< "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
}
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result.size());
nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, new_text.size());
} else {
// end of an entity
if (nested_entities.empty()) {
@ -3177,7 +3182,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
} else if (tag_name == "a") {
auto url = std::move(nested_entities.back().argument);
if (url.empty()) {
url = result.substr(nested_entities.back().entity_begin_pos);
url = new_text.as_cslice().substr(nested_entities.back().entity_begin_pos).str();
}
auto user_id = LinkManager::get_link_user_id(url);
if (user_id.is_valid()) {
@ -3226,6 +3231,7 @@ static Result<vector<MessageEntity>> do_parse_html(CSlice text, string &result)
sort_entities(entities);
result = new_text.as_cslice().str();
return std::move(entities);
}

View File

@ -32,6 +32,16 @@ class StringBuilder {
current_ptr_--;
}
void push_back(char c) {
if (unlikely(end_ptr_ <= current_ptr_)) {
if (!reserve_inner(RESERVED_SIZE)) {
on_error();
return;
}
}
*current_ptr_++ = c;
}
MutableCSlice as_cslice() {
if (current_ptr_ >= end_ptr_ + RESERVED_SIZE) {
std::abort(); // shouldn't happen
@ -40,6 +50,10 @@ class StringBuilder {
return MutableCSlice(begin_ptr_, current_ptr_);
}
size_t size() {
return static_cast<size_t>(current_ptr_ - begin_ptr_);
}
bool is_error() const {
return error_flag_;
}
@ -132,6 +146,7 @@ class StringBuilder {
}
return reserve_inner(RESERVED_SIZE);
}
bool reserve(size_t size) {
if (end_ptr_ > current_ptr_ && static_cast<size_t>(end_ptr_ - current_ptr_) >= size) {
return true;

View File

@ -62,24 +62,6 @@ bool check_utf8(CSlice str) {
return false;
}
void append_utf8_character(string &str, uint32 ch) {
if (ch <= 0x7f) {
str.push_back(static_cast<char>(ch));
} else if (ch <= 0x7ff) {
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
} else if (ch <= 0xffff) {
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
} else {
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
}
}
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
uint32 a = ptr[0];
if ((a & 0x80) == 0) {

View File

@ -32,7 +32,24 @@ inline size_t utf8_length(Slice str) {
size_t utf8_utf16_length(Slice str);
/// appends a Unicode character using UTF-8 encoding
void append_utf8_character(string &str, uint32 ch);
template <class T>
void append_utf8_character(T &str, uint32 ch) {
if (ch <= 0x7f) {
str.push_back(static_cast<char>(ch));
} else if (ch <= 0x7ff) {
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
} else if (ch <= 0xffff) {
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
} else {
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
}
}
/// moves pointer one UTF-8 character back
inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {