Replace text in-place in parse_html.

2023-01-09 15:12:14 +03:00 · 2023-01-09 15:12:14 +03:00 · 137fd3beb9
commit 137fd3beb9
parent ac0de06b5f
3 changed files with 45 additions and 23 deletions
--- a/td/telegram/MessageEntity.cpp
+++ b/td/telegram/MessageEntity.cpp
@ -2991,14 +2991,13 @@ static uint32 decode_html_entity(CSlice text, size_t &pos) {
 Result<vector<MessageEntity>> parse_html(string &str) {
  auto str_size = str.size();
  const char *text = str.c_str();
  auto result_end = MutableSlice(str).ubegin();
  const unsigned char *result_begin = result_end;
  vector<MessageEntity> entities;
  int32 utf16_offset = 0;
  bool need_recheck_utf8 = false;
  auto buf = StackAllocator::alloc(str_size + 30);
  StringBuilder new_text(buf.as_slice(), true);
  struct EntityInfo {
    string tag_name;
    string argument;
@ -3025,7 +3024,8 @@ Result<vector<MessageEntity>> parse_html(string &str) {
          // half of a surrogate pair
          need_recheck_utf8 = true;
        }
-        append_utf8_character(new_text, ch);
+        result_end = append_utf8_character_unsafe(result_end, ch);
        CHECK(result_end <= result_begin + i);
        continue;
      }
    }
@ -3033,7 +3033,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
      if (is_utf8_character_first_code_unit(c)) {
        utf16_offset += 1 + (c >= 0xf0);  // >= 4 bytes in symbol => surrogate pair
      }
-      new_text.push_back(text[i]);
+      *result_end++ = c;
      continue;
    }
@ -3139,7 +3139,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
                                      << "Tag \"span\" must have class \"tg-spoiler\" at byte offset " << begin_pos);
      }
-      nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, new_text.size());
+      nested_entities.emplace_back(std::move(tag_name), std::move(argument), utf16_offset, result_end - result_begin);
    } else {
      // end of an entity
      if (nested_entities.empty()) {
@ -3186,7 +3186,7 @@ Result<vector<MessageEntity>> parse_html(string &str) {
        } else if (tag_name == "a") {
          auto url = std::move(nested_entities.back().argument);
          if (url.empty()) {
-            url = new_text.as_cslice().substr(nested_entities.back().entity_begin_pos).str();
+            url = Slice(result_begin + nested_entities.back().entity_begin_pos, result_end).str();
          }
          auto user_id = LinkManager::get_link_user_id(url);
          if (user_id.is_valid()) {
@ -3235,12 +3235,12 @@ Result<vector<MessageEntity>> parse_html(string &str) {
  sort_entities(entities);
-  if (need_recheck_utf8 && !check_utf8(new_text.as_cslice())) {
+  str.resize(static_cast<size_t>(result_end - result_begin));
  if (need_recheck_utf8 && !check_utf8(str)) {
    return Status::Error(400,
                         "Text contains invalid Unicode characters after decoding HTML entities, check for unmatched "
                         "surrogate code units");
  }
  str = new_text.as_cslice().str();
  return std::move(entities);
 }
--- a/tdutils/td/utils/utf8.cpp
+++ b/tdutils/td/utils/utf8.cpp
@ -82,6 +82,25 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
  return ptr;
 }
 unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code) {
  if (code <= 0x7f) {
    *ptr++ = static_cast<unsigned char>(code);
  } else if (code <= 0x7ff) {
    *ptr++ = static_cast<unsigned char>(0xc0 | (code >> 6));
    *ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
  } else if (code <= 0xffff) {
    *ptr++ = static_cast<unsigned char>(0xe0 | (code >> 12));
    *ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
    *ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
  } else {
    *ptr++ = static_cast<unsigned char>(0xf0 | (code >> 18));
    *ptr++ = static_cast<unsigned char>(0x80 | ((code >> 12) & 0x3f));
    *ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
    *ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
  }
  return ptr;
 }
 string utf8_to_lower(Slice str) {
  string result;
  auto pos = str.ubegin();
--- a/tdutils/td/utils/utf8.h
+++ b/tdutils/td/utils/utf8.h
@ -33,21 +33,21 @@ size_t utf8_utf16_length(Slice str);
 /// appends a Unicode character using UTF-8 encoding
 template <class T>
-void append_utf8_character(T &str, uint32 ch) {
+void append_utf8_character(T &str, uint32 code) {
-  if (ch <= 0x7f) {
+  if (code <= 0x7f) {
-    str.push_back(static_cast<char>(ch));
+    str.push_back(static_cast<char>(code));
-  } else if (ch <= 0x7ff) {
+  } else if (code <= 0x7ff) {
-    str.push_back(static_cast<char>(0xc0 | (ch >> 6)));  // implementation-defined
+    str.push_back(static_cast<char>(0xc0 | (code >> 6)));  // implementation-defined
-    str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
+    str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
-  } else if (ch <= 0xffff) {
+  } else if (code <= 0xffff) {
-    str.push_back(static_cast<char>(0xe0 | (ch >> 12)));  // implementation-defined
+    str.push_back(static_cast<char>(0xe0 | (code >> 12)));  // implementation-defined
-    str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
+    str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
-    str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
+    str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
  } else {
-    str.push_back(static_cast<char>(0xf0 | (ch >> 18)));  // implementation-defined
+    str.push_back(static_cast<char>(0xf0 | (code >> 18)));  // implementation-defined
-    str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
+    str.push_back(static_cast<char>(0x80 | ((code >> 12) & 0x3f)));
-    str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
+    str.push_back(static_cast<char>(0x80 | ((code >> 6) & 0x3f)));
-    str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
+    str.push_back(static_cast<char>(0x80 | (code & 0x3f)));
  }
 }
@ -62,6 +62,9 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
 /// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
 const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
 /// appends a Unicode character using UTF-8 encoding and returns updated pointer
 unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code);
 /// truncates UTF-8 string to the given length in Unicode characters
 template <class T>
 T utf8_truncate(T str, size_t length) {