From 806e570a725263ae2607b21585fc67084008f2a4 Mon Sep 17 00:00:00 2001
From: levlam <levlam@telegram.org>
Date: Mon, 8 Oct 2018 15:53:05 +0300
Subject: [PATCH] Add source to next_utf8_unsafe.

GitOrigin-RevId: e8e5a47096461c0e76a64eb26cb848651d4d61e8
---
 td/telegram/MessageEntity.cpp       | 32 ++++++++++++++---------------
 td/telegram/MessagesDb.cpp          |  2 +-
 td/telegram/MessagesManager.cpp     |  4 ++--
 tdutils/td/utils/Hints.cpp          |  2 +-
 tdutils/td/utils/filesystem.cpp     |  2 +-
 tdutils/td/utils/port/IPAddress.cpp |  2 +-
 tdutils/td/utils/translit.cpp       |  4 ++--
 tdutils/td/utils/utf8.cpp           |  6 +++---
 tdutils/td/utils/utf8.h             |  2 +-
 9 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp
index 194af981a..3324400c8 100644
--- a/td/telegram/MessageEntity.cpp
+++ b/td/telegram/MessageEntity.cpp
@@ -189,7 +189,7 @@ static vector<Slice> match_mentions(Slice str) {
 
     if (ptr != begin) {
       uint32 prev;
-      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
+      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_mentions");
 
       if (is_word_character(prev)) {
         ptr++;
@@ -207,7 +207,7 @@ static vector<Slice> match_mentions(Slice str) {
     }
     uint32 next = 0;
     if (ptr != end) {
-      next_utf8_unsafe(ptr, &next);
+      next_utf8_unsafe(ptr, &next, "match_mentions 2");
     }
     if (is_word_character(next)) {
       continue;
@@ -233,7 +233,7 @@ static vector<Slice> match_bot_commands(Slice str) {
 
     if (ptr != begin) {
       uint32 prev;
-      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
+      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_bot_commands");
 
       if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
         ptr++;
@@ -266,7 +266,7 @@ static vector<Slice> match_bot_commands(Slice str) {
 
     uint32 next = 0;
     if (ptr != end) {
-      next_utf8_unsafe(ptr, &next);
+      next_utf8_unsafe(ptr, &next, "match_bot_commands 2");
     }
     if (is_word_character(next) || next == '/' || next == '<' || next == '>') {
       continue;
@@ -309,7 +309,7 @@ static vector<Slice> match_hashtags(Slice str) {
 
     if (ptr != begin) {
       uint32 prev;
-      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
+      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_hashtags");
 
       if (is_hashtag_letter(prev, category)) {
         ptr++;
@@ -322,7 +322,7 @@ static vector<Slice> match_hashtags(Slice str) {
     bool was_letter = false;
     while (ptr != end) {
       uint32 code;
-      auto next_ptr = next_utf8_unsafe(ptr, &code);
+      auto next_ptr = next_utf8_unsafe(ptr, &code, "match_hashtags 2");
       if (!is_hashtag_letter(code, category)) {
         break;
       }
@@ -370,7 +370,7 @@ static vector<Slice> match_cashtags(Slice str) {
 
     if (ptr != begin) {
       uint32 prev;
-      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
+      next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_cashtags");
 
       if (is_hashtag_letter(prev, category) || prev == '$') {
         ptr++;
@@ -390,7 +390,7 @@ static vector<Slice> match_cashtags(Slice str) {
 
     if (cashtag_end != end) {
       uint32 code;
-      next_utf8_unsafe(ptr, &code);
+      next_utf8_unsafe(ptr, &code, "match_cashtags 2");
       if (is_hashtag_letter(code, category) || code == '$') {
         continue;
       }
@@ -480,7 +480,7 @@ static vector<Slice> match_urls(Slice str) {
     const unsigned char *domain_end_ptr = begin + dot_pos;
     while (domain_end_ptr != end) {
       uint32 code = 0;
-      auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code);
+      auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls");
       if (code == '@') {
         last_at_ptr = domain_end_ptr;
       }
@@ -492,7 +492,7 @@ static vector<Slice> match_urls(Slice str) {
     domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
     while (domain_end_ptr != end) {
       uint32 code = 0;
-      auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code);
+      auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2");
       if (!is_domain_symbol(code)) {
         break;
       }
@@ -503,7 +503,7 @@ static vector<Slice> match_urls(Slice str) {
     while (domain_begin_ptr != begin) {
       domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
       uint32 code = 0;
-      auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code);
+      auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3");
       if (last_at_ptr == nullptr ? !is_domain_symbol(code) : !is_user_data_symbol(code)) {
         domain_begin_ptr = next_ptr;
         break;
@@ -534,7 +534,7 @@ static vector<Slice> match_urls(Slice str) {
       auto path_end_ptr = url_end_ptr + 1;
       while (path_end_ptr != end) {
         uint32 code = 0;
-        auto next_ptr = next_utf8_unsafe(path_end_ptr, &code);
+        auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_urls 4");
         if (!is_path_symbol(code)) {
           break;
         }
@@ -559,7 +559,7 @@ static vector<Slice> match_urls(Slice str) {
       while (user_data_begin_ptr != begin) {
         user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);
         uint32 code = 0;
-        auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code);
+        auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code, "match_urls 5");
         if (!is_user_data_symbol(code)) {
           user_data_begin_ptr = next_ptr;
           break;
@@ -579,7 +579,7 @@ static vector<Slice> match_urls(Slice str) {
         while (protocol_begin_ptr != begin) {
           protocol_begin_ptr = prev_utf8_unsafe(protocol_begin_ptr);
           uint32 code = 0;
-          auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code);
+          auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code, "match_urls 6");
           if (!is_protocol_symbol(code)) {
             protocol_begin_ptr = next_ptr;
             break;
@@ -601,7 +601,7 @@ static vector<Slice> match_urls(Slice str) {
         auto prefix_end = prefix.uend();
         auto prefix_back = prev_utf8_unsafe(prefix_end);
         uint32 code = 0;
-        next_utf8_unsafe(prefix_back, &code);
+        next_utf8_unsafe(prefix_back, &code, "match_urls 7");
         if (is_word_character(code) || code == '/' || code == '#' || code == '@') {
           is_bad = true;
         }
@@ -1117,7 +1117,7 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands, bool onl
     while (ptr != end && cnt > 0) {
       unsigned char c = ptr[0];
       utf16_pos += 1 + (c >= 0xf0);
-      ptr = next_utf8_unsafe(ptr, nullptr);
+      ptr = next_utf8_unsafe(ptr, nullptr, "match_urls 8");
 
       pos = static_cast<int32>(ptr - begin);
       if (entity_begin == pos) {
diff --git a/td/telegram/MessagesDb.cpp b/td/telegram/MessagesDb.cpp
index fe96da3b3..c1ae707e9 100644
--- a/td/telegram/MessagesDb.cpp
+++ b/td/telegram/MessagesDb.cpp
@@ -530,7 +530,7 @@ class MessagesDbImpl : public MessagesDbSyncInterface {
     for (auto ptr = query.ubegin(), end = query.uend(); ptr < end;) {
       uint32 code;
       auto code_ptr = ptr;
-      ptr = next_utf8_unsafe(ptr, &code);
+      ptr = next_utf8_unsafe(ptr, &code, "prepare_query");
       if (is_word_character(code)) {
         if (!in_word) {
           in_word = true;
diff --git a/td/telegram/MessagesManager.cpp b/td/telegram/MessagesManager.cpp
index 446269cf5..94a38a7de 100644
--- a/td/telegram/MessagesManager.cpp
+++ b/td/telegram/MessagesManager.cpp
@@ -22083,14 +22083,14 @@ void MessagesManager::update_used_hashtags(DialogId dialog_id, const Message *m)
     }
     while (utf16_pos < entity.offset && ptr < end) {
       utf16_pos += 1 + (ptr[0] >= 0xf0);
-      ptr = next_utf8_unsafe(ptr, nullptr);
+      ptr = next_utf8_unsafe(ptr, nullptr, "update_used_hashtags");
     }
     CHECK(utf16_pos == entity.offset);
     auto from = ptr;
 
     while (utf16_pos < entity.offset + entity.length && ptr < end) {
       utf16_pos += 1 + (ptr[0] >= 0xf0);
-      ptr = next_utf8_unsafe(ptr, nullptr);
+      ptr = next_utf8_unsafe(ptr, nullptr, "update_used_hashtags 2");
     }
     CHECK(utf16_pos == entity.offset + entity.length);
     auto to = ptr;
diff --git a/tdutils/td/utils/Hints.cpp b/tdutils/td/utils/Hints.cpp
index 8b333f205..3bfb9999a 100644
--- a/tdutils/td/utils/Hints.cpp
+++ b/tdutils/td/utils/Hints.cpp
@@ -41,7 +41,7 @@ vector<string> Hints::get_words(Slice name) {
   auto end = name.uend();
   while (pos != end) {
     uint32 code;
-    pos = next_utf8_unsafe(pos, &code);
+    pos = next_utf8_unsafe(pos, &code, "get_words");
 
     code = prepare_search_character(code);
     if (code == 0) {
diff --git a/tdutils/td/utils/filesystem.cpp b/tdutils/td/utils/filesystem.cpp
index b22418151..220d015b3 100644
--- a/tdutils/td/utils/filesystem.cpp
+++ b/tdutils/td/utils/filesystem.cpp
@@ -82,7 +82,7 @@ static std::string clean_filename_part(Slice name, int max_length) {
   int size = 0;
   for (auto *it = name.ubegin(); it != name.uend() && size < max_length;) {
     uint32 code;
-    it = next_utf8_unsafe(it, &code);
+    it = next_utf8_unsafe(it, &code, "clean_filename_part");
     if (!is_ok(code)) {
       code = ' ';
     }
diff --git a/tdutils/td/utils/port/IPAddress.cpp b/tdutils/td/utils/port/IPAddress.cpp
index 85707c9e2..38c1c9a24 100644
--- a/tdutils/td/utils/port/IPAddress.cpp
+++ b/tdutils/td/utils/port/IPAddress.cpp
@@ -50,7 +50,7 @@ static void punycode(string &result, Slice part) {
   auto end = part.uend();
   while (begin != end) {
     uint32 code;
-    begin = next_utf8_unsafe(begin, &code);
+    begin = next_utf8_unsafe(begin, &code, "punycode");
     if (code <= 127u) {
       result += to_lower(static_cast<char>(code));
       processed++;
diff --git a/tdutils/td/utils/translit.cpp b/tdutils/td/utils/translit.cpp
index 2a2a9ec5b..81abdfcbc 100644
--- a/tdutils/td/utils/translit.cpp
+++ b/tdutils/td/utils/translit.cpp
@@ -55,7 +55,7 @@ void add_word_transliterations(vector<string> &result, Slice word, bool allow_pa
   auto end = word.uend();
   while (pos != end) {
     uint32 code;
-    pos = next_utf8_unsafe(pos, &code);
+    pos = next_utf8_unsafe(pos, &code, "add_word_transliterations");
     auto it = simple_rules.find(code);
     if (it != simple_rules.end()) {
       s += it->second;
@@ -88,7 +88,7 @@ void add_word_transliterations(vector<string> &result, Slice word, bool allow_pa
     }
 
     uint32 code;
-    pos = next_utf8_unsafe(pos, &code);
+    pos = next_utf8_unsafe(pos, &code, "add_word_transliterations 2");
     auto it = simple_rules.find(code);
     if (it != simple_rules.end()) {
       s += it->second;
diff --git a/tdutils/td/utils/utf8.cpp b/tdutils/td/utils/utf8.cpp
index 50f82d639..2e584553a 100644
--- a/tdutils/td/utils/utf8.cpp
+++ b/tdutils/td/utils/utf8.cpp
@@ -79,7 +79,7 @@ void append_utf8_character(string &str, uint32 ch) {
   }
 }
 
-const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
+const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source) {
   uint32 a = ptr[0];
   if ((a & 0x80) == 0) {
     if (code) {
@@ -102,7 +102,7 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
     }
     return ptr + 4;
   }
-  UNREACHABLE();
+  LOG(FATAL) << a << " " << source;
   if (code) {
     *code = 0;
   }
@@ -115,7 +115,7 @@ string utf8_to_lower(Slice str) {
   auto end = str.uend();
   while (pos != end) {
     uint32 code;
-    pos = next_utf8_unsafe(pos, &code);
+    pos = next_utf8_unsafe(pos, &code, "utf8_to_lower");
     append_utf8_character(result, unicode_to_lower(code));
   }
   return result;
diff --git a/tdutils/td/utils/utf8.h b/tdutils/td/utils/utf8.h
index 6be1952c1..b719e664c 100644
--- a/tdutils/td/utils/utf8.h
+++ b/tdutils/td/utils/utf8.h
@@ -40,7 +40,7 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
 }
 
 /// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
-const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
+const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source);
 
 /// truncates UTF-8 string to the given length in Unicode characters
 template <class T>