From 806e570a725263ae2607b21585fc67084008f2a4 Mon Sep 17 00:00:00 2001 From: levlam Date: Mon, 8 Oct 2018 15:53:05 +0300 Subject: [PATCH] Add source to next_utf8_unsafe. GitOrigin-RevId: e8e5a47096461c0e76a64eb26cb848651d4d61e8 --- td/telegram/MessageEntity.cpp | 32 ++++++++++++++--------------- td/telegram/MessagesDb.cpp | 2 +- td/telegram/MessagesManager.cpp | 4 ++-- tdutils/td/utils/Hints.cpp | 2 +- tdutils/td/utils/filesystem.cpp | 2 +- tdutils/td/utils/port/IPAddress.cpp | 2 +- tdutils/td/utils/translit.cpp | 4 ++-- tdutils/td/utils/utf8.cpp | 6 +++--- tdutils/td/utils/utf8.h | 2 +- 9 files changed, 28 insertions(+), 28 deletions(-) diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 194af981a..3324400c8 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -189,7 +189,7 @@ static vector match_mentions(Slice str) { if (ptr != begin) { uint32 prev; - next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); + next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_mentions"); if (is_word_character(prev)) { ptr++; @@ -207,7 +207,7 @@ static vector match_mentions(Slice str) { } uint32 next = 0; if (ptr != end) { - next_utf8_unsafe(ptr, &next); + next_utf8_unsafe(ptr, &next, "match_mentions 2"); } if (is_word_character(next)) { continue; @@ -233,7 +233,7 @@ static vector match_bot_commands(Slice str) { if (ptr != begin) { uint32 prev; - next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); + next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_bot_commands"); if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') { ptr++; @@ -266,7 +266,7 @@ static vector match_bot_commands(Slice str) { uint32 next = 0; if (ptr != end) { - next_utf8_unsafe(ptr, &next); + next_utf8_unsafe(ptr, &next, "match_bot_commands 2"); } if (is_word_character(next) || next == '/' || next == '<' || next == '>') { continue; @@ -309,7 +309,7 @@ static vector match_hashtags(Slice str) { if (ptr != begin) { uint32 prev; - next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); + next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_hashtags"); if (is_hashtag_letter(prev, category)) { ptr++; @@ -322,7 +322,7 @@ static vector match_hashtags(Slice str) { bool was_letter = false; while (ptr != end) { uint32 code; - auto next_ptr = next_utf8_unsafe(ptr, &code); + auto next_ptr = next_utf8_unsafe(ptr, &code, "match_hashtags 2"); if (!is_hashtag_letter(code, category)) { break; } @@ -370,7 +370,7 @@ static vector match_cashtags(Slice str) { if (ptr != begin) { uint32 prev; - next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev); + next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_cashtags"); if (is_hashtag_letter(prev, category) || prev == '$') { ptr++; @@ -390,7 +390,7 @@ static vector match_cashtags(Slice str) { if (cashtag_end != end) { uint32 code; - next_utf8_unsafe(ptr, &code); + next_utf8_unsafe(ptr, &code, "match_cashtags 2"); if (is_hashtag_letter(code, category) || code == '$') { continue; } @@ -480,7 +480,7 @@ static vector match_urls(Slice str) { const unsigned char *domain_end_ptr = begin + dot_pos; while (domain_end_ptr != end) { uint32 code = 0; - auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code); + auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls"); if (code == '@') { last_at_ptr = domain_end_ptr; } @@ -492,7 +492,7 @@ static vector match_urls(Slice str) { domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1; while (domain_end_ptr != end) { uint32 code = 0; - auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code); + auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2"); if (!is_domain_symbol(code)) { break; } @@ -503,7 +503,7 @@ static vector match_urls(Slice str) { while (domain_begin_ptr != begin) { domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr); uint32 code = 0; - auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code); + auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3"); if (last_at_ptr == nullptr ? !is_domain_symbol(code) : !is_user_data_symbol(code)) { domain_begin_ptr = next_ptr; break; @@ -534,7 +534,7 @@ static vector match_urls(Slice str) { auto path_end_ptr = url_end_ptr + 1; while (path_end_ptr != end) { uint32 code = 0; - auto next_ptr = next_utf8_unsafe(path_end_ptr, &code); + auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_urls 4"); if (!is_path_symbol(code)) { break; } @@ -559,7 +559,7 @@ static vector match_urls(Slice str) { while (user_data_begin_ptr != begin) { user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr); uint32 code = 0; - auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code); + auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code, "match_urls 5"); if (!is_user_data_symbol(code)) { user_data_begin_ptr = next_ptr; break; @@ -579,7 +579,7 @@ static vector match_urls(Slice str) { while (protocol_begin_ptr != begin) { protocol_begin_ptr = prev_utf8_unsafe(protocol_begin_ptr); uint32 code = 0; - auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code); + auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code, "match_urls 6"); if (!is_protocol_symbol(code)) { protocol_begin_ptr = next_ptr; break; @@ -601,7 +601,7 @@ static vector match_urls(Slice str) { auto prefix_end = prefix.uend(); auto prefix_back = prev_utf8_unsafe(prefix_end); uint32 code = 0; - next_utf8_unsafe(prefix_back, &code); + next_utf8_unsafe(prefix_back, &code, "match_urls 7"); if (is_word_character(code) || code == '/' || code == '#' || code == '@') { is_bad = true; } @@ -1117,7 +1117,7 @@ vector find_entities(Slice text, bool skip_bot_commands, bool onl while (ptr != end && cnt > 0) { unsigned char c = ptr[0]; utf16_pos += 1 + (c >= 0xf0); - ptr = next_utf8_unsafe(ptr, nullptr); + ptr = next_utf8_unsafe(ptr, nullptr, "match_urls 8"); pos = static_cast(ptr - begin); if (entity_begin == pos) { diff --git a/td/telegram/MessagesDb.cpp b/td/telegram/MessagesDb.cpp index fe96da3b3..c1ae707e9 100644 --- a/td/telegram/MessagesDb.cpp +++ b/td/telegram/MessagesDb.cpp @@ -530,7 +530,7 @@ class MessagesDbImpl : public MessagesDbSyncInterface { for (auto ptr = query.ubegin(), end = query.uend(); ptr < end;) { uint32 code; auto code_ptr = ptr; - ptr = next_utf8_unsafe(ptr, &code); + ptr = next_utf8_unsafe(ptr, &code, "prepare_query"); if (is_word_character(code)) { if (!in_word) { in_word = true; diff --git a/td/telegram/MessagesManager.cpp b/td/telegram/MessagesManager.cpp index 446269cf5..94a38a7de 100644 --- a/td/telegram/MessagesManager.cpp +++ b/td/telegram/MessagesManager.cpp @@ -22083,14 +22083,14 @@ void MessagesManager::update_used_hashtags(DialogId dialog_id, const Message *m) } while (utf16_pos < entity.offset && ptr < end) { utf16_pos += 1 + (ptr[0] >= 0xf0); - ptr = next_utf8_unsafe(ptr, nullptr); + ptr = next_utf8_unsafe(ptr, nullptr, "update_used_hashtags"); } CHECK(utf16_pos == entity.offset); auto from = ptr; while (utf16_pos < entity.offset + entity.length && ptr < end) { utf16_pos += 1 + (ptr[0] >= 0xf0); - ptr = next_utf8_unsafe(ptr, nullptr); + ptr = next_utf8_unsafe(ptr, nullptr, "update_used_hashtags 2"); } CHECK(utf16_pos == entity.offset + entity.length); auto to = ptr; diff --git a/tdutils/td/utils/Hints.cpp b/tdutils/td/utils/Hints.cpp index 8b333f205..3bfb9999a 100644 --- a/tdutils/td/utils/Hints.cpp +++ b/tdutils/td/utils/Hints.cpp @@ -41,7 +41,7 @@ vector Hints::get_words(Slice name) { auto end = name.uend(); while (pos != end) { uint32 code; - pos = next_utf8_unsafe(pos, &code); + pos = next_utf8_unsafe(pos, &code, "get_words"); code = prepare_search_character(code); if (code == 0) { diff --git a/tdutils/td/utils/filesystem.cpp b/tdutils/td/utils/filesystem.cpp index b22418151..220d015b3 100644 --- a/tdutils/td/utils/filesystem.cpp +++ b/tdutils/td/utils/filesystem.cpp @@ -82,7 +82,7 @@ static std::string clean_filename_part(Slice name, int max_length) { int size = 0; for (auto *it = name.ubegin(); it != name.uend() && size < max_length;) { uint32 code; - it = next_utf8_unsafe(it, &code); + it = next_utf8_unsafe(it, &code, "clean_filename_part"); if (!is_ok(code)) { code = ' '; } diff --git a/tdutils/td/utils/port/IPAddress.cpp b/tdutils/td/utils/port/IPAddress.cpp index 85707c9e2..38c1c9a24 100644 --- a/tdutils/td/utils/port/IPAddress.cpp +++ b/tdutils/td/utils/port/IPAddress.cpp @@ -50,7 +50,7 @@ static void punycode(string &result, Slice part) { auto end = part.uend(); while (begin != end) { uint32 code; - begin = next_utf8_unsafe(begin, &code); + begin = next_utf8_unsafe(begin, &code, "punycode"); if (code <= 127u) { result += to_lower(static_cast(code)); processed++; diff --git a/tdutils/td/utils/translit.cpp b/tdutils/td/utils/translit.cpp index 2a2a9ec5b..81abdfcbc 100644 --- a/tdutils/td/utils/translit.cpp +++ b/tdutils/td/utils/translit.cpp @@ -55,7 +55,7 @@ void add_word_transliterations(vector &result, Slice word, bool allow_pa auto end = word.uend(); while (pos != end) { uint32 code; - pos = next_utf8_unsafe(pos, &code); + pos = next_utf8_unsafe(pos, &code, "add_word_transliterations"); auto it = simple_rules.find(code); if (it != simple_rules.end()) { s += it->second; @@ -88,7 +88,7 @@ void add_word_transliterations(vector &result, Slice word, bool allow_pa } uint32 code; - pos = next_utf8_unsafe(pos, &code); + pos = next_utf8_unsafe(pos, &code, "add_word_transliterations 2"); auto it = simple_rules.find(code); if (it != simple_rules.end()) { s += it->second; diff --git a/tdutils/td/utils/utf8.cpp b/tdutils/td/utils/utf8.cpp index 50f82d639..2e584553a 100644 --- a/tdutils/td/utils/utf8.cpp +++ b/tdutils/td/utils/utf8.cpp @@ -79,7 +79,7 @@ void append_utf8_character(string &str, uint32 ch) { } } -const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) { +const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source) { uint32 a = ptr[0]; if ((a & 0x80) == 0) { if (code) { @@ -102,7 +102,7 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) { } return ptr + 4; } - UNREACHABLE(); + LOG(FATAL) << a << " " << source; if (code) { *code = 0; } @@ -115,7 +115,7 @@ string utf8_to_lower(Slice str) { auto end = str.uend(); while (pos != end) { uint32 code; - pos = next_utf8_unsafe(pos, &code); + pos = next_utf8_unsafe(pos, &code, "utf8_to_lower"); append_utf8_character(result, unicode_to_lower(code)); } return result; diff --git a/tdutils/td/utils/utf8.h b/tdutils/td/utils/utf8.h index 6be1952c1..b719e664c 100644 --- a/tdutils/td/utils/utf8.h +++ b/tdutils/td/utils/utf8.h @@ -40,7 +40,7 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) { } /// moves pointer one UTF-8 character forward and saves code of the skipped character in *code -const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code); +const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source); /// truncates UTF-8 string to the given length in Unicode characters template