Remove source from next_utf8_unsafe.

This commit is contained in:
levlam 2022-08-19 16:37:44 +03:00
parent a8b3573a00
commit 96cca84a60
12 changed files with 41 additions and 42 deletions

View File

@ -5975,14 +5975,14 @@ void update_used_hashtags(Td *td, const MessageContent *content) {
} }
while (utf16_pos < entity.offset && ptr < end) { while (utf16_pos < entity.offset && ptr < end) {
utf16_pos += 1 + (ptr[0] >= 0xf0); utf16_pos += 1 + (ptr[0] >= 0xf0);
ptr = next_utf8_unsafe(ptr, &skipped_code, "update_used_hashtags"); ptr = next_utf8_unsafe(ptr, &skipped_code);
} }
CHECK(utf16_pos == entity.offset); CHECK(utf16_pos == entity.offset);
auto from = ptr; auto from = ptr;
while (utf16_pos < entity.offset + entity.length && ptr < end) { while (utf16_pos < entity.offset + entity.length && ptr < end) {
utf16_pos += 1 + (ptr[0] >= 0xf0); utf16_pos += 1 + (ptr[0] >= 0xf0);
ptr = next_utf8_unsafe(ptr, &skipped_code, "update_used_hashtags 2"); ptr = next_utf8_unsafe(ptr, &skipped_code);
} }
CHECK(utf16_pos == entity.offset + entity.length); CHECK(utf16_pos == entity.offset + entity.length);
auto to = ptr; auto to = ptr;

View File

@ -262,7 +262,7 @@ static vector<Slice> match_mentions(Slice str) {
if (ptr != begin) { if (ptr != begin) {
uint32 prev; uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_mentions"); next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_word_character(prev)) { if (is_word_character(prev)) {
ptr++; ptr++;
@ -280,7 +280,7 @@ static vector<Slice> match_mentions(Slice str) {
} }
uint32 next = 0; uint32 next = 0;
if (ptr != end) { if (ptr != end) {
next_utf8_unsafe(ptr, &next, "match_mentions 2"); next_utf8_unsafe(ptr, &next);
} }
if (is_word_character(next)) { if (is_word_character(next)) {
continue; continue;
@ -306,7 +306,7 @@ static vector<Slice> match_bot_commands(Slice str) {
if (ptr != begin) { if (ptr != begin) {
uint32 prev; uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_bot_commands"); next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') { if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
ptr++; ptr++;
@ -339,7 +339,7 @@ static vector<Slice> match_bot_commands(Slice str) {
uint32 next = 0; uint32 next = 0;
if (ptr != end) { if (ptr != end) {
next_utf8_unsafe(ptr, &next, "match_bot_commands 2"); next_utf8_unsafe(ptr, &next);
} }
if (is_word_character(next) || next == '/' || next == '<' || next == '>') { if (is_word_character(next) || next == '/' || next == '<' || next == '>') {
continue; continue;
@ -382,7 +382,7 @@ static vector<Slice> match_hashtags(Slice str) {
if (ptr != begin) { if (ptr != begin) {
uint32 prev; uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_hashtags"); next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_hashtag_letter(prev, category)) { if (is_hashtag_letter(prev, category)) {
ptr++; ptr++;
@ -395,7 +395,7 @@ static vector<Slice> match_hashtags(Slice str) {
bool was_letter = false; bool was_letter = false;
while (ptr != end) { while (ptr != end) {
uint32 code; uint32 code;
auto next_ptr = next_utf8_unsafe(ptr, &code, "match_hashtags 2"); auto next_ptr = next_utf8_unsafe(ptr, &code);
if (!is_hashtag_letter(code, category)) { if (!is_hashtag_letter(code, category)) {
break; break;
} }
@ -443,7 +443,7 @@ static vector<Slice> match_cashtags(Slice str) {
if (ptr != begin) { if (ptr != begin) {
uint32 prev; uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_cashtags"); next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (is_hashtag_letter(prev, category) || prev == '$') { if (is_hashtag_letter(prev, category) || prev == '$') {
ptr++; ptr++;
@ -467,7 +467,7 @@ static vector<Slice> match_cashtags(Slice str) {
if (cashtag_end != end) { if (cashtag_end != end) {
uint32 code; uint32 code;
next_utf8_unsafe(ptr, &code, "match_cashtags 2"); next_utf8_unsafe(ptr, &code);
if (is_hashtag_letter(code, category) || code == '$') { if (is_hashtag_letter(code, category) || code == '$') {
continue; continue;
} }
@ -506,7 +506,7 @@ static vector<Slice> match_media_timestamps(Slice str) {
if (media_timestamp_begin != begin) { if (media_timestamp_begin != begin) {
uint32 prev; uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(media_timestamp_begin), &prev, "match_media_timestamps 1"); next_utf8_unsafe(prev_utf8_unsafe(media_timestamp_begin), &prev);
if (is_word_character(prev)) { if (is_word_character(prev)) {
continue; continue;
@ -514,7 +514,7 @@ static vector<Slice> match_media_timestamps(Slice str) {
} }
if (media_timestamp_end != end) { if (media_timestamp_end != end) {
uint32 next; uint32 next;
next_utf8_unsafe(media_timestamp_end, &next, "match_media_timestamps 2"); next_utf8_unsafe(media_timestamp_end, &next);
if (is_word_character(next)) { if (is_word_character(next)) {
continue; continue;
@ -546,7 +546,7 @@ static vector<Slice> match_bank_card_numbers(Slice str) {
} }
if (ptr != begin) { if (ptr != begin) {
uint32 prev; uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_bank_card_numbers"); next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
if (prev == '.' || prev == ',' || prev == '+' || prev == '-' || prev == '_' || if (prev == '.' || prev == ',' || prev == '+' || prev == '-' || prev == '_' ||
get_unicode_simple_category(prev) == UnicodeSimpleCategory::Letter) { get_unicode_simple_category(prev) == UnicodeSimpleCategory::Letter) {
@ -582,7 +582,7 @@ static vector<Slice> match_bank_card_numbers(Slice str) {
} }
if (card_number_end != end) { if (card_number_end != end) {
uint32 next; uint32 next;
next_utf8_unsafe(card_number_end, &next, "match_bank_card_numbers 2"); next_utf8_unsafe(card_number_end, &next);
if (next == '-' || next == '_' || get_unicode_simple_category(next) == UnicodeSimpleCategory::Letter) { if (next == '-' || next == '_' || get_unicode_simple_category(next) == UnicodeSimpleCategory::Letter) {
continue; continue;
} }
@ -657,7 +657,7 @@ static vector<Slice> match_tg_urls(Slice str) {
auto path_end_ptr = ptr + 1; auto path_end_ptr = ptr + 1;
while (path_end_ptr != end) { while (path_end_ptr != end) {
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_tg_urls"); auto next_ptr = next_utf8_unsafe(path_end_ptr, &code);
if (!is_url_path_symbol(code)) { if (!is_url_path_symbol(code)) {
break; break;
} }
@ -739,7 +739,7 @@ static vector<Slice> match_urls(Slice str) {
while (domain_begin_ptr != begin) { while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr); domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 0"); auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code);
if (!is_domain_symbol(code)) { if (!is_domain_symbol(code)) {
domain_begin_ptr = next_ptr; domain_begin_ptr = next_ptr;
break; break;
@ -752,7 +752,7 @@ static vector<Slice> match_urls(Slice str) {
// try to find '@' to the right if there is no '@' to the left // try to find '@' to the right if there is no '@' to the left
while (domain_end_ptr != end) { while (domain_end_ptr != end) {
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls"); auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code);
if (code == '@') { if (code == '@') {
last_at_ptr = domain_end_ptr; last_at_ptr = domain_end_ptr;
} }
@ -765,7 +765,7 @@ static vector<Slice> match_urls(Slice str) {
} }
while (domain_end_ptr != end) { while (domain_end_ptr != end) {
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2"); auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code);
if (!is_domain_symbol(code)) { if (!is_domain_symbol(code)) {
break; break;
} }
@ -776,7 +776,7 @@ static vector<Slice> match_urls(Slice str) {
while (domain_begin_ptr != begin) { while (domain_begin_ptr != begin) {
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr); domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3"); auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code);
if (!is_user_data_symbol(code)) { if (!is_user_data_symbol(code)) {
domain_begin_ptr = next_ptr; domain_begin_ptr = next_ptr;
break; break;
@ -808,7 +808,7 @@ static vector<Slice> match_urls(Slice str) {
auto path_end_ptr = url_end_ptr + 1; auto path_end_ptr = url_end_ptr + 1;
while (path_end_ptr != end) { while (path_end_ptr != end) {
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_urls 4"); auto next_ptr = next_utf8_unsafe(path_end_ptr, &code);
if (!is_url_path_symbol(code)) { if (!is_url_path_symbol(code)) {
break; break;
} }
@ -836,7 +836,7 @@ static vector<Slice> match_urls(Slice str) {
while (user_data_begin_ptr != begin) { while (user_data_begin_ptr != begin) {
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr); user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code, "match_urls 5"); auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code);
if (!is_user_data_symbol(code)) { if (!is_user_data_symbol(code)) {
user_data_begin_ptr = next_ptr; user_data_begin_ptr = next_ptr;
break; break;
@ -856,7 +856,7 @@ static vector<Slice> match_urls(Slice str) {
while (protocol_begin_ptr != begin) { while (protocol_begin_ptr != begin) {
protocol_begin_ptr = prev_utf8_unsafe(protocol_begin_ptr); protocol_begin_ptr = prev_utf8_unsafe(protocol_begin_ptr);
uint32 code = 0; uint32 code = 0;
auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code, "match_urls 6"); auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code);
if (!is_protocol_symbol(code)) { if (!is_protocol_symbol(code)) {
protocol_begin_ptr = next_ptr; protocol_begin_ptr = next_ptr;
break; break;
@ -876,7 +876,7 @@ static vector<Slice> match_urls(Slice str) {
auto prefix_end = prefix.uend(); auto prefix_end = prefix.uend();
auto prefix_back = prev_utf8_unsafe(prefix_end); auto prefix_back = prev_utf8_unsafe(prefix_end);
uint32 code = 0; uint32 code = 0;
next_utf8_unsafe(prefix_back, &code, "match_urls 7"); next_utf8_unsafe(prefix_back, &code);
if (is_word_character(code) || code == '/' || code == '#' || code == '@') { if (is_word_character(code) || code == '/' || code == '#' || code == '@') {
is_bad = true; is_bad = true;
} }
@ -1669,7 +1669,7 @@ static void fix_entity_offsets(Slice text, vector<MessageEntity> &entities) {
while (ptr != end && cnt > 0) { while (ptr != end && cnt > 0) {
unsigned char c = ptr[0]; unsigned char c = ptr[0];
utf16_pos += 1 + (c >= 0xf0); utf16_pos += 1 + (c >= 0xf0);
ptr = next_utf8_unsafe(ptr, &skipped_code, "fix_entity_offsets"); ptr = next_utf8_unsafe(ptr, &skipped_code);
pos = static_cast<int32>(ptr - begin); pos = static_cast<int32>(ptr - begin);
if (entity_begin == pos) { if (entity_begin == pos) {

View File

@ -736,7 +736,7 @@ class MessagesDbImpl final : public MessagesDbSyncInterface {
for (auto ptr = query.ubegin(), end = query.uend(); ptr < end;) { for (auto ptr = query.ubegin(), end = query.uend(); ptr < end;) {
uint32 code; uint32 code;
auto code_ptr = ptr; auto code_ptr = ptr;
ptr = next_utf8_unsafe(ptr, &code, "prepare_query"); ptr = next_utf8_unsafe(ptr, &code);
if (is_word_character(code)) { if (is_word_character(code)) {
if (!in_word) { if (!in_word) {
in_word = true; in_word = true;

View File

@ -8460,7 +8460,7 @@ vector<string> StickersManager::get_emoji_language_codes(const vector<string> &i
} }
if (!text.empty()) { if (!text.empty()) {
uint32 code = 0; uint32 code = 0;
next_utf8_unsafe(text.ubegin(), &code, "get_emoji_language_codes"); next_utf8_unsafe(text.ubegin(), &code);
if ((0x410 <= code && code <= 0x44F) || code == 0x401 || code == 0x451) { if ((0x410 <= code && code <= 0x44F) || code == 0x401 || code == 0x451) {
// the first letter is cyrillic // the first letter is cyrillic
if (!td::contains(language_codes, "ru") && !td::contains(language_codes, "uk") && if (!td::contains(language_codes, "ru") && !td::contains(language_codes, "uk") &&

View File

@ -34,7 +34,7 @@ vector<string> Hints::fix_words(vector<string> words) {
return words; return words;
} }
vector<string> Hints::get_words(Slice name, bool is_search) { vector<string> Hints::get_words(Slice name) {
bool in_word = false; bool in_word = false;
string word; string word;
vector<string> words; vector<string> words;
@ -42,7 +42,7 @@ vector<string> Hints::get_words(Slice name, bool is_search) {
auto end = name.uend(); auto end = name.uend();
while (pos != end) { while (pos != end) {
uint32 code; uint32 code;
pos = next_utf8_unsafe(pos, &code, is_search ? "get_words_search" : "get_words_add"); pos = next_utf8_unsafe(pos, &code);
code = prepare_search_character(code); code = prepare_search_character(code);
if (code == 0) { if (code == 0) {
@ -94,7 +94,7 @@ void Hints::add(KeyT key, Slice name) {
return; return;
} }
vector<string> old_transliterations; vector<string> old_transliterations;
for (auto &old_word : get_words(it->second, false)) { for (auto &old_word : get_words(it->second)) {
delete_word(old_word, key, word_to_keys_); delete_word(old_word, key, word_to_keys_);
for (auto &w : get_word_transliterations(old_word, false)) { for (auto &w : get_word_transliterations(old_word, false)) {
@ -116,7 +116,7 @@ void Hints::add(KeyT key, Slice name) {
} }
vector<string> transliterations; vector<string> transliterations;
for (auto &word : get_words(name, false)) { for (auto &word : get_words(name)) {
add_word(word, key, word_to_keys_); add_word(word, key, word_to_keys_);
for (auto &w : get_word_transliterations(word, false)) { for (auto &w : get_word_transliterations(word, false)) {
@ -166,7 +166,7 @@ std::pair<size_t, vector<Hints::KeyT>> Hints::search(Slice query, int32 limit, b
return {key_to_name_.size(), std::move(results)}; return {key_to_name_.size(), std::move(results)};
} }
auto words = get_words(query, true); auto words = get_words(query);
if (return_all_for_empty_query && words.empty()) { if (return_all_for_empty_query && words.empty()) {
results.reserve(key_to_name_.size()); results.reserve(key_to_name_.size());
for (auto &it : key_to_name_) { for (auto &it : key_to_name_) {

View File

@ -52,7 +52,7 @@ class Hints {
static vector<string> fix_words(vector<string> words); static vector<string> fix_words(vector<string> words);
static vector<string> get_words(Slice name, bool is_search); static vector<string> get_words(Slice name);
static void add_search_results(vector<KeyT> &results, const string &word, static void add_search_results(vector<KeyT> &results, const string &word,
const std::map<string, vector<KeyT>> &word_to_keys); const std::map<string, vector<KeyT>> &word_to_keys);

View File

@ -133,7 +133,7 @@ static string clean_filename_part(Slice name, int max_length) {
int size = 0; int size = 0;
for (auto *it = name.ubegin(); it != name.uend() && size < max_length;) { for (auto *it = name.ubegin(); it != name.uend() && size < max_length;) {
uint32 code; uint32 code;
it = next_utf8_unsafe(it, &code, "clean_filename_part"); it = next_utf8_unsafe(it, &code);
if (!is_ok(code)) { if (!is_ok(code)) {
if (prepare_search_character(code) == 0) { if (prepare_search_character(code) == 0) {
continue; continue;

View File

@ -53,7 +53,7 @@ static void punycode(string &result, Slice part) {
auto end = part.uend(); auto end = part.uend();
while (begin != end) { while (begin != end) {
uint32 code; uint32 code;
begin = next_utf8_unsafe(begin, &code, "punycode"); begin = next_utf8_unsafe(begin, &code);
if (code <= 127u) { if (code <= 127u) {
result += to_lower(static_cast<char>(code)); result += to_lower(static_cast<char>(code));
processed++; processed++;

View File

@ -56,7 +56,7 @@ static void add_word_transliterations(vector<string> &result, Slice word, bool a
auto end = word.uend(); auto end = word.uend();
while (pos != end) { while (pos != end) {
uint32 code; uint32 code;
pos = next_utf8_unsafe(pos, &code, "add_word_transliterations"); pos = next_utf8_unsafe(pos, &code);
auto it = simple_rules.find(code); auto it = simple_rules.find(code);
if (it != simple_rules.end()) { if (it != simple_rules.end()) {
s += it->second; s += it->second;
@ -89,7 +89,7 @@ static void add_word_transliterations(vector<string> &result, Slice word, bool a
} }
uint32 code; uint32 code;
pos = next_utf8_unsafe(pos, &code, "add_word_transliterations 2"); pos = next_utf8_unsafe(pos, &code);
auto it = simple_rules.find(code); auto it = simple_rules.find(code);
if (it != simple_rules.end()) { if (it != simple_rules.end()) {
s += it->second; s += it->second;

View File

@ -6,7 +6,6 @@
// //
#include "td/utils/utf8.h" #include "td/utils/utf8.h"
#include "td/utils/logging.h"
#include "td/utils/misc.h" #include "td/utils/misc.h"
#include "td/utils/SliceBuilder.h" #include "td/utils/SliceBuilder.h"
#include "td/utils/unicode.h" #include "td/utils/unicode.h"
@ -81,7 +80,7 @@ void append_utf8_character(string &str, uint32 ch) {
} }
} }
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source) { const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
uint32 a = ptr[0]; uint32 a = ptr[0];
if ((a & 0x80) == 0) { if ((a & 0x80) == 0) {
*code = a; *code = a;
@ -96,7 +95,7 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, co
*code = ((a & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f); *code = ((a & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
return ptr + 4; return ptr + 4;
} }
LOG(FATAL) << a << " " << source; UNREACHABLE();
*code = 0; *code = 0;
return ptr; return ptr;
} }
@ -107,7 +106,7 @@ string utf8_to_lower(Slice str) {
auto end = str.uend(); auto end = str.uend();
while (pos != end) { while (pos != end) {
uint32 code; uint32 code;
pos = next_utf8_unsafe(pos, &code, "utf8_to_lower"); pos = next_utf8_unsafe(pos, &code);
append_utf8_character(result, unicode_to_lower(code)); append_utf8_character(result, unicode_to_lower(code));
} }
return result; return result;

View File

@ -49,7 +49,7 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
} }
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code /// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source); const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
/// truncates UTF-8 string to the given length in Unicode characters /// truncates UTF-8 string to the given length in Unicode characters
template <class T> template <class T>

View File

@ -50,7 +50,7 @@ class RangeSet {
RangeSet res; RangeSet res;
for (auto begin = data.ubegin(); begin != data.uend();) { for (auto begin = data.ubegin(); begin != data.uend();) {
uint32 size; uint32 size;
begin = next_utf8_unsafe(begin, &size, "RangeSet"); begin = next_utf8_unsafe(begin, &size);
if (!is_empty && size != 0) { if (!is_empty && size != 0) {
res.ranges_.push_back({curr * BIT_SIZE, (curr + size) * BIT_SIZE}); res.ranges_.push_back({curr * BIT_SIZE, (curr + size) * BIT_SIZE});