Add source to next_utf8_unsafe.
GitOrigin-RevId: e8e5a47096461c0e76a64eb26cb848651d4d61e8
This commit is contained in:
parent
08f04112af
commit
806e570a72
@ -189,7 +189,7 @@ static vector<Slice> match_mentions(Slice str) {
|
|||||||
|
|
||||||
if (ptr != begin) {
|
if (ptr != begin) {
|
||||||
uint32 prev;
|
uint32 prev;
|
||||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_mentions");
|
||||||
|
|
||||||
if (is_word_character(prev)) {
|
if (is_word_character(prev)) {
|
||||||
ptr++;
|
ptr++;
|
||||||
@ -207,7 +207,7 @@ static vector<Slice> match_mentions(Slice str) {
|
|||||||
}
|
}
|
||||||
uint32 next = 0;
|
uint32 next = 0;
|
||||||
if (ptr != end) {
|
if (ptr != end) {
|
||||||
next_utf8_unsafe(ptr, &next);
|
next_utf8_unsafe(ptr, &next, "match_mentions 2");
|
||||||
}
|
}
|
||||||
if (is_word_character(next)) {
|
if (is_word_character(next)) {
|
||||||
continue;
|
continue;
|
||||||
@ -233,7 +233,7 @@ static vector<Slice> match_bot_commands(Slice str) {
|
|||||||
|
|
||||||
if (ptr != begin) {
|
if (ptr != begin) {
|
||||||
uint32 prev;
|
uint32 prev;
|
||||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_bot_commands");
|
||||||
|
|
||||||
if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
|
if (is_word_character(prev) || prev == '/' || prev == '<' || prev == '>') {
|
||||||
ptr++;
|
ptr++;
|
||||||
@ -266,7 +266,7 @@ static vector<Slice> match_bot_commands(Slice str) {
|
|||||||
|
|
||||||
uint32 next = 0;
|
uint32 next = 0;
|
||||||
if (ptr != end) {
|
if (ptr != end) {
|
||||||
next_utf8_unsafe(ptr, &next);
|
next_utf8_unsafe(ptr, &next, "match_bot_commands 2");
|
||||||
}
|
}
|
||||||
if (is_word_character(next) || next == '/' || next == '<' || next == '>') {
|
if (is_word_character(next) || next == '/' || next == '<' || next == '>') {
|
||||||
continue;
|
continue;
|
||||||
@ -309,7 +309,7 @@ static vector<Slice> match_hashtags(Slice str) {
|
|||||||
|
|
||||||
if (ptr != begin) {
|
if (ptr != begin) {
|
||||||
uint32 prev;
|
uint32 prev;
|
||||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_hashtags");
|
||||||
|
|
||||||
if (is_hashtag_letter(prev, category)) {
|
if (is_hashtag_letter(prev, category)) {
|
||||||
ptr++;
|
ptr++;
|
||||||
@ -322,7 +322,7 @@ static vector<Slice> match_hashtags(Slice str) {
|
|||||||
bool was_letter = false;
|
bool was_letter = false;
|
||||||
while (ptr != end) {
|
while (ptr != end) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
auto next_ptr = next_utf8_unsafe(ptr, &code);
|
auto next_ptr = next_utf8_unsafe(ptr, &code, "match_hashtags 2");
|
||||||
if (!is_hashtag_letter(code, category)) {
|
if (!is_hashtag_letter(code, category)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -370,7 +370,7 @@ static vector<Slice> match_cashtags(Slice str) {
|
|||||||
|
|
||||||
if (ptr != begin) {
|
if (ptr != begin) {
|
||||||
uint32 prev;
|
uint32 prev;
|
||||||
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev);
|
next_utf8_unsafe(prev_utf8_unsafe(ptr), &prev, "match_cashtags");
|
||||||
|
|
||||||
if (is_hashtag_letter(prev, category) || prev == '$') {
|
if (is_hashtag_letter(prev, category) || prev == '$') {
|
||||||
ptr++;
|
ptr++;
|
||||||
@ -390,7 +390,7 @@ static vector<Slice> match_cashtags(Slice str) {
|
|||||||
|
|
||||||
if (cashtag_end != end) {
|
if (cashtag_end != end) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
next_utf8_unsafe(ptr, &code);
|
next_utf8_unsafe(ptr, &code, "match_cashtags 2");
|
||||||
if (is_hashtag_letter(code, category) || code == '$') {
|
if (is_hashtag_letter(code, category) || code == '$') {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -480,7 +480,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
const unsigned char *domain_end_ptr = begin + dot_pos;
|
const unsigned char *domain_end_ptr = begin + dot_pos;
|
||||||
while (domain_end_ptr != end) {
|
while (domain_end_ptr != end) {
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code);
|
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls");
|
||||||
if (code == '@') {
|
if (code == '@') {
|
||||||
last_at_ptr = domain_end_ptr;
|
last_at_ptr = domain_end_ptr;
|
||||||
}
|
}
|
||||||
@ -492,7 +492,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
|
domain_end_ptr = last_at_ptr == nullptr ? begin + dot_pos : last_at_ptr + 1;
|
||||||
while (domain_end_ptr != end) {
|
while (domain_end_ptr != end) {
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code);
|
auto next_ptr = next_utf8_unsafe(domain_end_ptr, &code, "match_urls 2");
|
||||||
if (!is_domain_symbol(code)) {
|
if (!is_domain_symbol(code)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -503,7 +503,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
while (domain_begin_ptr != begin) {
|
while (domain_begin_ptr != begin) {
|
||||||
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
|
domain_begin_ptr = prev_utf8_unsafe(domain_begin_ptr);
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code);
|
auto next_ptr = next_utf8_unsafe(domain_begin_ptr, &code, "match_urls 3");
|
||||||
if (last_at_ptr == nullptr ? !is_domain_symbol(code) : !is_user_data_symbol(code)) {
|
if (last_at_ptr == nullptr ? !is_domain_symbol(code) : !is_user_data_symbol(code)) {
|
||||||
domain_begin_ptr = next_ptr;
|
domain_begin_ptr = next_ptr;
|
||||||
break;
|
break;
|
||||||
@ -534,7 +534,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
auto path_end_ptr = url_end_ptr + 1;
|
auto path_end_ptr = url_end_ptr + 1;
|
||||||
while (path_end_ptr != end) {
|
while (path_end_ptr != end) {
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code);
|
auto next_ptr = next_utf8_unsafe(path_end_ptr, &code, "match_urls 4");
|
||||||
if (!is_path_symbol(code)) {
|
if (!is_path_symbol(code)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -559,7 +559,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
while (user_data_begin_ptr != begin) {
|
while (user_data_begin_ptr != begin) {
|
||||||
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);
|
user_data_begin_ptr = prev_utf8_unsafe(user_data_begin_ptr);
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code);
|
auto next_ptr = next_utf8_unsafe(user_data_begin_ptr, &code, "match_urls 5");
|
||||||
if (!is_user_data_symbol(code)) {
|
if (!is_user_data_symbol(code)) {
|
||||||
user_data_begin_ptr = next_ptr;
|
user_data_begin_ptr = next_ptr;
|
||||||
break;
|
break;
|
||||||
@ -579,7 +579,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
while (protocol_begin_ptr != begin) {
|
while (protocol_begin_ptr != begin) {
|
||||||
protocol_begin_ptr = prev_utf8_unsafe(protocol_begin_ptr);
|
protocol_begin_ptr = prev_utf8_unsafe(protocol_begin_ptr);
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code);
|
auto next_ptr = next_utf8_unsafe(protocol_begin_ptr, &code, "match_urls 6");
|
||||||
if (!is_protocol_symbol(code)) {
|
if (!is_protocol_symbol(code)) {
|
||||||
protocol_begin_ptr = next_ptr;
|
protocol_begin_ptr = next_ptr;
|
||||||
break;
|
break;
|
||||||
@ -601,7 +601,7 @@ static vector<Slice> match_urls(Slice str) {
|
|||||||
auto prefix_end = prefix.uend();
|
auto prefix_end = prefix.uend();
|
||||||
auto prefix_back = prev_utf8_unsafe(prefix_end);
|
auto prefix_back = prev_utf8_unsafe(prefix_end);
|
||||||
uint32 code = 0;
|
uint32 code = 0;
|
||||||
next_utf8_unsafe(prefix_back, &code);
|
next_utf8_unsafe(prefix_back, &code, "match_urls 7");
|
||||||
if (is_word_character(code) || code == '/' || code == '#' || code == '@') {
|
if (is_word_character(code) || code == '/' || code == '#' || code == '@') {
|
||||||
is_bad = true;
|
is_bad = true;
|
||||||
}
|
}
|
||||||
@ -1117,7 +1117,7 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands, bool onl
|
|||||||
while (ptr != end && cnt > 0) {
|
while (ptr != end && cnt > 0) {
|
||||||
unsigned char c = ptr[0];
|
unsigned char c = ptr[0];
|
||||||
utf16_pos += 1 + (c >= 0xf0);
|
utf16_pos += 1 + (c >= 0xf0);
|
||||||
ptr = next_utf8_unsafe(ptr, nullptr);
|
ptr = next_utf8_unsafe(ptr, nullptr, "match_urls 8");
|
||||||
|
|
||||||
pos = static_cast<int32>(ptr - begin);
|
pos = static_cast<int32>(ptr - begin);
|
||||||
if (entity_begin == pos) {
|
if (entity_begin == pos) {
|
||||||
|
@ -530,7 +530,7 @@ class MessagesDbImpl : public MessagesDbSyncInterface {
|
|||||||
for (auto ptr = query.ubegin(), end = query.uend(); ptr < end;) {
|
for (auto ptr = query.ubegin(), end = query.uend(); ptr < end;) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
auto code_ptr = ptr;
|
auto code_ptr = ptr;
|
||||||
ptr = next_utf8_unsafe(ptr, &code);
|
ptr = next_utf8_unsafe(ptr, &code, "prepare_query");
|
||||||
if (is_word_character(code)) {
|
if (is_word_character(code)) {
|
||||||
if (!in_word) {
|
if (!in_word) {
|
||||||
in_word = true;
|
in_word = true;
|
||||||
|
@ -22083,14 +22083,14 @@ void MessagesManager::update_used_hashtags(DialogId dialog_id, const Message *m)
|
|||||||
}
|
}
|
||||||
while (utf16_pos < entity.offset && ptr < end) {
|
while (utf16_pos < entity.offset && ptr < end) {
|
||||||
utf16_pos += 1 + (ptr[0] >= 0xf0);
|
utf16_pos += 1 + (ptr[0] >= 0xf0);
|
||||||
ptr = next_utf8_unsafe(ptr, nullptr);
|
ptr = next_utf8_unsafe(ptr, nullptr, "update_used_hashtags");
|
||||||
}
|
}
|
||||||
CHECK(utf16_pos == entity.offset);
|
CHECK(utf16_pos == entity.offset);
|
||||||
auto from = ptr;
|
auto from = ptr;
|
||||||
|
|
||||||
while (utf16_pos < entity.offset + entity.length && ptr < end) {
|
while (utf16_pos < entity.offset + entity.length && ptr < end) {
|
||||||
utf16_pos += 1 + (ptr[0] >= 0xf0);
|
utf16_pos += 1 + (ptr[0] >= 0xf0);
|
||||||
ptr = next_utf8_unsafe(ptr, nullptr);
|
ptr = next_utf8_unsafe(ptr, nullptr, "update_used_hashtags 2");
|
||||||
}
|
}
|
||||||
CHECK(utf16_pos == entity.offset + entity.length);
|
CHECK(utf16_pos == entity.offset + entity.length);
|
||||||
auto to = ptr;
|
auto to = ptr;
|
||||||
|
@ -41,7 +41,7 @@ vector<string> Hints::get_words(Slice name) {
|
|||||||
auto end = name.uend();
|
auto end = name.uend();
|
||||||
while (pos != end) {
|
while (pos != end) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
pos = next_utf8_unsafe(pos, &code);
|
pos = next_utf8_unsafe(pos, &code, "get_words");
|
||||||
|
|
||||||
code = prepare_search_character(code);
|
code = prepare_search_character(code);
|
||||||
if (code == 0) {
|
if (code == 0) {
|
||||||
|
@ -82,7 +82,7 @@ static std::string clean_filename_part(Slice name, int max_length) {
|
|||||||
int size = 0;
|
int size = 0;
|
||||||
for (auto *it = name.ubegin(); it != name.uend() && size < max_length;) {
|
for (auto *it = name.ubegin(); it != name.uend() && size < max_length;) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
it = next_utf8_unsafe(it, &code);
|
it = next_utf8_unsafe(it, &code, "clean_filename_part");
|
||||||
if (!is_ok(code)) {
|
if (!is_ok(code)) {
|
||||||
code = ' ';
|
code = ' ';
|
||||||
}
|
}
|
||||||
|
@ -50,7 +50,7 @@ static void punycode(string &result, Slice part) {
|
|||||||
auto end = part.uend();
|
auto end = part.uend();
|
||||||
while (begin != end) {
|
while (begin != end) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
begin = next_utf8_unsafe(begin, &code);
|
begin = next_utf8_unsafe(begin, &code, "punycode");
|
||||||
if (code <= 127u) {
|
if (code <= 127u) {
|
||||||
result += to_lower(static_cast<char>(code));
|
result += to_lower(static_cast<char>(code));
|
||||||
processed++;
|
processed++;
|
||||||
|
@ -55,7 +55,7 @@ void add_word_transliterations(vector<string> &result, Slice word, bool allow_pa
|
|||||||
auto end = word.uend();
|
auto end = word.uend();
|
||||||
while (pos != end) {
|
while (pos != end) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
pos = next_utf8_unsafe(pos, &code);
|
pos = next_utf8_unsafe(pos, &code, "add_word_transliterations");
|
||||||
auto it = simple_rules.find(code);
|
auto it = simple_rules.find(code);
|
||||||
if (it != simple_rules.end()) {
|
if (it != simple_rules.end()) {
|
||||||
s += it->second;
|
s += it->second;
|
||||||
@ -88,7 +88,7 @@ void add_word_transliterations(vector<string> &result, Slice word, bool allow_pa
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint32 code;
|
uint32 code;
|
||||||
pos = next_utf8_unsafe(pos, &code);
|
pos = next_utf8_unsafe(pos, &code, "add_word_transliterations 2");
|
||||||
auto it = simple_rules.find(code);
|
auto it = simple_rules.find(code);
|
||||||
if (it != simple_rules.end()) {
|
if (it != simple_rules.end()) {
|
||||||
s += it->second;
|
s += it->second;
|
||||||
|
@ -79,7 +79,7 @@ void append_utf8_character(string &str, uint32 ch) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
|
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source) {
|
||||||
uint32 a = ptr[0];
|
uint32 a = ptr[0];
|
||||||
if ((a & 0x80) == 0) {
|
if ((a & 0x80) == 0) {
|
||||||
if (code) {
|
if (code) {
|
||||||
@ -102,7 +102,7 @@ const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
|
|||||||
}
|
}
|
||||||
return ptr + 4;
|
return ptr + 4;
|
||||||
}
|
}
|
||||||
UNREACHABLE();
|
LOG(FATAL) << a << " " << source;
|
||||||
if (code) {
|
if (code) {
|
||||||
*code = 0;
|
*code = 0;
|
||||||
}
|
}
|
||||||
@ -115,7 +115,7 @@ string utf8_to_lower(Slice str) {
|
|||||||
auto end = str.uend();
|
auto end = str.uend();
|
||||||
while (pos != end) {
|
while (pos != end) {
|
||||||
uint32 code;
|
uint32 code;
|
||||||
pos = next_utf8_unsafe(pos, &code);
|
pos = next_utf8_unsafe(pos, &code, "utf8_to_lower");
|
||||||
append_utf8_character(result, unicode_to_lower(code));
|
append_utf8_character(result, unicode_to_lower(code));
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
@ -40,7 +40,7 @@ inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
|
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
|
||||||
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
|
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source);
|
||||||
|
|
||||||
/// truncates UTF-8 string to the given length in Unicode characters
|
/// truncates UTF-8 string to the given length in Unicode characters
|
||||||
template <class T>
|
template <class T>
|
||||||
|
Loading…
Reference in New Issue
Block a user