2018-12-31 20:04:05 +01:00
|
|
|
//
|
2022-12-31 22:28:08 +01:00
|
|
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2023
|
2018-12-31 20:04:05 +01:00
|
|
|
//
|
|
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
//
|
|
|
|
#pragma once
|
|
|
|
|
2018-12-27 20:24:44 +01:00
|
|
|
#include "td/utils/common.h"
|
2018-12-31 20:04:05 +01:00
|
|
|
#include "td/utils/Slice.h"
|
|
|
|
|
|
|
|
namespace td {
|
|
|
|
|
|
|
|
/// checks UTF-8 string for correctness
|
|
|
|
bool check_utf8(CSlice str);
|
|
|
|
|
|
|
|
/// checks if a code unit is a first code unit of a UTF-8 character
|
|
|
|
inline bool is_utf8_character_first_code_unit(unsigned char c) {
|
|
|
|
return (c & 0xC0) != 0x80;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// returns length of UTF-8 string in characters
|
|
|
|
inline size_t utf8_length(Slice str) {
|
|
|
|
size_t result = 0;
|
|
|
|
for (auto c : str) {
|
|
|
|
result += is_utf8_character_first_code_unit(c);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2019-10-07 02:41:04 +02:00
|
|
|
/// returns length of UTF-8 string in UTF-16 code units
|
2022-09-22 11:08:34 +02:00
|
|
|
size_t utf8_utf16_length(Slice str);
|
2019-10-07 02:41:04 +02:00
|
|
|
|
2018-12-31 20:04:05 +01:00
|
|
|
/// appends a Unicode character using UTF-8 encoding
|
2023-01-09 10:43:15 +01:00
|
|
|
template <class T>
|
|
|
|
void append_utf8_character(T &str, uint32 ch) {
|
|
|
|
if (ch <= 0x7f) {
|
|
|
|
str.push_back(static_cast<char>(ch));
|
|
|
|
} else if (ch <= 0x7ff) {
|
|
|
|
str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined
|
|
|
|
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
|
|
|
} else if (ch <= 0xffff) {
|
|
|
|
str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined
|
|
|
|
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
|
|
|
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
|
|
|
} else {
|
|
|
|
str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined
|
|
|
|
str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
|
|
|
|
str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
|
|
|
|
str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
|
|
|
|
}
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
|
|
|
|
/// moves pointer one UTF-8 character back
|
|
|
|
inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
|
|
|
|
while (!is_utf8_character_first_code_unit(*--ptr)) {
|
|
|
|
// pass
|
|
|
|
}
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
|
2022-08-19 15:37:44 +02:00
|
|
|
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code);
|
2018-12-31 20:04:05 +01:00
|
|
|
|
|
|
|
/// truncates UTF-8 string to the given length in Unicode characters
|
|
|
|
template <class T>
|
|
|
|
T utf8_truncate(T str, size_t length) {
|
|
|
|
if (str.size() > length) {
|
|
|
|
for (size_t i = 0; i < str.size(); i++) {
|
|
|
|
if (is_utf8_character_first_code_unit(static_cast<unsigned char>(str[i]))) {
|
|
|
|
if (length == 0) {
|
|
|
|
return str.substr(0, i);
|
|
|
|
} else {
|
|
|
|
length--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// truncates UTF-8 string to the given length given in UTF-16 code units
|
2022-09-22 11:08:34 +02:00
|
|
|
Slice utf8_utf16_truncate(Slice str, size_t length);
|
2018-12-31 20:04:05 +01:00
|
|
|
|
|
|
|
template <class T>
|
|
|
|
T utf8_substr(T str, size_t offset) {
|
2020-03-02 14:05:18 +01:00
|
|
|
if (offset == 0) {
|
|
|
|
return str;
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
auto offset_pos = utf8_truncate(str, offset).size();
|
|
|
|
return str.substr(offset_pos);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class T>
|
|
|
|
T utf8_substr(T str, size_t offset, size_t length) {
|
|
|
|
return utf8_truncate(utf8_substr(str, offset), length);
|
|
|
|
}
|
|
|
|
|
2022-09-22 11:08:34 +02:00
|
|
|
Slice utf8_utf16_substr(Slice str, size_t offset);
|
2018-12-31 20:04:05 +01:00
|
|
|
|
2022-09-22 11:08:34 +02:00
|
|
|
Slice utf8_utf16_substr(Slice str, size_t offset, size_t length);
|
2018-12-31 20:04:05 +01:00
|
|
|
|
|
|
|
/// Returns UTF-8 string converted to lower case.
|
|
|
|
string utf8_to_lower(Slice str);
|
|
|
|
|
2022-10-09 18:00:14 +02:00
|
|
|
/// Returns UTF-8 string split by words for search.
|
2022-09-27 13:16:09 +02:00
|
|
|
vector<string> utf8_get_search_words(Slice str);
|
|
|
|
|
|
|
|
/// Returns UTF-8 string prepared for search, leaving only digits and lowercased letters.
|
|
|
|
string utf8_prepare_search_string(Slice str);
|
|
|
|
|
2022-05-31 14:08:16 +02:00
|
|
|
/// Returns valid UTF-8 representation of the string.
|
|
|
|
string utf8_encode(CSlice data);
|
|
|
|
|
2018-12-31 20:04:05 +01:00
|
|
|
} // namespace td
|