197 lines
5.0 KiB
C++
197 lines
5.0 KiB
C++
//
|
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2024
|
|
//
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
//
|
|
#include "td/utils/utf8.h"
|
|
|
|
#include "td/utils/misc.h"
|
|
#include "td/utils/SliceBuilder.h"
|
|
#include "td/utils/unicode.h"
|
|
|
|
namespace td {
|
|
|
|
bool check_utf8(CSlice str) {
|
|
const char *data = str.data();
|
|
const char *data_end = data + str.size();
|
|
do {
|
|
uint32 a = static_cast<unsigned char>(*data++);
|
|
if ((a & 0x80) == 0) {
|
|
if (data == data_end + 1) {
|
|
return true;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
#define ENSURE(condition) \
|
|
if (!(condition)) { \
|
|
return false; \
|
|
}
|
|
|
|
ENSURE((a & 0x40) != 0);
|
|
|
|
uint32 b = static_cast<unsigned char>(*data++);
|
|
ENSURE((b & 0xc0) == 0x80);
|
|
if ((a & 0x20) == 0) {
|
|
ENSURE((a & 0x1e) > 0);
|
|
continue;
|
|
}
|
|
|
|
uint32 c = static_cast<unsigned char>(*data++);
|
|
ENSURE((c & 0xc0) == 0x80);
|
|
if ((a & 0x10) == 0) {
|
|
uint32 x = (((a & 0x0f) << 6) | (b & 0x20));
|
|
ENSURE(x != 0 && x != 0x360); // surrogates
|
|
continue;
|
|
}
|
|
|
|
uint32 d = static_cast<unsigned char>(*data++);
|
|
ENSURE((d & 0xc0) == 0x80);
|
|
if ((a & 0x08) == 0) {
|
|
uint32 t = (((a & 0x07) << 6) | (b & 0x30));
|
|
ENSURE(0 < t && t < 0x110); // end of unicode
|
|
continue;
|
|
}
|
|
|
|
return false;
|
|
#undef ENSURE
|
|
} while (true);
|
|
|
|
UNREACHABLE();
|
|
return false;
|
|
}
|
|
|
|
const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code) {
|
|
uint32 a = ptr[0];
|
|
if ((a & 0x80) == 0) {
|
|
*code = a;
|
|
return ptr + 1;
|
|
} else if ((a & 0x20) == 0) {
|
|
*code = ((a & 0x1f) << 6) | (ptr[1] & 0x3f);
|
|
return ptr + 2;
|
|
} else if ((a & 0x10) == 0) {
|
|
*code = ((a & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f);
|
|
return ptr + 3;
|
|
} else if ((a & 0x08) == 0) {
|
|
*code = ((a & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
|
|
return ptr + 4;
|
|
}
|
|
UNREACHABLE();
|
|
*code = 0;
|
|
return ptr;
|
|
}
|
|
|
|
unsigned char *append_utf8_character_unsafe(unsigned char *ptr, uint32 code) {
|
|
if (code <= 0x7f) {
|
|
*ptr++ = static_cast<unsigned char>(code);
|
|
} else if (code <= 0x7ff) {
|
|
*ptr++ = static_cast<unsigned char>(0xc0 | (code >> 6));
|
|
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
|
|
} else if (code <= 0xffff) {
|
|
*ptr++ = static_cast<unsigned char>(0xe0 | (code >> 12));
|
|
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
|
|
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
|
|
} else {
|
|
*ptr++ = static_cast<unsigned char>(0xf0 | (code >> 18));
|
|
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 12) & 0x3f));
|
|
*ptr++ = static_cast<unsigned char>(0x80 | ((code >> 6) & 0x3f));
|
|
*ptr++ = static_cast<unsigned char>(0x80 | (code & 0x3f));
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
string utf8_to_lower(Slice str) {
|
|
string result;
|
|
auto pos = str.ubegin();
|
|
auto end = str.uend();
|
|
while (pos != end) {
|
|
uint32 code;
|
|
pos = next_utf8_unsafe(pos, &code);
|
|
append_utf8_character(result, unicode_to_lower(code));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
vector<string> utf8_get_search_words(Slice str) {
|
|
bool in_word = false;
|
|
string word;
|
|
vector<string> words;
|
|
auto pos = str.ubegin();
|
|
auto end = str.uend();
|
|
while (pos != end) {
|
|
uint32 code;
|
|
pos = next_utf8_unsafe(pos, &code);
|
|
|
|
code = prepare_search_character(code);
|
|
if (code == 0) {
|
|
continue;
|
|
}
|
|
if (code == ' ') {
|
|
if (in_word) {
|
|
words.push_back(std::move(word));
|
|
word.clear();
|
|
in_word = false;
|
|
}
|
|
} else {
|
|
in_word = true;
|
|
code = remove_diacritics(code);
|
|
append_utf8_character(word, code);
|
|
}
|
|
}
|
|
if (in_word) {
|
|
words.push_back(std::move(word));
|
|
}
|
|
return words;
|
|
}
|
|
|
|
string utf8_prepare_search_string(Slice str) {
|
|
return implode(utf8_get_search_words(str));
|
|
}
|
|
|
|
string utf8_encode(CSlice data) {
|
|
if (check_utf8(data)) {
|
|
return data.str();
|
|
}
|
|
return PSTRING() << "url_decode(" << url_encode(data) << ')';
|
|
}
|
|
|
|
size_t utf8_utf16_length(Slice str) {
|
|
size_t result = 0;
|
|
for (auto c : str) {
|
|
result += is_utf8_character_first_code_unit(c) + ((c & 0xf8) == 0xf0);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
Slice utf8_utf16_truncate(Slice str, size_t length) {
|
|
for (size_t i = 0; i < str.size(); i++) {
|
|
auto c = static_cast<unsigned char>(str[i]);
|
|
if (is_utf8_character_first_code_unit(c)) {
|
|
if (length <= 0) {
|
|
return str.substr(0, i);
|
|
} else {
|
|
length--;
|
|
if (c >= 0xf0) { // >= 4 bytes in symbol => surrogate pair
|
|
length--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return str;
|
|
}
|
|
|
|
Slice utf8_utf16_substr(Slice str, size_t offset) {
|
|
if (offset == 0) {
|
|
return str;
|
|
}
|
|
auto offset_pos = utf8_utf16_truncate(str, offset).size();
|
|
return str.substr(offset_pos);
|
|
}
|
|
|
|
Slice utf8_utf16_substr(Slice str, size_t offset, size_t length) {
|
|
return utf8_utf16_truncate(utf8_utf16_substr(str, offset), length);
|
|
}
|
|
|
|
} // namespace td
|