// // Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2019 // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // #include "td/utils/utf8.h" #include "td/utils/logging.h" // for UNREACHABLE #include "td/utils/unicode.h" namespace td { bool check_utf8(CSlice str) { const char *data = str.data(); const char *data_end = data + str.size(); do { unsigned int a = static_cast<unsigned char>(*data++); if ((a & 0x80) == 0) { if (data == data_end + 1) { return true; } continue; } #define ENSURE(condition) \ if (!(condition)) { \ return false; \ } ENSURE((a & 0x40) != 0); unsigned int b = static_cast<unsigned char>(*data++); ENSURE((b & 0xc0) == 0x80); if ((a & 0x20) == 0) { ENSURE((a & 0x1e) > 0); continue; } unsigned int c = static_cast<unsigned char>(*data++); ENSURE((c & 0xc0) == 0x80); if ((a & 0x10) == 0) { int x = (((a & 0x0f) << 6) | (b & 0x20)); ENSURE(x != 0 && x != 0x360); // surrogates continue; } unsigned int d = static_cast<unsigned char>(*data++); ENSURE((d & 0xc0) == 0x80); if ((a & 0x08) == 0) { int t = (((a & 0x07) << 6) | (b & 0x30)); ENSURE(0 < t && t < 0x110); // end of unicode continue; } return false; #undef ENSURE } while (true); UNREACHABLE(); return false; } void append_utf8_character(string &str, uint32 ch) { if (ch <= 0x7f) { str.push_back(static_cast<char>(ch)); } else if (ch <= 0x7ff) { str.push_back(static_cast<char>(0xc0 | (ch >> 6))); // implementation-defined str.push_back(static_cast<char>(0x80 | (ch & 0x3f))); } else if (ch <= 0xffff) { str.push_back(static_cast<char>(0xe0 | (ch >> 12))); // implementation-defined str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f))); str.push_back(static_cast<char>(0x80 | (ch & 0x3f))); } else { str.push_back(static_cast<char>(0xf0 | (ch >> 18))); // implementation-defined str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f))); str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f))); str.push_back(static_cast<char>(0x80 | (ch & 0x3f))); } } const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source) { uint32 a = ptr[0]; if ((a & 0x80) == 0) { if (code) { *code = a; } return ptr + 1; } else if ((a & 0x20) == 0) { if (code) { *code = ((a & 0x1f) << 6) | (ptr[1] & 0x3f); } return ptr + 2; } else if ((a & 0x10) == 0) { if (code) { *code = ((a & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f); } return ptr + 3; } else if ((a & 0x08) == 0) { if (code) { *code = ((a & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f); } return ptr + 4; } LOG(FATAL) << a << " " << source; if (code) { *code = 0; } return ptr; } string utf8_to_lower(Slice str) { string result; auto pos = str.ubegin(); auto end = str.uend(); while (pos != end) { uint32 code; pos = next_utf8_unsafe(pos, &code, "utf8_to_lower"); append_utf8_character(result, unicode_to_lower(code)); } return result; } } // namespace td