Remove usage of codecvt: now conversions are safe and 2-1000x faster.

GitOrigin-RevId: a947a25cf9991dc39dc87b8766409e9b1e987009
This commit is contained in:
levlam 2018-05-18 18:55:40 +03:00
parent 18818fd4ff
commit 2691c7fabf
2 changed files with 116 additions and 29 deletions

View File

@ -10,44 +10,96 @@ char disable_linker_warning_about_empty_file_wstring_convert_cpp TD_UNUSED;
#if TD_PORT_WINDOWS
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#include "td/utils/utf8.h"
#include "td/utils/port/wstring_convert.h"
#include <codecvt>
#include <locale>
#include <utility>
#include <cwchar>
namespace td {
namespace detail {
template <class Facet>
class UsableFacet : public Facet {
public:
template <class... Args>
explicit UsableFacet(Args &&... args) : Facet(std::forward<Args>(args)...) {
}
~UsableFacet() = default;
};
} // namespace detail
Result<std::wstring> to_wstring(CSlice slice) {
// TODO(perf): optimize
std::wstring_convert<detail::UsableFacet<std::codecvt_utf8_utf16<wchar_t>>> converter;
auto res = converter.from_bytes(slice.begin(), slice.end());
if (converter.converted() != slice.size()) {
if (!check_utf8(slice)) {
return Status::Error("Wrong encoding");
}
return res;
size_t wstring_len = 0;
for (auto c : slice) {
wstring_len += ((c & 0xc0) != 0x80) + ((c & 0xf8) == 0xf0);
}
std::wstring result(wstring_len, static_cast<wchar_t>(0));
if (wstring_len) {
wchar_t *res = &result[0];
for (size_t i = 0; i < slice.size();) {
unsigned int a = static_cast<unsigned char>(slice[i++]);
if (a >= 0x80) {
unsigned int b = static_cast<unsigned char>(slice[i++]);
if (a >= 0xe0) {
unsigned int c = static_cast<unsigned char>(slice[i++]);
if (a >= 0xf0) {
unsigned int d = static_cast<unsigned char>(slice[i++]);
unsigned int val = ((a & 0x07) << 18) + ((b & 0x3f) << 12) + ((c & 0x3f) << 6) + (d & 0x3f) - 0x10000;
*res++ = static_cast<wchar_t>(0xD800 + (val >> 10));
*res++ = static_cast<wchar_t>(0xDC00 + (val & 0x3ff));
} else {
*res++ = static_cast<wchar_t>(((a & 0x0f) << 12) + ((b & 0x3f) << 6) + (c & 0x3f));
}
} else {
*res++ = static_cast<wchar_t>(((a & 0x1f) << 6) + (b & 0x3f));
}
} else {
*res++ = static_cast<wchar_t>(a);
}
}
CHECK(res == &result[0] + wstring_len);
}
return result;
}
Result<string> from_wstring(const wchar_t *begin, size_t size) {
std::wstring_convert<detail::UsableFacet<std::codecvt_utf8_utf16<wchar_t>>> converter;
auto res = converter.to_bytes(begin, begin + size);
if (converter.converted() != size) {
return Status::Error("Wrong encoding");
size_t result_len = 0;
for (size_t i = 0; i < size; i++) {
unsigned int cur = begin[i];
if ((cur & 0xF800) == 0xD800) {
if (i < size) {
unsigned int next = begin[++i];
if ((next & 0xFC00) == 0xDC00 && (cur & 0x400) == 0) {
result_len += 4;
continue;
}
}
return Status::Error("Wrong encoding");
}
result_len += 1 + (cur >= 0x80) + (cur >= 0x800);
}
return res;
std::string result(result_len, '\0');
if (result_len) {
char *res = &result[0];
for (size_t i = 0; i < size; i++) {
unsigned int cur = begin[i];
// TODO conversion unsigned int -> signed char is implementation defined
if (cur <= 0x7f) {
*res++ = static_cast<char>(cur);
} else if (cur <= 0x7ff) {
*res++ = static_cast<char>(0xc0 | (cur >> 6));
*res++ = static_cast<char>(0x80 | (cur & 0x3f));
} else if ((cur & 0xF800) != 0xD800) {
*res++ = static_cast<char>(0xe0 | (cur >> 12));
*res++ = static_cast<char>(0x80 | ((cur >> 6) & 0x3f));
*res++ = static_cast<char>(0x80 | (cur & 0x3f));
} else {
unsigned int next = begin[++i];
unsigned int val = ((cur - 0xD800) << 10) + next - 0xDC00 + 0x10000;
*res++ = static_cast<char>(0xf0 | (val >> 18));
*res++ = static_cast<char>(0x80 | ((val >> 12) & 0x3f));
*res++ = static_cast<char>(0x80 | ((val >> 6) & 0x3f));
*res++ = static_cast<char>(0x80 | (val & 0x3f));
}
}
}
return result;
}
Result<string> from_wstring(const std::wstring &str) {
@ -55,7 +107,7 @@ Result<string> from_wstring(const std::wstring &str) {
}
Result<string> from_wstring(const wchar_t *begin) {
return from_wstring(begin, wcslen(begin));
return from_wstring(begin, std::wcslen(begin));
}
} // namespace td

View File

@ -19,6 +19,7 @@
#include "td/utils/Slice.h"
#include "td/utils/StringBuilder.h"
#include "td/utils/tests.h"
#include "td/utils/utf8.h"
#include <atomic>
#include <clocale>
@ -270,7 +271,7 @@ static void test_idn_to_ascii_one(string host, string result) {
TEST(Misc, idn_to_ascii) {
test_idn_to_ascii_one("::::::::::::::::::::::::::::::::::::::@/", "::::::::::::::::::::::::::::::::::::::@/");
test_idn_to_ascii_one("%30", "%30");
test_idn_to_ascii_one("", "");
test_idn_to_ascii_one("%30", "%30");
test_idn_to_ascii_one("127.0.0.1", "127.0.0.1");
test_idn_to_ascii_one("fe80::", "fe80::");
@ -297,4 +298,38 @@ TEST(Misc, idn_to_ascii) {
test_idn_to_ascii_one("win-2k12r2-addc.阿伯测阿伯测ad.hai.com", "win-2k12r2-addc.xn--ad-tl3ca3569aba8944eca.hai.com");
test_idn_to_ascii_one("✌️.ws", "xn--7bi.ws");
test_idn_to_ascii_one("", "xn--59h");
ASSERT_TRUE(idn_to_ascii("\xc0").is_error());
}
#if TD_WINDOWS
static void test_to_wstring_one(string str) {
ASSERT_STREQ(str, from_wstring(to_wstring(str).ok()).ok());
}
TEST(Misc, to_wstring) {
test_to_wstring_one("");
for (int i = 0; i < 10; i++) {
test_to_wstring_one("test");
test_to_wstring_one("тест");
}
string str;
for (uint32 i = 0; i <= 0xD7FF; i++) {
append_utf8_character(str, i);
}
for (uint32 i = 0xE000; i <= 0x10FFFF; i++) {
append_utf8_character(str, i);
}
test_to_wstring_one(str);
ASSERT_TRUE(to_wstring("\xc0").is_error());
auto emoji = to_wstring("🏟").ok();
ASSERT_TRUE(from_wstring(emoji).ok() == "🏟");
ASSERT_TRUE(emoji.size() == 2);
auto emoji2 = emoji;
emoji[0] = emoji[1];
emoji2[1] = emoji2[0];
ASSERT_TRUE(from_wstring(emoji).is_error());
ASSERT_TRUE(from_wstring(emoji2).is_error());
emoji2[0] = emoji[0];
ASSERT_TRUE(from_wstring(emoji2).is_error());
}
#endif