From 2691c7fabf29a1102b5328b0d0d5fd129d8db78a Mon Sep 17 00:00:00 2001 From: levlam Date: Fri, 18 May 2018 18:55:40 +0300 Subject: [PATCH] Remove usage of codecvt: now conversions are safe and 2-1000x faster. GitOrigin-RevId: a947a25cf9991dc39dc87b8766409e9b1e987009 --- tdutils/td/utils/port/wstring_convert.cpp | 108 ++++++++++++++++------ tdutils/test/misc.cpp | 37 +++++++- 2 files changed, 116 insertions(+), 29 deletions(-) diff --git a/tdutils/td/utils/port/wstring_convert.cpp b/tdutils/td/utils/port/wstring_convert.cpp index 8dd83cc76..b997c79e4 100644 --- a/tdutils/td/utils/port/wstring_convert.cpp +++ b/tdutils/td/utils/port/wstring_convert.cpp @@ -10,44 +10,96 @@ char disable_linker_warning_about_empty_file_wstring_convert_cpp TD_UNUSED; #if TD_PORT_WINDOWS -#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#include "td/utils/utf8.h" -#include "td/utils/port/wstring_convert.h" - -#include -#include -#include +#include namespace td { -namespace detail { -template -class UsableFacet : public Facet { - public: - template - explicit UsableFacet(Args &&... args) : Facet(std::forward(args)...) { - } - ~UsableFacet() = default; -}; -} // namespace detail - Result to_wstring(CSlice slice) { - // TODO(perf): optimize - std::wstring_convert>> converter; - auto res = converter.from_bytes(slice.begin(), slice.end()); - if (converter.converted() != slice.size()) { + if (!check_utf8(slice)) { return Status::Error("Wrong encoding"); } - return res; + + size_t wstring_len = 0; + for (auto c : slice) { + wstring_len += ((c & 0xc0) != 0x80) + ((c & 0xf8) == 0xf0); + } + + std::wstring result(wstring_len, static_cast(0)); + if (wstring_len) { + wchar_t *res = &result[0]; + for (size_t i = 0; i < slice.size();) { + unsigned int a = static_cast(slice[i++]); + if (a >= 0x80) { + unsigned int b = static_cast(slice[i++]); + if (a >= 0xe0) { + unsigned int c = static_cast(slice[i++]); + if (a >= 0xf0) { + unsigned int d = static_cast(slice[i++]); + unsigned int val = ((a & 0x07) << 18) + ((b & 0x3f) << 12) + ((c & 0x3f) << 6) + (d & 0x3f) - 0x10000; + *res++ = static_cast(0xD800 + (val >> 10)); + *res++ = static_cast(0xDC00 + (val & 0x3ff)); + } else { + *res++ = static_cast(((a & 0x0f) << 12) + ((b & 0x3f) << 6) + (c & 0x3f)); + } + } else { + *res++ = static_cast(((a & 0x1f) << 6) + (b & 0x3f)); + } + } else { + *res++ = static_cast(a); + } + } + CHECK(res == &result[0] + wstring_len); + } + return result; } Result from_wstring(const wchar_t *begin, size_t size) { - std::wstring_convert>> converter; - auto res = converter.to_bytes(begin, begin + size); - if (converter.converted() != size) { - return Status::Error("Wrong encoding"); + size_t result_len = 0; + for (size_t i = 0; i < size; i++) { + unsigned int cur = begin[i]; + if ((cur & 0xF800) == 0xD800) { + if (i < size) { + unsigned int next = begin[++i]; + if ((next & 0xFC00) == 0xDC00 && (cur & 0x400) == 0) { + result_len += 4; + continue; + } + } + + return Status::Error("Wrong encoding"); + } + result_len += 1 + (cur >= 0x80) + (cur >= 0x800); } - return res; + + std::string result(result_len, '\0'); + if (result_len) { + char *res = &result[0]; + for (size_t i = 0; i < size; i++) { + unsigned int cur = begin[i]; + // TODO conversion unsigned int -> signed char is implementation defined + if (cur <= 0x7f) { + *res++ = static_cast(cur); + } else if (cur <= 0x7ff) { + *res++ = static_cast(0xc0 | (cur >> 6)); + *res++ = static_cast(0x80 | (cur & 0x3f)); + } else if ((cur & 0xF800) != 0xD800) { + *res++ = static_cast(0xe0 | (cur >> 12)); + *res++ = static_cast(0x80 | ((cur >> 6) & 0x3f)); + *res++ = static_cast(0x80 | (cur & 0x3f)); + } else { + unsigned int next = begin[++i]; + unsigned int val = ((cur - 0xD800) << 10) + next - 0xDC00 + 0x10000; + + *res++ = static_cast(0xf0 | (val >> 18)); + *res++ = static_cast(0x80 | ((val >> 12) & 0x3f)); + *res++ = static_cast(0x80 | ((val >> 6) & 0x3f)); + *res++ = static_cast(0x80 | (val & 0x3f)); + } + } + } + return result; } Result from_wstring(const std::wstring &str) { @@ -55,7 +107,7 @@ Result from_wstring(const std::wstring &str) { } Result from_wstring(const wchar_t *begin) { - return from_wstring(begin, wcslen(begin)); + return from_wstring(begin, std::wcslen(begin)); } } // namespace td diff --git a/tdutils/test/misc.cpp b/tdutils/test/misc.cpp index 56cd5af09..408c519ac 100644 --- a/tdutils/test/misc.cpp +++ b/tdutils/test/misc.cpp @@ -19,6 +19,7 @@ #include "td/utils/Slice.h" #include "td/utils/StringBuilder.h" #include "td/utils/tests.h" +#include "td/utils/utf8.h" #include #include @@ -270,7 +271,7 @@ static void test_idn_to_ascii_one(string host, string result) { TEST(Misc, idn_to_ascii) { test_idn_to_ascii_one("::::::::::::::::::::::::::::::::::::::@/", "::::::::::::::::::::::::::::::::::::::@/"); - test_idn_to_ascii_one("%30", "%30"); + test_idn_to_ascii_one("", ""); test_idn_to_ascii_one("%30", "%30"); test_idn_to_ascii_one("127.0.0.1", "127.0.0.1"); test_idn_to_ascii_one("fe80::", "fe80::"); @@ -297,4 +298,38 @@ TEST(Misc, idn_to_ascii) { test_idn_to_ascii_one("win-2k12r2-addc.阿伯测阿伯测ad.hai.com", "win-2k12r2-addc.xn--ad-tl3ca3569aba8944eca.hai.com"); test_idn_to_ascii_one("✌️.ws", "xn--7bi.ws"); test_idn_to_ascii_one("⛧", "xn--59h"); + ASSERT_TRUE(idn_to_ascii("\xc0").is_error()); } + +#if TD_WINDOWS +static void test_to_wstring_one(string str) { + ASSERT_STREQ(str, from_wstring(to_wstring(str).ok()).ok()); +} + +TEST(Misc, to_wstring) { + test_to_wstring_one(""); + for (int i = 0; i < 10; i++) { + test_to_wstring_one("test"); + test_to_wstring_one("тест"); + } + string str; + for (uint32 i = 0; i <= 0xD7FF; i++) { + append_utf8_character(str, i); + } + for (uint32 i = 0xE000; i <= 0x10FFFF; i++) { + append_utf8_character(str, i); + } + test_to_wstring_one(str); + ASSERT_TRUE(to_wstring("\xc0").is_error()); + auto emoji = to_wstring("🏟").ok(); + ASSERT_TRUE(from_wstring(emoji).ok() == "🏟"); + ASSERT_TRUE(emoji.size() == 2); + auto emoji2 = emoji; + emoji[0] = emoji[1]; + emoji2[1] = emoji2[0]; + ASSERT_TRUE(from_wstring(emoji).is_error()); + ASSERT_TRUE(from_wstring(emoji2).is_error()); + emoji2[0] = emoji[0]; + ASSERT_TRUE(from_wstring(emoji2).is_error()); +} +#endif