Remove usage of codecvt: now conversions are safe and 2-1000x faster.

GitOrigin-RevId: a947a25cf9991dc39dc87b8766409e9b1e987009
2018-05-18 18:55:40 +03:00 · 2018-05-18 18:55:40 +03:00 · 2691c7fabf
commit 2691c7fabf
parent 18818fd4ff
2 changed files with 116 additions and 29 deletions
--- a/tdutils/td/utils/port/wstring_convert.cpp
+++ b/tdutils/td/utils/port/wstring_convert.cpp
@ -10,44 +10,96 @@ char disable_linker_warning_about_empty_file_wstring_convert_cpp TD_UNUSED;

 #if TD_PORT_WINDOWS

-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#include "td/utils/utf8.h"

-#include "td/utils/port/wstring_convert.h"
-
-#include <codecvt>
-#include <locale>
-#include <utility>
+#include <cwchar>

 namespace td {

-namespace detail {
-template <class Facet>
-class UsableFacet : public Facet {
- public:
-  template <class... Args>
-  explicit UsableFacet(Args &&... args) : Facet(std::forward<Args>(args)...) {
-  }
-  ~UsableFacet() = default;
-};
-}  // namespace detail
-
 Result<std::wstring> to_wstring(CSlice slice) {
-  // TODO(perf): optimize
-  std::wstring_convert<detail::UsableFacet<std::codecvt_utf8_utf16<wchar_t>>> converter;
-  auto res = converter.from_bytes(slice.begin(), slice.end());
-  if (converter.converted() != slice.size()) {
+  if (!check_utf8(slice)) {
    return Status::Error("Wrong encoding");
  }
-  return res;
+
+  size_t wstring_len = 0;
+  for (auto c : slice) {
+    wstring_len += ((c & 0xc0) != 0x80) + ((c & 0xf8) == 0xf0);
+  }
+
+  std::wstring result(wstring_len, static_cast<wchar_t>(0));
+  if (wstring_len) {
+    wchar_t *res = &result[0];
+    for (size_t i = 0; i < slice.size();) {
+      unsigned int a = static_cast<unsigned char>(slice[i++]);
+      if (a >= 0x80) {
+        unsigned int b = static_cast<unsigned char>(slice[i++]);
+        if (a >= 0xe0) {
+          unsigned int c = static_cast<unsigned char>(slice[i++]);
+          if (a >= 0xf0) {
+            unsigned int d = static_cast<unsigned char>(slice[i++]);
+            unsigned int val = ((a & 0x07) << 18) + ((b & 0x3f) << 12) + ((c & 0x3f) << 6) + (d & 0x3f) - 0x10000;
+            *res++ = static_cast<wchar_t>(0xD800 + (val >> 10));
+            *res++ = static_cast<wchar_t>(0xDC00 + (val & 0x3ff));
+          } else {
+            *res++ = static_cast<wchar_t>(((a & 0x0f) << 12) + ((b & 0x3f) << 6) + (c & 0x3f));
+          }
+        } else {
+          *res++ = static_cast<wchar_t>(((a & 0x1f) << 6) + (b & 0x3f));
+        }
+      } else {
+        *res++ = static_cast<wchar_t>(a);
+      }
+    }
+    CHECK(res == &result[0] + wstring_len);
+  }
+  return result;
 }

 Result<string> from_wstring(const wchar_t *begin, size_t size) {
-  std::wstring_convert<detail::UsableFacet<std::codecvt_utf8_utf16<wchar_t>>> converter;
-  auto res = converter.to_bytes(begin, begin + size);
-  if (converter.converted() != size) {
-    return Status::Error("Wrong encoding");
+  size_t result_len = 0;
+  for (size_t i = 0; i < size; i++) {
+    unsigned int cur = begin[i];
+    if ((cur & 0xF800) == 0xD800) {
+      if (i < size) {
+        unsigned int next = begin[++i];
+        if ((next & 0xFC00) == 0xDC00 && (cur & 0x400) == 0) {
+          result_len += 4;
+          continue;
+        }
+      }
+
+      return Status::Error("Wrong encoding");
+    }
+    result_len += 1 + (cur >= 0x80) + (cur >= 0x800);
  }
-  return res;
+
+  std::string result(result_len, '\0');
+  if (result_len) {
+    char *res = &result[0];
+    for (size_t i = 0; i < size; i++) {
+      unsigned int cur = begin[i];
+      // TODO conversion unsigned int -> signed char is implementation defined
+      if (cur <= 0x7f) {
+        *res++ = static_cast<char>(cur);
+      } else if (cur <= 0x7ff) {
+        *res++ = static_cast<char>(0xc0 | (cur >> 6));
+        *res++ = static_cast<char>(0x80 | (cur & 0x3f));
+      } else if ((cur & 0xF800) != 0xD800) {
+        *res++ = static_cast<char>(0xe0 | (cur >> 12));
+        *res++ = static_cast<char>(0x80 | ((cur >> 6) & 0x3f));
+        *res++ = static_cast<char>(0x80 | (cur & 0x3f));
+      } else {
+        unsigned int next = begin[++i];
+        unsigned int val = ((cur - 0xD800) << 10) + next - 0xDC00 + 0x10000;
+
+        *res++ = static_cast<char>(0xf0 | (val >> 18));
+        *res++ = static_cast<char>(0x80 | ((val >> 12) & 0x3f));
+        *res++ = static_cast<char>(0x80 | ((val >> 6) & 0x3f));
+        *res++ = static_cast<char>(0x80 | (val & 0x3f));
+      }
+    }
+  }
+  return result;
 }

 Result<string> from_wstring(const std::wstring &str) {
@ -55,7 +107,7 @@ Result<string> from_wstring(const std::wstring &str) {
 }

 Result<string> from_wstring(const wchar_t *begin) {
-  return from_wstring(begin, wcslen(begin));
+  return from_wstring(begin, std::wcslen(begin));
 }

 }  // namespace td
--- a/tdutils/test/misc.cpp
+++ b/tdutils/test/misc.cpp
@ -19,6 +19,7 @@
 #include "td/utils/Slice.h"
 #include "td/utils/StringBuilder.h"
 #include "td/utils/tests.h"
+#include "td/utils/utf8.h"

 #include <atomic>
 #include <clocale>
@ -270,7 +271,7 @@ static void test_idn_to_ascii_one(string host, string result) {

 TEST(Misc, idn_to_ascii) {
  test_idn_to_ascii_one("::::::::::::::::::::::::::::::::::::::@/", "::::::::::::::::::::::::::::::::::::::@/");
-  test_idn_to_ascii_one("%30", "%30");
+  test_idn_to_ascii_one("", "");
  test_idn_to_ascii_one("%30", "%30");
  test_idn_to_ascii_one("127.0.0.1", "127.0.0.1");
  test_idn_to_ascii_one("fe80::", "fe80::");
@ -297,4 +298,38 @@ TEST(Misc, idn_to_ascii) {
  test_idn_to_ascii_one("win-2k12r2-addc.阿伯测阿伯测ad.hai.com", "win-2k12r2-addc.xn--ad-tl3ca3569aba8944eca.hai.com");
  test_idn_to_ascii_one("✌️.ws", "xn--7bi.ws");
  test_idn_to_ascii_one("⛧", "xn--59h");
+  ASSERT_TRUE(idn_to_ascii("\xc0").is_error());
 }
+
+#if TD_WINDOWS
+static void test_to_wstring_one(string str) {
+  ASSERT_STREQ(str, from_wstring(to_wstring(str).ok()).ok());
+}
+
+TEST(Misc, to_wstring) {
+  test_to_wstring_one("");
+  for (int i = 0; i < 10; i++) {
+    test_to_wstring_one("test");
+    test_to_wstring_one("тест");
+  }
+  string str;
+  for (uint32 i = 0; i <= 0xD7FF; i++) {
+    append_utf8_character(str, i);
+  }
+  for (uint32 i = 0xE000; i <= 0x10FFFF; i++) {
+    append_utf8_character(str, i);
+  }
+  test_to_wstring_one(str);
+  ASSERT_TRUE(to_wstring("\xc0").is_error());
+  auto emoji = to_wstring("🏟").ok();
+  ASSERT_TRUE(from_wstring(emoji).ok() == "🏟");
+  ASSERT_TRUE(emoji.size() == 2);
+  auto emoji2 = emoji;
+  emoji[0] = emoji[1];
+  emoji2[1] = emoji2[0];
+  ASSERT_TRUE(from_wstring(emoji).is_error());
+  ASSERT_TRUE(from_wstring(emoji2).is_error());
+  emoji2[0] = emoji[0];
+  ASSERT_TRUE(from_wstring(emoji2).is_error());
+}
+#endif