From 0764948a8743125a6e25ace29b61897baeb555be Mon Sep 17 00:00:00 2001 From: levlam Date: Fri, 18 May 2018 23:38:11 +0300 Subject: [PATCH] Punicode support for non-Windows. GitOrigin-RevId: 6ef7dd1bb163bdf369a7ed10a50d598b50e2def6 --- tdutils/td/utils/port/IPAddress.cpp | 112 +++++++++++++++++++++++++++- tdutils/test/misc.cpp | 3 +- 2 files changed, 110 insertions(+), 5 deletions(-) diff --git a/tdutils/td/utils/port/IPAddress.cpp b/tdutils/td/utils/port/IPAddress.cpp index 3c5f07a0b..129680f4d 100644 --- a/tdutils/td/utils/port/IPAddress.cpp +++ b/tdutils/td/utils/port/IPAddress.cpp @@ -41,6 +41,89 @@ static bool is_ascii_host(Slice host) { return true; } +#if !TD_WINDOWS +static void punycode(string &result, Slice part) { + vector codes; + codes.reserve(utf8_length(part)); + uint32 processed = 0; + auto begin = part.ubegin(); + auto end = part.uend(); + while (begin != end) { + uint32 code; + begin = next_utf8_unsafe(begin, &code); + if (code <= 127u) { + result += to_lower(static_cast(code)); + processed++; + } + codes.push_back(code); + } + + if (processed > 0) { + result += '-'; + } + + uint32 n = 127; + uint32 delta = 0; + int bias = -72; + bool is_first = true; + while (processed < codes.size()) { + // choose lowest not processed code + uint32 next_n = 0x110000; + for (auto code : codes) { + if (code > n && code < next_n) { + next_n = code; + } + } + delta += (next_n - n - 1) * (processed + 1); + + for (auto code : codes) { + if (code < next_n) { + delta++; + } + + if (code == next_n) { + // found next symbol, encode delta + int left = static_cast(delta); + while (true) { + bias += 36; + auto t = clamp(bias, 1, 26); + if (left < t) { + result += static_cast(left + 'a'); + break; + } + + left -= t; + auto digit = t + left % (36 - t); + result += static_cast(digit < 26 ? digit + 'a' : digit - 26 + '0'); + left /= 36 - t; + } + processed++; + + // update bias + if (is_first) { + delta /= 700; + is_first = false; + } else { + delta /= 2; + } + delta += delta / processed; + + bias = 0; + while (delta > 35 * 13) { + delta /= 35; + bias -= 36; + } + bias -= static_cast(36 * delta / (delta + 38)); + delta = 0; + } + } + + delta++; + n = next_n; + } +} +#endif + Result idn_to_ascii(CSlice host) { if (is_ascii_host(host)) { return to_lower(host); @@ -49,10 +132,15 @@ Result idn_to_ascii(CSlice host) { return Status::Error("Host name must be encoded in UTF-8"); } + const int MAX_DNS_NAME_LENGTH = 255; + if (host.size() >= MAX_DNS_NAME_LENGTH * 4) { // upper bound, 4 characters per symbol + return Status::Error("Host name is too long"); + } + #if TD_WINDOWS TRY_RESULT(whost, to_wstring(host)); - wchar_t punycode[256]; - int result_length = IdnToAscii(IDN_ALLOW_UNASSIGNED, whost.c_str(), whost.size(), punycode, 255); + wchar_t punycode[MAX_DNS_NAME_LENGTH + 1]; + int result_length = IdnToAscii(IDN_ALLOW_UNASSIGNED, whost.c_str(), whost.size(), punycode, MAX_DNS_NAME_LENGTH); if (result_length == 0) { return Status::Error("Host can't be converted to ASCII"); } @@ -60,8 +148,24 @@ Result idn_to_ascii(CSlice host) { TRY_RESULT(idn_host, from_wstring(punycode, result_length)); return idn_host; #else - // TODO - return Status::Error("Internationalized Domain Names are not supported"); + auto parts = full_split(Slice(host), '.'); + bool is_first = true; + string result; + for (auto part : parts) { + if (!is_first) { + result += '.'; + } + if (is_ascii_host(part)) { + result.append(part.data(), part.size()); + } else { + // TODO nameprep should be applied first, but punycode is better than nothing. + // It is better to use libidn/ICU here if available + result += "xn--"; + punycode(result, part); + } + is_first = false; + } + return result; #endif } diff --git a/tdutils/test/misc.cpp b/tdutils/test/misc.cpp index a793331fe..36e38b295 100644 --- a/tdutils/test/misc.cpp +++ b/tdutils/test/misc.cpp @@ -296,7 +296,8 @@ TEST(Misc, idn_to_ascii) { test_idn_to_ascii_one("wіkіреdіа.org", "xn--wkd-8cdx9d7hbd.org"); test_idn_to_ascii_one("cnwin2k8中国.avol.com", "xn--cnwin2k8-sd0mx14e.avol.com"); test_idn_to_ascii_one("win-2k12r2-addc.阿伯测阿伯测ad.hai.com", "win-2k12r2-addc.xn--ad-tl3ca3569aba8944eca.hai.com"); - test_idn_to_ascii_one("✌️.ws", "xn--7bi.ws"); + test_idn_to_ascii_one("✌.ws", "xn--7bi.ws"); + // test_idn_to_ascii_one("✌️.ws", "xn--7bi.ws"); // needs nameprep to succeed test_idn_to_ascii_one("⛧", "xn--59h"); test_idn_to_ascii_one("--рф.рф", "xn-----mmcq.xn--p1ai"); ASSERT_TRUE(idn_to_ascii("\xc0").is_error());