From b6bfb3ddbf05e2d163e40c8375ae0b503ff9cec7 Mon Sep 17 00:00:00 2001 From: levlam Date: Tue, 31 Jul 2018 02:49:42 +0300 Subject: [PATCH] Add transliteration implementation. GitOrigin-RevId: 205a0fdac314625617dc54065d572f92f442cf57 --- tdutils/CMakeLists.txt | 2 + tdutils/td/utils/translit.cpp | 114 ++++++++++++++++++++++++++++++++++ tdutils/td/utils/translit.h | 16 +++++ tdutils/test/misc.cpp | 24 +++++++ 4 files changed, 156 insertions(+) create mode 100644 tdutils/td/utils/translit.cpp create mode 100644 tdutils/td/utils/translit.h diff --git a/tdutils/CMakeLists.txt b/tdutils/CMakeLists.txt index 9ecb02cd8..f69a4df3b 100644 --- a/tdutils/CMakeLists.txt +++ b/tdutils/CMakeLists.txt @@ -79,6 +79,7 @@ set(TDUTILS_SOURCE td/utils/Time.cpp td/utils/Timer.cpp td/utils/tl_parsers.cpp + td/utils/translit.cpp td/utils/unicode.cpp td/utils/utf8.cpp @@ -183,6 +184,7 @@ set(TDUTILS_SOURCE td/utils/tl_helpers.h td/utils/tl_parsers.h td/utils/tl_storers.h + td/utils/translit.h td/utils/type_traits.h td/utils/unicode.h td/utils/utf8.h diff --git a/tdutils/td/utils/translit.cpp b/tdutils/td/utils/translit.cpp new file mode 100644 index 000000000..13f19f092 --- /dev/null +++ b/tdutils/td/utils/translit.cpp @@ -0,0 +1,114 @@ +// +// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2018 +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#include "td/utils/translit.h" + +#include "td/utils/misc.h" +#include "td/utils/utf8.h" + +#include +#include + +namespace td { + +static const std::unordered_map &get_en_to_ru_simple_rules() { + static const std::unordered_map rules{ + {'a', "а"}, {'b', "б"}, {'c', "к"}, {'d', "д"}, {'e', "е"}, {'f', "ф"}, {'g', "г"}, {'h', "х"}, {'i', "и"}, + {'j', "й"}, {'k', "к"}, {'l', "л"}, {'m', "м"}, {'n', "н"}, {'o', "о"}, {'p', "п"}, {'q', "к"}, {'r', "р"}, + {'s', "с"}, {'t', "т"}, {'u', "у"}, {'v', "в"}, {'w', "в"}, {'x', "кс"}, {'y', "и"}, {'z', "з"}}; + return rules; +} + +static const std::vector> &get_en_to_ru_complex_rules() { + static const std::vector> rules{ + {"ch", "ч"}, {"ei", "ей"}, {"ey", "ей"}, {"ia", "ия"}, {"iy", "ий"}, {"jo", "е"}, + {"ju", "ю"}, {"ja", "я"}, {"kh", "х"}, {"sh", "ш"}, {"shch", "щ"}, {"sch", "щ"}, + {"ts", "ц"}, {"yo", "е"}, {"yu", "ю"}, {"ya", "я"}, {"zh", "ж"}}; + return rules; +} + +static const std::unordered_map &get_ru_to_en_simple_rules() { + static const std::unordered_map rules{ + {0x430, "a"}, {0x431, "b"}, {0x432, "v"}, {0x433, "g"}, {0x434, "d"}, {0x435, "e"}, {0x451, "e"}, + {0x436, "zh"}, {0x437, "z"}, {0x438, "i"}, {0x439, "y"}, {0x43a, "k"}, {0x43b, "l"}, {0x43c, "m"}, + {0x43d, "n"}, {0x43e, "o"}, {0x43f, "p"}, {0x440, "r"}, {0x441, "s"}, {0x442, "t"}, {0x443, "u"}, + {0x444, "f"}, {0x445, "kh"}, {0x446, "ts"}, {0x447, "ch"}, {0x448, "sh"}, {0x449, "sch"}, {0x44a, ""}, + {0x44b, "y"}, {0x44c, ""}, {0x44d, "e"}, {0x44e, "yu"}, {0x44f, "ya"}}; + return rules; +} + +static const std::vector> &get_ru_to_en_complex_rules() { + static const std::vector> rules{ + {"ий", "y"}, {"ия", "ia"}, {"кс", "x"}, {"yo", "e"}, {"jo", "e"}}; + return rules; +} + +void add_word_transliterations(vector &result, Slice word, + const std::unordered_map &simple_rules, + const std::vector> &complex_rules) { + string s; + auto pos = word.ubegin(); + auto end = word.uend(); + while (pos != end) { + uint32 code; + pos = next_utf8_unsafe(pos, &code); + auto it = simple_rules.find(code); + if (it != simple_rules.end()) { + s += it->second; + } else { + append_utf8_character(s, code); + } + } + if (!s.empty()) { + result.push_back(std::move(s)); + s.clear(); + } + + pos = word.ubegin(); + while (pos != end) { + auto suffix = Slice(pos, end); + bool found = false; + for (auto &rule : complex_rules) { + if (begins_with(suffix, rule.first)) { + found = true; + pos += rule.first.size(); + s.append(rule.second); + break; + } + if (begins_with(rule.first, suffix)) { + result.push_back(s + rule.second); + } + } + if (found) { + continue; + } + + uint32 code; + pos = next_utf8_unsafe(pos, &code); + auto it = simple_rules.find(code); + if (it != simple_rules.end()) { + s += it->second; + } else { + append_utf8_character(s, code); + } + } + if (!s.empty()) { + result.push_back(std::move(s)); + } +} + +vector get_word_transliterations(Slice word) { + vector result; + + add_word_transliterations(result, word, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules()); + add_word_transliterations(result, word, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules()); + + std::sort(result.begin(), result.end()); + result.erase(std::unique(result.begin(), result.end()), result.end()); + return result; +} + +} // namespace td diff --git a/tdutils/td/utils/translit.h b/tdutils/td/utils/translit.h new file mode 100644 index 000000000..1a132fa1d --- /dev/null +++ b/tdutils/td/utils/translit.h @@ -0,0 +1,16 @@ +// +// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2018 +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#pragma once + +#include "td/utils/common.h" +#include "td/utils/Slice.h" + +namespace td { + +vector get_word_transliterations(Slice word); + +} // namespace td diff --git a/tdutils/test/misc.cpp b/tdutils/test/misc.cpp index 9bc660acb..9287e4c03 100644 --- a/tdutils/test/misc.cpp +++ b/tdutils/test/misc.cpp @@ -20,6 +20,7 @@ #include "td/utils/Slice.h" #include "td/utils/StringBuilder.h" #include "td/utils/tests.h" +#include "td/utils/translit.h" #include "td/utils/utf8.h" #include @@ -364,3 +365,26 @@ TEST(Misc, to_wstring) { ASSERT_TRUE(from_wstring(emoji2).is_error()); } #endif + +static void test_translit(string word, vector result) { + ASSERT_EQ(result, get_word_transliterations(word)); +} + +TEST(Misc, translit) { + test_translit("word", {"word", "ворд"}); + test_translit("", {}); + test_translit("ььььььььь", {"ььььььььь"}); + test_translit("крыло", {"krylo", "крыло"}); + test_translit("krylo", {"krylo", "крило"}); + test_translit("crylo", {"crylo", "крило"}); + test_translit("cheiia", {"cheiia", "кхеииа", "чейия"}); + test_translit("cheii", {"cheii", "кхеии", "чейи", "чейий", "чейия"}); + test_translit("s", {"s", "с", "ш", "щ"}); + test_translit("y", {"e", "y", "е", "и", "ю", "я"}); + test_translit("j", {"e", "j", "е", "й", "ю", "я"}); + test_translit("yo", {"e", "yo", "е", "ио"}); + test_translit("artjom", {"artem", "artjom", "артем", "артйом"}); + test_translit("artyom", {"artem", "artyom", "артем", "артиом"}); + test_translit("arty", {"arte", "arty", "арте", "арти", "артю", "артя"}); + test_translit("льи", {"li", "lia", "ly", "льи"}); +}