Add transliteration implementation.

GitOrigin-RevId: 205a0fdac314625617dc54065d572f92f442cf57
2018-07-31 02:49:42 +03:00 · 2018-07-31 02:49:42 +03:00 · b6bfb3ddbf
commit b6bfb3ddbf
parent 7706e8614b
4 changed files with 156 additions and 0 deletions
--- a/tdutils/CMakeLists.txt
+++ b/tdutils/CMakeLists.txt
@ -79,6 +79,7 @@ set(TDUTILS_SOURCE
  td/utils/Time.cpp
  td/utils/Timer.cpp
  td/utils/tl_parsers.cpp
+  td/utils/translit.cpp
  td/utils/unicode.cpp
  td/utils/utf8.cpp

@ -183,6 +184,7 @@ set(TDUTILS_SOURCE
  td/utils/tl_helpers.h
  td/utils/tl_parsers.h
  td/utils/tl_storers.h
+  td/utils/translit.h
  td/utils/type_traits.h
  td/utils/unicode.h
  td/utils/utf8.h
--- a/tdutils/td/utils/translit.cpp
+++ b/tdutils/td/utils/translit.cpp
@ -0,0 +1,114 @@
+//
+// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2018
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+#include "td/utils/translit.h"
+
+#include "td/utils/misc.h"
+#include "td/utils/utf8.h"
+
+#include <unordered_map>
+#include <utility>
+
+namespace td {
+
+static const std::unordered_map<uint32, string> &get_en_to_ru_simple_rules() {
+  static const std::unordered_map<uint32, string> rules{
+      {'a', "а"}, {'b', "б"}, {'c', "к"}, {'d', "д"}, {'e', "е"}, {'f', "ф"},  {'g', "г"}, {'h', "х"}, {'i', "и"},
+      {'j', "й"}, {'k', "к"}, {'l', "л"}, {'m', "м"}, {'n', "н"}, {'o', "о"},  {'p', "п"}, {'q', "к"}, {'r', "р"},
+      {'s', "с"}, {'t', "т"}, {'u', "у"}, {'v', "в"}, {'w', "в"}, {'x', "кс"}, {'y', "и"}, {'z', "з"}};
+  return rules;
+}
+
+static const std::vector<std::pair<string, string>> &get_en_to_ru_complex_rules() {
+  static const std::vector<std::pair<string, string>> rules{
+      {"ch", "ч"}, {"ei", "ей"}, {"ey", "ей"}, {"ia", "ия"}, {"iy", "ий"},  {"jo", "е"},
+      {"ju", "ю"}, {"ja", "я"},  {"kh", "х"},  {"sh", "ш"},  {"shch", "щ"}, {"sch", "щ"},
+      {"ts", "ц"}, {"yo", "е"},  {"yu", "ю"},  {"ya", "я"},  {"zh", "ж"}};
+  return rules;
+}
+
+static const std::unordered_map<uint32, string> &get_ru_to_en_simple_rules() {
+  static const std::unordered_map<uint32, string> rules{
+      {0x430, "a"},  {0x431, "b"},  {0x432, "v"},  {0x433, "g"},  {0x434, "d"},  {0x435, "e"},   {0x451, "e"},
+      {0x436, "zh"}, {0x437, "z"},  {0x438, "i"},  {0x439, "y"},  {0x43a, "k"},  {0x43b, "l"},   {0x43c, "m"},
+      {0x43d, "n"},  {0x43e, "o"},  {0x43f, "p"},  {0x440, "r"},  {0x441, "s"},  {0x442, "t"},   {0x443, "u"},
+      {0x444, "f"},  {0x445, "kh"}, {0x446, "ts"}, {0x447, "ch"}, {0x448, "sh"}, {0x449, "sch"}, {0x44a, ""},
+      {0x44b, "y"},  {0x44c, ""},   {0x44d, "e"},  {0x44e, "yu"}, {0x44f, "ya"}};
+  return rules;
+}
+
+static const std::vector<std::pair<string, string>> &get_ru_to_en_complex_rules() {
+  static const std::vector<std::pair<string, string>> rules{
+      {"ий", "y"}, {"ия", "ia"}, {"кс", "x"}, {"yo", "e"}, {"jo", "e"}};
+  return rules;
+}
+
+void add_word_transliterations(vector<string> &result, Slice word,
+                               const std::unordered_map<uint32, string> &simple_rules,
+                               const std::vector<std::pair<string, string>> &complex_rules) {
+  string s;
+  auto pos = word.ubegin();
+  auto end = word.uend();
+  while (pos != end) {
+    uint32 code;
+    pos = next_utf8_unsafe(pos, &code);
+    auto it = simple_rules.find(code);
+    if (it != simple_rules.end()) {
+      s += it->second;
+    } else {
+      append_utf8_character(s, code);
+    }
+  }
+  if (!s.empty()) {
+    result.push_back(std::move(s));
+    s.clear();
+  }
+
+  pos = word.ubegin();
+  while (pos != end) {
+    auto suffix = Slice(pos, end);
+    bool found = false;
+    for (auto &rule : complex_rules) {
+      if (begins_with(suffix, rule.first)) {
+        found = true;
+        pos += rule.first.size();
+        s.append(rule.second);
+        break;
+      }
+      if (begins_with(rule.first, suffix)) {
+        result.push_back(s + rule.second);
+      }
+    }
+    if (found) {
+      continue;
+    }
+
+    uint32 code;
+    pos = next_utf8_unsafe(pos, &code);
+    auto it = simple_rules.find(code);
+    if (it != simple_rules.end()) {
+      s += it->second;
+    } else {
+      append_utf8_character(s, code);
+    }
+  }
+  if (!s.empty()) {
+    result.push_back(std::move(s));
+  }
+}
+
+vector<string> get_word_transliterations(Slice word) {
+  vector<string> result;
+
+  add_word_transliterations(result, word, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules());
+  add_word_transliterations(result, word, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules());
+
+  std::sort(result.begin(), result.end());
+  result.erase(std::unique(result.begin(), result.end()), result.end());
+  return result;
+}
+
+}  // namespace td
--- a/tdutils/td/utils/translit.h
+++ b/tdutils/td/utils/translit.h
@ -0,0 +1,16 @@
+//
+// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2018
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+#pragma once
+
+#include "td/utils/common.h"
+#include "td/utils/Slice.h"
+
+namespace td {
+
+vector<string> get_word_transliterations(Slice word);
+
+}  // namespace td
--- a/tdutils/test/misc.cpp
+++ b/tdutils/test/misc.cpp
@ -20,6 +20,7 @@
 #include "td/utils/Slice.h"
 #include "td/utils/StringBuilder.h"
 #include "td/utils/tests.h"
+#include "td/utils/translit.h"
 #include "td/utils/utf8.h"

 #include <atomic>
@ -364,3 +365,26 @@ TEST(Misc, to_wstring) {
  ASSERT_TRUE(from_wstring(emoji2).is_error());
 }
 #endif
+
+static void test_translit(string word, vector<string> result) {
+  ASSERT_EQ(result, get_word_transliterations(word));
+}
+
+TEST(Misc, translit) {
+  test_translit("word", {"word", "ворд"});
+  test_translit("", {});
+  test_translit("ььььььььь", {"ььььььььь"});
+  test_translit("крыло", {"krylo", "крыло"});
+  test_translit("krylo", {"krylo", "крило"});
+  test_translit("crylo", {"crylo", "крило"});
+  test_translit("cheiia", {"cheiia", "кхеииа", "чейия"});
+  test_translit("cheii", {"cheii", "кхеии", "чейи", "чейий", "чейия"});
+  test_translit("s", {"s", "с", "ш", "щ"});
+  test_translit("y", {"e", "y", "е", "и", "ю", "я"});
+  test_translit("j", {"e", "j", "е", "й", "ю", "я"});
+  test_translit("yo", {"e", "yo", "е", "ио"});
+  test_translit("artjom", {"artem", "artjom", "артем", "артйом"});
+  test_translit("artyom", {"artem", "artyom", "артем", "артиом"});
+  test_translit("arty", {"arte", "arty", "арте", "арти", "артю", "артя"});
+  test_translit("льи", {"li", "lia", "ly", "льи"});
+}