From 4a6e0aea43d9e3a13e1954eac577c6d7ab251c3e Mon Sep 17 00:00:00 2001
From: levlam <levlam@telegram.org>
Date: Tue, 27 Sep 2022 14:16:09 +0300
Subject: [PATCH] Add utf8_prepare_search_string.

---
 tdutils/td/utils/Hints.cpp | 31 +------------------------------
 tdutils/td/utils/unicode.h |  2 +-
 tdutils/td/utils/utf8.cpp  | 36 ++++++++++++++++++++++++++++++++++++
 tdutils/td/utils/utf8.h    |  6 ++++++
 4 files changed, 44 insertions(+), 31 deletions(-)
diff --git a/tdutils/td/utils/Hints.cpp b/tdutils/td/utils/Hints.cpp
index 762281315..a7870527a 100644
--- a/tdutils/td/utils/Hints.cpp
+++ b/tdutils/td/utils/Hints.cpp
@@ -35,36 +35,7 @@ vector<string> Hints::fix_words(vector<string> words) {
 }
 
 vector<string> Hints::get_words(Slice name) {
-  bool in_word = false;
-  string word;
-  vector<string> words;
-  auto pos = name.ubegin();
-  auto end = name.uend();
-  while (pos != end) {
-    uint32 code;
-    pos = next_utf8_unsafe(pos, &code);
-
-    code = prepare_search_character(code);
-    if (code == 0) {
-      continue;
-    }
-    if (code == ' ') {
-      if (in_word) {
-        words.push_back(std::move(word));
-        word.clear();
-        in_word = false;
-      }
-    } else {
-      in_word = true;
-      code = remove_diacritics(code);
-      append_utf8_character(word, code);
-    }
-  }
-  if (in_word) {
-    words.push_back(std::move(word));
-  }
-
-  return fix_words(std::move(words));
+  return fix_words(utf8_get_search_words(name));
 }
 
 void Hints::add_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {
diff --git a/tdutils/td/utils/unicode.h b/tdutils/td/utils/unicode.h
index ad9f50d44..9012e4633 100644
--- a/tdutils/td/utils/unicode.h
+++ b/tdutils/td/utils/unicode.h
@@ -16,7 +16,7 @@ UnicodeSimpleCategory get_unicode_simple_category(uint32 code);
 
 /**
  * Prepares unicode character for search, leaving only digits and lowercased letters.
- * Return code of replacing character or 0 if the character should be skipped.
+ * Returns code of replacing character or 0 if the character should be skipped.
  */
 uint32 prepare_search_character(uint32 code);
 
diff --git a/tdutils/td/utils/utf8.cpp b/tdutils/td/utils/utf8.cpp
index a1e771d32..16c31e5b2 100644
--- a/tdutils/td/utils/utf8.cpp
+++ b/tdutils/td/utils/utf8.cpp
@@ -112,6 +112,42 @@ string utf8_to_lower(Slice str) {
   return result;
 }
 
+vector<string> utf8_get_search_words(Slice str) {
+  bool in_word = false;
+  string word;
+  vector<string> words;
+  auto pos = str.ubegin();
+  auto end = str.uend();
+  while (pos != end) {
+    uint32 code;
+    pos = next_utf8_unsafe(pos, &code);
+
+    code = prepare_search_character(code);
+    if (code == 0) {
+      continue;
+    }
+    if (code == ' ') {
+      if (in_word) {
+        words.push_back(std::move(word));
+        word.clear();
+        in_word = false;
+      }
+    } else {
+      in_word = true;
+      code = remove_diacritics(code);
+      append_utf8_character(word, code);
+    }
+  }
+  if (in_word) {
+    words.push_back(std::move(word));
+  }
+  return words;
+}
+
+string utf8_prepare_search_string(Slice str) {
+  return implode(utf8_get_search_words(str));
+}
+
 string utf8_encode(CSlice data) {
   if (check_utf8(data)) {
     return data.str();
diff --git a/tdutils/td/utils/utf8.h b/tdutils/td/utils/utf8.h
index 1247d9e7d..ff2b0ad1e 100644
--- a/tdutils/td/utils/utf8.h
+++ b/tdutils/td/utils/utf8.h
@@ -86,6 +86,12 @@ Slice utf8_utf16_substr(Slice str, size_t offset, size_t length);
 /// Returns UTF-8 string converted to lower case.
 string utf8_to_lower(Slice str);
 
+/// Returns UTF-8 string splitted by words for search.
+vector<string> utf8_get_search_words(Slice str);
+
+/// Returns UTF-8 string prepared for search, leaving only digits and lowercased letters.
+string utf8_prepare_search_string(Slice str);
+
 /// Returns valid UTF-8 representation of the string.
 string utf8_encode(CSlice data);