From 4a6e0aea43d9e3a13e1954eac577c6d7ab251c3e Mon Sep 17 00:00:00 2001 From: levlam Date: Tue, 27 Sep 2022 14:16:09 +0300 Subject: [PATCH] Add utf8_prepare_search_string. --- tdutils/td/utils/Hints.cpp | 31 +------------------------------ tdutils/td/utils/unicode.h | 2 +- tdutils/td/utils/utf8.cpp | 36 ++++++++++++++++++++++++++++++++++++ tdutils/td/utils/utf8.h | 6 ++++++ 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/tdutils/td/utils/Hints.cpp b/tdutils/td/utils/Hints.cpp index 762281315..a7870527a 100644 --- a/tdutils/td/utils/Hints.cpp +++ b/tdutils/td/utils/Hints.cpp @@ -35,36 +35,7 @@ vector Hints::fix_words(vector words) { } vector Hints::get_words(Slice name) { - bool in_word = false; - string word; - vector words; - auto pos = name.ubegin(); - auto end = name.uend(); - while (pos != end) { - uint32 code; - pos = next_utf8_unsafe(pos, &code); - - code = prepare_search_character(code); - if (code == 0) { - continue; - } - if (code == ' ') { - if (in_word) { - words.push_back(std::move(word)); - word.clear(); - in_word = false; - } - } else { - in_word = true; - code = remove_diacritics(code); - append_utf8_character(word, code); - } - } - if (in_word) { - words.push_back(std::move(word)); - } - - return fix_words(std::move(words)); + return fix_words(utf8_get_search_words(name)); } void Hints::add_word(const string &word, KeyT key, std::map> &word_to_keys) { diff --git a/tdutils/td/utils/unicode.h b/tdutils/td/utils/unicode.h index ad9f50d44..9012e4633 100644 --- a/tdutils/td/utils/unicode.h +++ b/tdutils/td/utils/unicode.h @@ -16,7 +16,7 @@ UnicodeSimpleCategory get_unicode_simple_category(uint32 code); /** * Prepares unicode character for search, leaving only digits and lowercased letters. - * Return code of replacing character or 0 if the character should be skipped. + * Returns code of replacing character or 0 if the character should be skipped. */ uint32 prepare_search_character(uint32 code); diff --git a/tdutils/td/utils/utf8.cpp b/tdutils/td/utils/utf8.cpp index a1e771d32..16c31e5b2 100644 --- a/tdutils/td/utils/utf8.cpp +++ b/tdutils/td/utils/utf8.cpp @@ -112,6 +112,42 @@ string utf8_to_lower(Slice str) { return result; } +vector utf8_get_search_words(Slice str) { + bool in_word = false; + string word; + vector words; + auto pos = str.ubegin(); + auto end = str.uend(); + while (pos != end) { + uint32 code; + pos = next_utf8_unsafe(pos, &code); + + code = prepare_search_character(code); + if (code == 0) { + continue; + } + if (code == ' ') { + if (in_word) { + words.push_back(std::move(word)); + word.clear(); + in_word = false; + } + } else { + in_word = true; + code = remove_diacritics(code); + append_utf8_character(word, code); + } + } + if (in_word) { + words.push_back(std::move(word)); + } + return words; +} + +string utf8_prepare_search_string(Slice str) { + return implode(utf8_get_search_words(str)); +} + string utf8_encode(CSlice data) { if (check_utf8(data)) { return data.str(); diff --git a/tdutils/td/utils/utf8.h b/tdutils/td/utils/utf8.h index 1247d9e7d..ff2b0ad1e 100644 --- a/tdutils/td/utils/utf8.h +++ b/tdutils/td/utils/utf8.h @@ -86,6 +86,12 @@ Slice utf8_utf16_substr(Slice str, size_t offset, size_t length); /// Returns UTF-8 string converted to lower case. string utf8_to_lower(Slice str); +/// Returns UTF-8 string splitted by words for search. +vector utf8_get_search_words(Slice str); + +/// Returns UTF-8 string prepared for search, leaving only digits and lowercased letters. +string utf8_prepare_search_string(Slice str); + /// Returns valid UTF-8 representation of the string. string utf8_encode(CSlice data);