Add utf8_prepare_search_string.

This commit is contained in:
levlam 2022-09-27 14:16:09 +03:00
parent 0c21d0d596
commit 4a6e0aea43
4 changed files with 44 additions and 31 deletions

View File

@ -35,36 +35,7 @@ vector<string> Hints::fix_words(vector<string> words) {
}
vector<string> Hints::get_words(Slice name) {
bool in_word = false;
string word;
vector<string> words;
auto pos = name.ubegin();
auto end = name.uend();
while (pos != end) {
uint32 code;
pos = next_utf8_unsafe(pos, &code);
code = prepare_search_character(code);
if (code == 0) {
continue;
}
if (code == ' ') {
if (in_word) {
words.push_back(std::move(word));
word.clear();
in_word = false;
}
} else {
in_word = true;
code = remove_diacritics(code);
append_utf8_character(word, code);
}
}
if (in_word) {
words.push_back(std::move(word));
}
return fix_words(std::move(words));
return fix_words(utf8_get_search_words(name));
}
void Hints::add_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {

View File

@ -16,7 +16,7 @@ UnicodeSimpleCategory get_unicode_simple_category(uint32 code);
/**
* Prepares unicode character for search, leaving only digits and lowercased letters.
* Return code of replacing character or 0 if the character should be skipped.
* Returns code of replacing character or 0 if the character should be skipped.
*/
uint32 prepare_search_character(uint32 code);

View File

@ -112,6 +112,42 @@ string utf8_to_lower(Slice str) {
return result;
}
vector<string> utf8_get_search_words(Slice str) {
bool in_word = false;
string word;
vector<string> words;
auto pos = str.ubegin();
auto end = str.uend();
while (pos != end) {
uint32 code;
pos = next_utf8_unsafe(pos, &code);
code = prepare_search_character(code);
if (code == 0) {
continue;
}
if (code == ' ') {
if (in_word) {
words.push_back(std::move(word));
word.clear();
in_word = false;
}
} else {
in_word = true;
code = remove_diacritics(code);
append_utf8_character(word, code);
}
}
if (in_word) {
words.push_back(std::move(word));
}
return words;
}
string utf8_prepare_search_string(Slice str) {
return implode(utf8_get_search_words(str));
}
string utf8_encode(CSlice data) {
if (check_utf8(data)) {
return data.str();

View File

@ -86,6 +86,12 @@ Slice utf8_utf16_substr(Slice str, size_t offset, size_t length);
/// Returns UTF-8 string converted to lower case.
string utf8_to_lower(Slice str);
/// Returns UTF-8 string splitted by words for search.
vector<string> utf8_get_search_words(Slice str);
/// Returns UTF-8 string prepared for search, leaving only digits and lowercased letters.
string utf8_prepare_search_string(Slice str);
/// Returns valid UTF-8 representation of the string.
string utf8_encode(CSlice data);