From 5403d0bf30aa89f36c9a653503f24caffeaaffb3 Mon Sep 17 00:00:00 2001 From: levlam Date: Tue, 31 Jul 2018 14:13:19 +0300 Subject: [PATCH] Support transliterations in Hints search. GitOrigin-RevId: 391622f2e02defa39564a14c51e8a1b03b751b96 --- tdutils/td/utils/Hints.cpp | 107 +++++++++++++++++++++++----------- tdutils/td/utils/Hints.h | 9 +++ tdutils/td/utils/translit.cpp | 10 ++-- tdutils/td/utils/translit.h | 2 +- tdutils/test/misc.cpp | 6 +- 5 files changed, 93 insertions(+), 41 deletions(-) diff --git a/tdutils/td/utils/Hints.cpp b/tdutils/td/utils/Hints.cpp index 1e7449a6..68ea8ec9 100644 --- a/tdutils/td/utils/Hints.cpp +++ b/tdutils/td/utils/Hints.cpp @@ -9,6 +9,7 @@ #include "td/utils/logging.h" #include "td/utils/misc.h" #include "td/utils/Slice.h" +#include "td/utils/translit.h" #include "td/utils/unicode.h" #include "td/utils/utf8.h" @@ -16,6 +17,22 @@ namespace td { +vector Hints::fix_words(vector words) { + std::sort(words.begin(), words.end()); + + size_t new_words_size = 0; + for (size_t i = 0; i != words.size(); i++) { + if (i == words.size() - 1 || !begins_with(words[i + 1], words[i])) { + if (i != new_words_size) { + words[new_words_size] = std::move(words[i]); + } + new_words_size++; + } + } + words.resize(new_words_size); + return words; +} + vector Hints::get_words(Slice name) { bool in_word = false; string word; @@ -44,20 +61,27 @@ vector Hints::get_words(Slice name) { if (in_word) { words.push_back(std::move(word)); } - std::sort(words.begin(), words.end()); - size_t new_words_size = 0; - for (size_t i = 0; i != words.size(); i++) { - if (i == words.size() - 1 || !begins_with(words[i + 1], words[i])) { - if (i != new_words_size) { - words[new_words_size] = std::move(words[i]); - } - // LOG(ERROR) << "Get word " << words[new_words_size]; - new_words_size++; - } + return fix_words(std::move(words)); +} + +void Hints::add_word(const string &word, KeyT key, std::map> &word_to_keys) { + vector &keys = word_to_keys[word]; + CHECK(std::find(keys.begin(), keys.end(), key) == keys.end()); + keys.push_back(key); +} + +void Hints::delete_word(const string &word, KeyT key, std::map> &word_to_keys) { + vector &keys = word_to_keys[word]; + auto key_it = std::find(keys.begin(), keys.end(), key); + CHECK(key_it != keys.end()); + if (keys.size() == 1) { + word_to_keys.erase(word); + } else { + CHECK(keys.size() > 1); + *key_it = keys.back(); + keys.pop_back(); } - words.resize(new_words_size); - return words; } void Hints::add(KeyT key, Slice name) { @@ -67,19 +91,19 @@ void Hints::add(KeyT key, Slice name) { if (it->second == name) { return; } - auto old_words = get_words(it->second); - for (auto &old_word : old_words) { - vector &keys = word_to_keys_[old_word]; - auto key_it = std::find(keys.begin(), keys.end(), key); - CHECK(key_it != keys.end()); - if (keys.size() == 1) { - word_to_keys_.erase(old_word); - } else { - CHECK(keys.size() > 1); - *key_it = keys.back(); - keys.pop_back(); + vector old_transliterations; + for (auto &old_word : get_words(it->second)) { + delete_word(old_word, key, word_to_keys_); + + for (auto &w : get_word_transliterations(old_word, false)) { + if (w != old_word) { + old_transliterations.push_back(std::move(w)); + } } } + for (auto &word : fix_words(old_transliterations)) { + delete_word(word, key, translit_word_to_keys_); + } } if (name.empty()) { if (it != key_to_name_.end()) { @@ -88,12 +112,21 @@ void Hints::add(KeyT key, Slice name) { key_to_rating_.erase(key); return; } - auto words = get_words(name); - for (auto &word : words) { - vector &keys = word_to_keys_[word]; - CHECK(std::find(keys.begin(), keys.end(), key) == keys.end()); - keys.push_back(key); + + vector transliterations; + for (auto &word : get_words(name)) { + add_word(word, key, word_to_keys_); + + for (auto &w : get_word_transliterations(word, false)) { + if (w != word) { + transliterations.push_back(std::move(w)); + } + } } + for (auto &word : fix_words(transliterations)) { + add_word(word, key, translit_word_to_keys_); + } + key_to_name_[key] = name.str(); } @@ -102,14 +135,22 @@ void Hints::set_rating(KeyT key, RatingT rating) { key_to_rating_[key] = rating; } -vector Hints::search_word(const string &word) const { - // LOG(ERROR) << "Search word " << word; - vector results; - auto it = word_to_keys_.lower_bound(word); - while (it != word_to_keys_.end() && begins_with(it->first, word)) { +void Hints::add_search_results(vector &results, const string &word, + const std::map> &word_to_keys) { + LOG(DEBUG) << "Search for word " << word; + auto it = word_to_keys.lower_bound(word); + while (it != word_to_keys.end() && begins_with(it->first, word)) { results.insert(results.end(), it->second.begin(), it->second.end()); ++it; } +} + +vector Hints::search_word(const string &word) const { + vector results; + add_search_results(results, word, translit_word_to_keys_); + for (auto w : get_word_transliterations(word, true)) { + add_search_results(results, w, word_to_keys_); + } std::sort(results.begin(), results.end()); results.erase(std::unique(results.begin(), results.end()), results.end()); diff --git a/tdutils/td/utils/Hints.h b/tdutils/td/utils/Hints.h index 64589668..2855ba2b 100644 --- a/tdutils/td/utils/Hints.h +++ b/tdutils/td/utils/Hints.h @@ -43,11 +43,20 @@ class Hints { private: std::map> word_to_keys_; + std::map> translit_word_to_keys_; std::unordered_map key_to_name_; std::unordered_map key_to_rating_; + static void add_word(const string &word, KeyT key, std::map> &word_to_keys); + static void delete_word(const string &word, KeyT key, std::map> &word_to_keys); + + static vector fix_words(vector words); + static vector get_words(Slice name); + static void add_search_results(vector &results, const string &word, + const std::map> &word_to_keys); + vector search_word(const string &word) const; class CompareByRating { diff --git a/tdutils/td/utils/translit.cpp b/tdutils/td/utils/translit.cpp index 13f19f09..ebf18604 100644 --- a/tdutils/td/utils/translit.cpp +++ b/tdutils/td/utils/translit.cpp @@ -46,7 +46,7 @@ static const std::vector> &get_ru_to_en_complex_rules( return rules; } -void add_word_transliterations(vector &result, Slice word, +void add_word_transliterations(vector &result, Slice word, bool allow_partial, const std::unordered_map &simple_rules, const std::vector> &complex_rules) { string s; @@ -78,7 +78,7 @@ void add_word_transliterations(vector &result, Slice word, s.append(rule.second); break; } - if (begins_with(rule.first, suffix)) { + if (allow_partial && begins_with(rule.first, suffix)) { result.push_back(s + rule.second); } } @@ -100,11 +100,11 @@ void add_word_transliterations(vector &result, Slice word, } } -vector get_word_transliterations(Slice word) { +vector get_word_transliterations(Slice word, bool allow_partial) { vector result; - add_word_transliterations(result, word, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules()); - add_word_transliterations(result, word, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules()); + add_word_transliterations(result, word, allow_partial, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules()); + add_word_transliterations(result, word, allow_partial, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules()); std::sort(result.begin(), result.end()); result.erase(std::unique(result.begin(), result.end()), result.end()); diff --git a/tdutils/td/utils/translit.h b/tdutils/td/utils/translit.h index 1a132fa1..437b13be 100644 --- a/tdutils/td/utils/translit.h +++ b/tdutils/td/utils/translit.h @@ -11,6 +11,6 @@ namespace td { -vector get_word_transliterations(Slice word); +vector get_word_transliterations(Slice word, bool allow_partial); } // namespace td diff --git a/tdutils/test/misc.cpp b/tdutils/test/misc.cpp index 9287e4c0..32a29d87 100644 --- a/tdutils/test/misc.cpp +++ b/tdutils/test/misc.cpp @@ -366,8 +366,8 @@ TEST(Misc, to_wstring) { } #endif -static void test_translit(string word, vector result) { - ASSERT_EQ(result, get_word_transliterations(word)); +static void test_translit(string word, vector result, bool allow_partial = true) { + ASSERT_EQ(result, get_word_transliterations(word, allow_partial)); } TEST(Misc, translit) { @@ -387,4 +387,6 @@ TEST(Misc, translit) { test_translit("artyom", {"artem", "artyom", "артем", "артиом"}); test_translit("arty", {"arte", "arty", "арте", "арти", "артю", "артя"}); test_translit("льи", {"li", "lia", "ly", "льи"}); + test_translit("y", {"y", "и"}, false); + test_translit("yo", {"e", "yo", "е", "ио"}, false); }