Support transliterations in Hints search.
GitOrigin-RevId: 391622f2e02defa39564a14c51e8a1b03b751b96
This commit is contained in:
parent
b6bfb3ddbf
commit
5403d0bf30
@ -9,6 +9,7 @@
|
||||
#include "td/utils/logging.h"
|
||||
#include "td/utils/misc.h"
|
||||
#include "td/utils/Slice.h"
|
||||
#include "td/utils/translit.h"
|
||||
#include "td/utils/unicode.h"
|
||||
#include "td/utils/utf8.h"
|
||||
|
||||
@ -16,6 +17,22 @@
|
||||
|
||||
namespace td {
|
||||
|
||||
vector<string> Hints::fix_words(vector<string> words) {
|
||||
std::sort(words.begin(), words.end());
|
||||
|
||||
size_t new_words_size = 0;
|
||||
for (size_t i = 0; i != words.size(); i++) {
|
||||
if (i == words.size() - 1 || !begins_with(words[i + 1], words[i])) {
|
||||
if (i != new_words_size) {
|
||||
words[new_words_size] = std::move(words[i]);
|
||||
}
|
||||
new_words_size++;
|
||||
}
|
||||
}
|
||||
words.resize(new_words_size);
|
||||
return words;
|
||||
}
|
||||
|
||||
vector<string> Hints::get_words(Slice name) {
|
||||
bool in_word = false;
|
||||
string word;
|
||||
@ -44,20 +61,27 @@ vector<string> Hints::get_words(Slice name) {
|
||||
if (in_word) {
|
||||
words.push_back(std::move(word));
|
||||
}
|
||||
std::sort(words.begin(), words.end());
|
||||
|
||||
size_t new_words_size = 0;
|
||||
for (size_t i = 0; i != words.size(); i++) {
|
||||
if (i == words.size() - 1 || !begins_with(words[i + 1], words[i])) {
|
||||
if (i != new_words_size) {
|
||||
words[new_words_size] = std::move(words[i]);
|
||||
}
|
||||
// LOG(ERROR) << "Get word " << words[new_words_size];
|
||||
new_words_size++;
|
||||
}
|
||||
return fix_words(std::move(words));
|
||||
}
|
||||
|
||||
void Hints::add_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {
|
||||
vector<KeyT> &keys = word_to_keys[word];
|
||||
CHECK(std::find(keys.begin(), keys.end(), key) == keys.end());
|
||||
keys.push_back(key);
|
||||
}
|
||||
|
||||
void Hints::delete_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {
|
||||
vector<KeyT> &keys = word_to_keys[word];
|
||||
auto key_it = std::find(keys.begin(), keys.end(), key);
|
||||
CHECK(key_it != keys.end());
|
||||
if (keys.size() == 1) {
|
||||
word_to_keys.erase(word);
|
||||
} else {
|
||||
CHECK(keys.size() > 1);
|
||||
*key_it = keys.back();
|
||||
keys.pop_back();
|
||||
}
|
||||
words.resize(new_words_size);
|
||||
return words;
|
||||
}
|
||||
|
||||
void Hints::add(KeyT key, Slice name) {
|
||||
@ -67,19 +91,19 @@ void Hints::add(KeyT key, Slice name) {
|
||||
if (it->second == name) {
|
||||
return;
|
||||
}
|
||||
auto old_words = get_words(it->second);
|
||||
for (auto &old_word : old_words) {
|
||||
vector<KeyT> &keys = word_to_keys_[old_word];
|
||||
auto key_it = std::find(keys.begin(), keys.end(), key);
|
||||
CHECK(key_it != keys.end());
|
||||
if (keys.size() == 1) {
|
||||
word_to_keys_.erase(old_word);
|
||||
} else {
|
||||
CHECK(keys.size() > 1);
|
||||
*key_it = keys.back();
|
||||
keys.pop_back();
|
||||
vector<string> old_transliterations;
|
||||
for (auto &old_word : get_words(it->second)) {
|
||||
delete_word(old_word, key, word_to_keys_);
|
||||
|
||||
for (auto &w : get_word_transliterations(old_word, false)) {
|
||||
if (w != old_word) {
|
||||
old_transliterations.push_back(std::move(w));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto &word : fix_words(old_transliterations)) {
|
||||
delete_word(word, key, translit_word_to_keys_);
|
||||
}
|
||||
}
|
||||
if (name.empty()) {
|
||||
if (it != key_to_name_.end()) {
|
||||
@ -88,12 +112,21 @@ void Hints::add(KeyT key, Slice name) {
|
||||
key_to_rating_.erase(key);
|
||||
return;
|
||||
}
|
||||
auto words = get_words(name);
|
||||
for (auto &word : words) {
|
||||
vector<KeyT> &keys = word_to_keys_[word];
|
||||
CHECK(std::find(keys.begin(), keys.end(), key) == keys.end());
|
||||
keys.push_back(key);
|
||||
|
||||
vector<string> transliterations;
|
||||
for (auto &word : get_words(name)) {
|
||||
add_word(word, key, word_to_keys_);
|
||||
|
||||
for (auto &w : get_word_transliterations(word, false)) {
|
||||
if (w != word) {
|
||||
transliterations.push_back(std::move(w));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto &word : fix_words(transliterations)) {
|
||||
add_word(word, key, translit_word_to_keys_);
|
||||
}
|
||||
|
||||
key_to_name_[key] = name.str();
|
||||
}
|
||||
|
||||
@ -102,14 +135,22 @@ void Hints::set_rating(KeyT key, RatingT rating) {
|
||||
key_to_rating_[key] = rating;
|
||||
}
|
||||
|
||||
vector<Hints::KeyT> Hints::search_word(const string &word) const {
|
||||
// LOG(ERROR) << "Search word " << word;
|
||||
vector<KeyT> results;
|
||||
auto it = word_to_keys_.lower_bound(word);
|
||||
while (it != word_to_keys_.end() && begins_with(it->first, word)) {
|
||||
void Hints::add_search_results(vector<KeyT> &results, const string &word,
|
||||
const std::map<string, vector<KeyT>> &word_to_keys) {
|
||||
LOG(DEBUG) << "Search for word " << word;
|
||||
auto it = word_to_keys.lower_bound(word);
|
||||
while (it != word_to_keys.end() && begins_with(it->first, word)) {
|
||||
results.insert(results.end(), it->second.begin(), it->second.end());
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
vector<Hints::KeyT> Hints::search_word(const string &word) const {
|
||||
vector<KeyT> results;
|
||||
add_search_results(results, word, translit_word_to_keys_);
|
||||
for (auto w : get_word_transliterations(word, true)) {
|
||||
add_search_results(results, w, word_to_keys_);
|
||||
}
|
||||
|
||||
std::sort(results.begin(), results.end());
|
||||
results.erase(std::unique(results.begin(), results.end()), results.end());
|
||||
|
@ -43,11 +43,20 @@ class Hints {
|
||||
|
||||
private:
|
||||
std::map<string, vector<KeyT>> word_to_keys_;
|
||||
std::map<string, vector<KeyT>> translit_word_to_keys_;
|
||||
std::unordered_map<KeyT, string> key_to_name_;
|
||||
std::unordered_map<KeyT, RatingT> key_to_rating_;
|
||||
|
||||
static void add_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys);
|
||||
static void delete_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys);
|
||||
|
||||
static vector<string> fix_words(vector<string> words);
|
||||
|
||||
static vector<string> get_words(Slice name);
|
||||
|
||||
static void add_search_results(vector<KeyT> &results, const string &word,
|
||||
const std::map<string, vector<KeyT>> &word_to_keys);
|
||||
|
||||
vector<KeyT> search_word(const string &word) const;
|
||||
|
||||
class CompareByRating {
|
||||
|
@ -46,7 +46,7 @@ static const std::vector<std::pair<string, string>> &get_ru_to_en_complex_rules(
|
||||
return rules;
|
||||
}
|
||||
|
||||
void add_word_transliterations(vector<string> &result, Slice word,
|
||||
void add_word_transliterations(vector<string> &result, Slice word, bool allow_partial,
|
||||
const std::unordered_map<uint32, string> &simple_rules,
|
||||
const std::vector<std::pair<string, string>> &complex_rules) {
|
||||
string s;
|
||||
@ -78,7 +78,7 @@ void add_word_transliterations(vector<string> &result, Slice word,
|
||||
s.append(rule.second);
|
||||
break;
|
||||
}
|
||||
if (begins_with(rule.first, suffix)) {
|
||||
if (allow_partial && begins_with(rule.first, suffix)) {
|
||||
result.push_back(s + rule.second);
|
||||
}
|
||||
}
|
||||
@ -100,11 +100,11 @@ void add_word_transliterations(vector<string> &result, Slice word,
|
||||
}
|
||||
}
|
||||
|
||||
vector<string> get_word_transliterations(Slice word) {
|
||||
vector<string> get_word_transliterations(Slice word, bool allow_partial) {
|
||||
vector<string> result;
|
||||
|
||||
add_word_transliterations(result, word, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules());
|
||||
add_word_transliterations(result, word, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules());
|
||||
add_word_transliterations(result, word, allow_partial, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules());
|
||||
add_word_transliterations(result, word, allow_partial, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules());
|
||||
|
||||
std::sort(result.begin(), result.end());
|
||||
result.erase(std::unique(result.begin(), result.end()), result.end());
|
||||
|
@ -11,6 +11,6 @@
|
||||
|
||||
namespace td {
|
||||
|
||||
vector<string> get_word_transliterations(Slice word);
|
||||
vector<string> get_word_transliterations(Slice word, bool allow_partial);
|
||||
|
||||
} // namespace td
|
||||
|
@ -366,8 +366,8 @@ TEST(Misc, to_wstring) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static void test_translit(string word, vector<string> result) {
|
||||
ASSERT_EQ(result, get_word_transliterations(word));
|
||||
static void test_translit(string word, vector<string> result, bool allow_partial = true) {
|
||||
ASSERT_EQ(result, get_word_transliterations(word, allow_partial));
|
||||
}
|
||||
|
||||
TEST(Misc, translit) {
|
||||
@ -387,4 +387,6 @@ TEST(Misc, translit) {
|
||||
test_translit("artyom", {"artem", "artyom", "артем", "артиом"});
|
||||
test_translit("arty", {"arte", "arty", "арте", "арти", "артю", "артя"});
|
||||
test_translit("льи", {"li", "lia", "ly", "льи"});
|
||||
test_translit("y", {"y", "и"}, false);
|
||||
test_translit("yo", {"e", "yo", "е", "ио"}, false);
|
||||
}
|
||||
|
Reference in New Issue
Block a user