Support transliterations in Hints search.

GitOrigin-RevId: 391622f2e02defa39564a14c51e8a1b03b751b96
This commit is contained in:
levlam 2018-07-31 14:13:19 +03:00
parent b6bfb3ddbf
commit 5403d0bf30
5 changed files with 93 additions and 41 deletions

View File

@ -9,6 +9,7 @@
#include "td/utils/logging.h"
#include "td/utils/misc.h"
#include "td/utils/Slice.h"
#include "td/utils/translit.h"
#include "td/utils/unicode.h"
#include "td/utils/utf8.h"
@ -16,6 +17,22 @@
namespace td {
vector<string> Hints::fix_words(vector<string> words) {
std::sort(words.begin(), words.end());
size_t new_words_size = 0;
for (size_t i = 0; i != words.size(); i++) {
if (i == words.size() - 1 || !begins_with(words[i + 1], words[i])) {
if (i != new_words_size) {
words[new_words_size] = std::move(words[i]);
}
new_words_size++;
}
}
words.resize(new_words_size);
return words;
}
vector<string> Hints::get_words(Slice name) {
bool in_word = false;
string word;
@ -44,20 +61,27 @@ vector<string> Hints::get_words(Slice name) {
if (in_word) {
words.push_back(std::move(word));
}
std::sort(words.begin(), words.end());
size_t new_words_size = 0;
for (size_t i = 0; i != words.size(); i++) {
if (i == words.size() - 1 || !begins_with(words[i + 1], words[i])) {
if (i != new_words_size) {
words[new_words_size] = std::move(words[i]);
}
// LOG(ERROR) << "Get word " << words[new_words_size];
new_words_size++;
}
return fix_words(std::move(words));
}
void Hints::add_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {
vector<KeyT> &keys = word_to_keys[word];
CHECK(std::find(keys.begin(), keys.end(), key) == keys.end());
keys.push_back(key);
}
void Hints::delete_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {
vector<KeyT> &keys = word_to_keys[word];
auto key_it = std::find(keys.begin(), keys.end(), key);
CHECK(key_it != keys.end());
if (keys.size() == 1) {
word_to_keys.erase(word);
} else {
CHECK(keys.size() > 1);
*key_it = keys.back();
keys.pop_back();
}
words.resize(new_words_size);
return words;
}
void Hints::add(KeyT key, Slice name) {
@ -67,19 +91,19 @@ void Hints::add(KeyT key, Slice name) {
if (it->second == name) {
return;
}
auto old_words = get_words(it->second);
for (auto &old_word : old_words) {
vector<KeyT> &keys = word_to_keys_[old_word];
auto key_it = std::find(keys.begin(), keys.end(), key);
CHECK(key_it != keys.end());
if (keys.size() == 1) {
word_to_keys_.erase(old_word);
} else {
CHECK(keys.size() > 1);
*key_it = keys.back();
keys.pop_back();
vector<string> old_transliterations;
for (auto &old_word : get_words(it->second)) {
delete_word(old_word, key, word_to_keys_);
for (auto &w : get_word_transliterations(old_word, false)) {
if (w != old_word) {
old_transliterations.push_back(std::move(w));
}
}
}
for (auto &word : fix_words(old_transliterations)) {
delete_word(word, key, translit_word_to_keys_);
}
}
if (name.empty()) {
if (it != key_to_name_.end()) {
@ -88,12 +112,21 @@ void Hints::add(KeyT key, Slice name) {
key_to_rating_.erase(key);
return;
}
auto words = get_words(name);
for (auto &word : words) {
vector<KeyT> &keys = word_to_keys_[word];
CHECK(std::find(keys.begin(), keys.end(), key) == keys.end());
keys.push_back(key);
vector<string> transliterations;
for (auto &word : get_words(name)) {
add_word(word, key, word_to_keys_);
for (auto &w : get_word_transliterations(word, false)) {
if (w != word) {
transliterations.push_back(std::move(w));
}
}
}
for (auto &word : fix_words(transliterations)) {
add_word(word, key, translit_word_to_keys_);
}
key_to_name_[key] = name.str();
}
@ -102,14 +135,22 @@ void Hints::set_rating(KeyT key, RatingT rating) {
key_to_rating_[key] = rating;
}
vector<Hints::KeyT> Hints::search_word(const string &word) const {
// LOG(ERROR) << "Search word " << word;
vector<KeyT> results;
auto it = word_to_keys_.lower_bound(word);
while (it != word_to_keys_.end() && begins_with(it->first, word)) {
void Hints::add_search_results(vector<KeyT> &results, const string &word,
const std::map<string, vector<KeyT>> &word_to_keys) {
LOG(DEBUG) << "Search for word " << word;
auto it = word_to_keys.lower_bound(word);
while (it != word_to_keys.end() && begins_with(it->first, word)) {
results.insert(results.end(), it->second.begin(), it->second.end());
++it;
}
}
vector<Hints::KeyT> Hints::search_word(const string &word) const {
vector<KeyT> results;
add_search_results(results, word, translit_word_to_keys_);
for (auto w : get_word_transliterations(word, true)) {
add_search_results(results, w, word_to_keys_);
}
std::sort(results.begin(), results.end());
results.erase(std::unique(results.begin(), results.end()), results.end());

View File

@ -43,11 +43,20 @@ class Hints {
private:
std::map<string, vector<KeyT>> word_to_keys_;
std::map<string, vector<KeyT>> translit_word_to_keys_;
std::unordered_map<KeyT, string> key_to_name_;
std::unordered_map<KeyT, RatingT> key_to_rating_;
static void add_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys);
static void delete_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys);
static vector<string> fix_words(vector<string> words);
static vector<string> get_words(Slice name);
static void add_search_results(vector<KeyT> &results, const string &word,
const std::map<string, vector<KeyT>> &word_to_keys);
vector<KeyT> search_word(const string &word) const;
class CompareByRating {

View File

@ -46,7 +46,7 @@ static const std::vector<std::pair<string, string>> &get_ru_to_en_complex_rules(
return rules;
}
void add_word_transliterations(vector<string> &result, Slice word,
void add_word_transliterations(vector<string> &result, Slice word, bool allow_partial,
const std::unordered_map<uint32, string> &simple_rules,
const std::vector<std::pair<string, string>> &complex_rules) {
string s;
@ -78,7 +78,7 @@ void add_word_transliterations(vector<string> &result, Slice word,
s.append(rule.second);
break;
}
if (begins_with(rule.first, suffix)) {
if (allow_partial && begins_with(rule.first, suffix)) {
result.push_back(s + rule.second);
}
}
@ -100,11 +100,11 @@ void add_word_transliterations(vector<string> &result, Slice word,
}
}
vector<string> get_word_transliterations(Slice word) {
vector<string> get_word_transliterations(Slice word, bool allow_partial) {
vector<string> result;
add_word_transliterations(result, word, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules());
add_word_transliterations(result, word, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules());
add_word_transliterations(result, word, allow_partial, get_en_to_ru_simple_rules(), get_en_to_ru_complex_rules());
add_word_transliterations(result, word, allow_partial, get_ru_to_en_simple_rules(), get_ru_to_en_complex_rules());
std::sort(result.begin(), result.end());
result.erase(std::unique(result.begin(), result.end()), result.end());

View File

@ -11,6 +11,6 @@
namespace td {
vector<string> get_word_transliterations(Slice word);
vector<string> get_word_transliterations(Slice word, bool allow_partial);
} // namespace td

View File

@ -366,8 +366,8 @@ TEST(Misc, to_wstring) {
}
#endif
static void test_translit(string word, vector<string> result) {
ASSERT_EQ(result, get_word_transliterations(word));
static void test_translit(string word, vector<string> result, bool allow_partial = true) {
ASSERT_EQ(result, get_word_transliterations(word, allow_partial));
}
TEST(Misc, translit) {
@ -387,4 +387,6 @@ TEST(Misc, translit) {
test_translit("artyom", {"artem", "artyom", "артем", "артиом"});
test_translit("arty", {"arte", "arty", "арте", "арти", "артю", "артя"});
test_translit("льи", {"li", "lia", "ly", "льи"});
test_translit("y", {"y", "и"}, false);
test_translit("yo", {"e", "yo", "е", "ио"}, false);
}