2018-12-31 20:04:05 +01:00
|
|
|
//
|
2022-01-01 01:35:39 +01:00
|
|
|
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2022
|
2018-12-31 20:04:05 +01:00
|
|
|
//
|
|
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
//
|
|
|
|
#include "td/utils/Hints.h"
|
|
|
|
|
2021-01-01 13:59:53 +01:00
|
|
|
#include "td/utils/algorithm.h"
|
2018-12-31 20:04:05 +01:00
|
|
|
#include "td/utils/logging.h"
|
|
|
|
#include "td/utils/misc.h"
|
|
|
|
#include "td/utils/Slice.h"
|
2018-07-31 13:13:19 +02:00
|
|
|
#include "td/utils/translit.h"
|
2018-12-31 20:04:05 +01:00
|
|
|
#include "td/utils/unicode.h"
|
|
|
|
#include "td/utils/utf8.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
namespace td {
|
|
|
|
|
2018-07-31 13:13:19 +02:00
|
|
|
vector<string> Hints::fix_words(vector<string> words) {
|
|
|
|
std::sort(words.begin(), words.end());
|
|
|
|
|
|
|
|
size_t new_words_size = 0;
|
|
|
|
for (size_t i = 0; i != words.size(); i++) {
|
|
|
|
if (i == words.size() - 1 || !begins_with(words[i + 1], words[i])) {
|
|
|
|
if (i != new_words_size) {
|
|
|
|
words[new_words_size] = std::move(words[i]);
|
|
|
|
}
|
|
|
|
new_words_size++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
words.resize(new_words_size);
|
|
|
|
return words;
|
|
|
|
}
|
|
|
|
|
2022-08-19 15:37:44 +02:00
|
|
|
vector<string> Hints::get_words(Slice name) {
|
2018-12-31 20:04:05 +01:00
|
|
|
bool in_word = false;
|
|
|
|
string word;
|
|
|
|
vector<string> words;
|
|
|
|
auto pos = name.ubegin();
|
|
|
|
auto end = name.uend();
|
|
|
|
while (pos != end) {
|
|
|
|
uint32 code;
|
2022-08-19 15:37:44 +02:00
|
|
|
pos = next_utf8_unsafe(pos, &code);
|
2018-12-31 20:04:05 +01:00
|
|
|
|
|
|
|
code = prepare_search_character(code);
|
|
|
|
if (code == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (code == ' ') {
|
|
|
|
if (in_word) {
|
|
|
|
words.push_back(std::move(word));
|
|
|
|
word.clear();
|
|
|
|
in_word = false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
in_word = true;
|
2018-07-31 17:42:27 +02:00
|
|
|
code = remove_diacritics(code);
|
2018-12-31 20:04:05 +01:00
|
|
|
append_utf8_character(word, code);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (in_word) {
|
|
|
|
words.push_back(std::move(word));
|
|
|
|
}
|
|
|
|
|
2018-07-31 13:13:19 +02:00
|
|
|
return fix_words(std::move(words));
|
|
|
|
}
|
|
|
|
|
|
|
|
void Hints::add_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {
|
|
|
|
vector<KeyT> &keys = word_to_keys[word];
|
2019-10-22 01:12:58 +02:00
|
|
|
CHECK(!td::contains(keys, key));
|
2018-07-31 13:13:19 +02:00
|
|
|
keys.push_back(key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Hints::delete_word(const string &word, KeyT key, std::map<string, vector<KeyT>> &word_to_keys) {
|
|
|
|
vector<KeyT> &keys = word_to_keys[word];
|
|
|
|
auto key_it = std::find(keys.begin(), keys.end(), key);
|
|
|
|
CHECK(key_it != keys.end());
|
|
|
|
if (keys.size() == 1) {
|
|
|
|
word_to_keys.erase(word);
|
|
|
|
} else {
|
|
|
|
CHECK(keys.size() > 1);
|
|
|
|
*key_it = keys.back();
|
|
|
|
keys.pop_back();
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Hints::add(KeyT key, Slice name) {
|
|
|
|
// LOG(ERROR) << "Add " << key << ": " << name;
|
|
|
|
auto it = key_to_name_.find(key);
|
|
|
|
if (it != key_to_name_.end()) {
|
|
|
|
if (it->second == name) {
|
|
|
|
return;
|
|
|
|
}
|
2018-07-31 13:13:19 +02:00
|
|
|
vector<string> old_transliterations;
|
2022-08-19 15:37:44 +02:00
|
|
|
for (auto &old_word : get_words(it->second)) {
|
2018-07-31 13:13:19 +02:00
|
|
|
delete_word(old_word, key, word_to_keys_);
|
|
|
|
|
|
|
|
for (auto &w : get_word_transliterations(old_word, false)) {
|
|
|
|
if (w != old_word) {
|
|
|
|
old_transliterations.push_back(std::move(w));
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
|
|
|
}
|
2018-07-31 13:13:19 +02:00
|
|
|
for (auto &word : fix_words(old_transliterations)) {
|
|
|
|
delete_word(word, key, translit_word_to_keys_);
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
|
|
|
if (name.empty()) {
|
|
|
|
if (it != key_to_name_.end()) {
|
|
|
|
key_to_name_.erase(it);
|
|
|
|
}
|
|
|
|
key_to_rating_.erase(key);
|
|
|
|
return;
|
|
|
|
}
|
2018-07-31 13:13:19 +02:00
|
|
|
|
|
|
|
vector<string> transliterations;
|
2022-08-19 15:37:44 +02:00
|
|
|
for (auto &word : get_words(name)) {
|
2018-07-31 13:13:19 +02:00
|
|
|
add_word(word, key, word_to_keys_);
|
|
|
|
|
|
|
|
for (auto &w : get_word_transliterations(word, false)) {
|
|
|
|
if (w != word) {
|
|
|
|
transliterations.push_back(std::move(w));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto &word : fix_words(transliterations)) {
|
|
|
|
add_word(word, key, translit_word_to_keys_);
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
2018-07-31 13:13:19 +02:00
|
|
|
|
2018-12-31 20:04:05 +01:00
|
|
|
key_to_name_[key] = name.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Hints::set_rating(KeyT key, RatingT rating) {
|
|
|
|
// LOG(ERROR) << "Set rating " << key << ": " << rating;
|
|
|
|
key_to_rating_[key] = rating;
|
|
|
|
}
|
|
|
|
|
2018-07-31 13:13:19 +02:00
|
|
|
void Hints::add_search_results(vector<KeyT> &results, const string &word,
|
|
|
|
const std::map<string, vector<KeyT>> &word_to_keys) {
|
|
|
|
LOG(DEBUG) << "Search for word " << word;
|
|
|
|
auto it = word_to_keys.lower_bound(word);
|
|
|
|
while (it != word_to_keys.end() && begins_with(it->first, word)) {
|
2018-12-31 20:04:05 +01:00
|
|
|
results.insert(results.end(), it->second.begin(), it->second.end());
|
|
|
|
++it;
|
|
|
|
}
|
2018-07-31 13:13:19 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
vector<Hints::KeyT> Hints::search_word(const string &word) const {
|
|
|
|
vector<KeyT> results;
|
|
|
|
add_search_results(results, word, translit_word_to_keys_);
|
2021-10-18 18:26:14 +02:00
|
|
|
for (const auto &w : get_word_transliterations(word, true)) {
|
2018-07-31 13:13:19 +02:00
|
|
|
add_search_results(results, w, word_to_keys_);
|
|
|
|
}
|
2018-12-31 20:04:05 +01:00
|
|
|
|
2020-12-30 16:50:57 +01:00
|
|
|
td::unique(results);
|
2018-12-31 20:04:05 +01:00
|
|
|
return results;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<size_t, vector<Hints::KeyT>> Hints::search(Slice query, int32 limit, bool return_all_for_empty_query) const {
|
|
|
|
// LOG(ERROR) << "Search " << query;
|
|
|
|
vector<KeyT> results;
|
|
|
|
|
|
|
|
if (limit < 0) {
|
|
|
|
return {key_to_name_.size(), std::move(results)};
|
|
|
|
}
|
|
|
|
|
2022-08-19 15:37:44 +02:00
|
|
|
auto words = get_words(query);
|
2018-12-31 20:04:05 +01:00
|
|
|
if (return_all_for_empty_query && words.empty()) {
|
|
|
|
results.reserve(key_to_name_.size());
|
|
|
|
for (auto &it : key_to_name_) {
|
|
|
|
results.push_back(it.first);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < words.size(); i++) {
|
|
|
|
vector<KeyT> keys = search_word(words[i]);
|
|
|
|
if (i == 0) {
|
|
|
|
results = std::move(keys);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// now need to intersect two lists
|
|
|
|
size_t results_pos = 0;
|
|
|
|
size_t keys_pos = 0;
|
|
|
|
size_t new_results_size = 0;
|
|
|
|
while (results_pos != results.size() && keys_pos != keys.size()) {
|
|
|
|
if (results[results_pos] < keys[keys_pos]) {
|
|
|
|
results_pos++;
|
|
|
|
} else if (results[results_pos] > keys[keys_pos]) {
|
|
|
|
keys_pos++;
|
|
|
|
} else {
|
|
|
|
results[new_results_size++] = results[results_pos];
|
|
|
|
results_pos++;
|
|
|
|
keys_pos++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
results.resize(new_results_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto total_size = results.size();
|
|
|
|
if (total_size < static_cast<size_t>(limit)) {
|
|
|
|
std::sort(results.begin(), results.end(), CompareByRating(key_to_rating_));
|
|
|
|
} else {
|
|
|
|
std::partial_sort(results.begin(), results.begin() + limit, results.end(), CompareByRating(key_to_rating_));
|
|
|
|
results.resize(limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
return {total_size, std::move(results)};
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Hints::has_key(KeyT key) const {
|
2022-05-01 22:03:06 +02:00
|
|
|
return key_to_name_.count(key) > 0;
|
2018-12-31 20:04:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
string Hints::key_to_string(KeyT key) const {
|
|
|
|
auto it = key_to_name_.find(key);
|
|
|
|
if (it == key_to_name_.end()) {
|
|
|
|
return string();
|
|
|
|
}
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<size_t, vector<Hints::KeyT>> Hints::search_empty(int32 limit) const {
|
|
|
|
return search(Slice(), limit, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t Hints::size() const {
|
|
|
|
return key_to_name_.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace td
|