tdlight/tdutils/td/utils/emoji.cpp

236 lines
13 KiB
C++

//
// Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2024
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
#include "td/utils/emoji.h"
#include "td/utils/base64.h"
#include "td/utils/FlatHashSet.h"
#include "td/utils/Gzip.h"
namespace td {
static constexpr size_t MAX_EMOJI_LENGTH = 28;
static bool is_emoji_element(Slice str) {
static const FlatHashSet<Slice, SliceHash> emojis = [max_emoji_length = MAX_EMOJI_LENGTH] {
#if TD_HAVE_ZLIB
Slice packed_emojis(
"eJxtmlly40iWRbdCs_rrr56H3WVmzBHE4A6AxCCFqiodcBFOKShSComawqyWglpAl1lvoP0wcdPK2vrDnY-mC8e75_"
"kkRUzLdjEtu8WUbWIbYwuxbWPbxXZYTOuX2F4XU13GVi2m5mNsn2L7HNuX2JaxJbGlsWWx5bGZ2GxsRWw_FlMbP9s_xRbHb-"
"P4bRyzjWN2b2N7F1t8tovjd_Hn3T62-N7uLrbH2KL2LGrO4vvO4vvOLhfTeXzneXzmPI57Hp87j3mdrxfT1_j5dRUbccz_"
"a3z3Nnrbni2mq5jXVbX428VPb2P3Jovd-5TO0Fm6gq6kQ_d-Rbemq-kaujZ2H3-mO0Xf6PZ0B7pbuju673QPdEe6x9h9yul42_InOkZZ_"
"kL3ho6slu_"
"o3tN9oPtI94nuM90XuiVdQkf2S3wsGXl5GhkfS3ws8bHExxIfS3ws8bHEx5LsY9Vjd0Z3TveV7oLuj3RXdNd0uFzu6G7o8LvE7xK_S_wu7-"
"nwu8TvEr_LJ7pnuhe6V7ofsUuwn2A_wX6C_QT7CfYT7CfYT7CfYD_BfoL9BPsJ9hPsJ9hPsJ9gP8F-gv0E-wn2E-wn2E-wn2A_wX6C_"
"QT7CfYT7CfYT7Cf_Inuz3S_0jm6nm6g83SXdBu6kS7QbelAl4AuAV0CugR0CegS0CWgS0CXMFUS-CXwS-CXwC-BXwK_"
"BHQJ6FLQpaBLQZeCLgVdCroUdCnoUtCloEtBl4IuBV0KuhR0KehS0KWgS0GXgi7Ffor9FPsp9lPsp9hPsZ9iP8V-iv0U-yn2U-yn2E-xn2I_"
"xX6K_RT7KfZT7KfYT7GfYj_Ffor9FPsp9lPsp0yfFAYpDDIYZDDIYJDBIINBBoMMBhkMMhhkMMhgkOE3w2-G3wy_GX4z_"
"Gb4zfCb4TfDb4bfDL8ZfjP8ZvjN8JvhN8Nvht8Mvxl-M_"
"xmGMxwlOEow1GGowxHGY4yHGU4ynGU4yjHUY6jHEc5jnIc5TjKcZTjKMdRTlVzqppT1Zyq5rjMcZnjMsdljsucBZGzIHIWRM6CyFkQOQsiZ0Hk"
"LIicBZGzIHIWRM6CyCGUQyiHUA6hHEI5hHII5RDKIZRDKIdQDqEcQjmEcgjlEMohlDMjcmZEzozIAZYzI3JmRM6MyOGXwy-HXw6_HH45_"
"HL4GdAZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGag"
"ZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGaiZEzVmnQWdZdZZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-"
"Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-"
"Fn4WfhZ-Fn4WfhZ-Fn4FfAr4FfAr4BfAb8CfgX8CvgV8CvgV8CvgF8BvwJ-BfwK-BXwK-BXwK-AXwG_An4F_"
"Ar4FfAr4FfAr4BfAb8CfgX8CvgV8CvgV8CvgF8BvwJ-BfwK-BXwK-BXwK-AXwG_An4F_"
"Ar4FfAr4FfAr4BfAb8CfgX8CvgV8CvgV8CvgF8BvwJqJdRKqJVQK6FWQq2EWgm1Emol1EqolVAroVZCrYRaCbUSaiXUSqiVUCuhVkKthFoJtRJ"
"qJdRKqJVQK6FWQq2EWgm1Emol1EqolVAroVZCrYRaCbUSaiXUSqiVUCuhVkKthFoJtRJqJdRKqJVQK6FWQq2EWgm1Emol1EqolVAroVZCrWTWV"
"divsF9hv8J-hfMK5xXOK5xXOK9wXuG8wnmF8wrnFc4rnFc4r3Be4bzCeYXzCucVziucVzivcF7hvMJ5RX4r3rHiHSska9Jdk-"
"6adNcskjXlril3Tblryl1T7ppy15S7ptw15a4pd025a8pd47fGb43fGr815a4xXWO6xnSN6RrTNQnVJFRjusZ0jeka0zWma0zXmK4xXWO6xnSN"
"6RrTNaZrHNWYrjFdY7qm3DXlril3Tblryl1T7ppy15S7ptw15a4pd025a8pdU-"
"6acteUu6bcNeWuwVnDr4ZfDb8afjX8Gvg18Gvg18CvgV8DvwZ-Dfwa-DXwa-DXwK-BXwO_Bn4N_"
"FrGaxmvZbyW8VrGaxmvZbyW8VrGaxmvZbyW8VrGaxmvZbz2NB71aKlHSz1a6tFSj5Z6tNSjpR4t9WipR0s9WurRUo-"
"WerTUo6UeLfVoqUdLPVrq0VKPlnq01KOlHi31aKlHSz1a6tFSj5Z6tNSjpR4t9WipR0s9WurRUo-"
"WerTUo6UeLfVoqUdLPVrq0VKPlnq01KOlHh38Ovh18Ovg18Gvg18HoQ4kHUg6kHSA6ADRAaLDdIfpDtMdpjuy78i-"
"I7WO1DpS60itI7WO1DpS60itI7ULWF3A6gJWF7C6gNUFrC5gdQGrC1hdwOoCVhe87QIujnQdBXUU1FFQR_"
"aO7B3ZOwrqKKjDh8OHw4ejoI6COgrqKKjDm8Obw5vDmyNJR5KOJB1JOpJ0JOlI0pGkI0lHko4kHUk6kDgK6iioo6DulDgFdRTUUVAHNQc1BzUH"
"NQc1BzUHNQcwRy0dtXTUsqeWPbXsqWVPLXtq2VPLnmXQswx6lkHPMuhZBj3Ueqj1UOuh1kOth1oPtR5qPdR6qPVQ66HWQ62HWg-"
"1Hmo91Hqo9VDrodZDrYdaD7Ueaj3Ueqj1UOuh1kOth1oPtR5qPdR6qPVQ66HWQ62HWg-"
"1Hmo91Hqo9VDrodZDrYdaD7WeudaDrgddD7oedAPoBtANoBtAN4BuAN3ANjLAb4DfAL8BfgP8BvgN8BvgN8BvgN8AvwF-A_wG-A3wG-A3wG-"
"A3wC_AX4D_Ab4DfAb4DfAb4DfAL8BfgP8BvgN8BvgN8BvgN8AvwF-A_wG-A3wG-A3wG-A3wC_AX4D_"
"Ab4DfAb4DfAb4DfAL8BfgP8BvgN8BvgN8DPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_"
"Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw2_"
"DeBvG2zDehvE2jLdhvA3jbRhvw3gbxtsw3obxNow3UoCRAowUYKQAIwUYKcBIAUYKMFKAEbojdEfojtAdoTtCd4TuCN0RuiN0R-"
"iO0B2hO0J3hO4I3RG6I3RH6I7QHaE7QneE7gjdEbojdEfojtAdoTtCd4TuCN0RGiM0RmiM0BihMUJjhMYIjREaIzRGaIzQGE80oDsCNsAlwCXA"
"JcAlwCXAJTDhAhMugCSAJIAkgCSAJIAkgCSAJIAkgCSAJIAkYD9gP2A_YD9gP2A_YD9gP2A_4C3gLeAt4C3gLeAt4C3gLTws_rJZ_"
"OV6Mf30vJh-_rSY3v45tuNiel_GVsW2im0dWx1bE9smtnExLS8XU5bFdhVbfD77FttNbPvYHmKLY2SPiyn-ljeton4VYvu-mNY_"
"xfYUW3xfvOZO8ZY7xUvuFO-4U7ziTvHSOsWb6hSvolO8dU7xqjnFS-YU75hTvFhO8SI5xYvjFO-"
"NU7wOTvE2OMXL4BSvfFO87E3NRWzxmSbqmz62-"
"EwTc23iO5vXxRQvXFO8ak3xpjXFi9YU71lTvGFN8W41xavVFK9SU7wrTfEKNMUb0NR9jC36jFeTqStii_67-LOOn93Gdh9bzCNeNaaz-P6zqD-"
"LHM-WsSWxxWfP4jvP4jvP4vvOoqezP8V2iC0-fx49n3-ILeZ77hbT1_izeDJP8WCetu9iex_bh8Vff9rF9rL46-d1bMyMd5T4HSV-xwJ_"
"xzz8yOz7ePqrMhNlyfRYMj2WTI8l02PJ9FiyOpasjiWrY8nqWJ7-"
"onr6oyHzMGUeprwoZQqmTMGUtZiyFjO21IwtNWNLzZjhGRM5YyJnDJAxQMZEzhglY5SMUTLWdsbazhgvO43HBM2Ym9npT1iYMac_Epx-"
"j2Mrr9jKKxZuhfOKJyqmdMVjFdlXPFuxXCuW64qtaMVjK9Jdke6KdFcswxVcVtBYAXHFOlmdfhcD4polvGYJr1mua5brmuW6Jvs12a_"
"Jfg3nNWtxzVBr8luT2vp06-S9Ha_sINSxB3Ss447HOtZxx7MdhepIqKMyHQa7w-IP05tfF3_474eMz384dfOXfzx185d_OnXzl38-"
"dfOXfzl185d_PXXzl387dfOXfz9185f_OHXzl_88dfOX_zp185e_"
"XXwYYrtUsFEwKggKtgquFdwo2Cu4VfBdwb2CBwVHBY8KnhW8KGDifPB8UeAVbBSMCoKCrYIrBdcKvim4UbBXcFBwq-"
"C7gnsFDwqOCp4UPCt4VXBK_lLJXwrvpZK_"
"VM6XyvlSOV8qZ4KdghsFewUHBbcK7hTcK3hU8KTgWcGLglcFp5w3orpRYhuh2yifjdLY6O0bPT7K8ijLowYcNeAop6NSHYV3FN5RyQfRCEojKI"
"2gNILSCBpwqzS2mjZbkd8qn61KsFViWyW21Uu3Ir_Vu7YivxXwrWbL9u_f_qDgqOBRwbOCE_"
"kr2bnSK670iisNeKVxrjTOtfBey9e1fF0r52sNeK0BrwXqWjlf6xXXyvla7_qmAb9pnG96_Ju876TZieFODHdiuNPjO6Wx-_"
"vH7xU8K3hVcJpRNyrljUp5I-83esWNGN5owBvZuZGdG6EjeFJwetder9hr5L2o7mVwr9myl9O9nO719r3I72V5L8t7odvL-14l2CvnvXLeK-"
"e9ct4r571AEbwo-N3FidhBdg6yc5CLg1wc5OIghgclf1CqB6V6UIYH5XPQu27l9E4vvdO77vSuO73rTsTuROxOL737-3EOCu4VPCg4KnhWcPL-"
"XW-_19vv5eJej98r-Xs9_qCnCLyCSwUbBaOCrYIrBdcKvinYKbhRsFdwUHCr4F7Bg4KjgicFLwpeFZzIH5X8UTkflfNR5I_K-"
"aicj0r1qFSPSvWoVI9K9ahUj0r1qAyPyvAomEcl9qjEHvX2R438qJEfZflRvh71-JMef5KvJ5XgSQM-ifyTBnxScZ_l_VmveJHTV43zKhc_"
"9K4fyvDHbCe7_Z8__uxj-3NsfWzX8_cf__eHh9h-je32__vhfWzbWfSD34JO_wj-2-fz_Pkyf77Onz_4DeQ33enzef58mT9f588f_Jeg33Snz-"
"f582X-fJ0_0X2ZdV9m3ZdZ92XWfZl1y1m3nHXLWbecdctZl8y6ZNYlsy6Zdcn8D_vpu_lf_U_Bs4IXBa8KTuR-0X8R-EX_TeAX_VeBX_"
"TfBX6R-I3EbyR-I_Ebid9I_FbitxK_lfitxG8l_iDxB4k_SPxB4g8Sf5b4s8SfJf4s8WeJv0j8ReIvEn-R-"
"IvES4mXEi8lXkq8nMVG6IzQGaEzQmeEzgidETojdEbojNCZ9xK_l_i9xO8lfi-x0BmhM0JnhM4Infko8UeJP0r8UeKPEn-S-JPEnyT-"
"JPEniVUUo6IYFcWoKEZFMSqKUVGMimJUFKOiGBXFqChGRTEqivm9KInEicSJxInEWikmlTiVOJU4lTiVOJM4kziTOJM4kziXOJc4lziXOJd4kH"
"iQeJB4kHiQ2EvsJfYSe4m9xJcSX0p8KfGlxJcSbyTeSLyReCPxRuIgcZA4SBwkDhJvJd5KvJV4K_FW4iuJryS-kvhK4iuJryW-"
"lvha4muJryXeSbyTeCfxTuKdxDcS30h8I_GNxDcS7yXeS7yXeC_"
"xXuKDxAeJDxIfJD5IfCvxrcS3Et9KfCvxncR3Et9JfCfxncTfJf4u8XeJv0v8XeJ7ie8lvpf4XuJ7iR8kfpD4QeIHiR8kfpb4WeJniZ8lfp7F9"
"udZfAqeFbwoeFVwEmt_ttqfrfZnq_3Zan-22p-t9mer_"
"dlqf7ban61OWKsT1uqEtTphrU5Yq83cajO32sytNnOrzdxqM7fazK02c6vN3Gozt9qRrHYkqx3Jakey2pGskdhIbCQ2EhuJR4lHiUeJR4nHWVx"
"pilaaopWmaKUpWmmKVpqilaZopSlaaYpWmqLVo8SPEj9K_"
"Cjx4yxe5bP4FDwreFHwquAkriSuJK4kriSuJF5JvJJ4JfFK4tUsbjQ3Gs2NRnOj0dxoNDcazY1Gc6PR3Gg0NxrNjUZzo9HcaDQ3Gs2NRnOj0Qn"
"b6IRtdMI2OmEbnbCNTthGJ2yjE7bRCdvohG10wjY6YRudsI1O2EYnbKMTttEJ2-"
"iEbXTCNjphG83nRvO50XxuNJ8bzef211l8Cp4VvCh4VXASa4q2mqKtpmirKdpqiraaoq2maKsp2mqKtpqirXbRVrtoq1201S7aahftfprFp-"
"BZwYuCVwUnsYrSqSiditKpKJ2K4iR2EjuJncTud7E4O3F24uzE2YmzqyWuJa4lriWuJW4kbiRuJG4kbiRuJW4lbiVuJW4l7iTuJO4k7iTuJD6T"
"-EziM4nPJD6T-Fzic4nPJT6X-FzirxJ_lfirxF8l_irxhcQXEl9IfCHxhcS6BDpdAp0ugU6XQKdLoNtJvJN4J_"
"FO4p3Eusk43WScbjJONxmnm4zTTcbpJuN0k3G6yTjdZJxuMk43GaebjNNNxukm47QGndag0xp0WoNOa9BpDTqtQac16LQGndag0xp0WoNOa9Bp"
"DTqtQaebjNNNxukm43STcbrJON1knG4yTjcZp5uM003GHSU-SnyU-CjxUeIXiV8kfpH4ReIXiV8lfpX4VeJXiV9ncS-DvQz2MtjLYC-"
"DgzgP4jyI8yDOgzgP4jyI8yDOgzgP4jwI3SB0g9ANQjcI3SB0g9ANQjcI3SB0w5PETxI_"
"Sfwk8dMs9jqtvE4rr9PK67TyOq28Tiuv08rrtPI6rbxOK69d1GsX9dpFvXZRr13U6_bldfvyun153b68bl_"
"eSmwlthJbia3EhcSFxIXEhcSFxKXEpcSlxKXEpcS6I3ndkbzuSF53JK87ktcdyeuO5HVH8rojed2R_"
"FritcRridcSryXWAeR1AHkdQF4HkNcB5HUAeR1AXgeQ1wHkdQB5HUBeB5DXAeR1AHkdQF4HkNcB5HUAeR1AXgeQ1wHkdQB5HUBeB5DXAeR1AHk"
"dQF4HkNcB5HUABf2eEvR7StDvKUG_pwT9nhL0J7igP8EF_Qku6E9wQX-"
"CC7q4Bl1cgy6uQRfXoItr2Em8k3gn8U7incQ6gIIOoKADKOgACjqAgg6goAMo6AAKOoCCDqCgAyjoAAo6gIIOoKADKOgACjqAgg6goAMo6AAK2"
"hiDNsagjTFoYwzaGIM2xqCNMWhjDNoYgzbGoP05aH8O2p-D9ueg_TloFw3aRYN20aBdNPy2i_4vsAI1eQ");
static string all_emojis_str = gzdecode(base64url_decode(packed_emojis).ok()).as_slice().str();
constexpr size_t EMOJI_COUNT = 2326;
#else
string all_emojis_str;
constexpr size_t EMOJI_COUNT = 0;
#endif
FlatHashSet<Slice, SliceHash> all_emojis;
all_emojis.reserve(EMOJI_COUNT);
for (size_t i = 0; i < all_emojis_str.size(); i++) {
CHECK(all_emojis_str[i] != ' ');
CHECK(all_emojis_str[i + 1] != ' ');
size_t j = i + 2;
while (j < all_emojis_str.size() && all_emojis_str[j] != ' ') {
j++;
}
CHECK(j < all_emojis_str.size());
all_emojis.insert(Slice(&all_emojis_str[i], &all_emojis_str[j]));
CHECK(j - i <= max_emoji_length);
i = j;
}
CHECK(all_emojis.size() == EMOJI_COUNT);
return all_emojis;
}();
auto len = str.size();
if (len > MAX_EMOJI_LENGTH + 3) {
return false;
}
if (emojis.count(str) != 0) {
return true;
}
if (len <= 3 || str[len - 3] != '\xEF' || str[len - 2] != '\xB8' || str[len - 1] != '\x8F') {
return false;
}
if (len >= 6 && str[len - 6] == '\xEF' && str[len - 5] == '\xB8' && str[len - 4] == '\x8F') {
return false;
}
return emojis.count(str.substr(0, len - 3)) != 0;
}
bool is_emoji(Slice str) {
size_t i = str.substr(0, MAX_EMOJI_LENGTH + 4).find('\xE2');
if (i == Slice::npos) {
return is_emoji_element(str);
}
size_t start_pos = 0;
for (; i + 3 < str.size(); i++) {
if (str[i] == '\xE2' && str[i + 1] == '\x80' && str[i + 2] == '\x8D') {
// zero-width joiner \u200D
if (!is_emoji_element(str.substr(start_pos, i - start_pos))) {
return false;
}
start_pos = i + 3;
i += 2;
}
}
return is_emoji_element(str.substr(start_pos));
}
int get_fitzpatrick_modifier(Slice emoji) {
if (emoji.size() < 4 || emoji[emoji.size() - 4] != '\xF0' || emoji[emoji.size() - 3] != '\x9F' ||
emoji[emoji.size() - 2] != '\x8F') {
return 0;
}
auto c = static_cast<unsigned char>(emoji.back());
if (c < 0xBB || c > 0xBF) {
return 0;
}
return (c - 0xBB) + 2;
}
Slice remove_fitzpatrick_modifier(Slice emoji) {
while (get_fitzpatrick_modifier(emoji) != 0) {
emoji.remove_suffix(4);
}
return emoji;
}
string remove_emoji_modifiers(Slice emoji, bool remove_selectors) {
string result = emoji.str();
remove_emoji_modifiers_in_place(result, remove_selectors);
return result;
}
void remove_emoji_modifiers_in_place(string &emoji, bool remove_selectors) {
static const Slice modifiers[] = {u8"\uFE0F" /* variation selector-16 */,
u8"\u200D\u2640" /* zero width joiner + female sign */,
u8"\u200D\u2642" /* zero width joiner + male sign */,
u8"\U0001F3FB" /* emoji modifier fitzpatrick type-1-2 */,
u8"\U0001F3FC" /* emoji modifier fitzpatrick type-3 */,
u8"\U0001F3FD" /* emoji modifier fitzpatrick type-4 */,
u8"\U0001F3FE" /* emoji modifier fitzpatrick type-5 */,
u8"\U0001F3FF" /* emoji modifier fitzpatrick type-6 */};
const size_t start_index = remove_selectors ? 0 : 1;
size_t j = 0;
for (size_t i = 0; i < emoji.size();) {
bool is_found = false;
for (size_t k = start_index; k < sizeof(modifiers) / sizeof(*modifiers); k++) {
auto length = modifiers[k].size();
if (i + length <= emoji.size() && Slice(&emoji[i], length) == modifiers[k]) {
// skip the modifier
i += length;
is_found = true;
break;
}
}
if (!is_found) {
emoji[j++] = emoji[i++];
}
}
if (j != 0) {
emoji.resize(j);
}
}
string remove_emoji_selectors(Slice emoji) {
if (!is_emoji(emoji)) {
return emoji.str();
}
string str;
for (size_t i = 0; i < emoji.size(); i++) {
if (i + 3 <= emoji.size() && emoji[i] == '\xEF' && emoji[i + 1] == '\xB8' && emoji[i + 2] == '\x8F') {
// skip \uFE0F
i += 2;
} else {
str += emoji[i];
}
}
CHECK(is_emoji(str));
return str;
}
} // namespace td