// // Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2023 // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // #include "td/utils/emoji.h" #include "td/utils/base64.h" #include "td/utils/FlatHashSet.h" #include "td/utils/Gzip.h" namespace td { static constexpr size_t MAX_EMOJI_LENGTH = 28; static bool is_emoji_element(Slice str) { static const FlatHashSet<Slice, SliceHash> emojis = [max_emoji_length = MAX_EMOJI_LENGTH] { #if TD_HAVE_ZLIB Slice packed_emojis( "eJxtmlly40iWRbdCs_rrr56H3WVmzBHE4A6AxCCFqiodcBFOKShSComawqyWglpAl1lvoP0wcdPK2vrDnY-mC8e75_" "kkRUzLdjEtu8WUbWIbYwuxbWPbxXZYTOuX2F4XU13GVi2m5mNsn2L7HNuX2JaxJbGlsWWx5bGZ2GxsRWw_FlMbP9s_xRbHb-" "P4bRyzjWN2b2N7F1t8tovjd_Hn3T62-N7uLrbH2KL2LGrO4vvO4vvOLhfTeXzneXzmPI57Hp87j3mdrxfT1_j5dRUbccz_" "a3z3Nnrbni2mq5jXVbX428VPb2P3Jovd-5TO0Fm6gq6kQ_d-Rbemq-kaujZ2H3-mO0Xf6PZ0B7pbuju673QPdEe6x9h9yul42_InOkZZ_" "kL3ho6slu_" "o3tN9oPtI94nuM90XuiVdQkf2S3wsGXl5GhkfS3ws8bHExxIfS3ws8bHEx5LsY9Vjd0Z3TveV7oLuj3RXdNd0uFzu6G7o8LvE7xK_S_wu7-" "nwu8TvEr_LJ7pnuhe6V7ofsUuwn2A_wX6C_QT7CfYT7CfYT7CfYD_BfoL9BPsJ9hPsJ9hPsJ9gP8F-gv0E-wn2E-wn2E-wn2A_wX6C_" "QT7CfYT7CfYT7Cf_Inuz3S_0jm6nm6g83SXdBu6kS7QbelAl4AuAV0CugR0CegS0CWgS0CXMFUS-CXwS-CXwC-BXwK_" "BHQJ6FLQpaBLQZeCLgVdCroUdCnoUtCloEtBl4IuBV0KuhR0KehS0KWgS0GXgi7Ffor9FPsp9lPsp9hPsZ9iP8V-iv0U-yn2U-yn2E-xn2I_" "xX6K_RT7KfZT7KfYT7GfYj_Ffor9FPsp9lPsp0yfFAYpDDIYZDDIYJDBIINBBoMMBhkMMhhkMMhgkOE3w2-G3wy_GX4z_" "Gb4zfCb4TfDb4bfDL8ZfjP8ZvjN8JvhN8Nvht8Mvxl-M_" "xmGMxwlOEow1GGowxHGY4yHGU4ynGU4yjHUY6jHEc5jnIc5TjKcZTjKMdRTlVzqppT1Zyq5rjMcZnjMsdljsucBZGzIHIWRM6CyFkQOQsiZ0Hk" "LIicBZGzIHIWRM6CyCGUQyiHUA6hHEI5hHII5RDKIZRDKIdQDqEcQjmEcgjlEMohlDMjcmZEzozIAZYzI3JmRM6MyOGXwy-HXw6_HH45_" "HL4GdAZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGag" "ZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGagZqBmoGaiZEzVmnQWdZdZZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-" "Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-Fn4WfhZ-" "Fn4WfhZ-Fn4WfhZ-Fn4FfAr4FfAr4BfAb8CfgX8CvgV8CvgV8CvgF8BvwJ-BfwK-BXwK-BXwK-AXwG_An4F_" "Ar4FfAr4FfAr4BfAb8CfgX8CvgV8CvgV8CvgF8BvwJ-BfwK-BXwK-BXwK-AXwG_An4F_" "Ar4FfAr4FfAr4BfAb8CfgX8CvgV8CvgV8CvgF8BvwJqJdRKqJVQK6FWQq2EWgm1Emol1EqolVAroVZCrYRaCbUSaiXUSqiVUCuhVkKthFoJtRJ" "qJdRKqJVQK6FWQq2EWgm1Emol1EqolVAroVZCrYRaCbUSaiXUSqiVUCuhVkKthFoJtRJqJdRKqJVQK6FWQq2EWgm1Emol1EqolVAroVZCrWTWV" "divsF9hv8J-hfMK5xXOK5xXOK9wXuG8wnmF8wrnFc4rnFc4r3Be4bzCeYXzCucVziucVzivcF7hvMJ5RX4r3rHiHSska9Jdk-" "6adNcskjXlril3Tblryl1T7ppy15S7ptw15a4pd025a8pd47fGb43fGr815a4xXWO6xnSN6RrTNQnVJFRjusZ0jeka0zWma0zXmK4xXWO6xnSN" "6RrTNaZrHNWYrjFdY7qm3DXlril3Tblryl1T7ppy15S7ptw15a4pd025a8pdU-" "6acteUu6bcNeWuwVnDr4ZfDb8afjX8Gvg18Gvg18CvgV8DvwZ-Dfwa-DXwa-DXwK-BXwO_Bn4N_" "FrGaxmvZbyW8VrGaxmvZbyW8VrGaxmvZbyW8VrGaxmvZbz2NB71aKlHSz1a6tFSj5Z6tNSjpR4t9WipR0s9WurRUo-" "WerTUo6UeLfVoqUdLPVrq0VKPlnq01KOlHi31aKlHSz1a6tFSj5Z6tNSjpR4t9WipR0s9WurRUo-" "WerTUo6UeLfVoqUdLPVrq0VKPlnq01KOlHh38Ovh18Ovg18Gvg18HoQ4kHUg6kHSA6ADRAaLDdIfpDtMdpjuy78i-" "I7WO1DpS60itI7WO1DpS60itI7ULWF3A6gJWF7C6gNUFrC5gdQGrC1hdwOoCVhe87QIujnQdBXUU1FFQR_" "aO7B3ZOwrqKKjDh8OHw4ejoI6COgrqKKjDm8Obw5vDmyNJR5KOJB1JOpJ0JOlI0pGkI0lHko4kHUk6kDgK6iioo6DulDgFdRTUUVAHNQc1BzUH" "NQc1BzUHNQcwRy0dtXTUsqeWPbXsqWVPLXtq2VPLnmXQswx6lkHPMuhZBj3Ueqj1UOuh1kOth1oPtR5qPdR6qPVQ66HWQ62HWg-" "1Hmo91Hqo9VDrodZDrYdaD7Ueaj3Ueqj1UOuh1kOth1oPtR5qPdR6qPVQ66HWQ62HWg-" "1Hmo91Hqo9VDrodZDrYdaD7WeudaDrgddD7oedAPoBtANoBtAN4BuAN3ANjLAb4DfAL8BfgP8BvgN8BvgN8BvgN8AvwF-A_wG-A3wG-A3wG-" "A3wC_AX4D_Ab4DfAb4DfAb4DfAL8BfgP8BvgN8BvgN8BvgN8AvwF-A_wG-A3wG-A3wG-A3wC_AX4D_" "Ab4DfAb4DfAb4DfAL8BfgP8BvgN8BvgN8DPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_" "Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw8_Dz8PPw2_" "DeBvG2zDehvE2jLdhvA3jbRhvw3gbxtsw3obxNow3UoCRAowUYKQAIwUYKcBIAUYKMFKAEbojdEfojtAdoTtCd4TuCN0RuiN0R-" "iO0B2hO0J3hO4I3RG6I3RH6I7QHaE7QneE7gjdEbojdEfojtAdoTtCd4TuCN0RGiM0RmiM0BihMUJjhMYIjREaIzRGaIzQGE80oDsCNsAlwCXA" "JcAlwCXAJTDhAhMugCSAJIAkgCSAJIAkgCSAJIAkgCSAJIAkYD9gP2A_YD9gP2A_YD9gP2A_4C3gLeAt4C3gLeAt4C3gLTws_rJZ_" "OV6Mf30vJh-_rSY3v45tuNiel_GVsW2im0dWx1bE9smtnExLS8XU5bFdhVbfD77FttNbPvYHmKLY2SPiyn-ljeton4VYvu-mNY_" "xfYUW3xfvOZO8ZY7xUvuFO-4U7ziTvHSOsWb6hSvolO8dU7xqjnFS-YU75hTvFhO8SI5xYvjFO-" "NU7wOTvE2OMXL4BSvfFO87E3NRWzxmSbqmz62-" "EwTc23iO5vXxRQvXFO8ak3xpjXFi9YU71lTvGFN8W41xavVFK9SU7wrTfEKNMUb0NR9jC36jFeTqStii_67-LOOn93Gdh9bzCNeNaaz-P6zqD-" "LHM-WsSWxxWfP4jvP4jvP4vvOoqezP8V2iC0-fx49n3-ILeZ77hbT1_izeDJP8WCetu9iex_bh8Vff9rF9rL46-d1bMyMd5T4HSV-xwJ_" "xzz8yOz7ePqrMhNlyfRYMj2WTI8l02PJ9FiyOpasjiWrY8nqWJ7-" "onr6oyHzMGUeprwoZQqmTMGUtZiyFjO21IwtNWNLzZjhGRM5YyJnDJAxQMZEzhglY5SMUTLWdsbazhgvO43HBM2Ym9npT1iYMac_Epx-" "j2Mrr9jKKxZuhfOKJyqmdMVjFdlXPFuxXCuW64qtaMVjK9Jdke6KdFcswxVcVtBYAXHFOlmdfhcD4polvGYJr1mua5brmuW6Jvs12a_" "Jfg3nNWtxzVBr8luT2vp06-S9Ha_sINSxB3Ss447HOtZxx7MdhepIqKMyHQa7w-IP05tfF3_474eMz384dfOXfzx185d_OnXzl38-" "dfOXfzl185d_PXXzl387dfOXfz9185f_OHXzl_88dfOX_zp185e_" "XXwYYrtUsFEwKggKtgquFdwo2Cu4VfBdwb2CBwVHBY8KnhW8KGDifPB8UeAVbBSMCoKCrYIrBdcKvim4UbBXcFBwq-" "C7gnsFDwqOCp4UPCt4VXBK_lLJXwrvpZK_" "VM6XyvlSOV8qZ4KdghsFewUHBbcK7hTcK3hU8KTgWcGLglcFp5w3orpRYhuh2yifjdLY6O0bPT7K8ijLowYcNeAop6NSHYV3FN5RyQfRCEojKI" "2gNILSCBpwqzS2mjZbkd8qn61KsFViWyW21Uu3Ir_Vu7YivxXwrWbL9u_f_qDgqOBRwbOCE_" "kr2bnSK670iisNeKVxrjTOtfBey9e1fF0r52sNeK0BrwXqWjlf6xXXyvla7_qmAb9pnG96_Ju876TZieFODHdiuNPjO6Wx-_" "vH7xU8K3hVcJpRNyrljUp5I-83esWNGN5owBvZuZGdG6EjeFJwetder9hr5L2o7mVwr9myl9O9nO719r3I72V5L8t7odvL-14l2CvnvXLeK-" "e9ct4r571AEbwo-N3FidhBdg6yc5CLg1wc5OIghgclf1CqB6V6UIYH5XPQu27l9E4vvdO77vSuO73rTsTuROxOL737-3EOCu4VPCg4KnhWcPL-" "XW-_19vv5eJej98r-Xs9_qCnCLyCSwUbBaOCrYIrBdcKvinYKbhRsFdwUHCr4F7Bg4KjgicFLwpeFZzIH5X8UTkflfNR5I_K-" "aicj0r1qFSPSvWoVI9K9ahUj0r1qAyPyvAomEcl9qjEHvX2R438qJEfZflRvh71-JMef5KvJ5XgSQM-ifyTBnxScZ_l_VmveJHTV43zKhc_" "9K4fyvDHbCe7_Z8__uxj-3NsfWzX8_cf__eHh9h-je32__vhfWzbWfSD34JO_wj-2-fz_Pkyf77Onz_4DeQ33enzef58mT9f588f_Jeg33Snz-" "f582X-fJ0_0X2ZdV9m3ZdZ92XWfZl1y1m3nHXLWbecdctZl8y6ZNYlsy6Zdcn8D_vpu_lf_U_Bs4IXBa8KTuR-0X8R-EX_TeAX_VeBX_" "TfBX6R-I3EbyR-I_Ebid9I_FbitxK_lfitxG8l_iDxB4k_SPxB4g8Sf5b4s8SfJf4s8WeJv0j8ReIvEn-R-" "IvES4mXEi8lXkq8nMVG6IzQGaEzQmeEzgidETojdEbojNCZ9xK_l_i9xO8lfi-x0BmhM0JnhM4Infko8UeJP0r8UeKPEn-S-JPEnyT-" "JPEniVUUo6IYFcWoKEZFMSqKUVGMimJUFKOiGBXFqChGRTEqivm9KInEicSJxInEWikmlTiVOJU4lTiVOJM4kziTOJM4kziXOJc4lziXOJd4kH" "iQeJB4kHiQ2EvsJfYSe4m9xJcSX0p8KfGlxJcSbyTeSLyReCPxRuIgcZA4SBwkDhJvJd5KvJV4K_FW4iuJryS-kvhK4iuJryW-" "lvha4muJryXeSbyTeCfxTuKdxDcS30h8I_GNxDcS7yXeS7yXeC_" "xXuKDxAeJDxIfJD5IfCvxrcS3Et9KfCvxncR3Et9JfCfxncTfJf4u8XeJv0v8XeJ7ie8lvpf4XuJ7iR8kfpD4QeIHiR8kfpb4WeJniZ8lfp7F9" "udZfAqeFbwoeFVwEmt_ttqfrfZnq_3Zan-22p-t9mer_" "dlqf7ban61OWKsT1uqEtTphrU5Yq83cajO32sytNnOrzdxqM7fazK02c6vN3Gozt9qRrHYkqx3Jakey2pGskdhIbCQ2EhuJR4lHiUeJR4nHWVx" "pilaaopWmaKUpWmmKVpqilaZopSlaaYpWmqLVo8SPEj9K_" "Cjx4yxe5bP4FDwreFHwquAkriSuJK4kriSuJF5JvJJ4JfFK4tUsbjQ3Gs2NRnOj0dxoNDcazY1Gc6PR3Gg0NxrNjUZzo9HcaDQ3Gs2NRnOj0Qn" "b6IRtdMI2OmEbnbCNTthGJ2yjE7bRCdvohG10wjY6YRudsI1O2EYnbKMTttEJ2-" "iEbXTCNjphG83nRvO50XxuNJ8bzef211l8Cp4VvCh4VXASa4q2mqKtpmirKdpqiraaoq2maKsp2mqKtpqirXbRVrtoq1201S7aahftfprFp-" "BZwYuCVwUnsYrSqSiditKpKJ2K4iR2EjuJncTud7E4O3F24uzE2YmzqyWuJa4lriWuJW4kbiRuJG4kbiRuJW4lbiVuJW4l7iTuJO4k7iTuJD6T" "-EziM4nPJD6T-Fzic4nPJT6X-FzirxJ_lfirxF8l_irxhcQXEl9IfCHxhcS6BDpdAp0ugU6XQKdLoNtJvJN4J_" "FO4p3Eusk43WScbjJONxmnm4zTTcbpJuN0k3G6yTjdZJxuMk43GaebjNNNxukm47QGndag0xp0WoNOa9BpDTqtQac16LQGndag0xp0WoNOa9Bp" "DTqtQaebjNNNxukm43STcbrJON1knG4yTjcZp5uM003GHSU-SnyU-CjxUeIXiV8kfpH4ReIXiV8lfpX4VeJXiV9ncS-DvQz2MtjLYC-" "DgzgP4jyI8yDOgzgP4jyI8yDOgzgP4jwI3SB0g9ANQjcI3SB0g9ANQjcI3SB0w5PETxI_" "Sfwk8dMs9jqtvE4rr9PK67TyOq28Tiuv08rrtPI6rbxOK69d1GsX9dpFvXZRr13U6_bldfvyun153b68bl_" "eSmwlthJbia3EhcSFxIXEhcSFxKXEpcSlxKXEpcS6I3ndkbzuSF53JK87ktcdyeuO5HVH8rojed2R_" "FritcRridcSryXWAeR1AHkdQF4HkNcB5HUAeR1AXgeQ1wHkdQB5HUBeB5DXAeR1AHkdQF4HkNcB5HUAeR1AXgeQ1wHkdQB5HUBeB5DXAeR1AHk" "dQF4HkNcB5HUABf2eEvR7StDvKUG_pwT9nhL0J7igP8EF_Qku6E9wQX-" "CC7q4Bl1cgy6uQRfXoItr2Em8k3gn8U7incQ6gIIOoKADKOgACjqAgg6goAMo6AAKOoCCDqCgAyjoAAo6gIIOoKADKOgACjqAgg6goAMo6AAK2" "hiDNsagjTFoYwzaGIM2xqCNMWhjDNoYgzbGoP05aH8O2p-D9ueg_TloFw3aRYN20aBdNPy2i_4vsAI1eQ"); static string all_emojis_str = gzdecode(base64url_decode(packed_emojis).ok()).as_slice().str(); constexpr size_t EMOJI_COUNT = 2326; #else string all_emojis_str; constexpr size_t EMOJI_COUNT = 0; #endif FlatHashSet<Slice, SliceHash> all_emojis; all_emojis.reserve(EMOJI_COUNT); for (size_t i = 0; i < all_emojis_str.size(); i++) { CHECK(all_emojis_str[i] != ' '); CHECK(all_emojis_str[i + 1] != ' '); size_t j = i + 2; while (j < all_emojis_str.size() && all_emojis_str[j] != ' ') { j++; } CHECK(j < all_emojis_str.size()); all_emojis.insert(Slice(&all_emojis_str[i], &all_emojis_str[j])); CHECK(j - i <= max_emoji_length); i = j; } CHECK(all_emojis.size() == EMOJI_COUNT); return all_emojis; }(); auto len = str.size(); if (len > MAX_EMOJI_LENGTH + 3) { return false; } if (emojis.count(str) != 0) { return true; } if (len <= 3 || str[len - 3] != '\xEF' || str[len - 2] != '\xB8' || str[len - 1] != '\x8F') { return false; } if (len >= 6 && str[len - 6] == '\xEF' && str[len - 5] == '\xB8' && str[len - 4] == '\x8F') { return false; } return emojis.count(str.substr(0, len - 3)) != 0; } bool is_emoji(Slice str) { size_t i = str.substr(0, MAX_EMOJI_LENGTH + 4).find('\xE2'); if (i == td::Slice::npos) { return is_emoji_element(str); } size_t start_pos = 0; for (; i + 3 < str.size(); i++) { if (str[i] == '\xE2' && str[i + 1] == '\x80' && str[i + 2] == '\x8D') { // zero-width joiner \u200D if (!is_emoji_element(str.substr(start_pos, i - start_pos))) { return false; } start_pos = i + 3; i += 2; } } return is_emoji_element(str.substr(start_pos)); } int get_fitzpatrick_modifier(Slice emoji) { if (emoji.size() < 4 || emoji[emoji.size() - 4] != '\xF0' || emoji[emoji.size() - 3] != '\x9F' || emoji[emoji.size() - 2] != '\x8F') { return 0; } auto c = static_cast<unsigned char>(emoji.back()); if (c < 0xBB || c > 0xBF) { return 0; } return (c - 0xBB) + 2; } Slice remove_fitzpatrick_modifier(Slice emoji) { while (get_fitzpatrick_modifier(emoji) != 0) { emoji.remove_suffix(4); } return emoji; } string remove_emoji_modifiers(Slice emoji, bool remove_selectors) { string result = emoji.str(); remove_emoji_modifiers_in_place(result, remove_selectors); return result; } void remove_emoji_modifiers_in_place(string &emoji, bool remove_selectors) { static const Slice modifiers[] = {u8"\uFE0F" /* variation selector-16 */, u8"\u200D\u2640" /* zero width joiner + female sign */, u8"\u200D\u2642" /* zero width joiner + male sign */, u8"\U0001F3FB" /* emoji modifier fitzpatrick type-1-2 */, u8"\U0001F3FC" /* emoji modifier fitzpatrick type-3 */, u8"\U0001F3FD" /* emoji modifier fitzpatrick type-4 */, u8"\U0001F3FE" /* emoji modifier fitzpatrick type-5 */, u8"\U0001F3FF" /* emoji modifier fitzpatrick type-6 */}; const size_t start_index = remove_selectors ? 0 : 1; size_t j = 0; for (size_t i = 0; i < emoji.size();) { bool is_found = false; for (size_t k = start_index; k < sizeof(modifiers) / sizeof(*modifiers); k++) { auto length = modifiers[k].size(); if (i + length <= emoji.size() && Slice(&emoji[i], length) == modifiers[k]) { // skip the modifier i += length; is_found = true; break; } } if (!is_found) { emoji[j++] = emoji[i++]; } } if (j != 0) { emoji.resize(j); } } string remove_emoji_selectors(Slice emoji) { if (!is_emoji(emoji)) { return emoji.str(); } string str; for (size_t i = 0; i < emoji.size(); i++) { if (i + 3 <= emoji.size() && emoji[i] == '\xEF' && emoji[i + 1] == '\xB8' && emoji[i + 2] == '\x8F') { // skip \uFE0F i += 2; } else { str += emoji[i]; } } CHECK(is_emoji(str)); return str; } } // namespace td