diff --git a/td/telegram/MessageEntity.cpp b/td/telegram/MessageEntity.cpp index 490c398bc..27bb5f460 100644 --- a/td/telegram/MessageEntity.cpp +++ b/td/telegram/MessageEntity.cpp @@ -434,6 +434,57 @@ static vector match_cashtags(Slice str) { return result; } +static vector match_media_timestamps(Slice str) { + vector result; + const unsigned char *begin = str.ubegin(); + const unsigned char *end = str.uend(); + const unsigned char *ptr = begin; + + while (true) { + ptr = static_cast(std::memchr(ptr, ':', narrow_cast(end - ptr))); + if (ptr == nullptr) { + break; + } + + auto media_timestamp_begin = ptr; + while (media_timestamp_begin != begin && + (media_timestamp_begin[-1] == ':' || is_digit(media_timestamp_begin[-1]))) { + media_timestamp_begin--; + } + auto media_timestamp_end = ptr; + while (media_timestamp_end + 1 != end && (media_timestamp_end[1] == ':' || is_digit(media_timestamp_end[1]))) { + media_timestamp_end++; + } + media_timestamp_end++; + + if (media_timestamp_begin != ptr && media_timestamp_end != ptr + 1 && is_digit(ptr[1])) { + ptr = media_timestamp_end; + + if (media_timestamp_begin != begin) { + uint32 prev; + next_utf8_unsafe(prev_utf8_unsafe(media_timestamp_begin), &prev, "match_media_timestamps 1"); + + if (is_word_character(prev)) { + continue; + } + } + if (media_timestamp_end != end) { + uint32 next; + next_utf8_unsafe(media_timestamp_end, &next, "match_media_timestamps 2"); + + if (is_word_character(next)) { + continue; + } + } + + result.emplace_back(media_timestamp_begin, media_timestamp_end); + } else { + ptr = media_timestamp_end; + } + } + return result; +} + static vector match_bank_card_numbers(Slice str) { vector result; const unsigned char *begin = str.ubegin(); @@ -1251,6 +1302,42 @@ vector> find_urls(Slice str) { return result; } +vector> find_media_timestamps(Slice str) { + vector> result; + for (auto media_timestamp : match_media_timestamps(str)) { + vector parts = full_split(media_timestamp, ':'); + CHECK(parts.size() >= 2); + if (parts.size() > 3 || parts.back().size() != 2) { + continue; + } + auto seconds = to_integer(parts.back()); + if (seconds >= 60) { + continue; + } + if (parts.size() == 2) { + if (parts[0].size() > 4 || parts[0].empty()) { + continue; + } + + auto minutes = to_integer(parts[0]); + result.emplace_back(media_timestamp, minutes * 60 + seconds); + continue; + } else { + if (parts[0].size() > 2 || parts[1].size() > 2 || parts[0].empty() || parts[1].empty()) { + continue; + } + + auto minutes = to_integer(parts[1]); + if (minutes >= 60) { + continue; + } + auto hours = to_integer(parts[0]); + result.emplace_back(media_timestamp, hours * 3600 + minutes * 60 + seconds); + } + } + return result; +} + static int32 text_length(Slice text) { return narrow_cast(utf8_utf16_length(text)); } @@ -1540,6 +1627,21 @@ vector find_entities(Slice text, bool skip_bot_commands) { return entities; } +static vector find_media_timestamp_entities(Slice text) { + vector entities; + + auto new_entities = find_media_timestamps(text); + for (auto &entity : new_entities) { + auto offset = narrow_cast(entity.first.begin() - text.begin()); + auto length = narrow_cast(entity.first.size()); + entities.emplace_back(MessageEntity::Type::MediaTimestamp, offset, length, to_string(entity.second)); + } + + fix_entity_offsets(text, entities); + + return entities; +} + static vector merge_entities(vector old_entities, vector new_entities) { if (new_entities.empty()) { return old_entities; @@ -3892,6 +3994,9 @@ Status fix_formatted_text(string &text, vector &entities, bool al if (!skip_new_entities) { merge_new_entities(entities, find_entities(text, skip_bot_commands)); } + if (!skip_media_timestamps) { + merge_new_entities(entities, find_media_timestamp_entities(text)); + } // new whitespace-only entities could be added after splitting of entities remove_invalid_entities(text, entities); diff --git a/td/telegram/MessageEntity.h b/td/telegram/MessageEntity.h index 3374935fb..9dd5257ba 100644 --- a/td/telegram/MessageEntity.h +++ b/td/telegram/MessageEntity.h @@ -146,7 +146,8 @@ vector find_cashtags(Slice str); vector find_bank_card_numbers(Slice str); vector find_tg_urls(Slice str); bool is_email_address(Slice str); -vector> find_urls(Slice str); // slice + is_email_address +vector> find_urls(Slice str); // slice + is_email_address +vector> find_media_timestamps(Slice str); // slice + media_timestamp string get_first_url(Slice text, const vector &entities); diff --git a/test/message_entities.cpp b/test/message_entities.cpp index 964e47725..4e8935aca 100644 --- a/test/message_entities.cpp +++ b/test/message_entities.cpp @@ -172,6 +172,47 @@ TEST(MessageEntities, cashtag) { check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"}); } +static void check_media_timestamp(const td::string &str, const td::vector> &expected) { + auto result = td::find_media_timestamps(str); + if (result != expected) { + LOG(FATAL) << td::tag("text", str) << td::tag("got", td::format::as_array(result)) + << td::tag("expected", td::format::as_array(expected)); + } +} + +TEST(MessageEntities, media_timestamp) { + check_media_timestamp("", {}); + check_media_timestamp(":", {}); + check_media_timestamp(":1", {}); + check_media_timestamp("a:1", {}); + check_media_timestamp("01", {}); + check_media_timestamp("01:", {}); + check_media_timestamp("01::", {}); + check_media_timestamp("01::", {}); + check_media_timestamp("a1:1a", {}); + check_media_timestamp("a1::01a", {}); + check_media_timestamp("2001:db8::8a2e:f70:13a4", {}); + check_media_timestamp("0:00", {{"0:00", 0}}); + check_media_timestamp("+0:00", {{"0:00", 0}}); + check_media_timestamp("0:00+", {{"0:00", 0}}); + check_media_timestamp("a0:00", {}); + check_media_timestamp("0:00a", {}); + check_media_timestamp("б0:00", {}); + check_media_timestamp("0:00б", {}); + check_media_timestamp("_0:00", {}); + check_media_timestamp("0:00_", {}); + check_media_timestamp("00:00:00:00", {}); + check_media_timestamp("1:1:01 1:1:1", {{"1:1:01", 3661}}); + check_media_timestamp("0:0:00 00:00 000:00 0000:00 00000:00 00:00:00 000:00:00 00:000:00 00:00:000", + {{"0:0:00", 0}, {"00:00", 0}, {"000:00", 0}, {"0000:00", 0}, {"00:00:00", 0}}); + check_media_timestamp("00:0:00 0:00:00 00::00 :00:00 00:00: 00:00:0 00:00:", {{"00:0:00", 0}, {"0:00:00", 0}}); + check_media_timestamp("1:1:59 1:1:-1 1:1:60", {{"1:1:59", 3719}}); + check_media_timestamp("1:59:00 1:-1:00 1:60:00", {{"1:59:00", 7140}, {"1:00", 60}}); + check_media_timestamp("59:59 60:00", {{"59:59", 3599}, {"60:00", 3600}}); + check_media_timestamp("9999:59 99:59:59 99:60:59", {{"9999:59", 599999}, {"99:59:59", 360000 - 1}}); + check_media_timestamp("2001:db8::8a2e:f70:13a4", {}); +} + static void check_bank_card_number(const td::string &str, const td::vector &expected) { auto result_slice = td::find_bank_card_numbers(str); td::vector result;