Implement find_media_timestamps.

This commit is contained in:
levlam 2021-07-28 08:30:22 +03:00
parent bdbf4db9ff
commit 14cf908017
3 changed files with 148 additions and 1 deletions

View File

@ -434,6 +434,57 @@ static vector<Slice> match_cashtags(Slice str) {
return result; return result;
} }
static vector<Slice> match_media_timestamps(Slice str) {
vector<Slice> result;
const unsigned char *begin = str.ubegin();
const unsigned char *end = str.uend();
const unsigned char *ptr = begin;
while (true) {
ptr = static_cast<const unsigned char *>(std::memchr(ptr, ':', narrow_cast<int32>(end - ptr)));
if (ptr == nullptr) {
break;
}
auto media_timestamp_begin = ptr;
while (media_timestamp_begin != begin &&
(media_timestamp_begin[-1] == ':' || is_digit(media_timestamp_begin[-1]))) {
media_timestamp_begin--;
}
auto media_timestamp_end = ptr;
while (media_timestamp_end + 1 != end && (media_timestamp_end[1] == ':' || is_digit(media_timestamp_end[1]))) {
media_timestamp_end++;
}
media_timestamp_end++;
if (media_timestamp_begin != ptr && media_timestamp_end != ptr + 1 && is_digit(ptr[1])) {
ptr = media_timestamp_end;
if (media_timestamp_begin != begin) {
uint32 prev;
next_utf8_unsafe(prev_utf8_unsafe(media_timestamp_begin), &prev, "match_media_timestamps 1");
if (is_word_character(prev)) {
continue;
}
}
if (media_timestamp_end != end) {
uint32 next;
next_utf8_unsafe(media_timestamp_end, &next, "match_media_timestamps 2");
if (is_word_character(next)) {
continue;
}
}
result.emplace_back(media_timestamp_begin, media_timestamp_end);
} else {
ptr = media_timestamp_end;
}
}
return result;
}
static vector<Slice> match_bank_card_numbers(Slice str) { static vector<Slice> match_bank_card_numbers(Slice str) {
vector<Slice> result; vector<Slice> result;
const unsigned char *begin = str.ubegin(); const unsigned char *begin = str.ubegin();
@ -1251,6 +1302,42 @@ vector<std::pair<Slice, bool>> find_urls(Slice str) {
return result; return result;
} }
vector<std::pair<Slice, int32>> find_media_timestamps(Slice str) {
vector<std::pair<Slice, int32>> result;
for (auto media_timestamp : match_media_timestamps(str)) {
vector<Slice> parts = full_split(media_timestamp, ':');
CHECK(parts.size() >= 2);
if (parts.size() > 3 || parts.back().size() != 2) {
continue;
}
auto seconds = to_integer<int32>(parts.back());
if (seconds >= 60) {
continue;
}
if (parts.size() == 2) {
if (parts[0].size() > 4 || parts[0].empty()) {
continue;
}
auto minutes = to_integer<int32>(parts[0]);
result.emplace_back(media_timestamp, minutes * 60 + seconds);
continue;
} else {
if (parts[0].size() > 2 || parts[1].size() > 2 || parts[0].empty() || parts[1].empty()) {
continue;
}
auto minutes = to_integer<int32>(parts[1]);
if (minutes >= 60) {
continue;
}
auto hours = to_integer<int32>(parts[0]);
result.emplace_back(media_timestamp, hours * 3600 + minutes * 60 + seconds);
}
}
return result;
}
static int32 text_length(Slice text) { static int32 text_length(Slice text) {
return narrow_cast<int32>(utf8_utf16_length(text)); return narrow_cast<int32>(utf8_utf16_length(text));
} }
@ -1540,6 +1627,21 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands) {
return entities; return entities;
} }
static vector<MessageEntity> find_media_timestamp_entities(Slice text) {
vector<MessageEntity> entities;
auto new_entities = find_media_timestamps(text);
for (auto &entity : new_entities) {
auto offset = narrow_cast<int32>(entity.first.begin() - text.begin());
auto length = narrow_cast<int32>(entity.first.size());
entities.emplace_back(MessageEntity::Type::MediaTimestamp, offset, length, to_string(entity.second));
}
fix_entity_offsets(text, entities);
return entities;
}
static vector<MessageEntity> merge_entities(vector<MessageEntity> old_entities, vector<MessageEntity> new_entities) { static vector<MessageEntity> merge_entities(vector<MessageEntity> old_entities, vector<MessageEntity> new_entities) {
if (new_entities.empty()) { if (new_entities.empty()) {
return old_entities; return old_entities;
@ -3892,6 +3994,9 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
if (!skip_new_entities) { if (!skip_new_entities) {
merge_new_entities(entities, find_entities(text, skip_bot_commands)); merge_new_entities(entities, find_entities(text, skip_bot_commands));
} }
if (!skip_media_timestamps) {
merge_new_entities(entities, find_media_timestamp_entities(text));
}
// new whitespace-only entities could be added after splitting of entities // new whitespace-only entities could be added after splitting of entities
remove_invalid_entities(text, entities); remove_invalid_entities(text, entities);

View File

@ -146,7 +146,8 @@ vector<Slice> find_cashtags(Slice str);
vector<Slice> find_bank_card_numbers(Slice str); vector<Slice> find_bank_card_numbers(Slice str);
vector<Slice> find_tg_urls(Slice str); vector<Slice> find_tg_urls(Slice str);
bool is_email_address(Slice str); bool is_email_address(Slice str);
vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address
vector<std::pair<Slice, int32>> find_media_timestamps(Slice str); // slice + media_timestamp
string get_first_url(Slice text, const vector<MessageEntity> &entities); string get_first_url(Slice text, const vector<MessageEntity> &entities);

View File

@ -172,6 +172,47 @@ TEST(MessageEntities, cashtag) {
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"}); check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
} }
static void check_media_timestamp(const td::string &str, const td::vector<std::pair<td::Slice, td::int32>> &expected) {
auto result = td::find_media_timestamps(str);
if (result != expected) {
LOG(FATAL) << td::tag("text", str) << td::tag("got", td::format::as_array(result))
<< td::tag("expected", td::format::as_array(expected));
}
}
TEST(MessageEntities, media_timestamp) {
check_media_timestamp("", {});
check_media_timestamp(":", {});
check_media_timestamp(":1", {});
check_media_timestamp("a:1", {});
check_media_timestamp("01", {});
check_media_timestamp("01:", {});
check_media_timestamp("01::", {});
check_media_timestamp("01::", {});
check_media_timestamp("a1:1a", {});
check_media_timestamp("a1::01a", {});
check_media_timestamp("2001:db8::8a2e:f70:13a4", {});
check_media_timestamp("0:00", {{"0:00", 0}});
check_media_timestamp("+0:00", {{"0:00", 0}});
check_media_timestamp("0:00+", {{"0:00", 0}});
check_media_timestamp("a0:00", {});
check_media_timestamp("0:00a", {});
check_media_timestamp("б0:00", {});
check_media_timestamp("0:00б", {});
check_media_timestamp("_0:00", {});
check_media_timestamp("0:00_", {});
check_media_timestamp("00:00:00:00", {});
check_media_timestamp("1:1:01 1:1:1", {{"1:1:01", 3661}});
check_media_timestamp("0:0:00 00:00 000:00 0000:00 00000:00 00:00:00 000:00:00 00:000:00 00:00:000",
{{"0:0:00", 0}, {"00:00", 0}, {"000:00", 0}, {"0000:00", 0}, {"00:00:00", 0}});
check_media_timestamp("00:0:00 0:00:00 00::00 :00:00 00:00: 00:00:0 00:00:", {{"00:0:00", 0}, {"0:00:00", 0}});
check_media_timestamp("1:1:59 1:1:-1 1:1:60", {{"1:1:59", 3719}});
check_media_timestamp("1:59:00 1:-1:00 1:60:00", {{"1:59:00", 7140}, {"1:00", 60}});
check_media_timestamp("59:59 60:00", {{"59:59", 3599}, {"60:00", 3600}});
check_media_timestamp("9999:59 99:59:59 99:60:59", {{"9999:59", 599999}, {"99:59:59", 360000 - 1}});
check_media_timestamp("2001:db8::8a2e:f70:13a4", {});
}
static void check_bank_card_number(const td::string &str, const td::vector<td::string> &expected) { static void check_bank_card_number(const td::string &str, const td::vector<td::string> &expected) {
auto result_slice = td::find_bank_card_numbers(str); auto result_slice = td::find_bank_card_numbers(str);
td::vector<td::string> result; td::vector<td::string> result;