Implement find_media_timestamps.
This commit is contained in:
parent
bdbf4db9ff
commit
14cf908017
@ -434,6 +434,57 @@ static vector<Slice> match_cashtags(Slice str) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static vector<Slice> match_media_timestamps(Slice str) {
|
||||||
|
vector<Slice> result;
|
||||||
|
const unsigned char *begin = str.ubegin();
|
||||||
|
const unsigned char *end = str.uend();
|
||||||
|
const unsigned char *ptr = begin;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
ptr = static_cast<const unsigned char *>(std::memchr(ptr, ':', narrow_cast<int32>(end - ptr)));
|
||||||
|
if (ptr == nullptr) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto media_timestamp_begin = ptr;
|
||||||
|
while (media_timestamp_begin != begin &&
|
||||||
|
(media_timestamp_begin[-1] == ':' || is_digit(media_timestamp_begin[-1]))) {
|
||||||
|
media_timestamp_begin--;
|
||||||
|
}
|
||||||
|
auto media_timestamp_end = ptr;
|
||||||
|
while (media_timestamp_end + 1 != end && (media_timestamp_end[1] == ':' || is_digit(media_timestamp_end[1]))) {
|
||||||
|
media_timestamp_end++;
|
||||||
|
}
|
||||||
|
media_timestamp_end++;
|
||||||
|
|
||||||
|
if (media_timestamp_begin != ptr && media_timestamp_end != ptr + 1 && is_digit(ptr[1])) {
|
||||||
|
ptr = media_timestamp_end;
|
||||||
|
|
||||||
|
if (media_timestamp_begin != begin) {
|
||||||
|
uint32 prev;
|
||||||
|
next_utf8_unsafe(prev_utf8_unsafe(media_timestamp_begin), &prev, "match_media_timestamps 1");
|
||||||
|
|
||||||
|
if (is_word_character(prev)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (media_timestamp_end != end) {
|
||||||
|
uint32 next;
|
||||||
|
next_utf8_unsafe(media_timestamp_end, &next, "match_media_timestamps 2");
|
||||||
|
|
||||||
|
if (is_word_character(next)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.emplace_back(media_timestamp_begin, media_timestamp_end);
|
||||||
|
} else {
|
||||||
|
ptr = media_timestamp_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static vector<Slice> match_bank_card_numbers(Slice str) {
|
static vector<Slice> match_bank_card_numbers(Slice str) {
|
||||||
vector<Slice> result;
|
vector<Slice> result;
|
||||||
const unsigned char *begin = str.ubegin();
|
const unsigned char *begin = str.ubegin();
|
||||||
@ -1251,6 +1302,42 @@ vector<std::pair<Slice, bool>> find_urls(Slice str) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vector<std::pair<Slice, int32>> find_media_timestamps(Slice str) {
|
||||||
|
vector<std::pair<Slice, int32>> result;
|
||||||
|
for (auto media_timestamp : match_media_timestamps(str)) {
|
||||||
|
vector<Slice> parts = full_split(media_timestamp, ':');
|
||||||
|
CHECK(parts.size() >= 2);
|
||||||
|
if (parts.size() > 3 || parts.back().size() != 2) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto seconds = to_integer<int32>(parts.back());
|
||||||
|
if (seconds >= 60) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (parts.size() == 2) {
|
||||||
|
if (parts[0].size() > 4 || parts[0].empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto minutes = to_integer<int32>(parts[0]);
|
||||||
|
result.emplace_back(media_timestamp, minutes * 60 + seconds);
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
if (parts[0].size() > 2 || parts[1].size() > 2 || parts[0].empty() || parts[1].empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto minutes = to_integer<int32>(parts[1]);
|
||||||
|
if (minutes >= 60) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto hours = to_integer<int32>(parts[0]);
|
||||||
|
result.emplace_back(media_timestamp, hours * 3600 + minutes * 60 + seconds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static int32 text_length(Slice text) {
|
static int32 text_length(Slice text) {
|
||||||
return narrow_cast<int32>(utf8_utf16_length(text));
|
return narrow_cast<int32>(utf8_utf16_length(text));
|
||||||
}
|
}
|
||||||
@ -1540,6 +1627,21 @@ vector<MessageEntity> find_entities(Slice text, bool skip_bot_commands) {
|
|||||||
return entities;
|
return entities;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static vector<MessageEntity> find_media_timestamp_entities(Slice text) {
|
||||||
|
vector<MessageEntity> entities;
|
||||||
|
|
||||||
|
auto new_entities = find_media_timestamps(text);
|
||||||
|
for (auto &entity : new_entities) {
|
||||||
|
auto offset = narrow_cast<int32>(entity.first.begin() - text.begin());
|
||||||
|
auto length = narrow_cast<int32>(entity.first.size());
|
||||||
|
entities.emplace_back(MessageEntity::Type::MediaTimestamp, offset, length, to_string(entity.second));
|
||||||
|
}
|
||||||
|
|
||||||
|
fix_entity_offsets(text, entities);
|
||||||
|
|
||||||
|
return entities;
|
||||||
|
}
|
||||||
|
|
||||||
static vector<MessageEntity> merge_entities(vector<MessageEntity> old_entities, vector<MessageEntity> new_entities) {
|
static vector<MessageEntity> merge_entities(vector<MessageEntity> old_entities, vector<MessageEntity> new_entities) {
|
||||||
if (new_entities.empty()) {
|
if (new_entities.empty()) {
|
||||||
return old_entities;
|
return old_entities;
|
||||||
@ -3892,6 +3994,9 @@ Status fix_formatted_text(string &text, vector<MessageEntity> &entities, bool al
|
|||||||
if (!skip_new_entities) {
|
if (!skip_new_entities) {
|
||||||
merge_new_entities(entities, find_entities(text, skip_bot_commands));
|
merge_new_entities(entities, find_entities(text, skip_bot_commands));
|
||||||
}
|
}
|
||||||
|
if (!skip_media_timestamps) {
|
||||||
|
merge_new_entities(entities, find_media_timestamp_entities(text));
|
||||||
|
}
|
||||||
|
|
||||||
// new whitespace-only entities could be added after splitting of entities
|
// new whitespace-only entities could be added after splitting of entities
|
||||||
remove_invalid_entities(text, entities);
|
remove_invalid_entities(text, entities);
|
||||||
|
@ -146,7 +146,8 @@ vector<Slice> find_cashtags(Slice str);
|
|||||||
vector<Slice> find_bank_card_numbers(Slice str);
|
vector<Slice> find_bank_card_numbers(Slice str);
|
||||||
vector<Slice> find_tg_urls(Slice str);
|
vector<Slice> find_tg_urls(Slice str);
|
||||||
bool is_email_address(Slice str);
|
bool is_email_address(Slice str);
|
||||||
vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address
|
vector<std::pair<Slice, bool>> find_urls(Slice str); // slice + is_email_address
|
||||||
|
vector<std::pair<Slice, int32>> find_media_timestamps(Slice str); // slice + media_timestamp
|
||||||
|
|
||||||
string get_first_url(Slice text, const vector<MessageEntity> &entities);
|
string get_first_url(Slice text, const vector<MessageEntity> &entities);
|
||||||
|
|
||||||
|
@ -172,6 +172,47 @@ TEST(MessageEntities, cashtag) {
|
|||||||
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
|
check_cashtag(u8"\u2122$ABC\u2122", {"$ABC"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void check_media_timestamp(const td::string &str, const td::vector<std::pair<td::Slice, td::int32>> &expected) {
|
||||||
|
auto result = td::find_media_timestamps(str);
|
||||||
|
if (result != expected) {
|
||||||
|
LOG(FATAL) << td::tag("text", str) << td::tag("got", td::format::as_array(result))
|
||||||
|
<< td::tag("expected", td::format::as_array(expected));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(MessageEntities, media_timestamp) {
|
||||||
|
check_media_timestamp("", {});
|
||||||
|
check_media_timestamp(":", {});
|
||||||
|
check_media_timestamp(":1", {});
|
||||||
|
check_media_timestamp("a:1", {});
|
||||||
|
check_media_timestamp("01", {});
|
||||||
|
check_media_timestamp("01:", {});
|
||||||
|
check_media_timestamp("01::", {});
|
||||||
|
check_media_timestamp("01::", {});
|
||||||
|
check_media_timestamp("a1:1a", {});
|
||||||
|
check_media_timestamp("a1::01a", {});
|
||||||
|
check_media_timestamp("2001:db8::8a2e:f70:13a4", {});
|
||||||
|
check_media_timestamp("0:00", {{"0:00", 0}});
|
||||||
|
check_media_timestamp("+0:00", {{"0:00", 0}});
|
||||||
|
check_media_timestamp("0:00+", {{"0:00", 0}});
|
||||||
|
check_media_timestamp("a0:00", {});
|
||||||
|
check_media_timestamp("0:00a", {});
|
||||||
|
check_media_timestamp("б0:00", {});
|
||||||
|
check_media_timestamp("0:00б", {});
|
||||||
|
check_media_timestamp("_0:00", {});
|
||||||
|
check_media_timestamp("0:00_", {});
|
||||||
|
check_media_timestamp("00:00:00:00", {});
|
||||||
|
check_media_timestamp("1:1:01 1:1:1", {{"1:1:01", 3661}});
|
||||||
|
check_media_timestamp("0:0:00 00:00 000:00 0000:00 00000:00 00:00:00 000:00:00 00:000:00 00:00:000",
|
||||||
|
{{"0:0:00", 0}, {"00:00", 0}, {"000:00", 0}, {"0000:00", 0}, {"00:00:00", 0}});
|
||||||
|
check_media_timestamp("00:0:00 0:00:00 00::00 :00:00 00:00: 00:00:0 00:00:", {{"00:0:00", 0}, {"0:00:00", 0}});
|
||||||
|
check_media_timestamp("1:1:59 1:1:-1 1:1:60", {{"1:1:59", 3719}});
|
||||||
|
check_media_timestamp("1:59:00 1:-1:00 1:60:00", {{"1:59:00", 7140}, {"1:00", 60}});
|
||||||
|
check_media_timestamp("59:59 60:00", {{"59:59", 3599}, {"60:00", 3600}});
|
||||||
|
check_media_timestamp("9999:59 99:59:59 99:60:59", {{"9999:59", 599999}, {"99:59:59", 360000 - 1}});
|
||||||
|
check_media_timestamp("2001:db8::8a2e:f70:13a4", {});
|
||||||
|
}
|
||||||
|
|
||||||
static void check_bank_card_number(const td::string &str, const td::vector<td::string> &expected) {
|
static void check_bank_card_number(const td::string &str, const td::vector<td::string> &expected) {
|
||||||
auto result_slice = td::find_bank_card_numbers(str);
|
auto result_slice = td::find_bank_card_numbers(str);
|
||||||
td::vector<td::string> result;
|
td::vector<td::string> result;
|
||||||
|
Loading…
Reference in New Issue
Block a user