Support speech recognition for video notes.

This commit is contained in:
levlam 2022-10-20 21:31:00 +03:00
parent c23dceddf3
commit 14b80ecd6f
5 changed files with 144 additions and 6 deletions

View File

@ -321,8 +321,8 @@ video duration:int32 width:int32 height:int32 file_name:string mime_type:string
//@description Describes a video note. The video must be equal in width and height, cropped to a circle, and stored in MPEG4 format @duration Duration of the video, in seconds; as defined by the sender
//@length Video width and height; as defined by the sender @minithumbnail Video minithumbnail; may be null
//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @video File containing the video
videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail video:file = VideoNote;
//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @speech_recognition_result Result of speech recognition in the video note; may be null @video File containing the video
videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail speech_recognition_result:SpeechRecognitionResult video:file = VideoNote;
//@description Describes a voice note. The voice note must be encoded with the Opus codec, and stored inside an OGG container. Voice notes can have only a single audio channel
//@duration Duration of the voice note, in seconds; as defined by the sender @waveform A waveform representation of the voice note in 5-bit format
@ -5108,12 +5108,12 @@ getMessageLinkInfo url:string = MessageLinkInfo;
//@to_language_code A two-letter ISO 639-1 language code of the language to which the message is translated
translateText text:string from_language_code:string to_language_code:string = Text;
//@description Recognizes speech in a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if the voice note is too long to be recognized
//@description Recognizes speech in a video note or a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if media duration is too big to be recognized
//@chat_id Identifier of the chat to which the message belongs
//@message_id Identifier of the message
recognizeSpeech chat_id:int53 message_id:int53 = Ok;
//@description Rates recognized speech in a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
//@description Rates recognized speech in a video note or a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
rateSpeechRecognition chat_id:int53 message_id:int53 is_good:Bool = Ok;

View File

@ -6188,6 +6188,8 @@ void update_used_hashtags(Td *td, const MessageContent *content) {
void recognize_message_content_speech(Td *td, const MessageContent *content, FullMessageId full_message_id,
Promise<Unit> &&promise) {
switch (content->get_type()) {
case MessageContentType::VideoNote:
return td->video_notes_manager_->recognize_speech(full_message_id, std::move(promise));
case MessageContentType::VoiceNote:
return td->voice_notes_manager_->recognize_speech(full_message_id, std::move(promise));
default:
@ -6198,6 +6200,8 @@ void recognize_message_content_speech(Td *td, const MessageContent *content, Ful
void rate_message_content_speech_recognition(Td *td, const MessageContent *content, FullMessageId full_message_id,
bool is_good, Promise<Unit> &&promise) {
switch (content->get_type()) {
case MessageContentType::VideoNote:
return td->video_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
case MessageContentType::VoiceNote:
return td->voice_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
default:

View File

@ -9,12 +9,14 @@
#include "td/telegram/AuthManager.h"
#include "td/telegram/files/FileManager.h"
#include "td/telegram/Global.h"
#include "td/telegram/MessagesManager.h"
#include "td/telegram/OptionManager.h"
#include "td/telegram/PhotoFormat.h"
#include "td/telegram/secret_api.h"
#include "td/telegram/Td.h"
#include "td/telegram/td_api.h"
#include "td/telegram/telegram_api.h"
#include "td/telegram/UpdatesManager.h"
#include "td/actor/actor.h"
@ -46,10 +48,13 @@ tl_object_ptr<td_api::videoNote> VideoNotesManager::get_video_note_object(FileId
}
auto video_note = get_video_note(file_id);
auto speech_recognition_result = video_note->transcription_info == nullptr
? nullptr
: video_note->transcription_info->get_speech_recognition_result_object();
return make_tl_object<td_api::videoNote>(
video_note->duration, video_note->dimensions.width, get_minithumbnail_object(video_note->minithumbnail),
get_thumbnail_object(td_->file_manager_.get(), video_note->thumbnail, PhotoFormat::Jpeg),
td_->file_manager_->get_file_object(file_id));
std::move(speech_recognition_result), td_->file_manager_->get_file_object(file_id));
}
FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace) {
@ -78,10 +83,17 @@ FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note
}
v->thumbnail = std::move(new_video_note->thumbnail);
}
if (TranscriptionInfo::update_from(v->transcription_info, std::move(new_video_note->transcription_info))) {
on_video_note_transcription_completed(file_id);
}
}
return file_id;
}
VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) {
return video_notes_.get_pointer(file_id);
}
const VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) const {
return video_notes_.get_pointer(file_id);
}
@ -103,9 +115,14 @@ FileId VideoNotesManager::dup_video_note(FileId new_id, FileId old_id) {
CHECK(old_video_note != nullptr);
auto &new_video_note = video_notes_[new_id];
CHECK(new_video_note == nullptr);
new_video_note = make_unique<VideoNote>(*old_video_note);
new_video_note = make_unique<VideoNote>();
new_video_note->file_id = new_id;
new_video_note->duration = old_video_note->duration;
new_video_note->dimensions = old_video_note->dimensions;
new_video_note->minithumbnail = old_video_note->minithumbnail;
new_video_note->thumbnail = old_video_note->thumbnail;
new_video_note->thumbnail.file_id = td_->file_manager_->dup_file_id(new_video_note->thumbnail.file_id);
new_video_note->transcription_info = TranscriptionInfo::copy_if_transcribed(old_video_note->transcription_info);
return new_id;
}
@ -173,6 +190,97 @@ void VideoNotesManager::unregister_video_note(FileId video_note_file_id, FullMes
CHECK(is_deleted);
}
void VideoNotesManager::recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise) {
auto it = message_video_notes_.find(full_message_id);
CHECK(it != message_video_notes_.end());
auto file_id = it->second;
auto video_note = get_video_note(file_id);
CHECK(video_note != nullptr);
if (video_note->transcription_info == nullptr) {
video_note->transcription_info = make_unique<TranscriptionInfo>();
}
auto handler = [actor_id = actor_id(this),
file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, true, std::move(r_update));
};
if (video_note->transcription_info->recognize_speech(td_, full_message_id, std::move(promise), std::move(handler))) {
on_video_note_transcription_updated(file_id);
}
}
void VideoNotesManager::on_transcribed_audio_update(
FileId file_id, bool is_initial, Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
if (G()->close_flag()) {
return;
}
auto video_note = get_video_note(file_id);
CHECK(video_note != nullptr);
CHECK(video_note->transcription_info != nullptr);
if (r_update.is_error()) {
auto promises = video_note->transcription_info->on_failed_transcription(r_update.error().clone());
on_video_note_transcription_updated(file_id);
fail_promises(promises, r_update.move_as_error());
return;
}
auto update = r_update.move_as_ok();
auto transcription_id = update->transcription_id_;
if (!update->pending_) {
auto promises = video_note->transcription_info->on_final_transcription(std::move(update->text_), transcription_id);
on_video_note_transcription_completed(file_id);
set_promises(promises);
} else {
auto is_changed =
video_note->transcription_info->on_partial_transcription(std::move(update->text_), transcription_id);
if (is_changed) {
on_video_note_transcription_updated(file_id);
}
if (is_initial) {
td_->updates_manager_->subscribe_to_transcribed_audio_updates(
transcription_id, [actor_id = actor_id(this),
file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, false,
std::move(r_update));
});
}
}
}
void VideoNotesManager::on_video_note_transcription_updated(FileId file_id) {
auto it = video_note_messages_.find(file_id);
if (it != video_note_messages_.end()) {
for (const auto &full_message_id : it->second) {
td_->messages_manager_->on_external_update_message_content(full_message_id);
}
}
}
void VideoNotesManager::on_video_note_transcription_completed(FileId file_id) {
auto it = video_note_messages_.find(file_id);
if (it != video_note_messages_.end()) {
for (const auto &full_message_id : it->second) {
td_->messages_manager_->on_update_message_content(full_message_id);
}
}
}
void VideoNotesManager::rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise) {
auto it = message_video_notes_.find(full_message_id);
CHECK(it != message_video_notes_.end());
auto file_id = it->second;
auto video_note = get_video_note(file_id);
CHECK(video_note != nullptr);
if (video_note->transcription_info == nullptr) {
return promise.set_value(Unit());
}
video_note->transcription_info->rate_speech_recognition(td_, full_message_id, is_good, std::move(promise));
}
SecretInputMedia VideoNotesManager::get_secret_input_media(FileId video_note_file_id,
tl_object_ptr<telegram_api::InputEncryptedFile> input_file,
BufferSlice thumbnail, int32 layer) const {

View File

@ -13,6 +13,7 @@
#include "td/telegram/SecretInputMedia.h"
#include "td/telegram/td_api.h"
#include "td/telegram/telegram_api.h"
#include "td/telegram/TranscriptionInfo.h"
#include "td/actor/actor.h"
@ -46,6 +47,10 @@ class VideoNotesManager final : public Actor {
void unregister_video_note(FileId video_note_file_id, FullMessageId full_message_id, const char *source);
void recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise);
void rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise);
tl_object_ptr<telegram_api::InputMedia> get_input_media(FileId file_id,
tl_object_ptr<telegram_api::InputFile> input_file,
tl_object_ptr<telegram_api::InputFile> input_thumbnail) const;
@ -75,14 +80,24 @@ class VideoNotesManager final : public Actor {
Dimensions dimensions;
string minithumbnail;
PhotoSize thumbnail;
unique_ptr<TranscriptionInfo> transcription_info;
FileId file_id;
};
VideoNote *get_video_note(FileId file_id);
const VideoNote *get_video_note(FileId file_id) const;
FileId on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace);
void on_video_note_transcription_updated(FileId file_id);
void on_video_note_transcription_completed(FileId file_id);
void on_transcribed_audio_update(FileId file_id, bool is_initial,
Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update);
void tear_down() final;
Td *td_;

View File

@ -24,10 +24,12 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
bool has_duration = video_note->duration != 0;
bool has_minithumbnail = !video_note->minithumbnail.empty();
bool has_thumbnail = video_note->thumbnail.file_id.is_valid();
bool is_transcribed = video_note->transcription_info != nullptr && video_note->transcription_info->is_transcribed();
BEGIN_STORE_FLAGS();
STORE_FLAG(has_duration);
STORE_FLAG(has_minithumbnail);
STORE_FLAG(has_thumbnail);
STORE_FLAG(is_transcribed);
END_STORE_FLAGS();
if (has_duration) {
store(video_note->duration, storer);
@ -39,6 +41,9 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
if (has_thumbnail) {
store(video_note->thumbnail, storer);
}
if (is_transcribed) {
store(video_note->transcription_info, storer);
}
store(file_id, storer);
}
@ -48,16 +53,19 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
bool has_duration;
bool has_minithumbnail;
bool has_thumbnail;
bool is_transcribed;
if (parser.version() >= static_cast<int32>(Version::AddVideoNoteFlags)) {
BEGIN_PARSE_FLAGS();
PARSE_FLAG(has_duration);
PARSE_FLAG(has_minithumbnail);
PARSE_FLAG(has_thumbnail);
PARSE_FLAG(is_transcribed);
END_PARSE_FLAGS();
} else {
has_duration = true;
has_minithumbnail = parser.version() >= static_cast<int32>(Version::SupportMinithumbnails);
has_thumbnail = true;
is_transcribed = false;
}
if (has_duration) {
parse(video_note->duration, parser);
@ -69,6 +77,9 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
if (has_thumbnail) {
parse(video_note->thumbnail, parser);
}
if (is_transcribed) {
parse(video_note->transcription_info, parser);
}
parse(video_note->file_id, parser);
if (parser.get_error() != nullptr || !video_note->file_id.is_valid()) {
return FileId();