diff --git a/td/generate/scheme/td_api.tl b/td/generate/scheme/td_api.tl index 764c7dd3a..446e24f61 100644 --- a/td/generate/scheme/td_api.tl +++ b/td/generate/scheme/td_api.tl @@ -321,8 +321,8 @@ video duration:int32 width:int32 height:int32 file_name:string mime_type:string //@description Describes a video note. The video must be equal in width and height, cropped to a circle, and stored in MPEG4 format @duration Duration of the video, in seconds; as defined by the sender //@length Video width and height; as defined by the sender @minithumbnail Video minithumbnail; may be null -//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @video File containing the video -videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail video:file = VideoNote; +//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @speech_recognition_result Result of speech recognition in the video note; may be null @video File containing the video +videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail speech_recognition_result:SpeechRecognitionResult video:file = VideoNote; //@description Describes a voice note. The voice note must be encoded with the Opus codec, and stored inside an OGG container. Voice notes can have only a single audio channel //@duration Duration of the voice note, in seconds; as defined by the sender @waveform A waveform representation of the voice note in 5-bit format @@ -5108,12 +5108,12 @@ getMessageLinkInfo url:string = MessageLinkInfo; //@to_language_code A two-letter ISO 639-1 language code of the language to which the message is translated translateText text:string from_language_code:string to_language_code:string = Text; -//@description Recognizes speech in a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if the voice note is too long to be recognized +//@description Recognizes speech in a video note or a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if media duration is too big to be recognized //@chat_id Identifier of the chat to which the message belongs //@message_id Identifier of the message recognizeSpeech chat_id:int53 message_id:int53 = Ok; -//@description Rates recognized speech in a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good +//@description Rates recognized speech in a video note or a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good rateSpeechRecognition chat_id:int53 message_id:int53 is_good:Bool = Ok; diff --git a/td/telegram/MessageContent.cpp b/td/telegram/MessageContent.cpp index ba0b0aebb..1649a0709 100644 --- a/td/telegram/MessageContent.cpp +++ b/td/telegram/MessageContent.cpp @@ -6188,6 +6188,8 @@ void update_used_hashtags(Td *td, const MessageContent *content) { void recognize_message_content_speech(Td *td, const MessageContent *content, FullMessageId full_message_id, Promise &&promise) { switch (content->get_type()) { + case MessageContentType::VideoNote: + return td->video_notes_manager_->recognize_speech(full_message_id, std::move(promise)); case MessageContentType::VoiceNote: return td->voice_notes_manager_->recognize_speech(full_message_id, std::move(promise)); default: @@ -6198,6 +6200,8 @@ void recognize_message_content_speech(Td *td, const MessageContent *content, Ful void rate_message_content_speech_recognition(Td *td, const MessageContent *content, FullMessageId full_message_id, bool is_good, Promise &&promise) { switch (content->get_type()) { + case MessageContentType::VideoNote: + return td->video_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise)); case MessageContentType::VoiceNote: return td->voice_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise)); default: diff --git a/td/telegram/VideoNotesManager.cpp b/td/telegram/VideoNotesManager.cpp index f08df77b8..cc176c3ee 100644 --- a/td/telegram/VideoNotesManager.cpp +++ b/td/telegram/VideoNotesManager.cpp @@ -9,12 +9,14 @@ #include "td/telegram/AuthManager.h" #include "td/telegram/files/FileManager.h" #include "td/telegram/Global.h" +#include "td/telegram/MessagesManager.h" #include "td/telegram/OptionManager.h" #include "td/telegram/PhotoFormat.h" #include "td/telegram/secret_api.h" #include "td/telegram/Td.h" #include "td/telegram/td_api.h" #include "td/telegram/telegram_api.h" +#include "td/telegram/UpdatesManager.h" #include "td/actor/actor.h" @@ -46,10 +48,13 @@ tl_object_ptr VideoNotesManager::get_video_note_object(FileId } auto video_note = get_video_note(file_id); + auto speech_recognition_result = video_note->transcription_info == nullptr + ? nullptr + : video_note->transcription_info->get_speech_recognition_result_object(); return make_tl_object( video_note->duration, video_note->dimensions.width, get_minithumbnail_object(video_note->minithumbnail), get_thumbnail_object(td_->file_manager_.get(), video_note->thumbnail, PhotoFormat::Jpeg), - td_->file_manager_->get_file_object(file_id)); + std::move(speech_recognition_result), td_->file_manager_->get_file_object(file_id)); } FileId VideoNotesManager::on_get_video_note(unique_ptr new_video_note, bool replace) { @@ -78,10 +83,17 @@ FileId VideoNotesManager::on_get_video_note(unique_ptr new_video_note } v->thumbnail = std::move(new_video_note->thumbnail); } + if (TranscriptionInfo::update_from(v->transcription_info, std::move(new_video_note->transcription_info))) { + on_video_note_transcription_completed(file_id); + } } return file_id; } +VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) { + return video_notes_.get_pointer(file_id); +} + const VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) const { return video_notes_.get_pointer(file_id); } @@ -103,9 +115,14 @@ FileId VideoNotesManager::dup_video_note(FileId new_id, FileId old_id) { CHECK(old_video_note != nullptr); auto &new_video_note = video_notes_[new_id]; CHECK(new_video_note == nullptr); - new_video_note = make_unique(*old_video_note); + new_video_note = make_unique(); new_video_note->file_id = new_id; + new_video_note->duration = old_video_note->duration; + new_video_note->dimensions = old_video_note->dimensions; + new_video_note->minithumbnail = old_video_note->minithumbnail; + new_video_note->thumbnail = old_video_note->thumbnail; new_video_note->thumbnail.file_id = td_->file_manager_->dup_file_id(new_video_note->thumbnail.file_id); + new_video_note->transcription_info = TranscriptionInfo::copy_if_transcribed(old_video_note->transcription_info); return new_id; } @@ -173,6 +190,97 @@ void VideoNotesManager::unregister_video_note(FileId video_note_file_id, FullMes CHECK(is_deleted); } +void VideoNotesManager::recognize_speech(FullMessageId full_message_id, Promise &&promise) { + auto it = message_video_notes_.find(full_message_id); + CHECK(it != message_video_notes_.end()); + + auto file_id = it->second; + auto video_note = get_video_note(file_id); + CHECK(video_note != nullptr); + if (video_note->transcription_info == nullptr) { + video_note->transcription_info = make_unique(); + } + + auto handler = [actor_id = actor_id(this), + file_id](Result> r_update) { + send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, true, std::move(r_update)); + }; + if (video_note->transcription_info->recognize_speech(td_, full_message_id, std::move(promise), std::move(handler))) { + on_video_note_transcription_updated(file_id); + } +} + +void VideoNotesManager::on_transcribed_audio_update( + FileId file_id, bool is_initial, Result> r_update) { + if (G()->close_flag()) { + return; + } + + auto video_note = get_video_note(file_id); + CHECK(video_note != nullptr); + CHECK(video_note->transcription_info != nullptr); + + if (r_update.is_error()) { + auto promises = video_note->transcription_info->on_failed_transcription(r_update.error().clone()); + on_video_note_transcription_updated(file_id); + fail_promises(promises, r_update.move_as_error()); + return; + } + auto update = r_update.move_as_ok(); + auto transcription_id = update->transcription_id_; + if (!update->pending_) { + auto promises = video_note->transcription_info->on_final_transcription(std::move(update->text_), transcription_id); + on_video_note_transcription_completed(file_id); + set_promises(promises); + } else { + auto is_changed = + video_note->transcription_info->on_partial_transcription(std::move(update->text_), transcription_id); + if (is_changed) { + on_video_note_transcription_updated(file_id); + } + + if (is_initial) { + td_->updates_manager_->subscribe_to_transcribed_audio_updates( + transcription_id, [actor_id = actor_id(this), + file_id](Result> r_update) { + send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, false, + std::move(r_update)); + }); + } + } +} + +void VideoNotesManager::on_video_note_transcription_updated(FileId file_id) { + auto it = video_note_messages_.find(file_id); + if (it != video_note_messages_.end()) { + for (const auto &full_message_id : it->second) { + td_->messages_manager_->on_external_update_message_content(full_message_id); + } + } +} + +void VideoNotesManager::on_video_note_transcription_completed(FileId file_id) { + auto it = video_note_messages_.find(file_id); + if (it != video_note_messages_.end()) { + for (const auto &full_message_id : it->second) { + td_->messages_manager_->on_update_message_content(full_message_id); + } + } +} + +void VideoNotesManager::rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise &&promise) { + auto it = message_video_notes_.find(full_message_id); + CHECK(it != message_video_notes_.end()); + + auto file_id = it->second; + auto video_note = get_video_note(file_id); + CHECK(video_note != nullptr); + if (video_note->transcription_info == nullptr) { + return promise.set_value(Unit()); + } + video_note->transcription_info->rate_speech_recognition(td_, full_message_id, is_good, std::move(promise)); +} + SecretInputMedia VideoNotesManager::get_secret_input_media(FileId video_note_file_id, tl_object_ptr input_file, BufferSlice thumbnail, int32 layer) const { diff --git a/td/telegram/VideoNotesManager.h b/td/telegram/VideoNotesManager.h index a3a56ce10..f97dbf1b7 100644 --- a/td/telegram/VideoNotesManager.h +++ b/td/telegram/VideoNotesManager.h @@ -13,6 +13,7 @@ #include "td/telegram/SecretInputMedia.h" #include "td/telegram/td_api.h" #include "td/telegram/telegram_api.h" +#include "td/telegram/TranscriptionInfo.h" #include "td/actor/actor.h" @@ -46,6 +47,10 @@ class VideoNotesManager final : public Actor { void unregister_video_note(FileId video_note_file_id, FullMessageId full_message_id, const char *source); + void recognize_speech(FullMessageId full_message_id, Promise &&promise); + + void rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise &&promise); + tl_object_ptr get_input_media(FileId file_id, tl_object_ptr input_file, tl_object_ptr input_thumbnail) const; @@ -75,14 +80,24 @@ class VideoNotesManager final : public Actor { Dimensions dimensions; string minithumbnail; PhotoSize thumbnail; + unique_ptr transcription_info; FileId file_id; }; + VideoNote *get_video_note(FileId file_id); + const VideoNote *get_video_note(FileId file_id) const; FileId on_get_video_note(unique_ptr new_video_note, bool replace); + void on_video_note_transcription_updated(FileId file_id); + + void on_video_note_transcription_completed(FileId file_id); + + void on_transcribed_audio_update(FileId file_id, bool is_initial, + Result> r_update); + void tear_down() final; Td *td_; diff --git a/td/telegram/VideoNotesManager.hpp b/td/telegram/VideoNotesManager.hpp index 819451037..cdfa23140 100644 --- a/td/telegram/VideoNotesManager.hpp +++ b/td/telegram/VideoNotesManager.hpp @@ -24,10 +24,12 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const bool has_duration = video_note->duration != 0; bool has_minithumbnail = !video_note->minithumbnail.empty(); bool has_thumbnail = video_note->thumbnail.file_id.is_valid(); + bool is_transcribed = video_note->transcription_info != nullptr && video_note->transcription_info->is_transcribed(); BEGIN_STORE_FLAGS(); STORE_FLAG(has_duration); STORE_FLAG(has_minithumbnail); STORE_FLAG(has_thumbnail); + STORE_FLAG(is_transcribed); END_STORE_FLAGS(); if (has_duration) { store(video_note->duration, storer); @@ -39,6 +41,9 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const if (has_thumbnail) { store(video_note->thumbnail, storer); } + if (is_transcribed) { + store(video_note->transcription_info, storer); + } store(file_id, storer); } @@ -48,16 +53,19 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) { bool has_duration; bool has_minithumbnail; bool has_thumbnail; + bool is_transcribed; if (parser.version() >= static_cast(Version::AddVideoNoteFlags)) { BEGIN_PARSE_FLAGS(); PARSE_FLAG(has_duration); PARSE_FLAG(has_minithumbnail); PARSE_FLAG(has_thumbnail); + PARSE_FLAG(is_transcribed); END_PARSE_FLAGS(); } else { has_duration = true; has_minithumbnail = parser.version() >= static_cast(Version::SupportMinithumbnails); has_thumbnail = true; + is_transcribed = false; } if (has_duration) { parse(video_note->duration, parser); @@ -69,6 +77,9 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) { if (has_thumbnail) { parse(video_note->thumbnail, parser); } + if (is_transcribed) { + parse(video_note->transcription_info, parser); + } parse(video_note->file_id, parser); if (parser.get_error() != nullptr || !video_note->file_id.is_valid()) { return FileId();