Support speech recognition for video notes.
This commit is contained in:
parent
c23dceddf3
commit
14b80ecd6f
@ -321,8 +321,8 @@ video duration:int32 width:int32 height:int32 file_name:string mime_type:string
|
||||
|
||||
//@description Describes a video note. The video must be equal in width and height, cropped to a circle, and stored in MPEG4 format @duration Duration of the video, in seconds; as defined by the sender
|
||||
//@length Video width and height; as defined by the sender @minithumbnail Video minithumbnail; may be null
|
||||
//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @video File containing the video
|
||||
videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail video:file = VideoNote;
|
||||
//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @speech_recognition_result Result of speech recognition in the video note; may be null @video File containing the video
|
||||
videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail speech_recognition_result:SpeechRecognitionResult video:file = VideoNote;
|
||||
|
||||
//@description Describes a voice note. The voice note must be encoded with the Opus codec, and stored inside an OGG container. Voice notes can have only a single audio channel
|
||||
//@duration Duration of the voice note, in seconds; as defined by the sender @waveform A waveform representation of the voice note in 5-bit format
|
||||
@ -5108,12 +5108,12 @@ getMessageLinkInfo url:string = MessageLinkInfo;
|
||||
//@to_language_code A two-letter ISO 639-1 language code of the language to which the message is translated
|
||||
translateText text:string from_language_code:string to_language_code:string = Text;
|
||||
|
||||
//@description Recognizes speech in a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if the voice note is too long to be recognized
|
||||
//@description Recognizes speech in a video note or a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if media duration is too big to be recognized
|
||||
//@chat_id Identifier of the chat to which the message belongs
|
||||
//@message_id Identifier of the message
|
||||
recognizeSpeech chat_id:int53 message_id:int53 = Ok;
|
||||
|
||||
//@description Rates recognized speech in a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
|
||||
//@description Rates recognized speech in a video note or a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
|
||||
rateSpeechRecognition chat_id:int53 message_id:int53 is_good:Bool = Ok;
|
||||
|
||||
|
||||
|
@ -6188,6 +6188,8 @@ void update_used_hashtags(Td *td, const MessageContent *content) {
|
||||
void recognize_message_content_speech(Td *td, const MessageContent *content, FullMessageId full_message_id,
|
||||
Promise<Unit> &&promise) {
|
||||
switch (content->get_type()) {
|
||||
case MessageContentType::VideoNote:
|
||||
return td->video_notes_manager_->recognize_speech(full_message_id, std::move(promise));
|
||||
case MessageContentType::VoiceNote:
|
||||
return td->voice_notes_manager_->recognize_speech(full_message_id, std::move(promise));
|
||||
default:
|
||||
@ -6198,6 +6200,8 @@ void recognize_message_content_speech(Td *td, const MessageContent *content, Ful
|
||||
void rate_message_content_speech_recognition(Td *td, const MessageContent *content, FullMessageId full_message_id,
|
||||
bool is_good, Promise<Unit> &&promise) {
|
||||
switch (content->get_type()) {
|
||||
case MessageContentType::VideoNote:
|
||||
return td->video_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
|
||||
case MessageContentType::VoiceNote:
|
||||
return td->voice_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
|
||||
default:
|
||||
|
@ -9,12 +9,14 @@
|
||||
#include "td/telegram/AuthManager.h"
|
||||
#include "td/telegram/files/FileManager.h"
|
||||
#include "td/telegram/Global.h"
|
||||
#include "td/telegram/MessagesManager.h"
|
||||
#include "td/telegram/OptionManager.h"
|
||||
#include "td/telegram/PhotoFormat.h"
|
||||
#include "td/telegram/secret_api.h"
|
||||
#include "td/telegram/Td.h"
|
||||
#include "td/telegram/td_api.h"
|
||||
#include "td/telegram/telegram_api.h"
|
||||
#include "td/telegram/UpdatesManager.h"
|
||||
|
||||
#include "td/actor/actor.h"
|
||||
|
||||
@ -46,10 +48,13 @@ tl_object_ptr<td_api::videoNote> VideoNotesManager::get_video_note_object(FileId
|
||||
}
|
||||
|
||||
auto video_note = get_video_note(file_id);
|
||||
auto speech_recognition_result = video_note->transcription_info == nullptr
|
||||
? nullptr
|
||||
: video_note->transcription_info->get_speech_recognition_result_object();
|
||||
return make_tl_object<td_api::videoNote>(
|
||||
video_note->duration, video_note->dimensions.width, get_minithumbnail_object(video_note->minithumbnail),
|
||||
get_thumbnail_object(td_->file_manager_.get(), video_note->thumbnail, PhotoFormat::Jpeg),
|
||||
td_->file_manager_->get_file_object(file_id));
|
||||
std::move(speech_recognition_result), td_->file_manager_->get_file_object(file_id));
|
||||
}
|
||||
|
||||
FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace) {
|
||||
@ -78,10 +83,17 @@ FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note
|
||||
}
|
||||
v->thumbnail = std::move(new_video_note->thumbnail);
|
||||
}
|
||||
if (TranscriptionInfo::update_from(v->transcription_info, std::move(new_video_note->transcription_info))) {
|
||||
on_video_note_transcription_completed(file_id);
|
||||
}
|
||||
}
|
||||
return file_id;
|
||||
}
|
||||
|
||||
VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) {
|
||||
return video_notes_.get_pointer(file_id);
|
||||
}
|
||||
|
||||
const VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) const {
|
||||
return video_notes_.get_pointer(file_id);
|
||||
}
|
||||
@ -103,9 +115,14 @@ FileId VideoNotesManager::dup_video_note(FileId new_id, FileId old_id) {
|
||||
CHECK(old_video_note != nullptr);
|
||||
auto &new_video_note = video_notes_[new_id];
|
||||
CHECK(new_video_note == nullptr);
|
||||
new_video_note = make_unique<VideoNote>(*old_video_note);
|
||||
new_video_note = make_unique<VideoNote>();
|
||||
new_video_note->file_id = new_id;
|
||||
new_video_note->duration = old_video_note->duration;
|
||||
new_video_note->dimensions = old_video_note->dimensions;
|
||||
new_video_note->minithumbnail = old_video_note->minithumbnail;
|
||||
new_video_note->thumbnail = old_video_note->thumbnail;
|
||||
new_video_note->thumbnail.file_id = td_->file_manager_->dup_file_id(new_video_note->thumbnail.file_id);
|
||||
new_video_note->transcription_info = TranscriptionInfo::copy_if_transcribed(old_video_note->transcription_info);
|
||||
return new_id;
|
||||
}
|
||||
|
||||
@ -173,6 +190,97 @@ void VideoNotesManager::unregister_video_note(FileId video_note_file_id, FullMes
|
||||
CHECK(is_deleted);
|
||||
}
|
||||
|
||||
void VideoNotesManager::recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise) {
|
||||
auto it = message_video_notes_.find(full_message_id);
|
||||
CHECK(it != message_video_notes_.end());
|
||||
|
||||
auto file_id = it->second;
|
||||
auto video_note = get_video_note(file_id);
|
||||
CHECK(video_note != nullptr);
|
||||
if (video_note->transcription_info == nullptr) {
|
||||
video_note->transcription_info = make_unique<TranscriptionInfo>();
|
||||
}
|
||||
|
||||
auto handler = [actor_id = actor_id(this),
|
||||
file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
|
||||
send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, true, std::move(r_update));
|
||||
};
|
||||
if (video_note->transcription_info->recognize_speech(td_, full_message_id, std::move(promise), std::move(handler))) {
|
||||
on_video_note_transcription_updated(file_id);
|
||||
}
|
||||
}
|
||||
|
||||
void VideoNotesManager::on_transcribed_audio_update(
|
||||
FileId file_id, bool is_initial, Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
|
||||
if (G()->close_flag()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto video_note = get_video_note(file_id);
|
||||
CHECK(video_note != nullptr);
|
||||
CHECK(video_note->transcription_info != nullptr);
|
||||
|
||||
if (r_update.is_error()) {
|
||||
auto promises = video_note->transcription_info->on_failed_transcription(r_update.error().clone());
|
||||
on_video_note_transcription_updated(file_id);
|
||||
fail_promises(promises, r_update.move_as_error());
|
||||
return;
|
||||
}
|
||||
auto update = r_update.move_as_ok();
|
||||
auto transcription_id = update->transcription_id_;
|
||||
if (!update->pending_) {
|
||||
auto promises = video_note->transcription_info->on_final_transcription(std::move(update->text_), transcription_id);
|
||||
on_video_note_transcription_completed(file_id);
|
||||
set_promises(promises);
|
||||
} else {
|
||||
auto is_changed =
|
||||
video_note->transcription_info->on_partial_transcription(std::move(update->text_), transcription_id);
|
||||
if (is_changed) {
|
||||
on_video_note_transcription_updated(file_id);
|
||||
}
|
||||
|
||||
if (is_initial) {
|
||||
td_->updates_manager_->subscribe_to_transcribed_audio_updates(
|
||||
transcription_id, [actor_id = actor_id(this),
|
||||
file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
|
||||
send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, false,
|
||||
std::move(r_update));
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VideoNotesManager::on_video_note_transcription_updated(FileId file_id) {
|
||||
auto it = video_note_messages_.find(file_id);
|
||||
if (it != video_note_messages_.end()) {
|
||||
for (const auto &full_message_id : it->second) {
|
||||
td_->messages_manager_->on_external_update_message_content(full_message_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VideoNotesManager::on_video_note_transcription_completed(FileId file_id) {
|
||||
auto it = video_note_messages_.find(file_id);
|
||||
if (it != video_note_messages_.end()) {
|
||||
for (const auto &full_message_id : it->second) {
|
||||
td_->messages_manager_->on_update_message_content(full_message_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void VideoNotesManager::rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise) {
|
||||
auto it = message_video_notes_.find(full_message_id);
|
||||
CHECK(it != message_video_notes_.end());
|
||||
|
||||
auto file_id = it->second;
|
||||
auto video_note = get_video_note(file_id);
|
||||
CHECK(video_note != nullptr);
|
||||
if (video_note->transcription_info == nullptr) {
|
||||
return promise.set_value(Unit());
|
||||
}
|
||||
video_note->transcription_info->rate_speech_recognition(td_, full_message_id, is_good, std::move(promise));
|
||||
}
|
||||
|
||||
SecretInputMedia VideoNotesManager::get_secret_input_media(FileId video_note_file_id,
|
||||
tl_object_ptr<telegram_api::InputEncryptedFile> input_file,
|
||||
BufferSlice thumbnail, int32 layer) const {
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "td/telegram/SecretInputMedia.h"
|
||||
#include "td/telegram/td_api.h"
|
||||
#include "td/telegram/telegram_api.h"
|
||||
#include "td/telegram/TranscriptionInfo.h"
|
||||
|
||||
#include "td/actor/actor.h"
|
||||
|
||||
@ -46,6 +47,10 @@ class VideoNotesManager final : public Actor {
|
||||
|
||||
void unregister_video_note(FileId video_note_file_id, FullMessageId full_message_id, const char *source);
|
||||
|
||||
void recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise);
|
||||
|
||||
void rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise);
|
||||
|
||||
tl_object_ptr<telegram_api::InputMedia> get_input_media(FileId file_id,
|
||||
tl_object_ptr<telegram_api::InputFile> input_file,
|
||||
tl_object_ptr<telegram_api::InputFile> input_thumbnail) const;
|
||||
@ -75,14 +80,24 @@ class VideoNotesManager final : public Actor {
|
||||
Dimensions dimensions;
|
||||
string minithumbnail;
|
||||
PhotoSize thumbnail;
|
||||
unique_ptr<TranscriptionInfo> transcription_info;
|
||||
|
||||
FileId file_id;
|
||||
};
|
||||
|
||||
VideoNote *get_video_note(FileId file_id);
|
||||
|
||||
const VideoNote *get_video_note(FileId file_id) const;
|
||||
|
||||
FileId on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace);
|
||||
|
||||
void on_video_note_transcription_updated(FileId file_id);
|
||||
|
||||
void on_video_note_transcription_completed(FileId file_id);
|
||||
|
||||
void on_transcribed_audio_update(FileId file_id, bool is_initial,
|
||||
Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update);
|
||||
|
||||
void tear_down() final;
|
||||
|
||||
Td *td_;
|
||||
|
@ -24,10 +24,12 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
|
||||
bool has_duration = video_note->duration != 0;
|
||||
bool has_minithumbnail = !video_note->minithumbnail.empty();
|
||||
bool has_thumbnail = video_note->thumbnail.file_id.is_valid();
|
||||
bool is_transcribed = video_note->transcription_info != nullptr && video_note->transcription_info->is_transcribed();
|
||||
BEGIN_STORE_FLAGS();
|
||||
STORE_FLAG(has_duration);
|
||||
STORE_FLAG(has_minithumbnail);
|
||||
STORE_FLAG(has_thumbnail);
|
||||
STORE_FLAG(is_transcribed);
|
||||
END_STORE_FLAGS();
|
||||
if (has_duration) {
|
||||
store(video_note->duration, storer);
|
||||
@ -39,6 +41,9 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
|
||||
if (has_thumbnail) {
|
||||
store(video_note->thumbnail, storer);
|
||||
}
|
||||
if (is_transcribed) {
|
||||
store(video_note->transcription_info, storer);
|
||||
}
|
||||
store(file_id, storer);
|
||||
}
|
||||
|
||||
@ -48,16 +53,19 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
|
||||
bool has_duration;
|
||||
bool has_minithumbnail;
|
||||
bool has_thumbnail;
|
||||
bool is_transcribed;
|
||||
if (parser.version() >= static_cast<int32>(Version::AddVideoNoteFlags)) {
|
||||
BEGIN_PARSE_FLAGS();
|
||||
PARSE_FLAG(has_duration);
|
||||
PARSE_FLAG(has_minithumbnail);
|
||||
PARSE_FLAG(has_thumbnail);
|
||||
PARSE_FLAG(is_transcribed);
|
||||
END_PARSE_FLAGS();
|
||||
} else {
|
||||
has_duration = true;
|
||||
has_minithumbnail = parser.version() >= static_cast<int32>(Version::SupportMinithumbnails);
|
||||
has_thumbnail = true;
|
||||
is_transcribed = false;
|
||||
}
|
||||
if (has_duration) {
|
||||
parse(video_note->duration, parser);
|
||||
@ -69,6 +77,9 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
|
||||
if (has_thumbnail) {
|
||||
parse(video_note->thumbnail, parser);
|
||||
}
|
||||
if (is_transcribed) {
|
||||
parse(video_note->transcription_info, parser);
|
||||
}
|
||||
parse(video_note->file_id, parser);
|
||||
if (parser.get_error() != nullptr || !video_note->file_id.is_valid()) {
|
||||
return FileId();
|
||||
|
Loading…
Reference in New Issue
Block a user