Support speech recognition for video notes.

2022-10-20 21:31:00 +03:00 · 2022-10-20 21:31:00 +03:00 · 14b80ecd6f
commit 14b80ecd6f
parent c23dceddf3
5 changed files with 144 additions and 6 deletions
--- a/td/generate/scheme/td_api.tl
+++ b/td/generate/scheme/td_api.tl
@ -321,8 +321,8 @@ video duration:int32 width:int32 height:int32 file_name:string mime_type:string

 //@description Describes a video note. The video must be equal in width and height, cropped to a circle, and stored in MPEG4 format @duration Duration of the video, in seconds; as defined by the sender
 //@length Video width and height; as defined by the sender @minithumbnail Video minithumbnail; may be null
-//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @video File containing the video
-videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail video:file = VideoNote;
+//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @speech_recognition_result Result of speech recognition in the video note; may be null @video File containing the video
+videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail speech_recognition_result:SpeechRecognitionResult video:file = VideoNote;

 //@description Describes a voice note. The voice note must be encoded with the Opus codec, and stored inside an OGG container. Voice notes can have only a single audio channel
 //@duration Duration of the voice note, in seconds; as defined by the sender @waveform A waveform representation of the voice note in 5-bit format
@ -5108,12 +5108,12 @@ getMessageLinkInfo url:string = MessageLinkInfo;
 //@to_language_code A two-letter ISO 639-1 language code of the language to which the message is translated
 translateText text:string from_language_code:string to_language_code:string = Text;

-//@description Recognizes speech in a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if the voice note is too long to be recognized
+//@description Recognizes speech in a video note or a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if media duration is too big to be recognized
 //@chat_id Identifier of the chat to which the message belongs
 //@message_id Identifier of the message
 recognizeSpeech chat_id:int53 message_id:int53 = Ok;

-//@description Rates recognized speech in a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
+//@description Rates recognized speech in a video note or a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
 rateSpeechRecognition chat_id:int53 message_id:int53 is_good:Bool = Ok;


--- a/td/telegram/MessageContent.cpp
+++ b/td/telegram/MessageContent.cpp
@ -6188,6 +6188,8 @@ void update_used_hashtags(Td *td, const MessageContent *content) {
 void recognize_message_content_speech(Td *td, const MessageContent *content, FullMessageId full_message_id,
                                      Promise<Unit> &&promise) {
  switch (content->get_type()) {
+    case MessageContentType::VideoNote:
+      return td->video_notes_manager_->recognize_speech(full_message_id, std::move(promise));
    case MessageContentType::VoiceNote:
      return td->voice_notes_manager_->recognize_speech(full_message_id, std::move(promise));
    default:
@ -6198,6 +6200,8 @@ void recognize_message_content_speech(Td *td, const MessageContent *content, Ful
 void rate_message_content_speech_recognition(Td *td, const MessageContent *content, FullMessageId full_message_id,
                                             bool is_good, Promise<Unit> &&promise) {
  switch (content->get_type()) {
+    case MessageContentType::VideoNote:
+      return td->video_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
    case MessageContentType::VoiceNote:
      return td->voice_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
    default:
--- a/td/telegram/VideoNotesManager.cpp
+++ b/td/telegram/VideoNotesManager.cpp
@ -9,12 +9,14 @@
 #include "td/telegram/AuthManager.h"
 #include "td/telegram/files/FileManager.h"
 #include "td/telegram/Global.h"
+#include "td/telegram/MessagesManager.h"
 #include "td/telegram/OptionManager.h"
 #include "td/telegram/PhotoFormat.h"
 #include "td/telegram/secret_api.h"
 #include "td/telegram/Td.h"
 #include "td/telegram/td_api.h"
 #include "td/telegram/telegram_api.h"
+#include "td/telegram/UpdatesManager.h"

 #include "td/actor/actor.h"

@ -46,10 +48,13 @@ tl_object_ptr<td_api::videoNote> VideoNotesManager::get_video_note_object(FileId
  }

  auto video_note = get_video_note(file_id);
+  auto speech_recognition_result = video_note->transcription_info == nullptr
+                                       ? nullptr
+                                       : video_note->transcription_info->get_speech_recognition_result_object();
  return make_tl_object<td_api::videoNote>(
      video_note->duration, video_note->dimensions.width, get_minithumbnail_object(video_note->minithumbnail),
      get_thumbnail_object(td_->file_manager_.get(), video_note->thumbnail, PhotoFormat::Jpeg),
-      td_->file_manager_->get_file_object(file_id));
+      std::move(speech_recognition_result), td_->file_manager_->get_file_object(file_id));
 }

 FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace) {
@ -78,10 +83,17 @@ FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note
      }
      v->thumbnail = std::move(new_video_note->thumbnail);
    }
+    if (TranscriptionInfo::update_from(v->transcription_info, std::move(new_video_note->transcription_info))) {
+      on_video_note_transcription_completed(file_id);
+    }
  }
  return file_id;
 }

+VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) {
+  return video_notes_.get_pointer(file_id);
+}
+
 const VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) const {
  return video_notes_.get_pointer(file_id);
 }
@ -103,9 +115,14 @@ FileId VideoNotesManager::dup_video_note(FileId new_id, FileId old_id) {
  CHECK(old_video_note != nullptr);
  auto &new_video_note = video_notes_[new_id];
  CHECK(new_video_note == nullptr);
-  new_video_note = make_unique<VideoNote>(*old_video_note);
+  new_video_note = make_unique<VideoNote>();
  new_video_note->file_id = new_id;
+  new_video_note->duration = old_video_note->duration;
+  new_video_note->dimensions = old_video_note->dimensions;
+  new_video_note->minithumbnail = old_video_note->minithumbnail;
+  new_video_note->thumbnail = old_video_note->thumbnail;
  new_video_note->thumbnail.file_id = td_->file_manager_->dup_file_id(new_video_note->thumbnail.file_id);
+  new_video_note->transcription_info = TranscriptionInfo::copy_if_transcribed(old_video_note->transcription_info);
  return new_id;
 }

@ -173,6 +190,97 @@ void VideoNotesManager::unregister_video_note(FileId video_note_file_id, FullMes
  CHECK(is_deleted);
 }

+void VideoNotesManager::recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise) {
+  auto it = message_video_notes_.find(full_message_id);
+  CHECK(it != message_video_notes_.end());
+
+  auto file_id = it->second;
+  auto video_note = get_video_note(file_id);
+  CHECK(video_note != nullptr);
+  if (video_note->transcription_info == nullptr) {
+    video_note->transcription_info = make_unique<TranscriptionInfo>();
+  }
+
+  auto handler = [actor_id = actor_id(this),
+                  file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
+    send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, true, std::move(r_update));
+  };
+  if (video_note->transcription_info->recognize_speech(td_, full_message_id, std::move(promise), std::move(handler))) {
+    on_video_note_transcription_updated(file_id);
+  }
+}
+
+void VideoNotesManager::on_transcribed_audio_update(
+    FileId file_id, bool is_initial, Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
+  if (G()->close_flag()) {
+    return;
+  }
+
+  auto video_note = get_video_note(file_id);
+  CHECK(video_note != nullptr);
+  CHECK(video_note->transcription_info != nullptr);
+
+  if (r_update.is_error()) {
+    auto promises = video_note->transcription_info->on_failed_transcription(r_update.error().clone());
+    on_video_note_transcription_updated(file_id);
+    fail_promises(promises, r_update.move_as_error());
+    return;
+  }
+  auto update = r_update.move_as_ok();
+  auto transcription_id = update->transcription_id_;
+  if (!update->pending_) {
+    auto promises = video_note->transcription_info->on_final_transcription(std::move(update->text_), transcription_id);
+    on_video_note_transcription_completed(file_id);
+    set_promises(promises);
+  } else {
+    auto is_changed =
+        video_note->transcription_info->on_partial_transcription(std::move(update->text_), transcription_id);
+    if (is_changed) {
+      on_video_note_transcription_updated(file_id);
+    }
+
+    if (is_initial) {
+      td_->updates_manager_->subscribe_to_transcribed_audio_updates(
+          transcription_id, [actor_id = actor_id(this),
+                             file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
+            send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, false,
+                         std::move(r_update));
+          });
+    }
+  }
+}
+
+void VideoNotesManager::on_video_note_transcription_updated(FileId file_id) {
+  auto it = video_note_messages_.find(file_id);
+  if (it != video_note_messages_.end()) {
+    for (const auto &full_message_id : it->second) {
+      td_->messages_manager_->on_external_update_message_content(full_message_id);
+    }
+  }
+}
+
+void VideoNotesManager::on_video_note_transcription_completed(FileId file_id) {
+  auto it = video_note_messages_.find(file_id);
+  if (it != video_note_messages_.end()) {
+    for (const auto &full_message_id : it->second) {
+      td_->messages_manager_->on_update_message_content(full_message_id);
+    }
+  }
+}
+
+void VideoNotesManager::rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise) {
+  auto it = message_video_notes_.find(full_message_id);
+  CHECK(it != message_video_notes_.end());
+
+  auto file_id = it->second;
+  auto video_note = get_video_note(file_id);
+  CHECK(video_note != nullptr);
+  if (video_note->transcription_info == nullptr) {
+    return promise.set_value(Unit());
+  }
+  video_note->transcription_info->rate_speech_recognition(td_, full_message_id, is_good, std::move(promise));
+}
+
 SecretInputMedia VideoNotesManager::get_secret_input_media(FileId video_note_file_id,
                                                           tl_object_ptr<telegram_api::InputEncryptedFile> input_file,
                                                           BufferSlice thumbnail, int32 layer) const {
--- a/td/telegram/VideoNotesManager.h
+++ b/td/telegram/VideoNotesManager.h
@ -13,6 +13,7 @@
 #include "td/telegram/SecretInputMedia.h"
 #include "td/telegram/td_api.h"
 #include "td/telegram/telegram_api.h"
+#include "td/telegram/TranscriptionInfo.h"

 #include "td/actor/actor.h"

@ -46,6 +47,10 @@ class VideoNotesManager final : public Actor {

  void unregister_video_note(FileId video_note_file_id, FullMessageId full_message_id, const char *source);

+  void recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise);
+
+  void rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise);
+
  tl_object_ptr<telegram_api::InputMedia> get_input_media(FileId file_id,
                                                          tl_object_ptr<telegram_api::InputFile> input_file,
                                                          tl_object_ptr<telegram_api::InputFile> input_thumbnail) const;
@ -75,14 +80,24 @@ class VideoNotesManager final : public Actor {
    Dimensions dimensions;
    string minithumbnail;
    PhotoSize thumbnail;
+    unique_ptr<TranscriptionInfo> transcription_info;

    FileId file_id;
  };

+  VideoNote *get_video_note(FileId file_id);
+
  const VideoNote *get_video_note(FileId file_id) const;

  FileId on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace);

+  void on_video_note_transcription_updated(FileId file_id);
+
+  void on_video_note_transcription_completed(FileId file_id);
+
+  void on_transcribed_audio_update(FileId file_id, bool is_initial,
+                                   Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update);
+
  void tear_down() final;

  Td *td_;
--- a/td/telegram/VideoNotesManager.hpp
+++ b/td/telegram/VideoNotesManager.hpp
@ -24,10 +24,12 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
  bool has_duration = video_note->duration != 0;
  bool has_minithumbnail = !video_note->minithumbnail.empty();
  bool has_thumbnail = video_note->thumbnail.file_id.is_valid();
+  bool is_transcribed = video_note->transcription_info != nullptr && video_note->transcription_info->is_transcribed();
  BEGIN_STORE_FLAGS();
  STORE_FLAG(has_duration);
  STORE_FLAG(has_minithumbnail);
  STORE_FLAG(has_thumbnail);
+  STORE_FLAG(is_transcribed);
  END_STORE_FLAGS();
  if (has_duration) {
    store(video_note->duration, storer);
@ -39,6 +41,9 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
  if (has_thumbnail) {
    store(video_note->thumbnail, storer);
  }
+  if (is_transcribed) {
+    store(video_note->transcription_info, storer);
+  }
  store(file_id, storer);
 }

@ -48,16 +53,19 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
  bool has_duration;
  bool has_minithumbnail;
  bool has_thumbnail;
+  bool is_transcribed;
  if (parser.version() >= static_cast<int32>(Version::AddVideoNoteFlags)) {
    BEGIN_PARSE_FLAGS();
    PARSE_FLAG(has_duration);
    PARSE_FLAG(has_minithumbnail);
    PARSE_FLAG(has_thumbnail);
+    PARSE_FLAG(is_transcribed);
    END_PARSE_FLAGS();
  } else {
    has_duration = true;
    has_minithumbnail = parser.version() >= static_cast<int32>(Version::SupportMinithumbnails);
    has_thumbnail = true;
+    is_transcribed = false;
  }
  if (has_duration) {
    parse(video_note->duration, parser);
@ -69,6 +77,9 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
  if (has_thumbnail) {
    parse(video_note->thumbnail, parser);
  }
+  if (is_transcribed) {
+    parse(video_note->transcription_info, parser);
+  }
  parse(video_note->file_id, parser);
  if (parser.get_error() != nullptr || !video_note->file_id.is_valid()) {
    return FileId();