diff --git a/td/generate/scheme/td_api.tl b/td/generate/scheme/td_api.tl
index 764c7dd3a..446e24f61 100644
--- a/td/generate/scheme/td_api.tl
+++ b/td/generate/scheme/td_api.tl
@@ -321,8 +321,8 @@ video duration:int32 width:int32 height:int32 file_name:string mime_type:string
 
 //@description Describes a video note. The video must be equal in width and height, cropped to a circle, and stored in MPEG4 format @duration Duration of the video, in seconds; as defined by the sender
 //@length Video width and height; as defined by the sender @minithumbnail Video minithumbnail; may be null
-//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @video File containing the video
-videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail video:file = VideoNote;
+//@thumbnail Video thumbnail in JPEG format; as defined by the sender; may be null @speech_recognition_result Result of speech recognition in the video note; may be null @video File containing the video
+videoNote duration:int32 length:int32 minithumbnail:minithumbnail thumbnail:thumbnail speech_recognition_result:SpeechRecognitionResult video:file = VideoNote;
 
 //@description Describes a voice note. The voice note must be encoded with the Opus codec, and stored inside an OGG container. Voice notes can have only a single audio channel
 //@duration Duration of the voice note, in seconds; as defined by the sender @waveform A waveform representation of the voice note in 5-bit format
@@ -5108,12 +5108,12 @@ getMessageLinkInfo url:string = MessageLinkInfo;
 //@to_language_code A two-letter ISO 639-1 language code of the language to which the message is translated
 translateText text:string from_language_code:string to_language_code:string = Text;
 
-//@description Recognizes speech in a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if the voice note is too long to be recognized
+//@description Recognizes speech in a video note or a voice note message. The message must be successfully sent and must not be scheduled. May return an error with a message "MSG_VOICE_TOO_LONG" if media duration is too big to be recognized
 //@chat_id Identifier of the chat to which the message belongs
 //@message_id Identifier of the message
 recognizeSpeech chat_id:int53 message_id:int53 = Ok;
 
-//@description Rates recognized speech in a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
+//@description Rates recognized speech in a video note or a voice note message @chat_id Identifier of the chat to which the message belongs @message_id Identifier of the message @is_good Pass true if the speech recognition is good
 rateSpeechRecognition chat_id:int53 message_id:int53 is_good:Bool = Ok;
 
 
diff --git a/td/telegram/MessageContent.cpp b/td/telegram/MessageContent.cpp
index ba0b0aebb..1649a0709 100644
--- a/td/telegram/MessageContent.cpp
+++ b/td/telegram/MessageContent.cpp
@@ -6188,6 +6188,8 @@ void update_used_hashtags(Td *td, const MessageContent *content) {
 void recognize_message_content_speech(Td *td, const MessageContent *content, FullMessageId full_message_id,
                                       Promise<Unit> &&promise) {
   switch (content->get_type()) {
+    case MessageContentType::VideoNote:
+      return td->video_notes_manager_->recognize_speech(full_message_id, std::move(promise));
     case MessageContentType::VoiceNote:
       return td->voice_notes_manager_->recognize_speech(full_message_id, std::move(promise));
     default:
@@ -6198,6 +6200,8 @@ void recognize_message_content_speech(Td *td, const MessageContent *content, Ful
 void rate_message_content_speech_recognition(Td *td, const MessageContent *content, FullMessageId full_message_id,
                                              bool is_good, Promise<Unit> &&promise) {
   switch (content->get_type()) {
+    case MessageContentType::VideoNote:
+      return td->video_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
     case MessageContentType::VoiceNote:
       return td->voice_notes_manager_->rate_speech_recognition(full_message_id, is_good, std::move(promise));
     default:
diff --git a/td/telegram/VideoNotesManager.cpp b/td/telegram/VideoNotesManager.cpp
index f08df77b8..cc176c3ee 100644
--- a/td/telegram/VideoNotesManager.cpp
+++ b/td/telegram/VideoNotesManager.cpp
@@ -9,12 +9,14 @@
 #include "td/telegram/AuthManager.h"
 #include "td/telegram/files/FileManager.h"
 #include "td/telegram/Global.h"
+#include "td/telegram/MessagesManager.h"
 #include "td/telegram/OptionManager.h"
 #include "td/telegram/PhotoFormat.h"
 #include "td/telegram/secret_api.h"
 #include "td/telegram/Td.h"
 #include "td/telegram/td_api.h"
 #include "td/telegram/telegram_api.h"
+#include "td/telegram/UpdatesManager.h"
 
 #include "td/actor/actor.h"
 
@@ -46,10 +48,13 @@ tl_object_ptr<td_api::videoNote> VideoNotesManager::get_video_note_object(FileId
   }
 
   auto video_note = get_video_note(file_id);
+  auto speech_recognition_result = video_note->transcription_info == nullptr
+                                       ? nullptr
+                                       : video_note->transcription_info->get_speech_recognition_result_object();
   return make_tl_object<td_api::videoNote>(
       video_note->duration, video_note->dimensions.width, get_minithumbnail_object(video_note->minithumbnail),
       get_thumbnail_object(td_->file_manager_.get(), video_note->thumbnail, PhotoFormat::Jpeg),
-      td_->file_manager_->get_file_object(file_id));
+      std::move(speech_recognition_result), td_->file_manager_->get_file_object(file_id));
 }
 
 FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace) {
@@ -78,10 +83,17 @@ FileId VideoNotesManager::on_get_video_note(unique_ptr<VideoNote> new_video_note
       }
       v->thumbnail = std::move(new_video_note->thumbnail);
     }
+    if (TranscriptionInfo::update_from(v->transcription_info, std::move(new_video_note->transcription_info))) {
+      on_video_note_transcription_completed(file_id);
+    }
   }
   return file_id;
 }
 
+VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) {
+  return video_notes_.get_pointer(file_id);
+}
+
 const VideoNotesManager::VideoNote *VideoNotesManager::get_video_note(FileId file_id) const {
   return video_notes_.get_pointer(file_id);
 }
@@ -103,9 +115,14 @@ FileId VideoNotesManager::dup_video_note(FileId new_id, FileId old_id) {
   CHECK(old_video_note != nullptr);
   auto &new_video_note = video_notes_[new_id];
   CHECK(new_video_note == nullptr);
-  new_video_note = make_unique<VideoNote>(*old_video_note);
+  new_video_note = make_unique<VideoNote>();
   new_video_note->file_id = new_id;
+  new_video_note->duration = old_video_note->duration;
+  new_video_note->dimensions = old_video_note->dimensions;
+  new_video_note->minithumbnail = old_video_note->minithumbnail;
+  new_video_note->thumbnail = old_video_note->thumbnail;
   new_video_note->thumbnail.file_id = td_->file_manager_->dup_file_id(new_video_note->thumbnail.file_id);
+  new_video_note->transcription_info = TranscriptionInfo::copy_if_transcribed(old_video_note->transcription_info);
   return new_id;
 }
 
@@ -173,6 +190,97 @@ void VideoNotesManager::unregister_video_note(FileId video_note_file_id, FullMes
   CHECK(is_deleted);
 }
 
+void VideoNotesManager::recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise) {
+  auto it = message_video_notes_.find(full_message_id);
+  CHECK(it != message_video_notes_.end());
+
+  auto file_id = it->second;
+  auto video_note = get_video_note(file_id);
+  CHECK(video_note != nullptr);
+  if (video_note->transcription_info == nullptr) {
+    video_note->transcription_info = make_unique<TranscriptionInfo>();
+  }
+
+  auto handler = [actor_id = actor_id(this),
+                  file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
+    send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, true, std::move(r_update));
+  };
+  if (video_note->transcription_info->recognize_speech(td_, full_message_id, std::move(promise), std::move(handler))) {
+    on_video_note_transcription_updated(file_id);
+  }
+}
+
+void VideoNotesManager::on_transcribed_audio_update(
+    FileId file_id, bool is_initial, Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
+  if (G()->close_flag()) {
+    return;
+  }
+
+  auto video_note = get_video_note(file_id);
+  CHECK(video_note != nullptr);
+  CHECK(video_note->transcription_info != nullptr);
+
+  if (r_update.is_error()) {
+    auto promises = video_note->transcription_info->on_failed_transcription(r_update.error().clone());
+    on_video_note_transcription_updated(file_id);
+    fail_promises(promises, r_update.move_as_error());
+    return;
+  }
+  auto update = r_update.move_as_ok();
+  auto transcription_id = update->transcription_id_;
+  if (!update->pending_) {
+    auto promises = video_note->transcription_info->on_final_transcription(std::move(update->text_), transcription_id);
+    on_video_note_transcription_completed(file_id);
+    set_promises(promises);
+  } else {
+    auto is_changed =
+        video_note->transcription_info->on_partial_transcription(std::move(update->text_), transcription_id);
+    if (is_changed) {
+      on_video_note_transcription_updated(file_id);
+    }
+
+    if (is_initial) {
+      td_->updates_manager_->subscribe_to_transcribed_audio_updates(
+          transcription_id, [actor_id = actor_id(this),
+                             file_id](Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update) {
+            send_closure(actor_id, &VideoNotesManager::on_transcribed_audio_update, file_id, false,
+                         std::move(r_update));
+          });
+    }
+  }
+}
+
+void VideoNotesManager::on_video_note_transcription_updated(FileId file_id) {
+  auto it = video_note_messages_.find(file_id);
+  if (it != video_note_messages_.end()) {
+    for (const auto &full_message_id : it->second) {
+      td_->messages_manager_->on_external_update_message_content(full_message_id);
+    }
+  }
+}
+
+void VideoNotesManager::on_video_note_transcription_completed(FileId file_id) {
+  auto it = video_note_messages_.find(file_id);
+  if (it != video_note_messages_.end()) {
+    for (const auto &full_message_id : it->second) {
+      td_->messages_manager_->on_update_message_content(full_message_id);
+    }
+  }
+}
+
+void VideoNotesManager::rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise) {
+  auto it = message_video_notes_.find(full_message_id);
+  CHECK(it != message_video_notes_.end());
+
+  auto file_id = it->second;
+  auto video_note = get_video_note(file_id);
+  CHECK(video_note != nullptr);
+  if (video_note->transcription_info == nullptr) {
+    return promise.set_value(Unit());
+  }
+  video_note->transcription_info->rate_speech_recognition(td_, full_message_id, is_good, std::move(promise));
+}
+
 SecretInputMedia VideoNotesManager::get_secret_input_media(FileId video_note_file_id,
                                                            tl_object_ptr<telegram_api::InputEncryptedFile> input_file,
                                                            BufferSlice thumbnail, int32 layer) const {
diff --git a/td/telegram/VideoNotesManager.h b/td/telegram/VideoNotesManager.h
index a3a56ce10..f97dbf1b7 100644
--- a/td/telegram/VideoNotesManager.h
+++ b/td/telegram/VideoNotesManager.h
@@ -13,6 +13,7 @@
 #include "td/telegram/SecretInputMedia.h"
 #include "td/telegram/td_api.h"
 #include "td/telegram/telegram_api.h"
+#include "td/telegram/TranscriptionInfo.h"
 
 #include "td/actor/actor.h"
 
@@ -46,6 +47,10 @@ class VideoNotesManager final : public Actor {
 
   void unregister_video_note(FileId video_note_file_id, FullMessageId full_message_id, const char *source);
 
+  void recognize_speech(FullMessageId full_message_id, Promise<Unit> &&promise);
+
+  void rate_speech_recognition(FullMessageId full_message_id, bool is_good, Promise<Unit> &&promise);
+
   tl_object_ptr<telegram_api::InputMedia> get_input_media(FileId file_id,
                                                           tl_object_ptr<telegram_api::InputFile> input_file,
                                                           tl_object_ptr<telegram_api::InputFile> input_thumbnail) const;
@@ -75,14 +80,24 @@ class VideoNotesManager final : public Actor {
     Dimensions dimensions;
     string minithumbnail;
     PhotoSize thumbnail;
+    unique_ptr<TranscriptionInfo> transcription_info;
 
     FileId file_id;
   };
 
+  VideoNote *get_video_note(FileId file_id);
+
   const VideoNote *get_video_note(FileId file_id) const;
 
   FileId on_get_video_note(unique_ptr<VideoNote> new_video_note, bool replace);
 
+  void on_video_note_transcription_updated(FileId file_id);
+
+  void on_video_note_transcription_completed(FileId file_id);
+
+  void on_transcribed_audio_update(FileId file_id, bool is_initial,
+                                   Result<telegram_api::object_ptr<telegram_api::updateTranscribedAudio>> r_update);
+
   void tear_down() final;
 
   Td *td_;
diff --git a/td/telegram/VideoNotesManager.hpp b/td/telegram/VideoNotesManager.hpp
index 819451037..cdfa23140 100644
--- a/td/telegram/VideoNotesManager.hpp
+++ b/td/telegram/VideoNotesManager.hpp
@@ -24,10 +24,12 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
   bool has_duration = video_note->duration != 0;
   bool has_minithumbnail = !video_note->minithumbnail.empty();
   bool has_thumbnail = video_note->thumbnail.file_id.is_valid();
+  bool is_transcribed = video_note->transcription_info != nullptr && video_note->transcription_info->is_transcribed();
   BEGIN_STORE_FLAGS();
   STORE_FLAG(has_duration);
   STORE_FLAG(has_minithumbnail);
   STORE_FLAG(has_thumbnail);
+  STORE_FLAG(is_transcribed);
   END_STORE_FLAGS();
   if (has_duration) {
     store(video_note->duration, storer);
@@ -39,6 +41,9 @@ void VideoNotesManager::store_video_note(FileId file_id, StorerT &storer) const
   if (has_thumbnail) {
     store(video_note->thumbnail, storer);
   }
+  if (is_transcribed) {
+    store(video_note->transcription_info, storer);
+  }
   store(file_id, storer);
 }
 
@@ -48,16 +53,19 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
   bool has_duration;
   bool has_minithumbnail;
   bool has_thumbnail;
+  bool is_transcribed;
   if (parser.version() >= static_cast<int32>(Version::AddVideoNoteFlags)) {
     BEGIN_PARSE_FLAGS();
     PARSE_FLAG(has_duration);
     PARSE_FLAG(has_minithumbnail);
     PARSE_FLAG(has_thumbnail);
+    PARSE_FLAG(is_transcribed);
     END_PARSE_FLAGS();
   } else {
     has_duration = true;
     has_minithumbnail = parser.version() >= static_cast<int32>(Version::SupportMinithumbnails);
     has_thumbnail = true;
+    is_transcribed = false;
   }
   if (has_duration) {
     parse(video_note->duration, parser);
@@ -69,6 +77,9 @@ FileId VideoNotesManager::parse_video_note(ParserT &parser) {
   if (has_thumbnail) {
     parse(video_note->thumbnail, parser);
   }
+  if (is_transcribed) {
+    parse(video_note->transcription_info, parser);
+  }
   parse(video_note->file_id, parser);
   if (parser.get_error() != nullptr || !video_note->file_id.is_valid()) {
     return FileId();