From eef1e9f44ff14c5e65b759bb1eafa3946cdaf719 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 22 May 2024 17:17:10 -0500 Subject: [PATCH] [ie/tiktok] Fix subtitles extraction (#9961) Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 56 ++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index c96fa50388..7772dd1f28 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -212,7 +212,31 @@ def _extract_aweme_app(self, aweme_id): raise ExtractorError('Unable to find video in feed', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) - def _get_subtitles(self, aweme_detail, aweme_id): + def _extract_web_data_and_status(self, url, video_id, fatal=True): + webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or '' + video_data, status = {}, None + + if universal_data := self._get_universal_data(webpage, video_id): + self.write_debug('Found universal data for rehydration') + status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 + video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict})) + + elif sigi_data := self._get_sigi_state(webpage, video_id): + self.write_debug('Found sigi state data') + status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 + video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) + + elif next_data := self._search_nextjs_data(webpage, video_id, default={}): + self.write_debug('Found next.js data') + status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 + video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) + + elif fatal: + raise ExtractorError('Unable to extract webpage video data') + + return video_data, status + + def _get_subtitles(self, aweme_detail, aweme_id, user_url): # TODO: Extract text positioning info subtitles = {} # aweme/detail endpoint subs @@ -243,9 +267,10 @@ def _get_subtitles(self, aweme_detail, aweme_id): }) # webpage subs if not subtitles: - for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', ...), expected_type=dict): - if not caption.get('Url'): - continue + if user_url: # only _parse_aweme_video_app needs to extract the webpage here + aweme_detail, _ = self._extract_web_data_and_status( + f'{user_url}/video/{aweme_id}', aweme_id, fatal=False) + for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])): subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({ 'ext': remove_start(caption.get('Format'), 'web'), 'url': caption['Url'], @@ -412,7 +437,7 @@ def extract_addr(addr, add_meta={}): 'album': str_or_none(music_info.get('album')) or None, 'artists': re.split(r'(?:, | & )', music_author) if music_author else None, 'formats': formats, - 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), + 'subtitles': self.extract_subtitles(aweme_detail, aweme_id, user_url), 'thumbnails': thumbnails, 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000), 'availability': self._availability( @@ -554,6 +579,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id): 'channel_id': channel_id, 'uploader_url': user_url, 'formats': formats, + 'subtitles': self.extract_subtitles(aweme_detail, video_id, None), 'thumbnails': thumbnails, 'http_headers': { 'Referer': webpage_url, @@ -839,25 +865,7 @@ def _real_extract(self, url): self.report_warning(f'{e}; trying with webpage') url = self._create_url(user_id, video_id) - webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}) - - if universal_data := self._get_universal_data(webpage, video_id): - self.write_debug('Found universal data for rehydration') - status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 - video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict})) - - elif sigi_data := self._get_sigi_state(webpage, video_id): - self.write_debug('Found sigi state data') - status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 - video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) - - elif next_data := self._search_nextjs_data(webpage, video_id, default={}): - self.write_debug('Found next.js data') - status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 - video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) - - else: - raise ExtractorError('Unable to extract webpage video data') + video_data, status = self._extract_web_data_and_status(url, video_id) if video_data and status == 0: return self._parse_aweme_video_web(video_data, url, video_id)