From f1ab9a3d9387e80b53d5c0b3be1a485f2739a48f Mon Sep 17 00:00:00 2001 From: Deukhoofd Date: Fri, 10 May 2024 18:40:46 +0200 Subject: [PATCH] Fixes for several pages, support for subtitles --- yt_dlp/extractor/beacon.py | 46 ++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/beacon.py b/yt_dlp/extractor/beacon.py index d25d9cfa9..00bfe6ca1 100644 --- a/yt_dlp/extractor/beacon.py +++ b/yt_dlp/extractor/beacon.py @@ -33,26 +33,53 @@ def _real_extract(self, url): state = traverse_obj(json_data, ('props', 'pageProps', '__APOLLO_STATE__')) content_data = None - image_data = None for key, value in state.items(): - if key.startswith('Content'): + # We can be given many different content objects, we want the one where the slug matches the video ID. + if key.startswith('Content') and traverse_obj(value, ('slug')) == video_id: content_data = value - if key.startswith('Image'): - image_data = value + break + # If the user is not authenticated, and this video is not public, the content will be hidden. In this case show an error to the user. if content_data is None: - raise ExtractorError('Failed to find content data', expected=True) + raise ExtractorError('Failed to find content data. Either the given content is not a video, or it requires authentication', expected=True) if content_data['contentVideo'] is None: raise ExtractorError('Failed to find content video. Either the given content is not a video, or it requires authentication', expected=True) - m3u8_url = traverse_obj(content_data, ('contentVideo', 'video', 'video')) + # Apollo GraphQL quirk, works with references. We grab the thumbnail reference so we + thumbnail_ref = traverse_obj(content_data, ('thumbnail', '__ref')) + image_data = None + if thumbnail_ref is not None: + image_data = traverse_obj(state, (thumbnail_ref)) + + # Prefer landscape thumbnail + thumbnail_url = traverse_obj(image_data, ('sizes', 'landscape', 'url')) + # If not found, try for square thumbnail + if thumbnail_url is None: + thumbnail_url = traverse_obj(image_data, ('sizes', 'square', 'url')) + # Otherwise, fall back to any other, if one exists + if thumbnail_url is None: + thumbnail_url = traverse_obj(image_data, ('sizes', ..., 'url')) + + video_data = traverse_obj(content_data, ('contentVideo', 'video')) + m3u8_url = traverse_obj(video_data, 'video') if m3u8_url is None: raise ExtractorError('Failed to find video data', expected=True) - thumbnail_url = traverse_obj(image_data, ('sizes', 'landscape', 'url')) - if thumbnail_url is None: - thumbnail_url = traverse_obj(image_data, ('sizes', 'square', 'url')) + # Beacon puts additional JSON in stringified form in the videoData. This data contains information about subtitles, and + # as such we parse this, and extract these subtitles. + additional_video_data_string = traverse_obj(video_data, 'videoData') + additional_video_data = self._parse_json(additional_video_data_string, video_id) + tracks_arr = traverse_obj(additional_video_data, ('playlist', ..., 'tracks')) + subtitles = {} + if tracks_arr is not None: + for tracks in tracks_arr: + for track in tracks: + if traverse_obj(track, 'kind') == 'captions': + file = track['file'] + language = track['language'].lower() + subs = {language: [{'url': file}]} + self._merge_subtitles(subs, target=subtitles) title = traverse_obj(content_data, 'title') description = traverse_obj(content_data, 'description') @@ -67,4 +94,5 @@ def _real_extract(self, url): 'timestamp': parse_iso8601(publishedAt), 'description': description, 'thumbnail': thumbnail_url, + 'subtitles': subtitles, }