mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-06-29 17:06:27 +02:00
Fixes for several pages, support for subtitles
This commit is contained in:
parent
ef0542bbd6
commit
f1ab9a3d93
|
@ -33,26 +33,53 @@ def _real_extract(self, url):
|
||||||
state = traverse_obj(json_data, ('props', 'pageProps', '__APOLLO_STATE__'))
|
state = traverse_obj(json_data, ('props', 'pageProps', '__APOLLO_STATE__'))
|
||||||
|
|
||||||
content_data = None
|
content_data = None
|
||||||
image_data = None
|
|
||||||
for key, value in state.items():
|
for key, value in state.items():
|
||||||
if key.startswith('Content'):
|
# We can be given many different content objects, we want the one where the slug matches the video ID.
|
||||||
|
if key.startswith('Content') and traverse_obj(value, ('slug')) == video_id:
|
||||||
content_data = value
|
content_data = value
|
||||||
if key.startswith('Image'):
|
break
|
||||||
image_data = value
|
|
||||||
|
|
||||||
|
# If the user is not authenticated, and this video is not public, the content will be hidden. In this case show an error to the user.
|
||||||
if content_data is None:
|
if content_data is None:
|
||||||
raise ExtractorError('Failed to find content data', expected=True)
|
raise ExtractorError('Failed to find content data. Either the given content is not a video, or it requires authentication', expected=True)
|
||||||
if content_data['contentVideo'] is None:
|
if content_data['contentVideo'] is None:
|
||||||
raise ExtractorError('Failed to find content video. Either the given content is not a video, or it requires authentication', expected=True)
|
raise ExtractorError('Failed to find content video. Either the given content is not a video, or it requires authentication', expected=True)
|
||||||
|
|
||||||
m3u8_url = traverse_obj(content_data, ('contentVideo', 'video', 'video'))
|
# Apollo GraphQL quirk, works with references. We grab the thumbnail reference so we
|
||||||
|
thumbnail_ref = traverse_obj(content_data, ('thumbnail', '__ref'))
|
||||||
|
image_data = None
|
||||||
|
if thumbnail_ref is not None:
|
||||||
|
image_data = traverse_obj(state, (thumbnail_ref))
|
||||||
|
|
||||||
|
# Prefer landscape thumbnail
|
||||||
|
thumbnail_url = traverse_obj(image_data, ('sizes', 'landscape', 'url'))
|
||||||
|
# If not found, try for square thumbnail
|
||||||
|
if thumbnail_url is None:
|
||||||
|
thumbnail_url = traverse_obj(image_data, ('sizes', 'square', 'url'))
|
||||||
|
# Otherwise, fall back to any other, if one exists
|
||||||
|
if thumbnail_url is None:
|
||||||
|
thumbnail_url = traverse_obj(image_data, ('sizes', ..., 'url'))
|
||||||
|
|
||||||
|
video_data = traverse_obj(content_data, ('contentVideo', 'video'))
|
||||||
|
m3u8_url = traverse_obj(video_data, 'video')
|
||||||
|
|
||||||
if m3u8_url is None:
|
if m3u8_url is None:
|
||||||
raise ExtractorError('Failed to find video data', expected=True)
|
raise ExtractorError('Failed to find video data', expected=True)
|
||||||
|
|
||||||
thumbnail_url = traverse_obj(image_data, ('sizes', 'landscape', 'url'))
|
# Beacon puts additional JSON in stringified form in the videoData. This data contains information about subtitles, and
|
||||||
if thumbnail_url is None:
|
# as such we parse this, and extract these subtitles.
|
||||||
thumbnail_url = traverse_obj(image_data, ('sizes', 'square', 'url'))
|
additional_video_data_string = traverse_obj(video_data, 'videoData')
|
||||||
|
additional_video_data = self._parse_json(additional_video_data_string, video_id)
|
||||||
|
tracks_arr = traverse_obj(additional_video_data, ('playlist', ..., 'tracks'))
|
||||||
|
subtitles = {}
|
||||||
|
if tracks_arr is not None:
|
||||||
|
for tracks in tracks_arr:
|
||||||
|
for track in tracks:
|
||||||
|
if traverse_obj(track, 'kind') == 'captions':
|
||||||
|
file = track['file']
|
||||||
|
language = track['language'].lower()
|
||||||
|
subs = {language: [{'url': file}]}
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
|
|
||||||
title = traverse_obj(content_data, 'title')
|
title = traverse_obj(content_data, 'title')
|
||||||
description = traverse_obj(content_data, 'description')
|
description = traverse_obj(content_data, 'description')
|
||||||
|
@ -67,4 +94,5 @@ def _real_extract(self, url):
|
||||||
'timestamp': parse_iso8601(publishedAt),
|
'timestamp': parse_iso8601(publishedAt),
|
||||||
'description': description,
|
'description': description,
|
||||||
'thumbnail': thumbnail_url,
|
'thumbnail': thumbnail_url,
|
||||||
|
'subtitles': subtitles,
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user