From f1ab9a3d9387e80b53d5c0b3be1a485f2739a48f Mon Sep 17 00:00:00 2001
From: Deukhoofd <Deukhoofd@gmail.com>
Date: Fri, 10 May 2024 18:40:46 +0200
Subject: [PATCH] Fixes for several pages, support for subtitles

---
 yt_dlp/extractor/beacon.py | 46 ++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/yt_dlp/extractor/beacon.py b/yt_dlp/extractor/beacon.py
index d25d9cfa9..00bfe6ca1 100644
--- a/yt_dlp/extractor/beacon.py
+++ b/yt_dlp/extractor/beacon.py
@@ -33,26 +33,53 @@ def _real_extract(self, url):
         state = traverse_obj(json_data, ('props', 'pageProps', '__APOLLO_STATE__'))
 
         content_data = None
-        image_data = None
         for key, value in state.items():
-            if key.startswith('Content'):
+            # We can be given many different content objects, we want the one where the slug matches the video ID.
+            if key.startswith('Content') and traverse_obj(value, ('slug')) == video_id:
                 content_data = value
-            if key.startswith('Image'):
-                image_data = value
+                break
 
+        # If the user is not authenticated, and this video is not public, the content will be hidden. In this case show an error to the user.
         if content_data is None:
-            raise ExtractorError('Failed to find content data', expected=True)
+            raise ExtractorError('Failed to find content data. Either the given content is not a video, or it requires authentication', expected=True)
         if content_data['contentVideo'] is None:
             raise ExtractorError('Failed to find content video. Either the given content is not a video, or it requires authentication', expected=True)
 
-        m3u8_url = traverse_obj(content_data, ('contentVideo', 'video', 'video'))
+        # Apollo GraphQL quirk, works with references. We grab the thumbnail reference so we
+        thumbnail_ref = traverse_obj(content_data, ('thumbnail', '__ref'))
+        image_data = None
+        if thumbnail_ref is not None:
+            image_data = traverse_obj(state, (thumbnail_ref))
+
+        # Prefer landscape thumbnail
+        thumbnail_url = traverse_obj(image_data, ('sizes', 'landscape', 'url'))
+        # If not found, try for square thumbnail
+        if thumbnail_url is None:
+            thumbnail_url = traverse_obj(image_data, ('sizes', 'square', 'url'))
+        # Otherwise, fall back to any other, if one exists
+        if thumbnail_url is None:
+            thumbnail_url = traverse_obj(image_data, ('sizes', ..., 'url'))
+
+        video_data = traverse_obj(content_data, ('contentVideo', 'video'))
+        m3u8_url = traverse_obj(video_data, 'video')
 
         if m3u8_url is None:
             raise ExtractorError('Failed to find video data', expected=True)
 
-        thumbnail_url = traverse_obj(image_data, ('sizes', 'landscape', 'url'))
-        if thumbnail_url is None:
-            thumbnail_url = traverse_obj(image_data, ('sizes', 'square', 'url'))
+        # Beacon puts additional JSON in stringified form in the videoData. This data contains information about subtitles, and
+        # as such we parse this, and extract these subtitles.
+        additional_video_data_string = traverse_obj(video_data, 'videoData')
+        additional_video_data = self._parse_json(additional_video_data_string, video_id)
+        tracks_arr = traverse_obj(additional_video_data, ('playlist', ..., 'tracks'))
+        subtitles = {}
+        if tracks_arr is not None:
+            for tracks in tracks_arr:
+                for track in tracks:
+                    if traverse_obj(track, 'kind') == 'captions':
+                        file = track['file']
+                        language = track['language'].lower()
+                        subs = {language: [{'url': file}]}
+                        self._merge_subtitles(subs, target=subtitles)
 
         title = traverse_obj(content_data, 'title')
         description = traverse_obj(content_data, 'description')
@@ -67,4 +94,5 @@ def _real_extract(self, url):
             'timestamp': parse_iso8601(publishedAt),
             'description': description,
             'thumbnail': thumbnail_url,
+            'subtitles': subtitles,
         }