Merge 523c7e1d44 into 5dbac313ae

[ie/generic] Add key_query extractor-arg
Authored by: bashonly
2024-06-26 07:30:20 +02:00 · 2024-06-16 16:29:53 +05:30 · 2024-06-15 18:38:02 -05:00 · 2024-06-15 18:38:02 -05:00 · 2024-06-15 21:51:27 +02:00 · 2024-06-13 23:22:30 +00:00
20 changed files with 615 additions and 266 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -237,27 +237,43 @@ jobs:
  macos:
    needs: process
    if: inputs.macos
+    permissions:
+      contents: read
+      actions: write  # For cleaning up cache
    runs-on: macos-12

    steps:
      - uses: actions/checkout@v4
      # NB: Building universal2 does not work with python from actions/setup-python
+
+      - name: Restore cached requirements
+        id: restore-cache
+        uses: actions/cache/restore@v4
+        env:
+          SEGMENT_DOWNLOAD_TIMEOUT_MINS: 1
+        with:
+          path: |
+            ~/yt-dlp-build-venv
+          key: cache-reqs-${{ github.job }}
+
      - name: Install Requirements
        run: |
          brew install coreutils
-          python3 devscripts/install_deps.py --user -o --include build
+          python3 -m venv ~/yt-dlp-build-venv
+          source ~/yt-dlp-build-venv/bin/activate
+          python3 devscripts/install_deps.py -o --include build
          python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt
          # We need to ignore wheels otherwise we break universal2 builds
-          python3 -m pip install -U --user --no-binary :all: -r requirements.txt
+          python3 -m pip install -U --no-binary :all: -r requirements.txt
          # We need to fuse our own universal2 wheels for curl_cffi
-          python3 -m pip install -U --user delocate
+          python3 -m pip install -U delocate
          mkdir curl_cffi_whls curl_cffi_universal2
          python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt
          for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do
            python3 -m pip download \
              --only-binary=:all: \
              --platform "${platform}" \
-              --pre -d curl_cffi_whls \
+              -d curl_cffi_whls \
              -r requirements.txt
          done
          ( # Overwrite x86_64-only libs with fat/universal2 libs or else Pyinstaller will do the opposite
@ -274,9 +290,10 @@ jobs:
          )
          python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2
          python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2
-          cd curl_cffi_universal2
-          for wheel in ./*cffi*.whl; do mv -n -- "${wheel}" "${wheel/x86_64/universal2}"; done
-          python3 -m pip install -U --user ./*cffi*.whl
+          for wheel in curl_cffi_universal2/*cffi*.whl; do
+            mv -n -- "${wheel}" "${wheel/x86_64/universal2}"
+          done
+          python3 -m pip install --force-reinstall -U curl_cffi_universal2/*cffi*.whl

      - name: Prepare
        run: |
@ -284,6 +301,7 @@ jobs:
          python3 devscripts/make_lazy_extractors.py
      - name: Build
        run: |
+          source ~/yt-dlp-build-venv/bin/activate
          python3 -m bundle.pyinstaller --target-architecture universal2 --onedir
          (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .)
          python3 -m bundle.pyinstaller --target-architecture universal2
@ -307,6 +325,24 @@ jobs:
            dist/yt-dlp_macos.zip
          compression-level: 0

+      - name: Cleanup cache
+        if: steps.restore-cache.outputs.cache-hit == 'true'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          cache_key: cache-reqs-${{ github.job }}
+          repository: ${{ github.repository }}
+          branch: ${{ github.ref }}
+        run: |
+          gh extension install actions/gh-actions-cache
+          gh actions-cache delete "${cache_key}" -R "${repository}" -B "${branch}" --confirm
+
+      - name: Cache requirements
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            ~/yt-dlp-build-venv
+          key: cache-reqs-${{ github.job }}
+
  macos_legacy:
    needs: process
    if: inputs.macos_legacy
--- a/.github/workflows/release-master.yml
+++ b/.github/workflows/release-master.yml
@ -24,6 +24,7 @@ jobs:
      source: master
    permissions:
      contents: write
-      packages: write
+      packages: write  # For package cache
+      actions: write  # For cleaning up cache
      id-token: write  # mandatory for trusted publishing
    secrets: inherit
--- a/.github/workflows/release-nightly.yml
+++ b/.github/workflows/release-nightly.yml
@ -37,6 +37,7 @@ jobs:
      source: nightly
    permissions:
      contents: write
-      packages: write
+      packages: write  # For package cache
+      actions: write  # For cleaning up cache
      id-token: write  # mandatory for trusted publishing
    secrets: inherit
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -228,7 +228,8 @@ jobs:
      origin: ${{ needs.prepare.outputs.target_repo }}
    permissions:
      contents: read
-      packages: write # For package cache
+      packages: write  # For package cache
+      actions: write  # For cleaning up cache
    secrets:
      GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }}

--- a/README.md
+++ b/README.md
@ -1779,8 +1779,9 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.)
 * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off

 #### generic
-* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg
+* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Note that if the stream has an HLS AES-128 key, then the query parameters will be passed to the key URI as well, unless the `key_query` extractor-arg is passed, or unless an external key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg
 * `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE`
+* `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg
 * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist
 * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live`

--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -581,8 +581,9 @@ class YoutubeDL:
        'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
        'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data',
        'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies',
-        'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options',
-        'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
+        'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url',
+        'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version',
+        'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
    }
    _deprecated_multivalue_fields = {
        'album_artist': 'album_artists',
--- a/yt_dlp/downloader/external.py
+++ b/yt_dlp/downloader/external.py
@ -108,7 +108,7 @@ def supports(cls, info_dict):
        return all((
            not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES,
            '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES,
-            not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'),
+            not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url', 'extra_param_to_key_url'),
            all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')),
        ))

--- a/yt_dlp/downloader/hls.py
+++ b/yt_dlp/downloader/hls.py
@ -160,10 +160,12 @@ def is_ad_fragment_end(s):
        extra_state = ctx.setdefault('extra_state', {})

        format_index = info_dict.get('format_index')
-        extra_query = None
-        extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
-        if extra_param_to_segment_url:
-            extra_query = urllib.parse.parse_qs(extra_param_to_segment_url)
+        extra_segment_query = None
+        if extra_param_to_segment_url := info_dict.get('extra_param_to_segment_url'):
+            extra_segment_query = urllib.parse.parse_qs(extra_param_to_segment_url)
+        extra_key_query = None
+        if extra_param_to_key_url := info_dict.get('extra_param_to_key_url'):
+            extra_key_query = urllib.parse.parse_qs(extra_param_to_key_url)
        i = 0
        media_sequence = 0
        decrypt_info = {'METHOD': 'NONE'}
@ -190,8 +192,8 @@ def is_ad_fragment_end(s):
                    if frag_index <= ctx['fragment_index']:
                        continue
                    frag_url = urljoin(man_url, line)
-                    if extra_query:
-                        frag_url = update_url_query(frag_url, extra_query)
+                    if extra_segment_query:
+                        frag_url = update_url_query(frag_url, extra_segment_query)

                    fragments.append({
                        'frag_index': frag_index,
@ -212,8 +214,8 @@ def is_ad_fragment_end(s):
                    frag_index += 1
                    map_info = parse_m3u8_attributes(line[11:])
                    frag_url = urljoin(man_url, map_info.get('URI'))
-                    if extra_query:
-                        frag_url = update_url_query(frag_url, extra_query)
+                    if extra_segment_query:
+                        frag_url = update_url_query(frag_url, extra_segment_query)

                    if map_info.get('BYTERANGE'):
                        splitted_byte_range = map_info.get('BYTERANGE').split('@')
@ -244,8 +246,10 @@ def is_ad_fragment_end(s):
                            decrypt_info['KEY'] = external_aes_key
                        else:
                            decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI'])
-                            if extra_query:
-                                decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
+                            if extra_key_query or extra_segment_query:
+                                # Fall back to extra_segment_query to key for backwards compat
+                                decrypt_info['URI'] = update_url_query(
+                                    decrypt_info['URI'], extra_key_query or extra_segment_query)
                            if decrypt_url != decrypt_info['URI']:
                                decrypt_info['KEY'] = None

--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1755,7 +1755,10 @@
    RTVETelevisionIE,
 )
 from .rtvs import RTVSIE
-from .rtvslo import RTVSLOIE
+from .rtvslo import (
+    RTVSLOIE,
+    RTVSLOShowIE,
+)
 from .rudovideo import RudoVideoIE
 from .rule34video import Rule34VideoIE
 from .rumble import (
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -234,7 +234,14 @@ class InfoExtractor:
                                 'maybe' if the format may have DRM and has to be tested before download.
                    * extra_param_to_segment_url  A query string to append to each
                                 fragment's URL, or to update each existing query string
-                                 with. Only applied by the native HLS/DASH downloaders.
+                                 with. If it is an HLS stream with an AES-128 decryption key,
+                                 the query paramaters will be passed to the key URI as well,
+                                 unless there is an `extra_param_to_key_url` given,
+                                 or unless an external key URI is provided via `hls_aes`.
+                                 Only applied by the native HLS/DASH downloaders.
+                    * extra_param_to_key_url  A query string to append to the URL
+                                 of the format's HLS AES-128 decryption key.
+                                 Only applied by the native HLS downloader.
                    * hls_aes    A dictionary of HLS AES-128 decryption information
                                 used by the native HLS downloader to override the
                                 values in the media playlist when an '#EXT-X-KEY' tag
--- a/yt_dlp/extractor/francetv.py
+++ b/yt_dlp/extractor/francetv.py
@ -5,6 +5,7 @@
 from .dailymotion import DailymotionIE
 from ..networking import HEADRequest
 from ..utils import (
+    clean_html,
    determine_ext,
    filter_dict,
    format_field,
@ -33,6 +34,7 @@ class FranceTVIE(InfoExtractor):
    _GEO_BYPASS = False

    _TESTS = [{
+        # tokenized url is in dinfo['video']['token']
        'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1',
        'info_dict': {
            'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
@ -44,6 +46,19 @@ class FranceTVIE(InfoExtractor):
            'upload_date': '20170813',
        },
        'params': {'skip_download': 'm3u8'},
+    }, {
+        # tokenized url is in dinfo['video']['token']['akamai']
+        'url': 'francetv:c5bda21d-2c6f-4470-8849-3d8327adb2ba',
+        'info_dict': {
+            'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
+            'ext': 'mp4',
+            'title': '13h15, le dimanche... - Les mystères de Jésus',
+            'timestamp': 1514118300,
+            'duration': 2880,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20171224',
+        },
+        'params': {'skip_download': 'm3u8'},
    }, {
        'url': 'francetv:162311093',
        'only_matching': True,
@ -68,6 +83,7 @@ class FranceTVIE(InfoExtractor):
    def _extract_video(self, video_id, hostname=None):
        is_live = None
        videos = []
+        drm_formats = False
        title = None
        subtitle = None
        episode_number = None
@ -85,13 +101,12 @@ def _extract_video(self, video_id, hostname=None):
                    'device_type': device_type,
                    'browser': browser,
                    'domain': hostname,
-                }), fatal=False)
+                }), fatal=False, expected_status=422)  # 422 json gives detailed error code/message

            if not dinfo:
                continue

-            video = traverse_obj(dinfo, ('video', {dict}))
-            if video:
+            if video := traverse_obj(dinfo, ('video', {dict})):
                videos.append(video)
                if duration is None:
                    duration = video.get('duration')
@ -99,9 +114,19 @@ def _extract_video(self, video_id, hostname=None):
                    is_live = video.get('is_live')
                if spritesheets is None:
                    spritesheets = video.get('spritesheets')
+            elif code := traverse_obj(dinfo, ('code', {int})):
+                if code == 2009:
+                    self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+                elif code in (2015, 2017):
+                    # 2015: L'accès à cette vidéo est impossible. (DRM-only)
+                    # 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM)
+                    drm_formats = True
+                    continue
+                self.report_warning(
+                    f'{self.IE_NAME} said: {code} "{clean_html(dinfo.get("message"))}"')
+                continue

-            meta = traverse_obj(dinfo, ('meta', {dict}))
-            if meta:
+            if meta := traverse_obj(dinfo, ('meta', {dict})):
                if title is None:
                    title = meta.get('title')
                # meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>"
@ -114,12 +139,15 @@ def _extract_video(self, video_id, hostname=None):
                if timestamp is None:
                    timestamp = parse_iso8601(meta.get('broadcasted_at'))

+        if not videos and drm_formats:
+            self.report_drm(video_id)
+
        formats, subtitles, video_url = [], {}, None
        for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])):
            video_url = video['url']
            format_id = video.get('format')

-            if token_url := url_or_none(video.get('token')):
+            if token_url := traverse_obj(video, ('token', (None, 'akamai'), {url_or_none}, any)):
                tokenized_url = traverse_obj(self._download_json(
                    token_url, video_id, f'Downloading signed {format_id} manifest URL',
                    fatal=False, query={
@ -225,13 +253,13 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
    _TESTS = [{
        'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
        'info_dict': {
-            'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
+            'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
            'ext': 'mp4',
            'title': '13h15, le dimanche... - Les mystères de Jésus',
-            'timestamp': 1502623500,
-            'duration': 2580,
+            'timestamp': 1514118300,
+            'duration': 2880,
            'thumbnail': r're:^https?://.*\.jpg$',
-            'upload_date': '20170813',
+            'upload_date': '20171224',
        },
        'params': {
            'skip_download': True,
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@ -2167,7 +2167,15 @@ def _extra_manifest_info(self, info, manifest_url):
                urllib.parse.urlparse(fragment_query).query or fragment_query
                or urllib.parse.urlparse(manifest_url).query or None)

-        hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
+        key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
+        if key_query is not None:
+            info['extra_param_to_key_url'] = (
+                urllib.parse.urlparse(key_query).query or key_query
+                or urllib.parse.urlparse(manifest_url).query or None)
+
+        def hex_or_none(value):
+            return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
+
        info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
            'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
        }) or None
--- a/yt_dlp/extractor/khanacademy.py
+++ b/yt_dlp/extractor/khanacademy.py
@ -3,43 +3,52 @@
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
+    make_archive_id,
    parse_iso8601,
-    try_get,
+    str_or_none,
+    traverse_obj,
+    url_or_none,
+    urljoin,
 )


 class KhanAcademyBaseIE(InfoExtractor):
    _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'

+    _PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70'
+
    def _parse_video(self, video):
        return {
            '_type': 'url_transparent',
            'url': video['youtubeId'],
-            'id': video.get('slug'),
-            'title': video.get('title'),
-            'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
-            'duration': int_or_none(video.get('duration')),
-            'description': video.get('description'),
+            'id': video['youtubeId'],
            'ie_key': 'Youtube',
+            **traverse_obj(video, {
+                'display_id': ('id', {str_or_none}),
+                'title': ('translatedTitle', {str}),
+                'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}),
+                'duration': ('duration', {int_or_none}),
+                'description': ('description', {str}),
+            }, get_all=False),
        }

    def _real_extract(self, url):
        display_id = self._match_id(url)
        content = self._download_json(
-            'https://www.khanacademy.org/api/internal/graphql/FetchContentData',
-            display_id, query={
+            'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id,
+            query={
                'fastly_cacheable': 'persist_until_publish',
-                'hash': '4134764944',
-                'lang': 'en',
+                'pcv': self._PUBLISHED_CONTENT_VERSION,
+                'hash': '1242644265',
                'variables': json.dumps({
                    'path': display_id,
-                    'queryParams': 'lang=en',
-                    'isModal': False,
-                    'followRedirects': True,
                    'countryCode': 'US',
+                    'kaLocale': 'en',
+                    'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION,
                }),
-            })['data']['contentJson']
-        return self._parse_component_props(self._parse_json(content, display_id)['componentProps'])
+                'lang': 'en',
+            })['data']['contentRoute']['listedPathData']
+        return self._parse_component_props(content, display_id)


 class KhanAcademyIE(KhanAcademyBaseIE):
@ -47,64 +56,98 @@ class KhanAcademyIE(KhanAcademyBaseIE):
    _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
    _TEST = {
        'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
-        'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
+        'md5': '1d5c2e70fa6aa29c38eca419f12515ce',
        'info_dict': {
            'id': 'FlIG3TvQCBQ',
            'ext': 'mp4',
            'title': 'The one-time pad',
            'description': 'The perfect cipher',
+            'display_id': '716378217',
            'duration': 176,
-            'uploader': 'Brit Cruise',
-            'uploader_id': 'khanacademy',
+            'uploader': 'Khan Academy',
+            'uploader_id': '@khanacademy',
+            'uploader_url': 'https://www.youtube.com/@khanacademy',
            'upload_date': '20120411',
            'timestamp': 1334170113,
            'license': 'cc-by-nc-sa',
+            'live_status': 'not_live',
+            'channel': 'Khan Academy',
+            'channel_id': 'UC4a-Gbdw7vOaccHmFo40b9g',
+            'channel_url': 'https://www.youtube.com/channel/UC4a-Gbdw7vOaccHmFo40b9g',
+            'channel_is_verified': True,
+            'playable_in_embed': True,
+            'categories': ['Education'],
+            'creators': ['Brit Cruise'],
+            'tags': [],
+            'age_limit': 0,
+            'availability': 'public',
+            'comment_count': int,
+            'channel_follower_count': int,
+            'thumbnail': str,
+            'view_count': int,
+            'like_count': int,
+            'heatmap': list,
        },
        'add_ie': ['Youtube'],
    }

-    def _parse_component_props(self, component_props):
-        video = component_props['tutorialPageData']['contentModel']
-        info = self._parse_video(video)
-        author_names = video.get('authorNames')
-        info.update({
-            'uploader': ', '.join(author_names) if author_names else None,
-            'timestamp': parse_iso8601(video.get('dateAdded')),
-            'license': video.get('kaUserLicense'),
-        })
-        return info
+    def _parse_component_props(self, component_props, display_id):
+        video = component_props['content']
+        return {
+            **self._parse_video(video),
+            **traverse_obj(video, {
+                'creators': ('authorNames', ..., {str}),
+                'timestamp': ('dateAdded', {parse_iso8601}),
+                'license': ('kaUserLicense', {str}),
+            }),
+        }


 class KhanAcademyUnitIE(KhanAcademyBaseIE):
    IE_NAME = 'khanacademy:unit'
-    _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
-    _TEST = {
+    _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('1,2', '')) + '/?(?:[?#&]|$)'
+    _TESTS = [{
        'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
        'info_dict': {
-            'id': 'cryptography',
+            'id': 'x48c910b6',
            'title': 'Cryptography',
            'description': 'How have humans protected their secret messages through history? What has changed today?',
+            'display_id': 'computing/computer-science/cryptography',
+            '_old_archive_ids': ['khanacademyunit cryptography'],
        },
        'playlist_mincount': 31,
-    }
+    }, {
+        'url': 'https://www.khanacademy.org/computing/computer-science',
+        'info_dict': {
+            'id': 'x301707a0',
+            'title': 'Computer science theory',
+            'description': 'md5:4b472a4646e6cf6ec4ccb52c4062f8ba',
+            'display_id': 'computing/computer-science',
+            '_old_archive_ids': ['khanacademyunit computer-science'],
+        },
+        'playlist_mincount': 50,
+    }]

-    def _parse_component_props(self, component_props):
-        curation = component_props['curation']
+    def _parse_component_props(self, component_props, display_id):
+        course = component_props['course']
+        selected_unit = traverse_obj(course, (
+            'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course

-        entries = []
-        tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
-        for tutorial_number, tutorial in enumerate(tutorials, 1):
-            chapter_info = {
-                'chapter': tutorial.get('title'),
-                'chapter_number': tutorial_number,
-                'chapter_id': tutorial.get('id'),
-            }
-            for content_item in (tutorial.get('contentItems') or []):
-                if content_item.get('kind') == 'Video':
-                    info = self._parse_video(content_item)
-                    info.update(chapter_info)
-                    entries.append(info)
+        def build_entry(entry):
+            return self.url_result(urljoin(
+                'https://www.khanacademy.org', entry['canonicalUrl']),
+                KhanAcademyIE, title=entry.get('translatedTitle'))
+
+        entries = traverse_obj(selected_unit, (
+            (('unitChildren', ...), None), 'allOrderedChildren', ..., 'curatedChildren',
+            lambda _, v: v['contentKind'] == 'Video' and v['canonicalUrl'], {build_entry}))

        return self.playlist_result(
-            entries, curation.get('unit'), curation.get('title'),
-            curation.get('description'))
+            entries,
+            display_id=display_id,
+            **traverse_obj(selected_unit, {
+                'id': ('id', {str}),
+                'title': ('translatedTitle', {str}),
+                'description': ('translatedDescription', {str}),
+                '_old_archive_ids': ('slug', {str}, {lambda x: [make_archive_id(self, x)] if x else None}),
+            }))
--- a/yt_dlp/extractor/nhk.py
+++ b/yt_dlp/extractor/nhk.py
@ -4,6 +4,7 @@
 from ..utils import (
    ExtractorError,
    clean_html,
+    filter_dict,
    get_element_by_class,
    int_or_none,
    join_nonempty,
@ -590,21 +591,22 @@ class NhkRadiruIE(InfoExtractor):
    IE_DESC = 'NHK らじる (Radiru/Rajiru)'
    _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
    _TESTS = [{
-        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210',
-        'skip': 'Episode expired on 2024-02-24',
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239',
+        'skip': 'Episode expired on 2024-06-09',
        'info_dict': {
-            'title': 'ジャズ・トゥナイト　シリーズＪＡＺＺジャイアンツ　５６　ジョニー・ホッジス',
-            'id': '0449_01_3926210',
+            'title': 'ジャズ・トゥナイト　ジャズ「Ｎｉｇｈｔ　ａｎｄ　Ｄａｙ」特集',
+            'id': '0449_01_4003239',
            'ext': 'm4a',
+            'uploader': 'NHK FM 東京',
+            'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc',
            'series': 'ジャズ・トゥナイト',
-            'uploader': 'NHK-FM',
-            'channel': 'NHK-FM',
+            'channel': 'NHK FM 東京',
            'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
-            'release_date': '20240217',
-            'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811',
-            'timestamp': 1708185600,
-            'release_timestamp': 1708178400,
-            'upload_date': '20240217',
+            'upload_date': '20240601',
+            'series_id': '0449_01',
+            'release_date': '20240601',
+            'timestamp': 1717257600,
+            'release_timestamp': 1717250400,
        },
    }, {
        # playlist, airs every weekday so it should _hopefully_ be okay forever
@ -613,71 +615,145 @@ class NhkRadiruIE(InfoExtractor):
            'id': '0458_01',
            'title': 'ベストオブクラシック',
            'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
-            'channel': 'NHK-FM',
-            'uploader': 'NHK-FM',
            'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
+            'series_id': '0458_01',
+            'uploader': 'NHK FM',
+            'channel': 'NHK FM',
+            'series': 'ベストオブクラシック',
        },
        'playlist_mincount': 3,
    }, {
        # one with letters in the id
-        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
-        'note': 'Expires on 2024-03-31',
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688',
+        'note': 'Expires on 2025-03-31',
        'info_dict': {
-            'id': 'F300_06_3738470',
+            'id': 'F683_01_3910688',
            'ext': 'm4a',
-            'title': '有島武郎「一房のぶどう」',
-            'description': '朗読：川野一宇（ラジオ深夜便アンカー）\r\n\r\n（2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より）',
-            'channel': 'NHKラジオ第1、NHK-FM',
-            'uploader': 'NHKラジオ第1、NHK-FM',
-            'timestamp': 1635757200,
-            'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
-            'release_date': '20161207',
-            'series': 'らじる文庫 by ラジオ深夜便 ',
-            'release_timestamp': 1481126700,
-            'upload_date': '20211101',
+            'title': '夏目漱石「文鳥」第1回',
+            'series': '【らじる文庫】夏目漱石「文鳥」（全4回）',
+            'series_id': 'F683_01',
+            'description': '朗読：浅井理アナウンサー',
+            'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg',
+            'upload_date': '20240106',
+            'release_date': '20240106',
+            'uploader': 'NHK R1',
+            'release_timestamp': 1704511800,
+            'channel': 'NHK R1',
+            'timestamp': 1704512700,
        },
-        'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'],
+        'expected_warnings': ['Unable to download JSON metadata',
+                              'Failed to get extended metadata. API returned Error 1: Invalid parameters'],
    }, {
        # news
-        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
-        'skip': 'Expires on 2023-04-17',
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173',
        'info_dict': {
-            'id': 'F261_01_3855109',
+            'id': 'F261_01_4012173',
            'ext': 'm4a',
            'channel': 'NHKラジオ第1',
            'uploader': 'NHKラジオ第1',
-            'timestamp': 1681635900,
-            'release_date': '20230416',
            'series': 'NHKラジオニュース',
-            'title': '午後６時のNHKニュース',
+            'title': '午前０時のNHKニュース',
            'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
-            'upload_date': '20230416',
-            'release_timestamp': 1681635600,
+            'release_timestamp': 1718290800,
+            'release_date': '20240613',
+            'timestamp': 1718291400,
+            'upload_date': '20240613',
        },
+    }, {
+        # fallback when extended metadata fails
+        'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298',
+        'skip': 'Expires on 2024-06-07',
+        'info_dict': {
+            'id': '2834_01_4009298',
+            'title': 'まち☆キラ！開成町特集',
+            'ext': 'm4a',
+            'release_date': '20240531',
+            'upload_date': '20240531',
+            'series': 'はま☆キラ！',
+            'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg',
+            'channel': 'NHK R1,FM',
+            'description': '',
+            'timestamp': 1717123800,
+            'uploader': 'NHK R1,FM',
+            'release_timestamp': 1717120800,
+            'series_id': '2834_01',
+        },
+        'expected_warnings': ['Failed to get extended metadata. API returned empty list.'],
    }]

    _API_URL_TMPL = None

-    def _extract_extended_description(self, episode_id, episode):
-        service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')}))
-        aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
+    def _extract_extended_metadata(self, episode_id, aa_vinfo):
+        service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')}))
        detail_url = try_call(
-            lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3))
+            lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3]))
        if not detail_url:
-            return
+            return {}

-        full_meta = traverse_obj(
-            self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False),
-            ('list', service, 0, {dict})) or {}
-        return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta)
+        response = self._download_json(
+            detail_url, episode_id, 'Downloading extended metadata',
+            'Failed to download extended metadata', fatal=False, expected_status=400)
+        if not response:
+            return {}

-    def _extract_episode_info(self, headline, programme_id, series_meta):
+        if error := traverse_obj(response, ('error', {dict})):
+            self.report_warning(
+                'Failed to get extended metadata. API returned '
+                f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}')
+            return {}
+
+        full_meta = traverse_obj(response, ('list', service, 0, {dict}))
+        if not full_meta:
+            self.report_warning('Failed to get extended metadata. API returned empty list.')
+            return {}
+
+        station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None
+        thumbnails = [{
+            'id': str(id_),
+            'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1,
+            **traverse_obj(thumb, {
+                'url': 'url',
+                'width': ('width', {int_or_none}),
+                'height': ('height', {int_or_none}),
+            }),
+        } for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))]
+
+        return filter_dict({
+            'channel': station,
+            'uploader': station,
+            'description': join_nonempty(
+                'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta),
+            'thumbnails': thumbnails,
+            **traverse_obj(full_meta, {
+                'title': ('title', {str}),
+                'timestamp': ('end_time', {unified_timestamp}),
+                'release_timestamp': ('start_time', {unified_timestamp}),
+            }),
+        })
+
+    def _extract_episode_info(self, episode, programme_id, series_meta):
+        episode_id = f'{programme_id}_{episode["id"]}'
+        aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')}))
+        extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo)
+        fallback_start_time, _, fallback_end_time = traverse_obj(
+            aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')}))
+
+        return {
+            **series_meta,
+            'id': episode_id,
+            'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False),
+            'container': 'm4a_dash',  # force fixup, AAC-only HLS
+            'was_live': True,
+            'title': episode.get('program_title'),
+            'description': episode.get('program_sub_title'),  # fallback
+            'timestamp': unified_timestamp(fallback_end_time),
+            'release_timestamp': unified_timestamp(fallback_start_time),
+            **extended_metadata,
+        }
+
+    def _extract_news_info(self, headline, programme_id, series_meta):
        episode_id = f'{programme_id}_{headline["headline_id"]}'
        episode = traverse_obj(headline, ('file_list', 0, {dict}))
-        description = self._extract_extended_description(episode_id, episode)
-        if not description:
-            self.report_warning('Failed to get extended description, falling back to summary')
-            description = traverse_obj(episode, ('file_title_sub', {str}))

        return {
            **series_meta,
@ -687,9 +763,9 @@ def _extract_episode_info(self, headline, programme_id, series_meta):
            'was_live': True,
            'series': series_meta.get('title'),
            'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
-            'description': description,
            **traverse_obj(episode, {
-                'title': 'file_title',
+                'title': ('file_title', {str}),
+                'description': ('file_title_sub', {str}),
                'timestamp': ('open_time', {unified_timestamp}),
                'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
            }),
@ -706,32 +782,58 @@ def _real_extract(self, url):
        site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
        programme_id = f'{site_id}_{corner_id}'

-        if site_id == 'F261':
-            json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
-        else:
-            json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
+        if site_id == 'F261':  # XXX: News programmes use old API (for now?)
+            meta = self._download_json(
+                'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main']
+            series_meta = traverse_obj(meta, {
+                'title': ('program_name', {str}),
+                'channel': ('media_name', {str}),
+                'uploader': ('media_name', {str}),
+                'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
+            }, get_all=False)

-        meta = self._download_json(json_url, programme_id)['main']
+            if headline_id:
+                headline = traverse_obj(
+                    meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any))
+                if not headline:
+                    raise ExtractorError('Content not found; it has most likely expired', expected=True)
+                return self._extract_news_info(headline, programme_id, series_meta)

-        series_meta = traverse_obj(meta, {
-            'title': 'program_name',
-            'channel': 'media_name',
-            'uploader': 'media_name',
-            'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
-        }, get_all=False)
+            def news_entries():
+                for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
+                    yield self._extract_news_info(headline, programme_id, series_meta)
+
+            return self.playlist_result(
+                news_entries(), programme_id, description=meta.get('site_detail'), **series_meta)
+
+        meta = self._download_json(
+            'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={
+                'site_id': site_id,
+                'corner_site_id': corner_id,
+            })
+
+        fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ')
+        series_meta = {
+            'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta),
+            'series_id': programme_id,
+            'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})),
+            'channel': fallback_station,
+            'uploader': fallback_station,
+        }

        if headline_id:
-            return self._extract_episode_info(
-                traverse_obj(meta, (
-                    'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
-                programme_id, series_meta)
+            episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any))
+            if not episode:
+                raise ExtractorError('Content not found; it has most likely expired', expected=True)
+            return self._extract_episode_info(episode, programme_id, series_meta)

        def entries():
-            for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
-                yield self._extract_episode_info(headline, programme_id, series_meta)
+            for episode in traverse_obj(meta, ('episodes', ..., {dict})):
+                yield self._extract_episode_info(episode, programme_id, series_meta)

        return self.playlist_result(
-            entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
+            entries(), programme_id, title=series_meta.get('series'),
+            description=meta.get('series_description'), **series_meta)


 class NhkRadioNewsPageIE(InfoExtractor):
--- a/yt_dlp/extractor/rtvslo.py
+++ b/yt_dlp/extractor/rtvslo.py
@ -1,3 +1,5 @@
+import re
+
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
@ -6,6 +8,7 @@
    traverse_obj,
    unified_timestamp,
    url_or_none,
+    urljoin,
 )


@ -21,75 +24,73 @@ class RTVSLOIE(InfoExtractor):
    _API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622'
    SUB_LANGS_MAP = {'Slovenski': 'sl'}

-    _TESTS = [
-        {
-            'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv',
-            'info_dict': {
-                'id': '174842550',
-                'ext': 'mp4',
-                'release_timestamp': 1643140032,
-                'upload_date': '20220125',
-                'series': 'Dnevnik',
-                'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg',
-                'description': 'md5:76a18692757aeb8f0f51221106277dd2',
-                'timestamp': 1643137046,
-                'title': 'Dnevnik',
-                'series_id': '92',
-                'release_date': '20220125',
-                'duration': 1789,
-            },
-        }, {
-            'url': 'https://365.rtvslo.si/arhiv/utrip/174843754',
-            'info_dict': {
-                'id': '174843754',
-                'ext': 'mp4',
-                'series_id': '94',
-                'release_date': '20220129',
-                'timestamp': 1643484455,
-                'title': 'Utrip',
-                'duration': 813,
-                'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg',
-                'description': 'md5:77f2892630c7b17bb7a5bb84319020c9',
-                'release_timestamp': 1643485825,
-                'upload_date': '20220129',
-                'series': 'Utrip',
-            },
-        }, {
-            'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609',
-            'info_dict': {
-                'id': '174844609',
-                'ext': 'mp3',
-                'series_id': '106615841',
-                'title': 'Il giornale della sera',
-                'duration': 1328,
-                'series': 'Il giornale della sera',
-                'timestamp': 1643743800,
-                'release_timestamp': 1643745424,
-                'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg',
-                'upload_date': '20220201',
-                'tbr': 128000,
-                'release_date': '20220201',
-            },
-        }, {
-            'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750',
-            'info_dict': {
-                'id': '148350750',
-                'ext': 'mp4',
-                'title': 'Prvi šolski dan, mozaična oddaja za mlade',
-                'series': 'Razred zase',
-                'series_id': '148185730',
-                'duration': 1481,
-                'upload_date': '20121019',
-                'timestamp': 1350672122,
-                'release_date': '20121019',
-                'release_timestamp': 1350672122,
-                'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg',
-            },
-        }, {
-            'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550',
-            'only_matching': True,
+    _TESTS = [{
+        'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv',
+        'info_dict': {
+            'id': '174842550',
+            'ext': 'mp4',
+            'release_timestamp': 1643140032,
+            'upload_date': '20220125',
+            'series': 'Dnevnik',
+            'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg',
+            'description': 'md5:76a18692757aeb8f0f51221106277dd2',
+            'timestamp': 1643137046,
+            'title': 'Dnevnik',
+            'series_id': '92',
+            'release_date': '20220125',
+            'duration': 1789,
        },
-    ]
+    }, {
+        'url': 'https://365.rtvslo.si/arhiv/utrip/174843754',
+        'info_dict': {
+            'id': '174843754',
+            'ext': 'mp4',
+            'series_id': '94',
+            'release_date': '20220129',
+            'timestamp': 1643484455,
+            'title': 'Utrip',
+            'duration': 813,
+            'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg',
+            'description': 'md5:77f2892630c7b17bb7a5bb84319020c9',
+            'release_timestamp': 1643485825,
+            'upload_date': '20220129',
+            'series': 'Utrip',
+        },
+    }, {
+        'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609',
+        'info_dict': {
+            'id': '174844609',
+            'ext': 'mp3',
+            'series_id': '106615841',
+            'title': 'Il giornale della sera',
+            'duration': 1328,
+            'series': 'Il giornale della sera',
+            'timestamp': 1643743800,
+            'release_timestamp': 1643745424,
+            'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg',
+            'upload_date': '20220201',
+            'tbr': 128000,
+            'release_date': '20220201',
+        },
+    }, {
+        'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750',
+        'info_dict': {
+            'id': '148350750',
+            'ext': 'mp4',
+            'title': 'Prvi šolski dan, mozaična oddaja za mlade',
+            'series': 'Razred zase',
+            'series_id': '148185730',
+            'duration': 1481,
+            'upload_date': '20121019',
+            'timestamp': 1350672122,
+            'release_date': '20121019',
+            'release_timestamp': 1350672122,
+            'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg',
+        },
+    }, {
+        'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550',
+        'only_matching': True,
+    }]

    def _real_extract(self, url):
        v_id = self._match_id(url)
@ -164,3 +165,26 @@ def _real_extract(self, url):
            'series': meta.get('showName'),
            'series_id': meta.get('showId'),
        }
+
+
+class RTVSLOShowIE(InfoExtractor):
+    IE_NAME = 'rtvslo.si:show'
+    _VALID_URL = r'https?://(?:365|4d)\.rtvslo.si/oddaja/[^/?#&]+/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://365.rtvslo.si/oddaja/ekipa-bled/173250997',
+        'info_dict': {
+            'id': '173250997',
+            'title': 'Ekipa Bled',
+        },
+        'playlist_count': 18,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        return self.playlist_from_matches(
+            re.findall(r'<a [^>]*\bhref="(/arhiv/[^"]+)"', webpage),
+            playlist_id, self._html_extract_title(webpage),
+            getter=lambda x: urljoin('https://365.rtvslo.si', x), ie=RTVSLOIE)
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@ -95,7 +95,7 @@ def _update_client_id(self):
                    return
        raise ExtractorError('Unable to extract client id')

-    def _download_json(self, *args, **kwargs):
+    def _call_api(self, *args, **kwargs):
        non_fatal = kwargs.get('fatal') is False
        if non_fatal:
            del kwargs['fatal']
@ -104,7 +104,7 @@ def _download_json(self, *args, **kwargs):
            query['client_id'] = self._CLIENT_ID
            kwargs['query'] = query
            try:
-                return super()._download_json(*args, **kwargs)
+                return self._download_json(*args, **kwargs)
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
                    self._store_client_id(None)
@ -163,7 +163,7 @@ def genNumBlock():
            'user_agent': self._USER_AGENT
        }

-        response = self._download_json(
+        response = self._call_api(
            self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
            None, note='Verifying login token...', fatal=False,
            data=json.dumps(payload).encode())
@ -217,12 +217,26 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f
            query['secret_token'] = secret_token

        if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'):
-            download_url = update_url_query(
-                self._API_V2_BASE + 'tracks/' + track_id + '/download', query)
-            redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri')
-            if redirect_url:
+            try:
+                # Do not use _call_api(); HTTP Error codes have different meanings for this request
+                download_data = self._download_json(
+                    f'{self._API_V2_BASE}tracks/{track_id}/download', track_id,
+                    'Downloading original download format info JSON', query=query, headers=self._HEADERS)
+            except ExtractorError as e:
+                if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+                    self.report_warning(
+                        'Original download format is only available '
+                        f'for registered users. {self._login_hint()}')
+                elif isinstance(e.cause, HTTPError) and e.cause.status == 403:
+                    self.write_debug('Original download format is not available for this client')
+                else:
+                    self.report_warning(e.msg)
+                download_data = None
+
+            if redirect_url := traverse_obj(download_data, ('redirectUri', {url_or_none})):
                urlh = self._request_webpage(
-                    HEADRequest(redirect_url), track_id, 'Checking for original download format', fatal=False)
+                    HEADRequest(redirect_url), track_id, 'Checking original download format availability',
+                    'Original download format is not available', fatal=False)
                if urlh:
                    format_url = urlh.url
                    format_urls.add(format_url)
@ -303,7 +317,7 @@ def add_format(f, protocol, is_preview=False):
            stream = None
            for retry in self.RetryManager(fatal=False):
                try:
-                    stream = self._download_json(
+                    stream = self._call_api(
                        format_url, track_id, f'Downloading {identifier} format info JSON',
                        query=query, headers=self._HEADERS)
                except ExtractorError as e:
@ -630,7 +644,7 @@ def _real_extract(self, url):
                resolve_title += f'/{token}'
            info_json_url = self._resolv_url(self._BASE_URL + resolve_title)

-        info = self._download_json(
+        info = self._call_api(
            info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)

        return self._extract_info_dict(info, full_title, token)
@ -641,7 +655,7 @@ def _extract_set(self, playlist, token=None):
        playlist_id = str(playlist['id'])
        tracks = playlist.get('tracks') or []
        if not all(t.get('permalink_url') for t in tracks) and token:
-            tracks = self._download_json(
+            tracks = self._call_api(
                self._API_V2_BASE + 'tracks', playlist_id,
                'Downloading tracks', query={
                    'ids': ','.join([str(t['id']) for t in tracks]),
@ -699,7 +713,7 @@ def _real_extract(self, url):
        if token:
            full_title += '/' + token

-        info = self._download_json(self._resolv_url(
+        info = self._call_api(self._resolv_url(
            self._BASE_URL + full_title), full_title, headers=self._HEADERS)

        if 'errors' in info:
@ -730,7 +744,7 @@ def _entries(self, url, playlist_id):
        for i in itertools.count():
            for retry in self.RetryManager():
                try:
-                    response = self._download_json(
+                    response = self._call_api(
                        url, playlist_id, query=query, headers=self._HEADERS,
                        note=f'Downloading track page {i + 1}')
                    break
@ -838,7 +852,7 @@ def _real_extract(self, url):
        mobj = self._match_valid_url(url)
        uploader = mobj.group('user')

-        user = self._download_json(
+        user = self._call_api(
            self._resolv_url(self._BASE_URL + uploader),
            uploader, 'Downloading user info', headers=self._HEADERS)

@ -864,7 +878,7 @@ class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE):

    def _real_extract(self, url):
        user_id = self._match_id(url)
-        user = self._download_json(
+        user = self._call_api(
            self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS)

        return self._extract_playlist(
@ -886,7 +900,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
    def _real_extract(self, url):
        track_name = self._match_id(url)

-        track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS)
+        track = self._call_api(self._resolv_url(url), track_name, headers=self._HEADERS)
        track_id = self._search_regex(
            r'soundcloud:track-stations:(\d+)', track['id'], 'track id')

@ -930,7 +944,7 @@ class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
    def _real_extract(self, url):
        slug, relation = self._match_valid_url(url).group('slug', 'relation')

-        track = self._download_json(
+        track = self._call_api(
            self._resolv_url(self._BASE_URL + slug),
            slug, 'Downloading track info', headers=self._HEADERS)

@ -965,7 +979,7 @@ def _real_extract(self, url):
        if token:
            query['secret_token'] = token

-        data = self._download_json(
+        data = self._call_api(
            self._API_V2_BASE + 'playlists/' + playlist_id,
            playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)

@ -1000,7 +1014,7 @@ def _get_collection(self, endpoint, collection_id, **query):
        next_url = update_url_query(self._API_V2_BASE + endpoint, query)

        for i in itertools.count(1):
-            response = self._download_json(
+            response = self._call_api(
                next_url, collection_id, f'Downloading page {i}',
                'Unable to download API page', headers=self._HEADERS)

--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@ -213,8 +213,19 @@ def _extract_aweme_app(self, aweme_id):
        return self._parse_aweme_video_app(aweme_detail)

    def _extract_web_data_and_status(self, url, video_id, fatal=True):
-        webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or ''
-        video_data, status = {}, None
+        video_data, status = {}, -1
+
+        res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'})
+        if res is False:
+            return video_data, status
+
+        webpage, urlh = res
+        if urllib.parse.urlparse(urlh.url).path == '/login':
+            message = 'TikTok is requiring login for access to this content'
+            if fatal:
+                self.raise_login_required(message)
+            self.report_warning(f'{message}. {self._login_hint()}')
+            return video_data, status

        if universal_data := self._get_universal_data(webpage, video_id):
            self.write_debug('Found universal data for rehydration')
--- a/yt_dlp/extractor/tubitv.py
+++ b/yt_dlp/extractor/tubitv.py
@ -13,6 +13,7 @@


 class TubiTvIE(InfoExtractor):
+    IE_NAME = 'tubitv'
    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?P<type>video|movies|tv-shows)/(?P<id>\d+)'
    _LOGIN_URL = 'http://tubitv.com/login'
    _NETRC_MACHINE = 'tubitv'
@ -148,30 +149,54 @@ def _real_extract(self, url):


 class TubiTvShowIE(InfoExtractor):
-    _WORKING = False
-    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/[0-9]+/(?P<show_name>[^/?#]+)'
+    IE_NAME = 'tubitv:series'
+    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/\d+/(?P<show_name>[^/?#]+)(?:/season-(?P<season>\d+))?'
    _TESTS = [{
        'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true',
-        'playlist_mincount': 390,
+        'playlist_mincount': 389,
        'info_dict': {
            'id': 'the-joy-of-painting-with-bob-ross',
        },
+    }, {
+        'url': 'https://tubitv.com/series/2311/the-saddle-club/season-1',
+        'playlist_count': 26,
+        'info_dict': {
+            'id': 'the-saddle-club-season-1',
+        },
+    }, {
+        'url': 'https://tubitv.com/series/2311/the-saddle-club/season-3',
+        'playlist_count': 19,
+        'info_dict': {
+            'id': 'the-saddle-club-season-3',
+        },
+    }, {
+        'url': 'https://tubitv.com/series/2311/the-saddle-club/',
+        'playlist_mincount': 71,
+        'info_dict': {
+            'id': 'the-saddle-club',
+        },
    }]

-    def _entries(self, show_url, show_name):
-        show_webpage = self._download_webpage(show_url, show_name)
+    def _entries(self, show_url, playlist_id, selected_season):
+        webpage = self._download_webpage(show_url, playlist_id)

-        show_json = self._parse_json(self._search_regex(
-            r'window\.__data\s*=\s*({[^<]+});\s*</script>',
-            show_webpage, 'data'), show_name, transform_source=js_to_json)['video']
+        data = self._search_json(
+            r'window\.__data\s*=', webpage, 'data', playlist_id,
+            transform_source=js_to_json)['video']

-        for episode_id in show_json['fullContentById']:
-            if traverse_obj(show_json, ('byId', episode_id, 'type')) == 's':
-                continue
-            yield self.url_result(
-                f'https://tubitv.com/tv-shows/{episode_id}/',
-                ie=TubiTvIE.ie_key(), video_id=episode_id)
+        # v['number'] is already a decimal string, but stringify to protect against API changes
+        path = [lambda _, v: str(v['number']) == selected_season] if selected_season else [..., {dict}]
+
+        for season in traverse_obj(data, ('byId', lambda _, v: v['type'] == 's', 'seasons', *path)):
+            season_number = int_or_none(season.get('number'))
+            for episode in traverse_obj(season, ('episodes', lambda _, v: v['id'])):
+                episode_id = episode['id']
+                yield self.url_result(
+                    f'https://tubitv.com/tv-shows/{episode_id}/', TubiTvIE, episode_id,
+                    season_number=season_number, episode_number=int_or_none(episode.get('num')))

    def _real_extract(self, url):
-        show_name = self._match_valid_url(url).group('show_name')
-        return self.playlist_result(self._entries(url, show_name), playlist_id=show_name)
+        playlist_id, selected_season = self._match_valid_url(url).group('show_name', 'season')
+        if selected_season:
+            playlist_id = f'{playlist_id}-season-{selected_season}'
+        return self.playlist_result(self._entries(url, playlist_id, selected_season), playlist_id)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -885,14 +885,14 @@ def _get_count(self, data, *path_list):
        return count

    @staticmethod
-    def _extract_thumbnails(data, *path_list):
+    def _extract_thumbnails(data, *path_list, final_key='thumbnails'):
        """
        Extract thumbnails from thumbnails dict
        @param path_list: path list to level that contains 'thumbnails' key
        """
        thumbnails = []
        for path in path_list or [()]:
-            for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)):
+            for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)):
                thumbnail_url = url_or_none(thumbnail.get('url'))
                if not thumbnail_url:
                    continue
@ -5124,6 +5124,10 @@ def _extract_metadata_from_tabs(self, item_id, data):
        else:
            metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict)

+        # pageHeaderViewModel slow rollout began April 2024
+        page_header_view_model = traverse_obj(data, (
+            'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict}))
+
        # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
        # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
        def _get_uncropped(url):
@ -5139,8 +5143,10 @@ def _get_uncropped(url):
                    'preference': 1,
                })

-        channel_banners = self._extract_thumbnails(
-            data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner')))
+        channel_banners = (
+            self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner')))
+            or self._extract_thumbnails(
+                page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources'))
        for banner in channel_banners:
            banner['preference'] = -10

@ -5167,7 +5173,11 @@ def _get_uncropped(url):
                      or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag'))
                      or info['id']),
            'availability': self._extract_availability(data),
-            'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
+            'channel_follower_count': (
+                self._get_count(data, ('header', ..., 'subscriberCountText'))
+                or traverse_obj(page_header_view_model, (
+                    'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts',
+                    lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))),
            'description': try_get(metadata_renderer, lambda x: x.get('description', '')),
            'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str}))
                     or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))),
--- a/yt_dlp/postprocessor/ffmpeg.py
+++ b/yt_dlp/postprocessor/ffmpeg.py
@ -1048,6 +1048,34 @@ def _ffmpeg_args_for_chapter(self, number, chapter, info):
            ['-ss', str(chapter['start_time']),
             '-t', str(chapter['end_time'] - chapter['start_time'])])

+    # Extends opts with chapter specific metadata for the supported formats.
+    #
+    # Tested and supported on opus, m4a, webm and mp4.
+    def _set_metadata_arg(self, opts, ext, key, value):
+        if ext == 'opus':
+            # opus file requires a stream to keep title, artist etc metadata.
+            # FFmpegMetadataPP already set metadata and created that stream.
+            # Futher metadata updates should be set on the stream(:s)
+            # -metadata will do nothing and needs to be -metadata:s
+            opts.extend(['-metadata:s', f'{key}={value}'])
+        elif ext in ['m4a', 'webm', 'mp4']:
+            opts.extend(['-metadata', f'{key}={value}'])
+
+    # FFmpeg adds metadata about all chapters from parent file to all split m4a files.
+    # This is incorrect since there must be only single chapter in each file after split.
+    # Such behavior confuses players who think multiple chapters present
+    def _set_out_opts(self, ext, chapter_title, track_number):
+        out_opts = [*self.stream_copy_opts()]
+        out_opts.extend(['-map_metadata', '0'])
+        # exclude chapters metadata but keep everything else
+        out_opts.extend(['-map_chapters', '-1'])
+
+        # replace global title with chapter specific title in split files
+        if chapter_title:
+            self._set_metadata_arg(out_opts, ext, "title", chapter_title)
+        self._set_metadata_arg(out_opts, ext, "track", track_number)
+        return out_opts
+
    @PostProcessor._restrict_to(images=False)
    def run(self, info):
        self._fixup_chapters(info)
@ -1062,7 +1090,8 @@ def run(self, info):
        self.to_screen(f'Splitting video by chapters; {len(chapters)} chapters found')
        for idx, chapter in enumerate(chapters):
            destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
-            self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())])
+            out_file_opts = self._set_out_opts(info['ext'], chapter.get('title', ''), str(idx + 1))
+            self.real_run_ffmpeg([(in_file, opts)], [(destination, out_file_opts)])
        if in_file != info['filepath']:
            self._delete_downloaded_files(in_file, msg=None)
        return [], info
Author	SHA1	Message	Date
danilovsergei	f135f9371e	Merge `523c7e1d44` into `5dbac313ae`	2024-06-16 16:29:53 +05:30
bashonly	5dbac313ae	[ie/generic] Add `key_query` extractor-arg Authored by: bashonly	2024-06-15 18:38:02 -05:00
bashonly	ca8885edd9	[fd/hls] Apply `extra_param_to_key_url` from info dict Authored by: bashonly	2024-06-15 18:38:02 -05:00
c-basalt	4093eb1fcc	[ie/khanacademy] Fix extractors (#9136 ) Closes #8775 Authored by: c-basalt	2024-06-15 21:51:27 +02:00
bashonly	a0d9967f68	[ie/youtube:tab] Fix channel metadata extraction (#10071 ) Closes #9893, Closes #10090 Authored by: bashonly, shoxie007 Co-authored-by: shoxie007 <74592022+shoxie007@users.noreply.github.com>	2024-06-13 23:22:30 +00:00
bashonly	ea88129784	[ie/tiktok] Detect and raise when login is required (#10124 ) Authored by: bashonly	2024-06-13 23:16:43 +00:00
garret1317	b8e2a5e0e1	[ie/NHKRadiru] Fix extractor (#10106 ) Closes #10105 Authored by: garret1317	2024-06-13 23:08:40 +00:00
bashonly	e53e56b735	[ie/soundcloud] Fix `download` format extraction (#10125 ) Authored by: bashonly	2024-06-13 23:01:19 +00:00
JSubelj	92a1c4abae	[ie/rtvslo.si:show] Add extractor (#8418 ) Authored by: JSubelj, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>	2024-06-14 00:51:12 +02:00
bashonly	3690c2f598	[ie/francetv] Detect and raise errors for DRM (#10165 ) Closes #10163 Authored by: bashonly	2024-06-13 22:44:20 +00:00
bashonly	081708d607	[ie/francetv] Fix extractor (#10177 ) Closes #10175 Authored by: bashonly	2024-06-13 22:31:13 +00:00
bashonly	d7d861811c	[ie/tubitv:series] Fix extractor (#10116 ) Closes #8563 Authored by: bashonly	2024-06-13 21:59:17 +00:00
bashonly	46c1b7cfec	[build] Cache dependencies for `macos` job (#10088 ) Authored by: bashonly	2024-06-13 21:13:08 +00:00
sergeidanilov	523c7e1d44	Fixed comments and added more supported formats	2023-11-12 23:40:55 -08:00
sergeidanilov	fc46f24dc5	fix pull request lint comments cleaned up implementation and comments	2023-10-25 23:18:49 -07:00
danilovsergei	c29caf1739	Merge branch 'yt-dlp:master' into split-chapters-metadata-m4a-fix	2023-10-25 23:04:21 -07:00
danilovsergei	bb9ec9a24d	Fix https://github.com/yt-dlp/yt-dlp/issues/8363 by manually adding metadata during ffmpeg split chapters execution	2023-10-16 22:35:04 -07:00