Compare commits

...

17 Commits

Author SHA1 Message Date
danilovsergei
f135f9371e
Merge 523c7e1d44 into 5dbac313ae 2024-06-16 16:29:53 +05:30
bashonly
5dbac313ae [ie/generic] Add key_query extractor-arg
Authored by: bashonly
2024-06-15 18:38:02 -05:00
bashonly
ca8885edd9 [fd/hls] Apply extra_param_to_key_url from info dict
Authored by: bashonly
2024-06-15 18:38:02 -05:00
c-basalt
4093eb1fcc
[ie/khanacademy] Fix extractors (#9136)
Closes #8775
Authored by: c-basalt
2024-06-15 21:51:27 +02:00
bashonly
a0d9967f68
[ie/youtube:tab] Fix channel metadata extraction (#10071)
Closes #9893, Closes #10090
Authored by: bashonly, shoxie007

Co-authored-by: shoxie007 <74592022+shoxie007@users.noreply.github.com>
2024-06-13 23:22:30 +00:00
bashonly
ea88129784
[ie/tiktok] Detect and raise when login is required (#10124)
Authored by: bashonly
2024-06-13 23:16:43 +00:00
garret1317
b8e2a5e0e1
[ie/NHKRadiru] Fix extractor (#10106)
Closes #10105
Authored by: garret1317
2024-06-13 23:08:40 +00:00
bashonly
e53e56b735
[ie/soundcloud] Fix download format extraction (#10125)
Authored by: bashonly
2024-06-13 23:01:19 +00:00
JSubelj
92a1c4abae
[ie/rtvslo.si:show] Add extractor (#8418)
Authored by: JSubelj, seproDev

Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
2024-06-14 00:51:12 +02:00
bashonly
3690c2f598
[ie/francetv] Detect and raise errors for DRM (#10165)
Closes #10163
Authored by: bashonly
2024-06-13 22:44:20 +00:00
bashonly
081708d607
[ie/francetv] Fix extractor (#10177)
Closes #10175
Authored by: bashonly
2024-06-13 22:31:13 +00:00
bashonly
d7d861811c
[ie/tubitv:series] Fix extractor (#10116)
Closes #8563
Authored by: bashonly
2024-06-13 21:59:17 +00:00
bashonly
46c1b7cfec
[build] Cache dependencies for macos job (#10088)
Authored by: bashonly
2024-06-13 21:13:08 +00:00
sergeidanilov
523c7e1d44 Fixed comments and added more supported formats 2023-11-12 23:40:55 -08:00
sergeidanilov
fc46f24dc5 fix pull request lint comments
cleaned up implementation and comments
2023-10-25 23:18:49 -07:00
danilovsergei
c29caf1739
Merge branch 'yt-dlp:master' into split-chapters-metadata-m4a-fix 2023-10-25 23:04:21 -07:00
danilovsergei
bb9ec9a24d
Fix https://github.com/yt-dlp/yt-dlp/issues/8363 by manually adding metadata during ffmpeg split chapters execution 2023-10-16 22:35:04 -07:00
20 changed files with 615 additions and 266 deletions

View File

@ -237,27 +237,43 @@ jobs:
macos: macos:
needs: process needs: process
if: inputs.macos if: inputs.macos
permissions:
contents: read
actions: write # For cleaning up cache
runs-on: macos-12 runs-on: macos-12
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
# NB: Building universal2 does not work with python from actions/setup-python # NB: Building universal2 does not work with python from actions/setup-python
- name: Restore cached requirements
id: restore-cache
uses: actions/cache/restore@v4
env:
SEGMENT_DOWNLOAD_TIMEOUT_MINS: 1
with:
path: |
~/yt-dlp-build-venv
key: cache-reqs-${{ github.job }}
- name: Install Requirements - name: Install Requirements
run: | run: |
brew install coreutils brew install coreutils
python3 devscripts/install_deps.py --user -o --include build python3 -m venv ~/yt-dlp-build-venv
source ~/yt-dlp-build-venv/bin/activate
python3 devscripts/install_deps.py -o --include build
python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt
# We need to ignore wheels otherwise we break universal2 builds # We need to ignore wheels otherwise we break universal2 builds
python3 -m pip install -U --user --no-binary :all: -r requirements.txt python3 -m pip install -U --no-binary :all: -r requirements.txt
# We need to fuse our own universal2 wheels for curl_cffi # We need to fuse our own universal2 wheels for curl_cffi
python3 -m pip install -U --user delocate python3 -m pip install -U delocate
mkdir curl_cffi_whls curl_cffi_universal2 mkdir curl_cffi_whls curl_cffi_universal2
python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt
for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do
python3 -m pip download \ python3 -m pip download \
--only-binary=:all: \ --only-binary=:all: \
--platform "${platform}" \ --platform "${platform}" \
--pre -d curl_cffi_whls \ -d curl_cffi_whls \
-r requirements.txt -r requirements.txt
done done
( # Overwrite x86_64-only libs with fat/universal2 libs or else Pyinstaller will do the opposite ( # Overwrite x86_64-only libs with fat/universal2 libs or else Pyinstaller will do the opposite
@ -274,9 +290,10 @@ jobs:
) )
python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2 python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2
python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2 python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2
cd curl_cffi_universal2 for wheel in curl_cffi_universal2/*cffi*.whl; do
for wheel in ./*cffi*.whl; do mv -n -- "${wheel}" "${wheel/x86_64/universal2}"; done mv -n -- "${wheel}" "${wheel/x86_64/universal2}"
python3 -m pip install -U --user ./*cffi*.whl done
python3 -m pip install --force-reinstall -U curl_cffi_universal2/*cffi*.whl
- name: Prepare - name: Prepare
run: | run: |
@ -284,6 +301,7 @@ jobs:
python3 devscripts/make_lazy_extractors.py python3 devscripts/make_lazy_extractors.py
- name: Build - name: Build
run: | run: |
source ~/yt-dlp-build-venv/bin/activate
python3 -m bundle.pyinstaller --target-architecture universal2 --onedir python3 -m bundle.pyinstaller --target-architecture universal2 --onedir
(cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .)
python3 -m bundle.pyinstaller --target-architecture universal2 python3 -m bundle.pyinstaller --target-architecture universal2
@ -307,6 +325,24 @@ jobs:
dist/yt-dlp_macos.zip dist/yt-dlp_macos.zip
compression-level: 0 compression-level: 0
- name: Cleanup cache
if: steps.restore-cache.outputs.cache-hit == 'true'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
cache_key: cache-reqs-${{ github.job }}
repository: ${{ github.repository }}
branch: ${{ github.ref }}
run: |
gh extension install actions/gh-actions-cache
gh actions-cache delete "${cache_key}" -R "${repository}" -B "${branch}" --confirm
- name: Cache requirements
uses: actions/cache/save@v4
with:
path: |
~/yt-dlp-build-venv
key: cache-reqs-${{ github.job }}
macos_legacy: macos_legacy:
needs: process needs: process
if: inputs.macos_legacy if: inputs.macos_legacy

View File

@ -24,6 +24,7 @@ jobs:
source: master source: master
permissions: permissions:
contents: write contents: write
packages: write packages: write # For package cache
actions: write # For cleaning up cache
id-token: write # mandatory for trusted publishing id-token: write # mandatory for trusted publishing
secrets: inherit secrets: inherit

View File

@ -37,6 +37,7 @@ jobs:
source: nightly source: nightly
permissions: permissions:
contents: write contents: write
packages: write packages: write # For package cache
actions: write # For cleaning up cache
id-token: write # mandatory for trusted publishing id-token: write # mandatory for trusted publishing
secrets: inherit secrets: inherit

View File

@ -229,6 +229,7 @@ jobs:
permissions: permissions:
contents: read contents: read
packages: write # For package cache packages: write # For package cache
actions: write # For cleaning up cache
secrets: secrets:
GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }}

View File

@ -1779,8 +1779,9 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.)
* `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off
#### generic #### generic
* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg * `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Note that if the stream has an HLS AES-128 key, then the query parameters will be passed to the key URI as well, unless the `key_query` extractor-arg is passed, or unless an external key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg
* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE` * `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE`
* `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg
* `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist
* `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live`

View File

@ -581,8 +581,9 @@ class YoutubeDL:
'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data',
'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies', 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies',
'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options', 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url',
'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time', 'hls_aes', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version',
'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time',
} }
_deprecated_multivalue_fields = { _deprecated_multivalue_fields = {
'album_artist': 'album_artists', 'album_artist': 'album_artists',

View File

@ -108,7 +108,7 @@ def supports(cls, info_dict):
return all(( return all((
not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES, not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES,
'+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES, '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES,
not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url'), not traverse_obj(info_dict, ('hls_aes', ...), 'extra_param_to_segment_url', 'extra_param_to_key_url'),
all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')), all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')),
)) ))

View File

@ -160,10 +160,12 @@ def is_ad_fragment_end(s):
extra_state = ctx.setdefault('extra_state', {}) extra_state = ctx.setdefault('extra_state', {})
format_index = info_dict.get('format_index') format_index = info_dict.get('format_index')
extra_query = None extra_segment_query = None
extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') if extra_param_to_segment_url := info_dict.get('extra_param_to_segment_url'):
if extra_param_to_segment_url: extra_segment_query = urllib.parse.parse_qs(extra_param_to_segment_url)
extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) extra_key_query = None
if extra_param_to_key_url := info_dict.get('extra_param_to_key_url'):
extra_key_query = urllib.parse.parse_qs(extra_param_to_key_url)
i = 0 i = 0
media_sequence = 0 media_sequence = 0
decrypt_info = {'METHOD': 'NONE'} decrypt_info = {'METHOD': 'NONE'}
@ -190,8 +192,8 @@ def is_ad_fragment_end(s):
if frag_index <= ctx['fragment_index']: if frag_index <= ctx['fragment_index']:
continue continue
frag_url = urljoin(man_url, line) frag_url = urljoin(man_url, line)
if extra_query: if extra_segment_query:
frag_url = update_url_query(frag_url, extra_query) frag_url = update_url_query(frag_url, extra_segment_query)
fragments.append({ fragments.append({
'frag_index': frag_index, 'frag_index': frag_index,
@ -212,8 +214,8 @@ def is_ad_fragment_end(s):
frag_index += 1 frag_index += 1
map_info = parse_m3u8_attributes(line[11:]) map_info = parse_m3u8_attributes(line[11:])
frag_url = urljoin(man_url, map_info.get('URI')) frag_url = urljoin(man_url, map_info.get('URI'))
if extra_query: if extra_segment_query:
frag_url = update_url_query(frag_url, extra_query) frag_url = update_url_query(frag_url, extra_segment_query)
if map_info.get('BYTERANGE'): if map_info.get('BYTERANGE'):
splitted_byte_range = map_info.get('BYTERANGE').split('@') splitted_byte_range = map_info.get('BYTERANGE').split('@')
@ -244,8 +246,10 @@ def is_ad_fragment_end(s):
decrypt_info['KEY'] = external_aes_key decrypt_info['KEY'] = external_aes_key
else: else:
decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI']) decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI'])
if extra_query: if extra_key_query or extra_segment_query:
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) # Fall back to extra_segment_query to key for backwards compat
decrypt_info['URI'] = update_url_query(
decrypt_info['URI'], extra_key_query or extra_segment_query)
if decrypt_url != decrypt_info['URI']: if decrypt_url != decrypt_info['URI']:
decrypt_info['KEY'] = None decrypt_info['KEY'] = None

View File

@ -1755,7 +1755,10 @@
RTVETelevisionIE, RTVETelevisionIE,
) )
from .rtvs import RTVSIE from .rtvs import RTVSIE
from .rtvslo import RTVSLOIE from .rtvslo import (
RTVSLOIE,
RTVSLOShowIE,
)
from .rudovideo import RudoVideoIE from .rudovideo import RudoVideoIE
from .rule34video import Rule34VideoIE from .rule34video import Rule34VideoIE
from .rumble import ( from .rumble import (

View File

@ -234,7 +234,14 @@ class InfoExtractor:
'maybe' if the format may have DRM and has to be tested before download. 'maybe' if the format may have DRM and has to be tested before download.
* extra_param_to_segment_url A query string to append to each * extra_param_to_segment_url A query string to append to each
fragment's URL, or to update each existing query string fragment's URL, or to update each existing query string
with. Only applied by the native HLS/DASH downloaders. with. If it is an HLS stream with an AES-128 decryption key,
the query paramaters will be passed to the key URI as well,
unless there is an `extra_param_to_key_url` given,
or unless an external key URI is provided via `hls_aes`.
Only applied by the native HLS/DASH downloaders.
* extra_param_to_key_url A query string to append to the URL
of the format's HLS AES-128 decryption key.
Only applied by the native HLS downloader.
* hls_aes A dictionary of HLS AES-128 decryption information * hls_aes A dictionary of HLS AES-128 decryption information
used by the native HLS downloader to override the used by the native HLS downloader to override the
values in the media playlist when an '#EXT-X-KEY' tag values in the media playlist when an '#EXT-X-KEY' tag

View File

@ -5,6 +5,7 @@
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
from ..networking import HEADRequest from ..networking import HEADRequest
from ..utils import ( from ..utils import (
clean_html,
determine_ext, determine_ext,
filter_dict, filter_dict,
format_field, format_field,
@ -33,6 +34,7 @@ class FranceTVIE(InfoExtractor):
_GEO_BYPASS = False _GEO_BYPASS = False
_TESTS = [{ _TESTS = [{
# tokenized url is in dinfo['video']['token']
'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1', 'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1',
'info_dict': { 'info_dict': {
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
@ -44,6 +46,19 @@ class FranceTVIE(InfoExtractor):
'upload_date': '20170813', 'upload_date': '20170813',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, {
# tokenized url is in dinfo['video']['token']['akamai']
'url': 'francetv:c5bda21d-2c6f-4470-8849-3d8327adb2ba',
'info_dict': {
'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus',
'timestamp': 1514118300,
'duration': 2880,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20171224',
},
'params': {'skip_download': 'm3u8'},
}, { }, {
'url': 'francetv:162311093', 'url': 'francetv:162311093',
'only_matching': True, 'only_matching': True,
@ -68,6 +83,7 @@ class FranceTVIE(InfoExtractor):
def _extract_video(self, video_id, hostname=None): def _extract_video(self, video_id, hostname=None):
is_live = None is_live = None
videos = [] videos = []
drm_formats = False
title = None title = None
subtitle = None subtitle = None
episode_number = None episode_number = None
@ -85,13 +101,12 @@ def _extract_video(self, video_id, hostname=None):
'device_type': device_type, 'device_type': device_type,
'browser': browser, 'browser': browser,
'domain': hostname, 'domain': hostname,
}), fatal=False) }), fatal=False, expected_status=422) # 422 json gives detailed error code/message
if not dinfo: if not dinfo:
continue continue
video = traverse_obj(dinfo, ('video', {dict})) if video := traverse_obj(dinfo, ('video', {dict})):
if video:
videos.append(video) videos.append(video)
if duration is None: if duration is None:
duration = video.get('duration') duration = video.get('duration')
@ -99,9 +114,19 @@ def _extract_video(self, video_id, hostname=None):
is_live = video.get('is_live') is_live = video.get('is_live')
if spritesheets is None: if spritesheets is None:
spritesheets = video.get('spritesheets') spritesheets = video.get('spritesheets')
elif code := traverse_obj(dinfo, ('code', {int})):
if code == 2009:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
elif code in (2015, 2017):
# 2015: L'accès à cette vidéo est impossible. (DRM-only)
# 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM)
drm_formats = True
continue
self.report_warning(
f'{self.IE_NAME} said: {code} "{clean_html(dinfo.get("message"))}"')
continue
meta = traverse_obj(dinfo, ('meta', {dict})) if meta := traverse_obj(dinfo, ('meta', {dict})):
if meta:
if title is None: if title is None:
title = meta.get('title') title = meta.get('title')
# meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>" # meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>"
@ -114,12 +139,15 @@ def _extract_video(self, video_id, hostname=None):
if timestamp is None: if timestamp is None:
timestamp = parse_iso8601(meta.get('broadcasted_at')) timestamp = parse_iso8601(meta.get('broadcasted_at'))
if not videos and drm_formats:
self.report_drm(video_id)
formats, subtitles, video_url = [], {}, None formats, subtitles, video_url = [], {}, None
for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])): for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])):
video_url = video['url'] video_url = video['url']
format_id = video.get('format') format_id = video.get('format')
if token_url := url_or_none(video.get('token')): if token_url := traverse_obj(video, ('token', (None, 'akamai'), {url_or_none}, any)):
tokenized_url = traverse_obj(self._download_json( tokenized_url = traverse_obj(self._download_json(
token_url, video_id, f'Downloading signed {format_id} manifest URL', token_url, video_id, f'Downloading signed {format_id} manifest URL',
fatal=False, query={ fatal=False, query={
@ -225,13 +253,13 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
'info_dict': { 'info_dict': {
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', 'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba',
'ext': 'mp4', 'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus', 'title': '13h15, le dimanche... - Les mystères de Jésus',
'timestamp': 1502623500, 'timestamp': 1514118300,
'duration': 2580, 'duration': 2880,
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20170813', 'upload_date': '20171224',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,

View File

@ -2167,7 +2167,15 @@ def _extra_manifest_info(self, info, manifest_url):
urllib.parse.urlparse(fragment_query).query or fragment_query urllib.parse.urlparse(fragment_query).query or fragment_query
or urllib.parse.urlparse(manifest_url).query or None) or urllib.parse.urlparse(manifest_url).query or None)
hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
if key_query is not None:
info['extra_param_to_key_url'] = (
urllib.parse.urlparse(key_query).query or key_query
or urllib.parse.urlparse(manifest_url).query or None)
def hex_or_none(value):
return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
}) or None }) or None

View File

@ -3,43 +3,52 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
make_archive_id,
parse_iso8601, parse_iso8601,
try_get, str_or_none,
traverse_obj,
url_or_none,
urljoin,
) )
class KhanAcademyBaseIE(InfoExtractor): class KhanAcademyBaseIE(InfoExtractor):
_VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)' _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
_PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70'
def _parse_video(self, video): def _parse_video(self, video):
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': video['youtubeId'], 'url': video['youtubeId'],
'id': video.get('slug'), 'id': video['youtubeId'],
'title': video.get('title'),
'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
'duration': int_or_none(video.get('duration')),
'description': video.get('description'),
'ie_key': 'Youtube', 'ie_key': 'Youtube',
**traverse_obj(video, {
'display_id': ('id', {str_or_none}),
'title': ('translatedTitle', {str}),
'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}),
'duration': ('duration', {int_or_none}),
'description': ('description', {str}),
}, get_all=False),
} }
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
content = self._download_json( content = self._download_json(
'https://www.khanacademy.org/api/internal/graphql/FetchContentData', 'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id,
display_id, query={ query={
'fastly_cacheable': 'persist_until_publish', 'fastly_cacheable': 'persist_until_publish',
'hash': '4134764944', 'pcv': self._PUBLISHED_CONTENT_VERSION,
'lang': 'en', 'hash': '1242644265',
'variables': json.dumps({ 'variables': json.dumps({
'path': display_id, 'path': display_id,
'queryParams': 'lang=en',
'isModal': False,
'followRedirects': True,
'countryCode': 'US', 'countryCode': 'US',
'kaLocale': 'en',
'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION,
}), }),
})['data']['contentJson'] 'lang': 'en',
return self._parse_component_props(self._parse_json(content, display_id)['componentProps']) })['data']['contentRoute']['listedPathData']
return self._parse_component_props(content, display_id)
class KhanAcademyIE(KhanAcademyBaseIE): class KhanAcademyIE(KhanAcademyBaseIE):
@ -47,64 +56,98 @@ class KhanAcademyIE(KhanAcademyBaseIE):
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/') _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
_TEST = { _TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad', 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0', 'md5': '1d5c2e70fa6aa29c38eca419f12515ce',
'info_dict': { 'info_dict': {
'id': 'FlIG3TvQCBQ', 'id': 'FlIG3TvQCBQ',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The one-time pad', 'title': 'The one-time pad',
'description': 'The perfect cipher', 'description': 'The perfect cipher',
'display_id': '716378217',
'duration': 176, 'duration': 176,
'uploader': 'Brit Cruise', 'uploader': 'Khan Academy',
'uploader_id': 'khanacademy', 'uploader_id': '@khanacademy',
'uploader_url': 'https://www.youtube.com/@khanacademy',
'upload_date': '20120411', 'upload_date': '20120411',
'timestamp': 1334170113, 'timestamp': 1334170113,
'license': 'cc-by-nc-sa', 'license': 'cc-by-nc-sa',
'live_status': 'not_live',
'channel': 'Khan Academy',
'channel_id': 'UC4a-Gbdw7vOaccHmFo40b9g',
'channel_url': 'https://www.youtube.com/channel/UC4a-Gbdw7vOaccHmFo40b9g',
'channel_is_verified': True,
'playable_in_embed': True,
'categories': ['Education'],
'creators': ['Brit Cruise'],
'tags': [],
'age_limit': 0,
'availability': 'public',
'comment_count': int,
'channel_follower_count': int,
'thumbnail': str,
'view_count': int,
'like_count': int,
'heatmap': list,
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
} }
def _parse_component_props(self, component_props): def _parse_component_props(self, component_props, display_id):
video = component_props['tutorialPageData']['contentModel'] video = component_props['content']
info = self._parse_video(video) return {
author_names = video.get('authorNames') **self._parse_video(video),
info.update({ **traverse_obj(video, {
'uploader': ', '.join(author_names) if author_names else None, 'creators': ('authorNames', ..., {str}),
'timestamp': parse_iso8601(video.get('dateAdded')), 'timestamp': ('dateAdded', {parse_iso8601}),
'license': video.get('kaUserLicense'), 'license': ('kaUserLicense', {str}),
}) }),
return info }
class KhanAcademyUnitIE(KhanAcademyBaseIE): class KhanAcademyUnitIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy:unit' IE_NAME = 'khanacademy:unit'
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)' _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('1,2', '')) + '/?(?:[?#&]|$)'
_TEST = { _TESTS = [{
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography', 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
'info_dict': { 'info_dict': {
'id': 'cryptography', 'id': 'x48c910b6',
'title': 'Cryptography', 'title': 'Cryptography',
'description': 'How have humans protected their secret messages through history? What has changed today?', 'description': 'How have humans protected their secret messages through history? What has changed today?',
'display_id': 'computing/computer-science/cryptography',
'_old_archive_ids': ['khanacademyunit cryptography'],
}, },
'playlist_mincount': 31, 'playlist_mincount': 31,
} }, {
'url': 'https://www.khanacademy.org/computing/computer-science',
'info_dict': {
'id': 'x301707a0',
'title': 'Computer science theory',
'description': 'md5:4b472a4646e6cf6ec4ccb52c4062f8ba',
'display_id': 'computing/computer-science',
'_old_archive_ids': ['khanacademyunit computer-science'],
},
'playlist_mincount': 50,
}]
def _parse_component_props(self, component_props): def _parse_component_props(self, component_props, display_id):
curation = component_props['curation'] course = component_props['course']
selected_unit = traverse_obj(course, (
'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course
entries = [] def build_entry(entry):
tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or [] return self.url_result(urljoin(
for tutorial_number, tutorial in enumerate(tutorials, 1): 'https://www.khanacademy.org', entry['canonicalUrl']),
chapter_info = { KhanAcademyIE, title=entry.get('translatedTitle'))
'chapter': tutorial.get('title'),
'chapter_number': tutorial_number, entries = traverse_obj(selected_unit, (
'chapter_id': tutorial.get('id'), (('unitChildren', ...), None), 'allOrderedChildren', ..., 'curatedChildren',
} lambda _, v: v['contentKind'] == 'Video' and v['canonicalUrl'], {build_entry}))
for content_item in (tutorial.get('contentItems') or []):
if content_item.get('kind') == 'Video':
info = self._parse_video(content_item)
info.update(chapter_info)
entries.append(info)
return self.playlist_result( return self.playlist_result(
entries, curation.get('unit'), curation.get('title'), entries,
curation.get('description')) display_id=display_id,
**traverse_obj(selected_unit, {
'id': ('id', {str}),
'title': ('translatedTitle', {str}),
'description': ('translatedDescription', {str}),
'_old_archive_ids': ('slug', {str}, {lambda x: [make_archive_id(self, x)] if x else None}),
}))

View File

@ -4,6 +4,7 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
clean_html, clean_html,
filter_dict,
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
@ -590,21 +591,22 @@ class NhkRadiruIE(InfoExtractor):
IE_DESC = 'NHK らじる (Radiru/Rajiru)' IE_DESC = 'NHK らじる (Radiru/Rajiru)'
_VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?' _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
_TESTS = [{ _TESTS = [{
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210', 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239',
'skip': 'Episode expired on 2024-02-24', 'skip': 'Episode expired on 2024-06-09',
'info_dict': { 'info_dict': {
'title': 'ジャズ・トゥナイト シリーズJAZZジャイアンツ 56 ジョニー・ホッジス', 'title': 'ジャズ・トゥナイト ジャズ「Night and Day」特集',
'id': '0449_01_3926210', 'id': '0449_01_4003239',
'ext': 'm4a', 'ext': 'm4a',
'uploader': 'NHK FM 東京',
'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc',
'series': 'ジャズ・トゥナイト', 'series': 'ジャズ・トゥナイト',
'uploader': 'NHK-FM', 'channel': 'NHK FM 東京',
'channel': 'NHK-FM',
'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg', 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
'release_date': '20240217', 'upload_date': '20240601',
'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811', 'series_id': '0449_01',
'timestamp': 1708185600, 'release_date': '20240601',
'release_timestamp': 1708178400, 'timestamp': 1717257600,
'upload_date': '20240217', 'release_timestamp': 1717250400,
}, },
}, { }, {
# playlist, airs every weekday so it should _hopefully_ be okay forever # playlist, airs every weekday so it should _hopefully_ be okay forever
@ -613,71 +615,145 @@ class NhkRadiruIE(InfoExtractor):
'id': '0458_01', 'id': '0458_01',
'title': 'ベストオブクラシック', 'title': 'ベストオブクラシック',
'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
'channel': 'NHK-FM',
'uploader': 'NHK-FM',
'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
'series_id': '0458_01',
'uploader': 'NHK FM',
'channel': 'NHK FM',
'series': 'ベストオブクラシック',
}, },
'playlist_mincount': 3, 'playlist_mincount': 3,
}, { }, {
# one with letters in the id # one with letters in the id
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470', 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688',
'note': 'Expires on 2024-03-31', 'note': 'Expires on 2025-03-31',
'info_dict': { 'info_dict': {
'id': 'F300_06_3738470', 'id': 'F683_01_3910688',
'ext': 'm4a', 'ext': 'm4a',
'title': '有島武郎「一房のぶどう」', 'title': '夏目漱石「文鳥」第1回',
'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より', 'series': '【らじる文庫】夏目漱石「文鳥」全4回',
'channel': 'NHKラジオ第1、NHK-FM', 'series_id': 'F683_01',
'uploader': 'NHKラジオ第1、NHK-FM', 'description': '朗読:浅井理アナウンサー',
'timestamp': 1635757200, 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg',
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', 'upload_date': '20240106',
'release_date': '20161207', 'release_date': '20240106',
'series': 'らじる文庫 by ラジオ深夜便 ', 'uploader': 'NHK R1',
'release_timestamp': 1481126700, 'release_timestamp': 1704511800,
'upload_date': '20211101', 'channel': 'NHK R1',
'timestamp': 1704512700,
}, },
'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'], 'expected_warnings': ['Unable to download JSON metadata',
'Failed to get extended metadata. API returned Error 1: Invalid parameters'],
}, { }, {
# news # news
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109', 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173',
'skip': 'Expires on 2023-04-17',
'info_dict': { 'info_dict': {
'id': 'F261_01_3855109', 'id': 'F261_01_4012173',
'ext': 'm4a', 'ext': 'm4a',
'channel': 'NHKラジオ第1', 'channel': 'NHKラジオ第1',
'uploader': 'NHKラジオ第1', 'uploader': 'NHKラジオ第1',
'timestamp': 1681635900,
'release_date': '20230416',
'series': 'NHKラジオニュース', 'series': 'NHKラジオニュース',
'title': '後6時のNHKニュース', 'title': '午前時のNHKニュース',
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
'upload_date': '20230416', 'release_timestamp': 1718290800,
'release_timestamp': 1681635600, 'release_date': '20240613',
'timestamp': 1718291400,
'upload_date': '20240613',
}, },
}, {
# fallback when extended metadata fails
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298',
'skip': 'Expires on 2024-06-07',
'info_dict': {
'id': '2834_01_4009298',
'title': 'まち☆キラ!開成町特集',
'ext': 'm4a',
'release_date': '20240531',
'upload_date': '20240531',
'series': 'はま☆キラ!',
'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg',
'channel': 'NHK R1,FM',
'description': '',
'timestamp': 1717123800,
'uploader': 'NHK R1,FM',
'release_timestamp': 1717120800,
'series_id': '2834_01',
},
'expected_warnings': ['Failed to get extended metadata. API returned empty list.'],
}] }]
_API_URL_TMPL = None _API_URL_TMPL = None
def _extract_extended_description(self, episode_id, episode): def _extract_extended_metadata(self, episode_id, aa_vinfo):
service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')})) service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')}))
aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
detail_url = try_call( detail_url = try_call(
lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3)) lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3]))
if not detail_url: if not detail_url:
return return {}
full_meta = traverse_obj( response = self._download_json(
self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False), detail_url, episode_id, 'Downloading extended metadata',
('list', service, 0, {dict})) or {} 'Failed to download extended metadata', fatal=False, expected_status=400)
return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta) if not response:
return {}
def _extract_episode_info(self, headline, programme_id, series_meta): if error := traverse_obj(response, ('error', {dict})):
self.report_warning(
'Failed to get extended metadata. API returned '
f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}')
return {}
full_meta = traverse_obj(response, ('list', service, 0, {dict}))
if not full_meta:
self.report_warning('Failed to get extended metadata. API returned empty list.')
return {}
station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None
thumbnails = [{
'id': str(id_),
'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1,
**traverse_obj(thumb, {
'url': 'url',
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
}),
} for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))]
return filter_dict({
'channel': station,
'uploader': station,
'description': join_nonempty(
'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta),
'thumbnails': thumbnails,
**traverse_obj(full_meta, {
'title': ('title', {str}),
'timestamp': ('end_time', {unified_timestamp}),
'release_timestamp': ('start_time', {unified_timestamp}),
}),
})
def _extract_episode_info(self, episode, programme_id, series_meta):
episode_id = f'{programme_id}_{episode["id"]}'
aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')}))
extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo)
fallback_start_time, _, fallback_end_time = traverse_obj(
aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')}))
return {
**series_meta,
'id': episode_id,
'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False),
'container': 'm4a_dash', # force fixup, AAC-only HLS
'was_live': True,
'title': episode.get('program_title'),
'description': episode.get('program_sub_title'), # fallback
'timestamp': unified_timestamp(fallback_end_time),
'release_timestamp': unified_timestamp(fallback_start_time),
**extended_metadata,
}
def _extract_news_info(self, headline, programme_id, series_meta):
episode_id = f'{programme_id}_{headline["headline_id"]}' episode_id = f'{programme_id}_{headline["headline_id"]}'
episode = traverse_obj(headline, ('file_list', 0, {dict})) episode = traverse_obj(headline, ('file_list', 0, {dict}))
description = self._extract_extended_description(episode_id, episode)
if not description:
self.report_warning('Failed to get extended description, falling back to summary')
description = traverse_obj(episode, ('file_title_sub', {str}))
return { return {
**series_meta, **series_meta,
@ -687,9 +763,9 @@ def _extract_episode_info(self, headline, programme_id, series_meta):
'was_live': True, 'was_live': True,
'series': series_meta.get('title'), 'series': series_meta.get('title'),
'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'), 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
'description': description,
**traverse_obj(episode, { **traverse_obj(episode, {
'title': 'file_title', 'title': ('file_title', {str}),
'description': ('file_title_sub', {str}),
'timestamp': ('open_time', {unified_timestamp}), 'timestamp': ('open_time', {unified_timestamp}),
'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}), 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
}), }),
@ -706,32 +782,58 @@ def _real_extract(self, url):
site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline') site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
programme_id = f'{site_id}_{corner_id}' programme_id = f'{site_id}_{corner_id}'
if site_id == 'F261': if site_id == 'F261': # XXX: News programmes use old API (for now?)
json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json' meta = self._download_json(
else: 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main']
json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
meta = self._download_json(json_url, programme_id)['main']
series_meta = traverse_obj(meta, { series_meta = traverse_obj(meta, {
'title': 'program_name', 'title': ('program_name', {str}),
'channel': 'media_name', 'channel': ('media_name', {str}),
'uploader': 'media_name', 'uploader': ('media_name', {str}),
'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
}, get_all=False) }, get_all=False)
if headline_id: if headline_id:
return self._extract_episode_info( headline = traverse_obj(
traverse_obj(meta, ( meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any))
'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False), if not headline:
programme_id, series_meta) raise ExtractorError('Content not found; it has most likely expired', expected=True)
return self._extract_news_info(headline, programme_id, series_meta)
def entries(): def news_entries():
for headline in traverse_obj(meta, ('detail_list', ..., {dict})): for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
yield self._extract_episode_info(headline, programme_id, series_meta) yield self._extract_news_info(headline, programme_id, series_meta)
return self.playlist_result( return self.playlist_result(
entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta) news_entries(), programme_id, description=meta.get('site_detail'), **series_meta)
meta = self._download_json(
'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={
'site_id': site_id,
'corner_site_id': corner_id,
})
fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ')
series_meta = {
'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta),
'series_id': programme_id,
'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})),
'channel': fallback_station,
'uploader': fallback_station,
}
if headline_id:
episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any))
if not episode:
raise ExtractorError('Content not found; it has most likely expired', expected=True)
return self._extract_episode_info(episode, programme_id, series_meta)
def entries():
for episode in traverse_obj(meta, ('episodes', ..., {dict})):
yield self._extract_episode_info(episode, programme_id, series_meta)
return self.playlist_result(
entries(), programme_id, title=series_meta.get('series'),
description=meta.get('series_description'), **series_meta)
class NhkRadioNewsPageIE(InfoExtractor): class NhkRadioNewsPageIE(InfoExtractor):

View File

@ -1,3 +1,5 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
@ -6,6 +8,7 @@
traverse_obj, traverse_obj,
unified_timestamp, unified_timestamp,
url_or_none, url_or_none,
urljoin,
) )
@ -21,8 +24,7 @@ class RTVSLOIE(InfoExtractor):
_API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622' _API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622'
SUB_LANGS_MAP = {'Slovenski': 'sl'} SUB_LANGS_MAP = {'Slovenski': 'sl'}
_TESTS = [ _TESTS = [{
{
'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv',
'info_dict': { 'info_dict': {
'id': '174842550', 'id': '174842550',
@ -88,8 +90,7 @@ class RTVSLOIE(InfoExtractor):
}, { }, {
'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550',
'only_matching': True, 'only_matching': True,
}, }]
]
def _real_extract(self, url): def _real_extract(self, url):
v_id = self._match_id(url) v_id = self._match_id(url)
@ -164,3 +165,26 @@ def _real_extract(self, url):
'series': meta.get('showName'), 'series': meta.get('showName'),
'series_id': meta.get('showId'), 'series_id': meta.get('showId'),
} }
class RTVSLOShowIE(InfoExtractor):
IE_NAME = 'rtvslo.si:show'
_VALID_URL = r'https?://(?:365|4d)\.rtvslo.si/oddaja/[^/?#&]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://365.rtvslo.si/oddaja/ekipa-bled/173250997',
'info_dict': {
'id': '173250997',
'title': 'Ekipa Bled',
},
'playlist_count': 18,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
return self.playlist_from_matches(
re.findall(r'<a [^>]*\bhref="(/arhiv/[^"]+)"', webpage),
playlist_id, self._html_extract_title(webpage),
getter=lambda x: urljoin('https://365.rtvslo.si', x), ie=RTVSLOIE)

View File

@ -95,7 +95,7 @@ def _update_client_id(self):
return return
raise ExtractorError('Unable to extract client id') raise ExtractorError('Unable to extract client id')
def _download_json(self, *args, **kwargs): def _call_api(self, *args, **kwargs):
non_fatal = kwargs.get('fatal') is False non_fatal = kwargs.get('fatal') is False
if non_fatal: if non_fatal:
del kwargs['fatal'] del kwargs['fatal']
@ -104,7 +104,7 @@ def _download_json(self, *args, **kwargs):
query['client_id'] = self._CLIENT_ID query['client_id'] = self._CLIENT_ID
kwargs['query'] = query kwargs['query'] = query
try: try:
return super()._download_json(*args, **kwargs) return self._download_json(*args, **kwargs)
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
self._store_client_id(None) self._store_client_id(None)
@ -163,7 +163,7 @@ def genNumBlock():
'user_agent': self._USER_AGENT 'user_agent': self._USER_AGENT
} }
response = self._download_json( response = self._call_api(
self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID), self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
None, note='Verifying login token...', fatal=False, None, note='Verifying login token...', fatal=False,
data=json.dumps(payload).encode()) data=json.dumps(payload).encode())
@ -217,12 +217,26 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f
query['secret_token'] = secret_token query['secret_token'] = secret_token
if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'): if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'):
download_url = update_url_query( try:
self._API_V2_BASE + 'tracks/' + track_id + '/download', query) # Do not use _call_api(); HTTP Error codes have different meanings for this request
redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') download_data = self._download_json(
if redirect_url: f'{self._API_V2_BASE}tracks/{track_id}/download', track_id,
'Downloading original download format info JSON', query=query, headers=self._HEADERS)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self.report_warning(
'Original download format is only available '
f'for registered users. {self._login_hint()}')
elif isinstance(e.cause, HTTPError) and e.cause.status == 403:
self.write_debug('Original download format is not available for this client')
else:
self.report_warning(e.msg)
download_data = None
if redirect_url := traverse_obj(download_data, ('redirectUri', {url_or_none})):
urlh = self._request_webpage( urlh = self._request_webpage(
HEADRequest(redirect_url), track_id, 'Checking for original download format', fatal=False) HEADRequest(redirect_url), track_id, 'Checking original download format availability',
'Original download format is not available', fatal=False)
if urlh: if urlh:
format_url = urlh.url format_url = urlh.url
format_urls.add(format_url) format_urls.add(format_url)
@ -303,7 +317,7 @@ def add_format(f, protocol, is_preview=False):
stream = None stream = None
for retry in self.RetryManager(fatal=False): for retry in self.RetryManager(fatal=False):
try: try:
stream = self._download_json( stream = self._call_api(
format_url, track_id, f'Downloading {identifier} format info JSON', format_url, track_id, f'Downloading {identifier} format info JSON',
query=query, headers=self._HEADERS) query=query, headers=self._HEADERS)
except ExtractorError as e: except ExtractorError as e:
@ -630,7 +644,7 @@ def _real_extract(self, url):
resolve_title += f'/{token}' resolve_title += f'/{token}'
info_json_url = self._resolv_url(self._BASE_URL + resolve_title) info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
info = self._download_json( info = self._call_api(
info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS) info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
return self._extract_info_dict(info, full_title, token) return self._extract_info_dict(info, full_title, token)
@ -641,7 +655,7 @@ def _extract_set(self, playlist, token=None):
playlist_id = str(playlist['id']) playlist_id = str(playlist['id'])
tracks = playlist.get('tracks') or [] tracks = playlist.get('tracks') or []
if not all(t.get('permalink_url') for t in tracks) and token: if not all(t.get('permalink_url') for t in tracks) and token:
tracks = self._download_json( tracks = self._call_api(
self._API_V2_BASE + 'tracks', playlist_id, self._API_V2_BASE + 'tracks', playlist_id,
'Downloading tracks', query={ 'Downloading tracks', query={
'ids': ','.join([str(t['id']) for t in tracks]), 'ids': ','.join([str(t['id']) for t in tracks]),
@ -699,7 +713,7 @@ def _real_extract(self, url):
if token: if token:
full_title += '/' + token full_title += '/' + token
info = self._download_json(self._resolv_url( info = self._call_api(self._resolv_url(
self._BASE_URL + full_title), full_title, headers=self._HEADERS) self._BASE_URL + full_title), full_title, headers=self._HEADERS)
if 'errors' in info: if 'errors' in info:
@ -730,7 +744,7 @@ def _entries(self, url, playlist_id):
for i in itertools.count(): for i in itertools.count():
for retry in self.RetryManager(): for retry in self.RetryManager():
try: try:
response = self._download_json( response = self._call_api(
url, playlist_id, query=query, headers=self._HEADERS, url, playlist_id, query=query, headers=self._HEADERS,
note=f'Downloading track page {i + 1}') note=f'Downloading track page {i + 1}')
break break
@ -838,7 +852,7 @@ def _real_extract(self, url):
mobj = self._match_valid_url(url) mobj = self._match_valid_url(url)
uploader = mobj.group('user') uploader = mobj.group('user')
user = self._download_json( user = self._call_api(
self._resolv_url(self._BASE_URL + uploader), self._resolv_url(self._BASE_URL + uploader),
uploader, 'Downloading user info', headers=self._HEADERS) uploader, 'Downloading user info', headers=self._HEADERS)
@ -864,7 +878,7 @@ class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
user_id = self._match_id(url) user_id = self._match_id(url)
user = self._download_json( user = self._call_api(
self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS) self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS)
return self._extract_playlist( return self._extract_playlist(
@ -886,7 +900,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
track_name = self._match_id(url) track_name = self._match_id(url)
track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS) track = self._call_api(self._resolv_url(url), track_name, headers=self._HEADERS)
track_id = self._search_regex( track_id = self._search_regex(
r'soundcloud:track-stations:(\d+)', track['id'], 'track id') r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
@ -930,7 +944,7 @@ class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
slug, relation = self._match_valid_url(url).group('slug', 'relation') slug, relation = self._match_valid_url(url).group('slug', 'relation')
track = self._download_json( track = self._call_api(
self._resolv_url(self._BASE_URL + slug), self._resolv_url(self._BASE_URL + slug),
slug, 'Downloading track info', headers=self._HEADERS) slug, 'Downloading track info', headers=self._HEADERS)
@ -965,7 +979,7 @@ def _real_extract(self, url):
if token: if token:
query['secret_token'] = token query['secret_token'] = token
data = self._download_json( data = self._call_api(
self._API_V2_BASE + 'playlists/' + playlist_id, self._API_V2_BASE + 'playlists/' + playlist_id,
playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS) playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
@ -1000,7 +1014,7 @@ def _get_collection(self, endpoint, collection_id, **query):
next_url = update_url_query(self._API_V2_BASE + endpoint, query) next_url = update_url_query(self._API_V2_BASE + endpoint, query)
for i in itertools.count(1): for i in itertools.count(1):
response = self._download_json( response = self._call_api(
next_url, collection_id, f'Downloading page {i}', next_url, collection_id, f'Downloading page {i}',
'Unable to download API page', headers=self._HEADERS) 'Unable to download API page', headers=self._HEADERS)

View File

@ -213,8 +213,19 @@ def _extract_aweme_app(self, aweme_id):
return self._parse_aweme_video_app(aweme_detail) return self._parse_aweme_video_app(aweme_detail)
def _extract_web_data_and_status(self, url, video_id, fatal=True): def _extract_web_data_and_status(self, url, video_id, fatal=True):
webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or '' video_data, status = {}, -1
video_data, status = {}, None
res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'})
if res is False:
return video_data, status
webpage, urlh = res
if urllib.parse.urlparse(urlh.url).path == '/login':
message = 'TikTok is requiring login for access to this content'
if fatal:
self.raise_login_required(message)
self.report_warning(f'{message}. {self._login_hint()}')
return video_data, status
if universal_data := self._get_universal_data(webpage, video_id): if universal_data := self._get_universal_data(webpage, video_id):
self.write_debug('Found universal data for rehydration') self.write_debug('Found universal data for rehydration')

View File

@ -13,6 +13,7 @@
class TubiTvIE(InfoExtractor): class TubiTvIE(InfoExtractor):
IE_NAME = 'tubitv'
_VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?P<type>video|movies|tv-shows)/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?P<type>video|movies|tv-shows)/(?P<id>\d+)'
_LOGIN_URL = 'http://tubitv.com/login' _LOGIN_URL = 'http://tubitv.com/login'
_NETRC_MACHINE = 'tubitv' _NETRC_MACHINE = 'tubitv'
@ -148,30 +149,54 @@ def _real_extract(self, url):
class TubiTvShowIE(InfoExtractor): class TubiTvShowIE(InfoExtractor):
_WORKING = False IE_NAME = 'tubitv:series'
_VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/[0-9]+/(?P<show_name>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?tubitv\.com/series/\d+/(?P<show_name>[^/?#]+)(?:/season-(?P<season>\d+))?'
_TESTS = [{ _TESTS = [{
'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true', 'url': 'https://tubitv.com/series/3936/the-joy-of-painting-with-bob-ross?start=true',
'playlist_mincount': 390, 'playlist_mincount': 389,
'info_dict': { 'info_dict': {
'id': 'the-joy-of-painting-with-bob-ross', 'id': 'the-joy-of-painting-with-bob-ross',
}, },
}, {
'url': 'https://tubitv.com/series/2311/the-saddle-club/season-1',
'playlist_count': 26,
'info_dict': {
'id': 'the-saddle-club-season-1',
},
}, {
'url': 'https://tubitv.com/series/2311/the-saddle-club/season-3',
'playlist_count': 19,
'info_dict': {
'id': 'the-saddle-club-season-3',
},
}, {
'url': 'https://tubitv.com/series/2311/the-saddle-club/',
'playlist_mincount': 71,
'info_dict': {
'id': 'the-saddle-club',
},
}] }]
def _entries(self, show_url, show_name): def _entries(self, show_url, playlist_id, selected_season):
show_webpage = self._download_webpage(show_url, show_name) webpage = self._download_webpage(show_url, playlist_id)
show_json = self._parse_json(self._search_regex( data = self._search_json(
r'window\.__data\s*=\s*({[^<]+});\s*</script>', r'window\.__data\s*=', webpage, 'data', playlist_id,
show_webpage, 'data'), show_name, transform_source=js_to_json)['video'] transform_source=js_to_json)['video']
for episode_id in show_json['fullContentById']: # v['number'] is already a decimal string, but stringify to protect against API changes
if traverse_obj(show_json, ('byId', episode_id, 'type')) == 's': path = [lambda _, v: str(v['number']) == selected_season] if selected_season else [..., {dict}]
continue
for season in traverse_obj(data, ('byId', lambda _, v: v['type'] == 's', 'seasons', *path)):
season_number = int_or_none(season.get('number'))
for episode in traverse_obj(season, ('episodes', lambda _, v: v['id'])):
episode_id = episode['id']
yield self.url_result( yield self.url_result(
f'https://tubitv.com/tv-shows/{episode_id}/', f'https://tubitv.com/tv-shows/{episode_id}/', TubiTvIE, episode_id,
ie=TubiTvIE.ie_key(), video_id=episode_id) season_number=season_number, episode_number=int_or_none(episode.get('num')))
def _real_extract(self, url): def _real_extract(self, url):
show_name = self._match_valid_url(url).group('show_name') playlist_id, selected_season = self._match_valid_url(url).group('show_name', 'season')
return self.playlist_result(self._entries(url, show_name), playlist_id=show_name) if selected_season:
playlist_id = f'{playlist_id}-season-{selected_season}'
return self.playlist_result(self._entries(url, playlist_id, selected_season), playlist_id)

View File

@ -885,14 +885,14 @@ def _get_count(self, data, *path_list):
return count return count
@staticmethod @staticmethod
def _extract_thumbnails(data, *path_list): def _extract_thumbnails(data, *path_list, final_key='thumbnails'):
""" """
Extract thumbnails from thumbnails dict Extract thumbnails from thumbnails dict
@param path_list: path list to level that contains 'thumbnails' key @param path_list: path list to level that contains 'thumbnails' key
""" """
thumbnails = [] thumbnails = []
for path in path_list or [()]: for path in path_list or [()]:
for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)): for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)):
thumbnail_url = url_or_none(thumbnail.get('url')) thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url: if not thumbnail_url:
continue continue
@ -5124,6 +5124,10 @@ def _extract_metadata_from_tabs(self, item_id, data):
else: else:
metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict)
# pageHeaderViewModel slow rollout began April 2024
page_header_view_model = traverse_obj(data, (
'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict}))
# We can get the uncropped banner/avatar by replacing the crop params with '=s0' # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
# See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714 # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
def _get_uncropped(url): def _get_uncropped(url):
@ -5139,8 +5143,10 @@ def _get_uncropped(url):
'preference': 1, 'preference': 1,
}) })
channel_banners = self._extract_thumbnails( channel_banners = (
data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner')))
or self._extract_thumbnails(
page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources'))
for banner in channel_banners: for banner in channel_banners:
banner['preference'] = -10 banner['preference'] = -10
@ -5167,7 +5173,11 @@ def _get_uncropped(url):
or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag'))
or info['id']), or info['id']),
'availability': self._extract_availability(data), 'availability': self._extract_availability(data),
'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), 'channel_follower_count': (
self._get_count(data, ('header', ..., 'subscriberCountText'))
or traverse_obj(page_header_view_model, (
'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts',
lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))),
'description': try_get(metadata_renderer, lambda x: x.get('description', '')), 'description': try_get(metadata_renderer, lambda x: x.get('description', '')),
'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str})) 'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str}))
or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))), or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))),

View File

@ -1048,6 +1048,34 @@ def _ffmpeg_args_for_chapter(self, number, chapter, info):
['-ss', str(chapter['start_time']), ['-ss', str(chapter['start_time']),
'-t', str(chapter['end_time'] - chapter['start_time'])]) '-t', str(chapter['end_time'] - chapter['start_time'])])
# Extends opts with chapter specific metadata for the supported formats.
#
# Tested and supported on opus, m4a, webm and mp4.
def _set_metadata_arg(self, opts, ext, key, value):
if ext == 'opus':
# opus file requires a stream to keep title, artist etc metadata.
# FFmpegMetadataPP already set metadata and created that stream.
# Futher metadata updates should be set on the stream(:s)
# -metadata will do nothing and needs to be -metadata:s
opts.extend(['-metadata:s', f'{key}={value}'])
elif ext in ['m4a', 'webm', 'mp4']:
opts.extend(['-metadata', f'{key}={value}'])
# FFmpeg adds metadata about all chapters from parent file to all split m4a files.
# This is incorrect since there must be only single chapter in each file after split.
# Such behavior confuses players who think multiple chapters present
def _set_out_opts(self, ext, chapter_title, track_number):
out_opts = [*self.stream_copy_opts()]
out_opts.extend(['-map_metadata', '0'])
# exclude chapters metadata but keep everything else
out_opts.extend(['-map_chapters', '-1'])
# replace global title with chapter specific title in split files
if chapter_title:
self._set_metadata_arg(out_opts, ext, "title", chapter_title)
self._set_metadata_arg(out_opts, ext, "track", track_number)
return out_opts
@PostProcessor._restrict_to(images=False) @PostProcessor._restrict_to(images=False)
def run(self, info): def run(self, info):
self._fixup_chapters(info) self._fixup_chapters(info)
@ -1062,7 +1090,8 @@ def run(self, info):
self.to_screen(f'Splitting video by chapters; {len(chapters)} chapters found') self.to_screen(f'Splitting video by chapters; {len(chapters)} chapters found')
for idx, chapter in enumerate(chapters): for idx, chapter in enumerate(chapters):
destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info) destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())]) out_file_opts = self._set_out_opts(info['ext'], chapter.get('title', ''), str(idx + 1))
self.real_run_ffmpeg([(in_file, opts)], [(destination, out_file_opts)])
if in_file != info['filepath']: if in_file != info['filepath']:
self._delete_downloaded_files(in_file, msg=None) self._delete_downloaded_files(in_file, msg=None)
return [], info return [], info