Compare commits

...

13 Commits

Author SHA1 Message Date
coletdjnz
52f7e99237
Merge 4cf46e29c1 into 8e15177b41 2024-05-17 21:31:52 +05:30
Justin Keogh
8e15177b41
[ie/youtube] Fix comments extraction (#9775)
Closes #9358
Authored by: jakeogh, minamotorin, shoxie007, bbilly1

Co-authored-by: minamotorin <76122224+minamotorin@users.noreply.github.com>
Co-authored-by: shoxie007 <74592022+shoxie007@users.noreply.github.com>
Co-authored-by: Simon <35427372+bbilly1@users.noreply.github.com>
2024-05-17 14:37:30 +00:00
Roeniss Moon
dd9ad97b1f
[cookies] Add --cookies-from-browser support for Whale (#9649)
Closes #9307
Authored by: roeniss
2024-05-17 14:33:12 +00:00
minamotorin
61b17437dc
[ie] Add POST data hash to --write-pages filenames (#9879)
Closes #9773
Authored by: minamotorin
2024-05-17 14:28:36 +00:00
kylegustavo
7975ddf245
[ie/bbc] Fix and extend extraction (#9705)
Closes #9701
Authored by: kylegustavo, dirkf, pukkandan
2024-05-17 06:20:13 +00:00
pukkandan
4cf46e29c1
Update yt_dlp/extractor/youtube.py
Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2024-05-12 23:18:48 +05:30
coletdjnz
b69031b221
Update video timestamp tests 2024-05-11 11:49:25 +12:00
coletdjnz
919824f299
comment 2024-05-04 22:05:57 +12:00
coletdjnz
481ada3437
Do not extract timestamp if timezone not available 2024-05-04 22:05:18 +12:00
coletdjnz
a1af9ffe27
Do not extract timestamp on 3.8 if timezone is not present 2024-05-04 21:46:31 +12:00
coletdjnz
a7e0a8452b
linter 2024-05-04 21:20:42 +12:00
coletdjnz
b229dcf1b1
add test for override 2024-05-04 21:18:53 +12:00
coletdjnz
17bcceda78
[ie/youtube] Extract upload date timestamp if available 2024-05-04 21:13:01 +12:00
7 changed files with 473 additions and 185 deletions

View File

@ -666,7 +666,7 @@ ## Filesystem Options:
The name of the browser to load cookies
from. Currently supported browsers are:
brave, chrome, chromium, edge, firefox,
opera, safari, vivaldi. Optionally, the
opera, safari, vivaldi, whale. Optionally, the
KEYRING used for decrypting Chromium cookies
on Linux, the name/path of the PROFILE to
load cookies from, and the CONTAINER name

View File

@ -5,6 +5,7 @@
import sys
import unittest
import warnings
import datetime as dt
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -27,6 +28,7 @@
ExtractorError,
InAdvancePagedList,
LazyList,
NO_DEFAULT,
OnDemandPagedList,
Popen,
age_restricted,
@ -768,6 +770,11 @@ def test_encode_compat_str(self):
def test_parse_iso8601(self):
self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00'), 1395641066)
self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=dt.timedelta(hours=-7)), 1395641066)
self.assertEqual(parse_iso8601('2014-03-23T23:04:26', timezone=NO_DEFAULT), None)
# default does not override timezone in date_str
self.assertEqual(parse_iso8601('2014-03-23T23:04:26-07:00', timezone=dt.timedelta(hours=-10)), 1395641066)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266)

View File

@ -46,7 +46,7 @@
from .utils._utils import _YDLLogger
from .utils.networking import normalize_url
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'}
SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'),
'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'),
'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'),
'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'),
}[browser_name]
elif sys.platform == 'darwin':
@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': os.path.join(appdata, 'Microsoft Edge'),
'opera': os.path.join(appdata, 'com.operasoftware.Opera'),
'vivaldi': os.path.join(appdata, 'Vivaldi'),
'whale': os.path.join(appdata, 'Naver/Whale'),
}[browser_name]
else:
@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': os.path.join(config, 'microsoft-edge'),
'opera': os.path.join(config, 'opera'),
'vivaldi': os.path.join(config, 'vivaldi'),
'whale': os.path.join(config, 'naver-whale'),
}[browser_name]
# Linux keyring names can be determined by snooping on dbus while opening the browser in KDE:
@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name):
'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium',
'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium',
'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome',
'whale': 'Whale',
}[browser_name]
browsers_without_profiles = {'opera'}

View File

@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
'title': 'Russia stages massive WW2 parade',
'title': 'Russia stages massive WW2 parade despite Western boycott',
'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
},
'playlist_count': 2,
@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
'title': 'BUGGER',
'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
},
'playlist_count': 18,
}, {
@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': {
'id': 'p02mprgb',
'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'description': 'md5:2868290467291b37feda7863f7a83f54',
'title': 'Germanwings crash site aerial video',
'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
'duration': 47,
'timestamp': 1427219242,
'upload_date': '20150324',
'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
},
'params': {
'skip_download': True,
}
},
'skip': 'now SIMORGH_DATA with no video',
}, {
# single video embedded with data-playable containing XML playlists (regional section)
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
'info_dict': {
'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'id': '39275083',
'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'ext': 'mp4',
'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
'timestamp': 1434713142,
'upload_date': '20150619',
'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
},
'params': {
'skip_download': True,
}
},
}, {
# single video from video playlist embedded with vxp-playlist-data JSON
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
},
'params': {
'skip_download': True,
}
},
'skip': '404 Not Found',
}, {
# single video story with digitalData
# single video story with __PWA_PRELOADED_STATE__
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
'info_dict': {
'id': 'p02q6gc4',
'ext': 'flv',
'title': 'Sri Lanka’s spicy secret',
'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
'timestamp': 1437674293,
'upload_date': '20150723',
'ext': 'mp4',
'title': 'Tasting the spice of life in Jaffna',
'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
'timestamp': 1646058397,
'upload_date': '20220228',
'duration': 255,
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
# single video story without digitalData
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'timestamp': 1415867444,
'upload_date': '20141113',
},
'params': {
# rtmp download
'skip_download': True,
}
'skip': 'redirects to TopGear home page',
}, {
# single video embedded with Morph
# TODO: replacement test page
'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
'info_dict': {
'id': 'p041vhd0',
@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'BBC Sport',
'uploader_id': 'bbc_sport',
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Georestricted to UK',
'skip': 'Video no longer in page',
}, {
# single video with playlist.sxml URL in playlist param
# single video in __INITIAL_DATA__
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
'title': 'Ronaldo to Man Utd, Arsenal to spend?',
'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
'timestamp': 1437750175,
'upload_date': '20150724',
'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
'duration': 140,
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
# article with multiple videos embedded with playlist.sxml in playlist param
# article with multiple videos embedded with Morph.setPayload
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': '34475836',
@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
},
'playlist_count': 3,
}, {
# Testing noplaylist
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': 'p034ppnv',
'ext': 'mp4',
'title': 'All you need to know about Jurgen Klopp',
'timestamp': 1444335081,
'upload_date': '20151008',
'duration': 122.0,
'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
},
'params': {
'noplaylist': True,
},
}, {
# school report article with single video
'url': 'http://www.bbc.co.uk/schoolreport/35744779',
@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'title': 'School which breaks down barriers in Jerusalem',
},
'playlist_count': 1,
'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
}, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1437785037,
'upload_date': '20150725',
'duration': 105,
},
}, {
# video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': {
'id': 'p0b71qth',
'id': 'p0b779gc',
'ext': 'mp4',
'title': 'Why France is making this woman a national hero',
'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1638230731,
'upload_date': '20211130',
'timestamp': 1638215626,
'upload_date': '20211129',
'duration': 125,
},
}, {
# video with script id __NEXT_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/uk-68546268',
'info_dict': {
'id': 'p0hj0lq7',
'ext': 'mp4',
'title': 'Nasser Hospital doctor describes his treatment by IDF',
'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1710188248,
'upload_date': '20240311',
'duration': 104,
},
}, {
# single video article embedded with data-media-vpid
@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three',
},
'skip': '404 Not Found',
}, {
'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
'info_dict': {
@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'ext': 'mp4',
'title': 'md5:2fabf12a726603193a2879a055f72514',
'description': 'Learn English words and phrases from this story',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
},
'add_ie': [BBCCoUkIE.ie_key()],
}, {
@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': {
'id': 'p07c6sb9',
'ext': 'mp4',
'title': 'How positive thinking is harming your happiness',
'alt_title': 'The downsides of positive thinking',
'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
'title': 'The downsides of positive thinking',
'description': 'The downsides of positive thinking',
'duration': 235,
'thumbnail': r're:https?://.+/p07c9dsr.jpg',
'upload_date': '20190604',
'categories': ['Psychology'],
'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
'upload_date': '20220223',
'timestamp': 1645632746,
},
}, {
# BBC Sounds
'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
'info_dict': {
'id': 'm001q789',
'id': 'p0hrw4nr',
'ext': 'mp4',
'title': 'The Night Tracks Mix - Music for the darkling hour',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
'chapters': 'count:8',
'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
'uploader': 'Radio 3',
'duration': 1800,
'uploader_id': 'bbc_radio_three',
},
'title': 'Are our coastlines being washed away?',
'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
'timestamp': 1713556800,
'upload_date': '20240419',
'duration': 1588,
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
'uploader': 'World Service',
'uploader_id': 'bbc_world_service',
'series': 'CrowdScience',
'chapters': [],
}
}, { # onion routes
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
'only_matching': True,
@ -1008,8 +1039,7 @@ def _real_extract(self, url):
webpage, 'group id', default=None)
if group_id:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id,
ie=BBCCoUkIE.ie_key())
f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
@ -1069,83 +1099,133 @@ def _real_extract(self, url):
}
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video
# seems to be always related to the first one
morph_payload = self._parse_json(
self._search_regex(
r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
webpage, 'morph payload', default='{}'),
playlist_id, fatal=False)
# Several setPayload calls may be present but the video(s)
# should be in one that mentions leadMedia or videoData
morph_payload = self._search_json(
r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
default={})
if morph_payload:
components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
for component in components:
if not isinstance(component, dict):
continue
lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
if not lead_media:
continue
identifiers = lead_media.get('identifiers')
if not identifiers or not isinstance(identifiers, dict):
continue
programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
for lead_media in traverse_obj(morph_payload, (
'body', 'components', ..., 'props', 'leadMedia', {dict})):
programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
if not programme_id:
continue
title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id)
description = lead_media.get('summary')
uploader = lead_media.get('masterBrand')
uploader_id = lead_media.get('mid')
duration = None
duration_d = lead_media.get('duration')
if isinstance(duration_d, dict):
duration = parse_duration(dict_get(
duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
return {
'id': programme_id,
'title': title,
'description': description,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'title': lead_media.get('title') or self._og_search_title(webpage),
**traverse_obj(lead_media, {
'description': ('summary', {str}),
'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
'uploader': ('masterBrand', {str}),
'uploader_id': ('mid', {str}),
}),
'formats': formats,
'subtitles': subtitles,
}
body = self._parse_json(traverse_obj(morph_payload, (
'body', 'content', 'article', 'body')), playlist_id, fatal=False)
for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
if video_data.get('vpid'):
video_id = video_data['vpid']
formats, subtitles = self._download_media_selector(video_id)
entry = {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
else:
video_id = video_data['pid']
entry = self.url_result(
f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
video_id, url_transparent=True)
entry.update({
'timestamp': traverse_obj(morph_payload, (
'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601})
),
**traverse_obj(video_data, {
'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
'title': (('title', 'caption'), {str}, any),
'duration': ('duration', {parse_duration}),
}),
})
if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
return entry
entries.append(entry)
if entries:
playlist_title = traverse_obj(morph_payload, (
'body', 'content', 'article', 'headline', {str})) or playlist_title
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
preload_state = self._parse_json(self._search_regex(
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), playlist_id, fatal=False)
if preload_state:
current_programme = preload_state.get('programmes', {}).get('current') or {}
programme_id = current_programme.get('id')
if current_programme and programme_id and current_programme.get('type') == 'playable_item':
title = current_programme.get('titles', {}).get('tertiary') or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
synopses = current_programme.get('synopses') or {}
network = current_programme.get('network') or {}
duration = int_or_none(
current_programme.get('duration', {}).get('value'))
thumbnail = None
image_url = current_programme.get('image_url')
if image_url:
thumbnail = image_url.replace('{recipe}', 'raw')
# various PRELOADED_STATE JSON
preload_state = self._search_json(
r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
'preload state', playlist_id, transform_source=js_to_json, default={})
# PRELOADED_STATE with current programmme
current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
programme_id = traverse_obj(current_programme, ('id', {str}))
if programme_id and current_programme.get('type') == 'playable_item':
title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
return {
'id': programme_id,
'title': title,
'formats': formats,
**traverse_obj(current_programme, {
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
'duration': ('duration', 'value', {int_or_none}),
'uploader': ('network', 'short_title', {str}),
'uploader_id': ('network', 'id', {str}),
'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
'series': ('titles', 'primary', {str}),
}),
'subtitles': subtitles,
'chapters': traverse_obj(preload_state, (
'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
})
),
}
# PWA_PRELOADED_STATE with article video asset
asset_id = traverse_obj(preload_state, (
'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
'assetVideo', 0, {str}, any))
if asset_id:
video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
if video_id:
article = traverse_obj(preload_state, (
'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
def image_url(image_id):
return traverse_obj(preload_state, (
'entities', 'images', image_id, 'url',
{lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
formats, subtitles = self._download_media_selector(video_id)
return {
'id': programme_id,
'title': title,
'description': dict_get(synopses, ('long', 'medium', 'short')),
'thumbnail': thumbnail,
'duration': duration,
'uploader': network.get('short_title'),
'uploader_id': network.get('id'),
'id': video_id,
**traverse_obj(preload_state, ('entities', 'videos', asset_id, {
'title': ('title', {str}),
'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
'thumbnail': (0, {image_url}),
'duration': ('duration', {int_or_none}),
})),
'formats': formats,
'subtitles': subtitles,
'chapters': traverse_obj(preload_state, (
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
})) or None,
'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
}
else:
return self.url_result(
f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
asset_id, playlist_title, display_id=playlist_id,
description=playlist_description)
bbc3_config = self._parse_json(
self._search_regex(
@ -1191,6 +1271,28 @@ def _real_extract(self, url):
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def parse_model(model):
"""Extract single video from model structure"""
item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
if not item_id:
return
formats, subtitles = self._download_media_selector(item_id)
return {
'id': item_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
'duration': ('versions', 0, 'duration', {int}),
'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
})
}
def is_type(*types):
return lambda _, v: v['type'] in types
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None)
@ -1202,6 +1304,19 @@ def _real_extract(self, url):
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
for video_data in traverse_obj(initial_data, (
'stores', 'article', 'articleBodyContent', is_type('video'))):
model = traverse_obj(video_data, (
'model', 'blocks', is_type('aresMedia'),
'model', 'blocks', is_type('aresMediaMetadata'),
'model', {dict}, any))
entry = parse_model(model)
if entry:
entries.append(entry)
if entries:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def parse_media(media):
if not media:
return
@ -1234,27 +1349,90 @@ def parse_media(media):
'subtitles': subtitles,
'timestamp': item_time,
'description': strip_or_none(item_desc),
'duration': int_or_none(item.get('duration')),
})
for resp in (initial_data.get('data') or {}).values():
name = resp.get('name')
for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
name = resp['name']
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
for block in (try_get(resp,
(lambda x: x['data']['blocks'],
lambda x: x['data']['content']['model']['blocks'],),
list) or []):
if block.get('type') not in ['media', 'video']:
continue
parse_media(block.get('model'))
for block in traverse_obj(resp, (
'data', (None, ('content', 'model')), 'blocks',
is_type('media', 'video'), 'model', {dict})):
parse_media(block)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
# extract from SIMORGH_DATA hydration JSON
simorgh_data = self._search_json(
r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
'simorgh data', playlist_id, default={})
if simorgh_data:
done = False
for video_data in traverse_obj(simorgh_data, (
'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
model = traverse_obj(video_data, (
'model', 'blocks', is_type('aresMedia'),
'model', 'blocks', is_type('aresMediaMetadata'),
'model', {dict}, any))
if video_data['type'] == 'video':
entry = parse_model(model)
else: # legacyMedia: no duration, subtitles
block_id, entry = traverse_obj(model, ('blockId', {str})), None
media_data = traverse_obj(simorgh_data, (
'pageData', 'promo', 'media',
{lambda x: x if x['id'] == block_id else None}))
formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
'url': ('url', {url_or_none}),
'ext': ('format', {str}),
'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
}))
if formats:
entry = {
'id': block_id,
'display_id': playlist_id,
'formats': formats,
'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
}),
}
done = True
if entry:
entries.append(entry)
if done:
break
if entries:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
re.findall(pattern, webpage))))
# US accessed article with single embedded video (e.g.
# https://www.bbc.com/news/uk-68546268)
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
('props', 'pageProps', 'page'))
model = traverse_obj(next_data, (
..., 'contents', is_type('video'),
'model', 'blocks', is_type('media'),
'model', 'blocks', is_type('mediaMetadata'),
'model', {dict}, any))
if model and (entry := parse_model(model)):
if not entry.get('timestamp'):
entry['timestamp'] = traverse_obj(next_data, (
..., 'contents', is_type('timestamp'), 'model',
'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
entries.append(entry)
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
# Multiple video article (e.g.
# http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX

View File

@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
if urlh is False:
assert not fatal
return False
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
encoding=encoding, data=data)
return (content, urlh)
@staticmethod
@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
def _request_dump_filename(self, url, video_id):
basen = f'{video_id}_{url}'
def _request_dump_filename(self, url, video_id, data=None):
if data is not None:
data = hashlib.md5(data).hexdigest()
basen = join_nonempty(video_id, data, url, delim='_')
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
except LookupError:
return webpage_bytes.decode('utf-8', 'replace')
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
url_or_request = self._create_request(url_or_request, data)
if self.get_param('dump_intermediate_pages', False):
self.to_screen('Dumping request to ' + urlh.url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
filename = self._request_dump_filename(urlh.url, video_id)
filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
filename = self._request_dump_filename(url_or_request.url, video_id)
filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:

View File

@ -1325,6 +1325,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
'uploader_id': '@PhilippHagemeister',
'heatmap': 'count:100',
'timestamp': 1349198244,
}
},
{
@ -1368,6 +1369,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
'uploader_id': '@PhilippHagemeister',
'heatmap': 'count:100',
'timestamp': 1349198244,
},
'params': {
'skip_download': True,
@ -1454,6 +1456,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int,
'channel_is_verified': True,
'heatmap': 'count:100',
'timestamp': 1401991663,
},
},
{
@ -1513,6 +1516,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Projekt Melody',
'uploader_url': 'https://www.youtube.com/@ProjektMelody',
'uploader_id': '@ProjektMelody',
'timestamp': 1577508724,
},
},
{
@ -1618,6 +1622,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@Olympics',
'uploader_id': '@Olympics',
'channel_is_verified': True,
'timestamp': 1440707674,
},
'params': {
'skip_download': 'requires avconv',
@ -1651,6 +1656,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'ĺ­«á„‹á„…',
'uploader_url': 'https://www.youtube.com/@AllenMeow',
'uploader_id': '@AllenMeow',
'timestamp': 1299776999,
},
},
# url_encoded_fmt_stream_map is empty string
@ -1794,6 +1800,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
}],
'params': {'skip_download': True},
'skip': 'Not multifeed anymore',
},
{
# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
@ -1902,6 +1909,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'The Berkman Klein Center for Internet & Society',
'uploader_id': '@BKCHarvard',
'uploader_url': 'https://www.youtube.com/@BKCHarvard',
'timestamp': 1422422076,
},
'params': {
'skip_download': True,
@ -1937,6 +1945,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@BernieSanders',
'channel_is_verified': True,
'heatmap': 'count:100',
'timestamp': 1447987198,
},
'params': {
'skip_download': True,
@ -2000,6 +2009,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@Vsauce',
'comment_count': int,
'channel_is_verified': True,
'timestamp': 1484761047,
},
'params': {
'skip_download': True,
@ -2155,6 +2165,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'l\'Or Vert asbl',
'uploader_url': 'https://www.youtube.com/@ElevageOrVert',
'uploader_id': '@ElevageOrVert',
'timestamp': 1497343210,
},
'params': {
'skip_download': True,
@ -2193,6 +2204,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@Csharp-video-tutorialsBlogspot',
'channel_is_verified': True,
'heatmap': 'count:100',
'timestamp': 1377976349,
},
'params': {
'skip_download': True,
@ -2275,6 +2287,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@CBSMornings',
'comment_count': int,
'channel_is_verified': True,
'timestamp': 1405513526,
}
},
{
@ -2292,7 +2305,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'view_count': int,
'channel': 'Walk around Japan',
'tags': ['Ueno Tokyo', 'Okachimachi Tokyo', 'Ameyoko Street', 'Tokyo attraction', 'Travel in Tokyo'],
'thumbnail': 'https://i.ytimg.com/vi_webp/cBvYw8_A0vQ/hqdefault.webp',
'thumbnail': 'https://i.ytimg.com/vi/cBvYw8_A0vQ/hqdefault.jpg',
'age_limit': 0,
'availability': 'public',
'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
@ -2302,6 +2315,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Walk around Japan',
'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124',
'uploader_id': '@walkaroundjapan7124',
'timestamp': 1605884416,
},
'params': {
'skip_download': True,
@ -2396,6 +2410,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int,
'channel_is_verified': True,
'heatmap': 'count:100',
'timestamp': 1395685455,
}, 'params': {'format': 'mhtml', 'skip_download': True}
}, {
# Ensure video upload_date is in UTC timezone (video was uploaded 1641170939)
@ -2425,9 +2440,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@LeonNguyen',
'uploader_id': '@LeonNguyen',
'heatmap': 'count:100',
'timestamp': 1641170939,
}
}, {
# Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date
# todo: remove compat opt? no longer works
'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4',
'info_dict': {
'id': '2NUZ8W2llS4',
@ -2487,38 +2504,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'comment_count': int,
'channel_is_verified': True,
'heatmap': 'count:100',
'timestamp': 1641172509,
}
},
{ # continuous livestream. Microformat upload date should be preferred.
# Upload date was 2021-06-19 (not UTC), while stream start is 2021-11-27
'url': 'https://www.youtube.com/watch?v=kgx4WGK0oNU',
{ # continuous livestream.
# Upload date was 2022-07-12T05:12:29-07:00, while stream start is 2022-07-12T15:59:30+00:00
'url': 'https://www.youtube.com/watch?v=jfKfPfyJRdk',
'info_dict': {
'id': 'kgx4WGK0oNU',
'title': r're:jazz\/lofi hip hop radio🌱chill beats to relax\/study to \[LIVE 24\/7\] \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'id': 'jfKfPfyJRdk',
'ext': 'mp4',
'channel_id': 'UC84whx2xxsiA1gXHXXqKGOA',
'availability': 'public',
'age_limit': 0,
'release_timestamp': 1637975704,
'upload_date': '20210619',
'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA',
'live_status': 'is_live',
'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg',
'channel': 'Abao in Tokyo',
'channel_follower_count': int,
'release_date': '20211127',
'tags': 'count:39',
'categories': ['People & Blogs'],
'channel_id': 'UCSJ4gkVC6NrvII8umztf0Ow',
'like_count': int,
'view_count': int,
'playable_in_embed': True,
'description': 'md5:2ef1d002cad520f65825346e2084e49d',
'uploader': 'Lofi Girl',
'categories': ['Music'],
'concurrent_view_count': int,
'uploader': 'Abao in Tokyo',
'uploader_url': 'https://www.youtube.com/@abaointokyo',
'uploader_id': '@abaointokyo',
'playable_in_embed': True,
'timestamp': 1657627949,
'release_date': '20220712',
'channel_url': 'https://www.youtube.com/channel/UCSJ4gkVC6NrvII8umztf0Ow',
'description': 'md5:13a6f76df898f5674f9127139f3df6f7',
'age_limit': 0,
'thumbnail': 'https://i.ytimg.com/vi/jfKfPfyJRdk/maxresdefault.jpg',
'release_timestamp': 1657641570,
'uploader_url': 'https://www.youtube.com/@LofiGirl',
'channel_follower_count': int,
'channel_is_verified': True,
'title': r're:^lofi hip hop radio đź“š - beats to relax/study to',
'view_count': int,
'live_status': 'is_live',
'tags': 'count:32',
'channel': 'Lofi Girl',
'availability': 'public',
'upload_date': '20220712',
'uploader_id': '@LofiGirl',
},
'params': {'skip_download': True}
'params': {'skip_download': True},
}, {
'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA',
'info_dict': {
@ -2544,6 +2564,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@lesmiscore',
'uploader': 'Lesmiscore',
'uploader_url': 'https://www.youtube.com/@lesmiscore',
'timestamp': 1648005313,
}
}, {
# Prefer primary title+description language metadata by default
@ -2571,6 +2592,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@coletdjnz',
'uploader_id': '@coletdjnz',
'uploader': 'cole-dlp-test-acc',
'timestamp': 1662677394,
},
'params': {'skip_download': True}
}, {
@ -2584,7 +2606,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': 5,
'live_status': 'not_live',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'upload_date': '20220728',
'upload_date': '20220729',
'view_count': int,
'categories': ['People & Blogs'],
'thumbnail': r're:^https?://.*\.jpg',
@ -2597,6 +2619,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@coletdjnz',
'uploader_id': '@coletdjnz',
'uploader': 'cole-dlp-test-acc',
'timestamp': 1659073275,
'like_count': int,
},
'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}},
'expected_warnings': [r'Preferring "fr" translated fields'],
@ -2662,6 +2686,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Projekt Melody',
'uploader_id': '@ProjektMelody',
'uploader_url': 'https://www.youtube.com/@ProjektMelody',
'timestamp': 1577508724,
},
'params': {'extractor_args': {'youtube': {'player_client': ['tv_embedded']}}, 'format': '251-drc'},
},
@ -2696,6 +2721,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': '@sana_natori',
'channel_is_verified': True,
'heatmap': 'count:100',
'timestamp': 1671798112,
},
},
{
@ -2765,6 +2791,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries',
'uploader_id': '@ChristopherSykesDocumentaries',
'heatmap': 'count:100',
'timestamp': 1211825920,
},
'params': {
'skip_download': True,
@ -3317,7 +3344,36 @@ def _extract_heatmap(self, data):
'value': ('intensityScoreNormalized', {float_or_none}),
})) or None
def _extract_comment(self, comment_renderer, parent=None):
def _extract_comment(self, entities, parent=None):
comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
return
toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
return {
'id': comment_id,
'parent': parent or 'root',
**traverse_obj(comment_entity_payload, {
'text': ('properties', 'content', 'content', {str}),
'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
'author_id': ('author', 'channelId', {self.ucid_or_none}),
'author': ('author', 'displayName', {str}),
'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
'author_is_uploader': ('author', 'isCreator', {bool}),
'author_is_verified': ('author', 'isVerified', {bool}),
'author_url': ('author', 'channelCommand', 'innertubeCommand', (
('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
), {lambda x: urljoin('https://www.youtube.com', x)}),
}, get_all=False),
'is_favorited': (None if toolbar_entity_payload is None else
toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
'_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate.
'timestamp': self._parse_time_text(time_text),
}
def _extract_comment_old(self, comment_renderer, parent=None):
comment_id = comment_renderer.get('commentId')
if not comment_id:
return
@ -3398,21 +3454,39 @@ def extract_header(contents):
break
return _continuation
def extract_thread(contents):
def extract_thread(contents, entity_payloads):
if not parent:
tracker['current_page_thread'] = 0
for content in contents:
if not parent and tracker['total_parent_comments'] >= max_parents:
yield
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})
comment = self._extract_comment(comment_renderer, parent)
# old comment format
if not entity_payloads:
comment_renderer = get_first(
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
expected_type=dict, default={})
comment = self._extract_comment_old(comment_renderer, parent)
# new comment format
else:
view_model = (
traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
or traverse_obj(content, ('commentViewModel', {dict})))
comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
if not comment_keys:
continue
entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
comment = self._extract_comment(entities, parent)
if comment:
comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None
if not comment:
continue
comment_id = comment['id']
if comment.get('is_pinned'):
tracker['pinned_comment_ids'].add(comment_id)
# Sometimes YouTube may break and give us infinite looping comments.
@ -3505,7 +3579,7 @@ def extract_thread(contents):
check_get_keys = None
if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
check_get_keys = [[*continuation_items_path, ..., (
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
try:
response = self._extract_response(
item_id=None, query=continuation,
@ -3529,6 +3603,7 @@ def extract_thread(contents):
raise
is_forced_continuation = False
continuation = None
mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
if is_first_continuation:
continuation = extract_header(continuation_items)
@ -3537,7 +3612,7 @@ def extract_thread(contents):
break
continue
for entry in extract_thread(continuation_items):
for entry in extract_thread(continuation_items, mutations):
if not entry:
return
yield entry
@ -4562,19 +4637,35 @@ def process_language(container, base_url, lang_code, sub_name, query):
'uploader_id': channel_handle,
'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None),
})
# We only want timestamp IF it has time precision AND a timezone
# Currently the uploadDate in microformats appears to be in US/Pacific timezone.
timestamp = (
parse_iso8601(get_first(microformats, 'uploadDate'), timezone=NO_DEFAULT)
or parse_iso8601(search_meta('uploadDate'), timezone=NO_DEFAULT)
)
upload_date = (
dt.datetime.fromtimestamp(timestamp, dt.timezone.utc).strftime('%Y%m%d') if timestamp else
(
unified_strdate(get_first(microformats, 'uploadDate'))
or unified_strdate(search_meta('uploadDate'))
))
# In the case we cannot get the timestamp:
# The upload date for scheduled, live and past live streams / premieres in microformats
# may be different from the stream date. Although not in UTC, we will prefer it in this case.
# See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139
upload_date = (
unified_strdate(get_first(microformats, 'uploadDate'))
or unified_strdate(search_meta('uploadDate')))
if not upload_date or (
live_status in ('not_live', None)
not timestamp
and live_status in ('not_live', None)
and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', [])
):
# this should be in UTC, as configured in the cookie/client context
upload_date = strftime_or_none(
self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date
info['upload_date'] = upload_date
info['timestamp'] = timestamp
if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'):
# Newly uploaded videos' HLS formats are potentially problematic and need to be checked

View File

@ -1134,7 +1134,7 @@ def is_path_like(f):
return isinstance(f, (str, bytes, os.PathLike))
def extract_timezone(date_str):
def extract_timezone(date_str, default=None):
m = re.search(
r'''(?x)
^.{8,}? # >=8 char non-TZ prefix, if present
@ -1146,21 +1146,25 @@ def extract_timezone(date_str):
(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
$)
''', date_str)
timezone = None
if not m:
m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
if timezone is not None:
date_str = date_str[:-len(m.group('tz'))]
timezone = dt.timedelta(hours=timezone or 0)
timezone = dt.timedelta(hours=timezone)
else:
date_str = date_str[:-len(m.group('tz'))]
if not m.group('sign'):
timezone = dt.timedelta()
else:
if m.group('sign'):
sign = 1 if m.group('sign') == '+' else -1
timezone = dt.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
if timezone is None and default is not NO_DEFAULT:
timezone = default or dt.timedelta()
return timezone, date_str
@ -1172,10 +1176,9 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
date_str = re.sub(r'\.[0-9]+', '', date_str)
if timezone is None:
timezone, date_str = extract_timezone(date_str)
timezone, date_str = extract_timezone(date_str, timezone)
with contextlib.suppress(ValueError):
with contextlib.suppress(ValueError, TypeError):
date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
dt_ = dt.datetime.strptime(date_str, date_format) - timezone
return calendar.timegm(dt_.timetuple())