mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-06-01 19:08:14 +02:00
Compare commits
19 Commits
6c7277dcbe
...
3462aff04f
Author | SHA1 | Date | |
---|---|---|---|
|
3462aff04f | ||
|
8e15177b41 | ||
|
dd9ad97b1f | ||
|
61b17437dc | ||
|
7975ddf245 | ||
|
6c8ede8188 | ||
|
3046c17822 | ||
|
dd986a4149 | ||
|
c3fccc58cf | ||
|
28d5051546 | ||
|
a1ff1d4272 | ||
|
0a3c5aceb5 | ||
|
9c3b227db8 | ||
|
ea2a085397 | ||
|
44bb6c2056 | ||
|
fe4a15ff75 | ||
|
5d51ddbbfc | ||
|
c9d8184fe6 | ||
|
c574be85f1 |
|
@ -666,7 +666,7 @@ ## Filesystem Options:
|
|||
The name of the browser to load cookies
|
||||
from. Currently supported browsers are:
|
||||
brave, chrome, chromium, edge, firefox,
|
||||
opera, safari, vivaldi. Optionally, the
|
||||
opera, safari, vivaldi, whale. Optionally, the
|
||||
KEYRING used for decrypting Chromium cookies
|
||||
on Linux, the name/path of the PROFILE to
|
||||
load cookies from, and the CONTAINER name
|
||||
|
|
|
@ -874,32 +874,33 @@ def test_format_note(self):
|
|||
}), r'^30fps$')
|
||||
|
||||
def test_postprocessors(self):
|
||||
filename = 'post-processor-testfile.mp4'
|
||||
audiofile = filename + '.mp3'
|
||||
filename = 'post-processor-testfile'
|
||||
video_file = filename + '.mp4'
|
||||
audio_file = filename + '.mp3'
|
||||
|
||||
class SimplePP(PostProcessor):
|
||||
def run(self, info):
|
||||
with open(audiofile, 'w') as f:
|
||||
with open(audio_file, 'w') as f:
|
||||
f.write('EXAMPLE')
|
||||
return [info['filepath']], info
|
||||
|
||||
def run_pp(params, PP):
|
||||
with open(filename, 'w') as f:
|
||||
with open(video_file, 'w') as f:
|
||||
f.write('EXAMPLE')
|
||||
ydl = YoutubeDL(params)
|
||||
ydl.add_post_processor(PP())
|
||||
ydl.post_process(filename, {'filepath': filename})
|
||||
ydl.post_process(video_file, {'filepath': video_file})
|
||||
|
||||
run_pp({'keepvideo': True}, SimplePP)
|
||||
self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename)
|
||||
self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)
|
||||
os.unlink(filename)
|
||||
os.unlink(audiofile)
|
||||
run_pp({'keepvideo': True, 'outtmpl': filename}, SimplePP)
|
||||
self.assertTrue(os.path.exists(video_file), '%s doesn\'t exist' % video_file)
|
||||
self.assertTrue(os.path.exists(audio_file), '%s doesn\'t exist' % audio_file)
|
||||
os.unlink(video_file)
|
||||
os.unlink(audio_file)
|
||||
|
||||
run_pp({'keepvideo': False}, SimplePP)
|
||||
self.assertFalse(os.path.exists(filename), '%s exists' % filename)
|
||||
self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)
|
||||
os.unlink(audiofile)
|
||||
run_pp({'keepvideo': False, 'outtmpl': filename}, SimplePP)
|
||||
self.assertFalse(os.path.exists(video_file), '%s exists' % video_file)
|
||||
self.assertTrue(os.path.exists(audio_file), '%s doesn\'t exist' % audio_file)
|
||||
os.unlink(audio_file)
|
||||
|
||||
class ModifierPP(PostProcessor):
|
||||
def run(self, info):
|
||||
|
@ -907,9 +908,9 @@ def run(self, info):
|
|||
f.write('MODIFIED')
|
||||
return [], info
|
||||
|
||||
run_pp({'keepvideo': False}, ModifierPP)
|
||||
self.assertTrue(os.path.exists(filename), '%s doesn\'t exist' % filename)
|
||||
os.unlink(filename)
|
||||
run_pp({'keepvideo': False, 'outtmpl': filename}, ModifierPP)
|
||||
self.assertTrue(os.path.exists(video_file), '%s doesn\'t exist' % video_file)
|
||||
os.unlink(video_file)
|
||||
|
||||
def test_match_filter(self):
|
||||
first = {
|
||||
|
|
|
@ -3219,7 +3219,6 @@ def replace_info_dict(new_info):
|
|||
# info_dict['_filename'] needs to be set for backward compatibility
|
||||
info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
|
||||
temp_filename = self.prepare_filename(info_dict, 'temp')
|
||||
files_to_move = {}
|
||||
|
||||
# Forced printings
|
||||
self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
|
||||
|
@ -3247,13 +3246,11 @@ def check_max_downloads():
|
|||
sub_files = self._write_subtitles(info_dict, temp_filename)
|
||||
if sub_files is None:
|
||||
return
|
||||
files_to_move.update(dict(sub_files))
|
||||
|
||||
thumb_files = self._write_thumbnails(
|
||||
'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail'))
|
||||
if thumb_files is None:
|
||||
return
|
||||
files_to_move.update(dict(thumb_files))
|
||||
|
||||
infofn = self.prepare_filename(info_dict, 'infojson')
|
||||
_infojson_written = self._write_info_json('video', info_dict, infofn)
|
||||
|
@ -3327,13 +3324,12 @@ def _write_link_file(link_type):
|
|||
for link_type, should_write in write_links.items()):
|
||||
return
|
||||
|
||||
new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
|
||||
new_info, _ = self.pre_process(info_dict, 'before_dl')
|
||||
replace_info_dict(new_info)
|
||||
|
||||
if self.params.get('skip_download'):
|
||||
info_dict['filepath'] = temp_filename
|
||||
info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
|
||||
info_dict['__files_to_move'] = files_to_move
|
||||
replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
|
||||
info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
|
||||
else:
|
||||
|
@ -3447,9 +3443,6 @@ def correct_ext(filename, ext=new_ext):
|
|||
info_dict['__files_to_merge'] = downloaded
|
||||
# Even if there were no downloads, it is being merged only now
|
||||
info_dict['__real_download'] = True
|
||||
else:
|
||||
for file in downloaded:
|
||||
files_to_move[file] = None
|
||||
else:
|
||||
# Just a single file
|
||||
dl_filename = existing_video_file(full_filename, temp_filename)
|
||||
|
@ -3463,7 +3456,6 @@ def correct_ext(filename, ext=new_ext):
|
|||
|
||||
dl_filename = dl_filename or temp_filename
|
||||
info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
|
||||
|
||||
except network_exceptions as err:
|
||||
self.report_error('unable to download video data: %s' % error_to_compat_str(err))
|
||||
return
|
||||
|
@ -3534,7 +3526,7 @@ def ffmpeg_fixup(cndn, msg, cls):
|
|||
|
||||
fixup()
|
||||
try:
|
||||
replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
|
||||
replace_info_dict(self.post_process(dl_filename, info_dict))
|
||||
except PostProcessingError as err:
|
||||
self.report_error('Postprocessing: %s' % str(err))
|
||||
return
|
||||
|
@ -3655,8 +3647,6 @@ def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None):
|
|||
os.remove(filename)
|
||||
except OSError:
|
||||
self.report_warning(f'Unable to delete file {filename}')
|
||||
if filename in info.get('__files_to_move', []): # NB: Delete even if None
|
||||
del info['__files_to_move'][filename]
|
||||
|
||||
@staticmethod
|
||||
def post_extract(info_dict):
|
||||
|
@ -3673,8 +3663,7 @@ def actual_post_extract(info_dict):
|
|||
|
||||
def run_pp(self, pp, infodict):
|
||||
files_to_delete = []
|
||||
if '__files_to_move' not in infodict:
|
||||
infodict['__files_to_move'] = {}
|
||||
|
||||
try:
|
||||
files_to_delete, infodict = pp.run(infodict)
|
||||
except PostProcessingError as e:
|
||||
|
@ -3686,10 +3675,7 @@ def run_pp(self, pp, infodict):
|
|||
|
||||
if not files_to_delete:
|
||||
return infodict
|
||||
if self.params.get('keepvideo', False):
|
||||
for f in files_to_delete:
|
||||
infodict['__files_to_move'].setdefault(f, '')
|
||||
else:
|
||||
if not self.params.get('keepvideo', False):
|
||||
self._delete_downloaded_files(
|
||||
*files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
|
||||
return infodict
|
||||
|
@ -3702,23 +3688,27 @@ def run_all_pps(self, key, info, *, additional_pps=None):
|
|||
return info
|
||||
|
||||
def pre_process(self, ie_info, key='pre_process', files_to_move=None):
|
||||
if files_to_move is not None:
|
||||
self.report_warning('[pre_process] "files_to_move" is deprecated and may be removed in a future version')
|
||||
|
||||
info = dict(ie_info)
|
||||
info['__files_to_move'] = files_to_move or {}
|
||||
try:
|
||||
info = self.run_all_pps(key, info)
|
||||
except PostProcessingError as err:
|
||||
msg = f'Preprocessing: {err}'
|
||||
info.setdefault('__pending_error', msg)
|
||||
self.report_error(msg, is_error=False)
|
||||
return info, info.pop('__files_to_move', None)
|
||||
return info, files_to_move
|
||||
|
||||
def post_process(self, filename, info, files_to_move=None):
|
||||
"""Run all the postprocessors on the given file."""
|
||||
if files_to_move is not None:
|
||||
self.report_warning('[post_process] "files_to_move" is deprecated and may be removed in a future version')
|
||||
|
||||
info['filepath'] = filename
|
||||
info['__files_to_move'] = files_to_move or {}
|
||||
info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors'))
|
||||
info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
|
||||
del info['__files_to_move']
|
||||
info.pop('__multiple_thumbnails', None)
|
||||
return self.run_all_pps('after_move', info)
|
||||
|
||||
def _make_archive_id(self, info_dict):
|
||||
|
@ -4305,10 +4295,11 @@ def _write_subtitles(self, info_dict, filename):
|
|||
sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
|
||||
sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext'))
|
||||
existing_sub = self.existing_file((sub_filename_final, sub_filename))
|
||||
|
||||
if existing_sub:
|
||||
self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present')
|
||||
sub_info['filepath'] = existing_sub
|
||||
ret.append((existing_sub, sub_filename_final))
|
||||
ret.append(existing_sub)
|
||||
continue
|
||||
|
||||
self.to_screen(f'[info] Writing video subtitles to: {sub_filename}')
|
||||
|
@ -4319,7 +4310,7 @@ def _write_subtitles(self, info_dict, filename):
|
|||
with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
|
||||
subfile.write(sub_info['data'])
|
||||
sub_info['filepath'] = sub_filename
|
||||
ret.append((sub_filename, sub_filename_final))
|
||||
ret.append(sub_filename)
|
||||
continue
|
||||
except OSError:
|
||||
self.report_error(f'Cannot write video subtitles file {sub_filename}')
|
||||
|
@ -4330,7 +4321,7 @@ def _write_subtitles(self, info_dict, filename):
|
|||
sub_copy.setdefault('http_headers', info_dict.get('http_headers'))
|
||||
self.dl(sub_filename, sub_copy, subtitle=True)
|
||||
sub_info['filepath'] = sub_filename
|
||||
ret.append((sub_filename, sub_filename_final))
|
||||
ret.append(sub_filename)
|
||||
except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
|
||||
msg = f'Unable to download video subtitles for {sub_lang!r}: {err}'
|
||||
if self.params.get('ignoreerrors') is not True: # False or 'only_download'
|
||||
|
@ -4350,6 +4341,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
|
|||
self.to_screen(f'[info] There are no {label} thumbnails to download')
|
||||
return ret
|
||||
multiple = write_all and len(thumbnails) > 1
|
||||
info_dict['__multiple_thumbnails'] = multiple
|
||||
|
||||
if thumb_filename_base is None:
|
||||
thumb_filename_base = filename
|
||||
|
@ -4371,7 +4363,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
|
|||
self.to_screen('[info] %s is already present' % (
|
||||
thumb_display_id if multiple else f'{label} thumbnail').capitalize())
|
||||
t['filepath'] = existing_thumb
|
||||
ret.append((existing_thumb, thumb_filename_final))
|
||||
ret.append(existing_thumb)
|
||||
else:
|
||||
self.to_screen(f'[info] Downloading {thumb_display_id} ...')
|
||||
try:
|
||||
|
@ -4379,7 +4371,7 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
|
|||
self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
|
||||
with open(encodeFilename(thumb_filename), 'wb') as thumbf:
|
||||
shutil.copyfileobj(uf, thumbf)
|
||||
ret.append((thumb_filename, thumb_filename_final))
|
||||
ret.append(thumb_filename)
|
||||
t['filepath'] = thumb_filename
|
||||
except network_exceptions as err:
|
||||
if isinstance(err, HTTPError) and err.status == 404:
|
||||
|
@ -4389,4 +4381,5 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
|
|||
thumbnails.pop(idx)
|
||||
if ret and not write_all:
|
||||
break
|
||||
|
||||
return ret
|
||||
|
|
|
@ -46,7 +46,7 @@
|
|||
from .utils._utils import _YDLLogger
|
||||
from .utils.networking import normalize_url
|
||||
|
||||
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
|
||||
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'}
|
||||
SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
|
||||
|
||||
|
||||
|
@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'),
|
||||
'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'),
|
||||
'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'),
|
||||
'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'),
|
||||
}[browser_name]
|
||||
|
||||
elif sys.platform == 'darwin':
|
||||
|
@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': os.path.join(appdata, 'Microsoft Edge'),
|
||||
'opera': os.path.join(appdata, 'com.operasoftware.Opera'),
|
||||
'vivaldi': os.path.join(appdata, 'Vivaldi'),
|
||||
'whale': os.path.join(appdata, 'Naver/Whale'),
|
||||
}[browser_name]
|
||||
|
||||
else:
|
||||
|
@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': os.path.join(config, 'microsoft-edge'),
|
||||
'opera': os.path.join(config, 'opera'),
|
||||
'vivaldi': os.path.join(config, 'vivaldi'),
|
||||
'whale': os.path.join(config, 'naver-whale'),
|
||||
}[browser_name]
|
||||
|
||||
# Linux keyring names can be determined by snooping on dbus while opening the browser in KDE:
|
||||
|
@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium',
|
||||
'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium',
|
||||
'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome',
|
||||
'whale': 'Whale',
|
||||
}[browser_name]
|
||||
|
||||
browsers_without_profiles = {'opera'}
|
||||
|
|
|
@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'url': 'http://www.bbc.com/news/world-europe-32668511',
|
||||
'info_dict': {
|
||||
'id': 'world-europe-32668511',
|
||||
'title': 'Russia stages massive WW2 parade',
|
||||
'title': 'Russia stages massive WW2 parade despite Western boycott',
|
||||
'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
|
||||
},
|
||||
'playlist_count': 2,
|
||||
|
@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'info_dict': {
|
||||
'id': '3662a707-0af9-3149-963f-47bea720b460',
|
||||
'title': 'BUGGER',
|
||||
'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
|
||||
},
|
||||
'playlist_count': 18,
|
||||
}, {
|
||||
|
@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'info_dict': {
|
||||
'id': 'p02mprgb',
|
||||
'ext': 'mp4',
|
||||
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
|
||||
'description': 'md5:2868290467291b37feda7863f7a83f54',
|
||||
'title': 'Germanwings crash site aerial video',
|
||||
'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
|
||||
'duration': 47,
|
||||
'timestamp': 1427219242,
|
||||
'upload_date': '20150324',
|
||||
'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
'skip_download': True,
|
||||
}
|
||||
}, {
|
||||
|
@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
'skip': 'now SIMORGH_DATA with no video',
|
||||
}, {
|
||||
# single video embedded with data-playable containing XML playlists (regional section)
|
||||
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
|
||||
'info_dict': {
|
||||
'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
|
||||
'id': '39275083',
|
||||
'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
|
||||
'ext': 'mp4',
|
||||
'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
|
||||
'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
|
||||
'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
|
||||
'timestamp': 1434713142,
|
||||
'upload_date': '20150619',
|
||||
'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
}, {
|
||||
# single video from video playlist embedded with vxp-playlist-data JSON
|
||||
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
|
||||
|
@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
'skip': '404 Not Found',
|
||||
}, {
|
||||
# single video story with digitalData
|
||||
# single video story with __PWA_PRELOADED_STATE__
|
||||
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
|
||||
'info_dict': {
|
||||
'id': 'p02q6gc4',
|
||||
'ext': 'flv',
|
||||
'title': 'Sri Lanka’s spicy secret',
|
||||
'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
|
||||
'timestamp': 1437674293,
|
||||
'upload_date': '20150723',
|
||||
'ext': 'mp4',
|
||||
'title': 'Tasting the spice of life in Jaffna',
|
||||
'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
|
||||
'timestamp': 1646058397,
|
||||
'upload_date': '20220228',
|
||||
'duration': 255,
|
||||
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
'skip_download': True,
|
||||
}
|
||||
}, {
|
||||
# single video story without digitalData
|
||||
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
|
||||
|
@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'timestamp': 1415867444,
|
||||
'upload_date': '20141113',
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
'skip_download': True,
|
||||
}
|
||||
'skip': 'redirects to TopGear home page',
|
||||
}, {
|
||||
# single video embedded with Morph
|
||||
# TODO: replacement test page
|
||||
'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
|
||||
'info_dict': {
|
||||
'id': 'p041vhd0',
|
||||
|
@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'uploader': 'BBC Sport',
|
||||
'uploader_id': 'bbc_sport',
|
||||
},
|
||||
'params': {
|
||||
# m3u8 download
|
||||
'skip_download': True,
|
||||
},
|
||||
'skip': 'Georestricted to UK',
|
||||
'skip': 'Video no longer in page',
|
||||
}, {
|
||||
# single video with playlist.sxml URL in playlist param
|
||||
# single video in __INITIAL_DATA__
|
||||
'url': 'http://www.bbc.com/sport/0/football/33653409',
|
||||
'info_dict': {
|
||||
'id': 'p02xycnp',
|
||||
'ext': 'mp4',
|
||||
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
|
||||
'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
|
||||
'title': 'Ronaldo to Man Utd, Arsenal to spend?',
|
||||
'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
|
||||
'timestamp': 1437750175,
|
||||
'upload_date': '20150724',
|
||||
'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
|
||||
'duration': 140,
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
'skip_download': True,
|
||||
}
|
||||
}, {
|
||||
# article with multiple videos embedded with playlist.sxml in playlist param
|
||||
# article with multiple videos embedded with Morph.setPayload
|
||||
'url': 'http://www.bbc.com/sport/0/football/34475836',
|
||||
'info_dict': {
|
||||
'id': '34475836',
|
||||
|
@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
|
||||
},
|
||||
'playlist_count': 3,
|
||||
}, {
|
||||
# Testing noplaylist
|
||||
'url': 'http://www.bbc.com/sport/0/football/34475836',
|
||||
'info_dict': {
|
||||
'id': 'p034ppnv',
|
||||
'ext': 'mp4',
|
||||
'title': 'All you need to know about Jurgen Klopp',
|
||||
'timestamp': 1444335081,
|
||||
'upload_date': '20151008',
|
||||
'duration': 122.0,
|
||||
'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
|
||||
},
|
||||
'params': {
|
||||
'noplaylist': True,
|
||||
},
|
||||
}, {
|
||||
# school report article with single video
|
||||
'url': 'http://www.bbc.co.uk/schoolreport/35744779',
|
||||
|
@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'title': 'School which breaks down barriers in Jerusalem',
|
||||
},
|
||||
'playlist_count': 1,
|
||||
'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
|
||||
}, {
|
||||
# single video with playlist URL from weather section
|
||||
'url': 'http://www.bbc.com/weather/features/33601775',
|
||||
|
@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'thumbnail': r're:https?://.+/.+\.jpg',
|
||||
'timestamp': 1437785037,
|
||||
'upload_date': '20150725',
|
||||
'duration': 105,
|
||||
},
|
||||
}, {
|
||||
# video with window.__INITIAL_DATA__ and value as JSON string
|
||||
'url': 'https://www.bbc.com/news/av/world-europe-59468682',
|
||||
'info_dict': {
|
||||
'id': 'p0b71qth',
|
||||
'id': 'p0b779gc',
|
||||
'ext': 'mp4',
|
||||
'title': 'Why France is making this woman a national hero',
|
||||
'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
|
||||
'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
|
||||
'thumbnail': r're:https?://.+/.+\.jpg',
|
||||
'timestamp': 1638230731,
|
||||
'upload_date': '20211130',
|
||||
'timestamp': 1638215626,
|
||||
'upload_date': '20211129',
|
||||
'duration': 125,
|
||||
},
|
||||
}, {
|
||||
# video with script id __NEXT_DATA__ and value as JSON string
|
||||
'url': 'https://www.bbc.com/news/uk-68546268',
|
||||
'info_dict': {
|
||||
'id': 'p0hj0lq7',
|
||||
'ext': 'mp4',
|
||||
'title': 'Nasser Hospital doctor describes his treatment by IDF',
|
||||
'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
|
||||
'thumbnail': r're:https?://.+/.+\.jpg',
|
||||
'timestamp': 1710188248,
|
||||
'upload_date': '20240311',
|
||||
'duration': 104,
|
||||
},
|
||||
}, {
|
||||
# single video article embedded with data-media-vpid
|
||||
|
@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'uploader': 'Radio 3',
|
||||
'uploader_id': 'bbc_radio_three',
|
||||
},
|
||||
'skip': '404 Not Found',
|
||||
}, {
|
||||
'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
|
||||
'info_dict': {
|
||||
|
@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'ext': 'mp4',
|
||||
'title': 'md5:2fabf12a726603193a2879a055f72514',
|
||||
'description': 'Learn English words and phrases from this story',
|
||||
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
|
||||
},
|
||||
'add_ie': [BBCCoUkIE.ie_key()],
|
||||
}, {
|
||||
|
@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
|
|||
'info_dict': {
|
||||
'id': 'p07c6sb9',
|
||||
'ext': 'mp4',
|
||||
'title': 'How positive thinking is harming your happiness',
|
||||
'alt_title': 'The downsides of positive thinking',
|
||||
'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
|
||||
'title': 'The downsides of positive thinking',
|
||||
'description': 'The downsides of positive thinking',
|
||||
'duration': 235,
|
||||
'thumbnail': r're:https?://.+/p07c9dsr.jpg',
|
||||
'upload_date': '20190604',
|
||||
'categories': ['Psychology'],
|
||||
'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
|
||||
'upload_date': '20220223',
|
||||
'timestamp': 1645632746,
|
||||
},
|
||||
}, {
|
||||
# BBC Sounds
|
||||
'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
|
||||
'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
|
||||
'info_dict': {
|
||||
'id': 'm001q789',
|
||||
'id': 'p0hrw4nr',
|
||||
'ext': 'mp4',
|
||||
'title': 'The Night Tracks Mix - Music for the darkling hour',
|
||||
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
|
||||
'chapters': 'count:8',
|
||||
'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
|
||||
'uploader': 'Radio 3',
|
||||
'duration': 1800,
|
||||
'uploader_id': 'bbc_radio_three',
|
||||
},
|
||||
'title': 'Are our coastlines being washed away?',
|
||||
'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
|
||||
'timestamp': 1713556800,
|
||||
'upload_date': '20240419',
|
||||
'duration': 1588,
|
||||
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
|
||||
'uploader': 'World Service',
|
||||
'uploader_id': 'bbc_world_service',
|
||||
'series': 'CrowdScience',
|
||||
'chapters': [],
|
||||
}
|
||||
}, { # onion routes
|
||||
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
|
||||
'only_matching': True,
|
||||
|
@ -1008,8 +1039,7 @@ def _real_extract(self, url):
|
|||
webpage, 'group id', default=None)
|
||||
if group_id:
|
||||
return self.url_result(
|
||||
'https://www.bbc.co.uk/programmes/%s' % group_id,
|
||||
ie=BBCCoUkIE.ie_key())
|
||||
f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
|
||||
|
||||
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
|
||||
programme_id = self._search_regex(
|
||||
|
@ -1069,83 +1099,133 @@ def _real_extract(self, url):
|
|||
}
|
||||
|
||||
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
|
||||
# There are several setPayload calls may be present but the video
|
||||
# seems to be always related to the first one
|
||||
morph_payload = self._parse_json(
|
||||
self._search_regex(
|
||||
r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
|
||||
webpage, 'morph payload', default='{}'),
|
||||
playlist_id, fatal=False)
|
||||
# Several setPayload calls may be present but the video(s)
|
||||
# should be in one that mentions leadMedia or videoData
|
||||
morph_payload = self._search_json(
|
||||
r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
|
||||
contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
|
||||
default={})
|
||||
if morph_payload:
|
||||
components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
|
||||
for component in components:
|
||||
if not isinstance(component, dict):
|
||||
continue
|
||||
lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
|
||||
if not lead_media:
|
||||
continue
|
||||
identifiers = lead_media.get('identifiers')
|
||||
if not identifiers or not isinstance(identifiers, dict):
|
||||
continue
|
||||
programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
|
||||
for lead_media in traverse_obj(morph_payload, (
|
||||
'body', 'components', ..., 'props', 'leadMedia', {dict})):
|
||||
programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
|
||||
if not programme_id:
|
||||
continue
|
||||
title = lead_media.get('title') or self._og_search_title(webpage)
|
||||
formats, subtitles = self._download_media_selector(programme_id)
|
||||
description = lead_media.get('summary')
|
||||
uploader = lead_media.get('masterBrand')
|
||||
uploader_id = lead_media.get('mid')
|
||||
duration = None
|
||||
duration_d = lead_media.get('duration')
|
||||
if isinstance(duration_d, dict):
|
||||
duration = parse_duration(dict_get(
|
||||
duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
|
||||
return {
|
||||
'id': programme_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'duration': duration,
|
||||
'uploader': uploader,
|
||||
'uploader_id': uploader_id,
|
||||
'title': lead_media.get('title') or self._og_search_title(webpage),
|
||||
**traverse_obj(lead_media, {
|
||||
'description': ('summary', {str}),
|
||||
'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
|
||||
'uploader': ('masterBrand', {str}),
|
||||
'uploader_id': ('mid', {str}),
|
||||
}),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
body = self._parse_json(traverse_obj(morph_payload, (
|
||||
'body', 'content', 'article', 'body')), playlist_id, fatal=False)
|
||||
for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
|
||||
if video_data.get('vpid'):
|
||||
video_id = video_data['vpid']
|
||||
formats, subtitles = self._download_media_selector(video_id)
|
||||
entry = {
|
||||
'id': video_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
else:
|
||||
video_id = video_data['pid']
|
||||
entry = self.url_result(
|
||||
f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
|
||||
video_id, url_transparent=True)
|
||||
entry.update({
|
||||
'timestamp': traverse_obj(morph_payload, (
|
||||
'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601})
|
||||
),
|
||||
**traverse_obj(video_data, {
|
||||
'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
|
||||
'title': (('title', 'caption'), {str}, any),
|
||||
'duration': ('duration', {parse_duration}),
|
||||
}),
|
||||
})
|
||||
if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
|
||||
return entry
|
||||
entries.append(entry)
|
||||
if entries:
|
||||
playlist_title = traverse_obj(morph_payload, (
|
||||
'body', 'content', 'article', 'headline', {str})) or playlist_title
|
||||
return self.playlist_result(
|
||||
entries, playlist_id, playlist_title, playlist_description)
|
||||
|
||||
preload_state = self._parse_json(self._search_regex(
|
||||
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
|
||||
'preload state', default='{}'), playlist_id, fatal=False)
|
||||
if preload_state:
|
||||
current_programme = preload_state.get('programmes', {}).get('current') or {}
|
||||
programme_id = current_programme.get('id')
|
||||
if current_programme and programme_id and current_programme.get('type') == 'playable_item':
|
||||
title = current_programme.get('titles', {}).get('tertiary') or playlist_title
|
||||
formats, subtitles = self._download_media_selector(programme_id)
|
||||
synopses = current_programme.get('synopses') or {}
|
||||
network = current_programme.get('network') or {}
|
||||
duration = int_or_none(
|
||||
current_programme.get('duration', {}).get('value'))
|
||||
thumbnail = None
|
||||
image_url = current_programme.get('image_url')
|
||||
if image_url:
|
||||
thumbnail = image_url.replace('{recipe}', 'raw')
|
||||
# various PRELOADED_STATE JSON
|
||||
preload_state = self._search_json(
|
||||
r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
|
||||
'preload state', playlist_id, transform_source=js_to_json, default={})
|
||||
# PRELOADED_STATE with current programmme
|
||||
current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
|
||||
programme_id = traverse_obj(current_programme, ('id', {str}))
|
||||
if programme_id and current_programme.get('type') == 'playable_item':
|
||||
title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
|
||||
formats, subtitles = self._download_media_selector(programme_id)
|
||||
return {
|
||||
'id': programme_id,
|
||||
'title': title,
|
||||
'formats': formats,
|
||||
**traverse_obj(current_programme, {
|
||||
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
|
||||
'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
|
||||
'duration': ('duration', 'value', {int_or_none}),
|
||||
'uploader': ('network', 'short_title', {str}),
|
||||
'uploader_id': ('network', 'id', {str}),
|
||||
'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
|
||||
'series': ('titles', 'primary', {str}),
|
||||
}),
|
||||
'subtitles': subtitles,
|
||||
'chapters': traverse_obj(preload_state, (
|
||||
'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
|
||||
'title': ('titles', {lambda x: join_nonempty(
|
||||
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
|
||||
'start_time': ('offset', 'start', {float_or_none}),
|
||||
'end_time': ('offset', 'end', {float_or_none}),
|
||||
})
|
||||
),
|
||||
}
|
||||
|
||||
# PWA_PRELOADED_STATE with article video asset
|
||||
asset_id = traverse_obj(preload_state, (
|
||||
'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
|
||||
'assetVideo', 0, {str}, any))
|
||||
if asset_id:
|
||||
video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
|
||||
if video_id:
|
||||
article = traverse_obj(preload_state, (
|
||||
'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
|
||||
|
||||
def image_url(image_id):
|
||||
return traverse_obj(preload_state, (
|
||||
'entities', 'images', image_id, 'url',
|
||||
{lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
|
||||
|
||||
formats, subtitles = self._download_media_selector(video_id)
|
||||
return {
|
||||
'id': programme_id,
|
||||
'title': title,
|
||||
'description': dict_get(synopses, ('long', 'medium', 'short')),
|
||||
'thumbnail': thumbnail,
|
||||
'duration': duration,
|
||||
'uploader': network.get('short_title'),
|
||||
'uploader_id': network.get('id'),
|
||||
'id': video_id,
|
||||
**traverse_obj(preload_state, ('entities', 'videos', asset_id, {
|
||||
'title': ('title', {str}),
|
||||
'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
|
||||
'thumbnail': (0, {image_url}),
|
||||
'duration': ('duration', {int_or_none}),
|
||||
})),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'chapters': traverse_obj(preload_state, (
|
||||
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
|
||||
'title': ('titles', {lambda x: join_nonempty(
|
||||
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
|
||||
'start_time': ('offset', 'start', {float_or_none}),
|
||||
'end_time': ('offset', 'end', {float_or_none}),
|
||||
})) or None,
|
||||
'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
|
||||
}
|
||||
else:
|
||||
return self.url_result(
|
||||
f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
|
||||
asset_id, playlist_title, display_id=playlist_id,
|
||||
description=playlist_description)
|
||||
|
||||
bbc3_config = self._parse_json(
|
||||
self._search_regex(
|
||||
|
@ -1191,6 +1271,28 @@ def _real_extract(self, url):
|
|||
return self.playlist_result(
|
||||
entries, playlist_id, playlist_title, playlist_description)
|
||||
|
||||
def parse_model(model):
|
||||
"""Extract single video from model structure"""
|
||||
item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
|
||||
if not item_id:
|
||||
return
|
||||
formats, subtitles = self._download_media_selector(item_id)
|
||||
return {
|
||||
'id': item_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
**traverse_obj(model, {
|
||||
'title': ('title', {str}),
|
||||
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
|
||||
'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
|
||||
'duration': ('versions', 0, 'duration', {int}),
|
||||
'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
|
||||
})
|
||||
}
|
||||
|
||||
def is_type(*types):
|
||||
return lambda _, v: v['type'] in types
|
||||
|
||||
initial_data = self._search_regex(
|
||||
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
|
||||
'quoted preload state', default=None)
|
||||
|
@ -1202,6 +1304,19 @@ def _real_extract(self, url):
|
|||
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
|
||||
initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
|
||||
if initial_data:
|
||||
for video_data in traverse_obj(initial_data, (
|
||||
'stores', 'article', 'articleBodyContent', is_type('video'))):
|
||||
model = traverse_obj(video_data, (
|
||||
'model', 'blocks', is_type('aresMedia'),
|
||||
'model', 'blocks', is_type('aresMediaMetadata'),
|
||||
'model', {dict}, any))
|
||||
entry = parse_model(model)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
if entries:
|
||||
return self.playlist_result(
|
||||
entries, playlist_id, playlist_title, playlist_description)
|
||||
|
||||
def parse_media(media):
|
||||
if not media:
|
||||
return
|
||||
|
@ -1234,27 +1349,90 @@ def parse_media(media):
|
|||
'subtitles': subtitles,
|
||||
'timestamp': item_time,
|
||||
'description': strip_or_none(item_desc),
|
||||
'duration': int_or_none(item.get('duration')),
|
||||
})
|
||||
for resp in (initial_data.get('data') or {}).values():
|
||||
name = resp.get('name')
|
||||
|
||||
for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
|
||||
name = resp['name']
|
||||
if name == 'media-experience':
|
||||
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
|
||||
elif name == 'article':
|
||||
for block in (try_get(resp,
|
||||
(lambda x: x['data']['blocks'],
|
||||
lambda x: x['data']['content']['model']['blocks'],),
|
||||
list) or []):
|
||||
if block.get('type') not in ['media', 'video']:
|
||||
continue
|
||||
parse_media(block.get('model'))
|
||||
for block in traverse_obj(resp, (
|
||||
'data', (None, ('content', 'model')), 'blocks',
|
||||
is_type('media', 'video'), 'model', {dict})):
|
||||
parse_media(block)
|
||||
return self.playlist_result(
|
||||
entries, playlist_id, playlist_title, playlist_description)
|
||||
|
||||
# extract from SIMORGH_DATA hydration JSON
|
||||
simorgh_data = self._search_json(
|
||||
r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
|
||||
'simorgh data', playlist_id, default={})
|
||||
if simorgh_data:
|
||||
done = False
|
||||
for video_data in traverse_obj(simorgh_data, (
|
||||
'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
|
||||
model = traverse_obj(video_data, (
|
||||
'model', 'blocks', is_type('aresMedia'),
|
||||
'model', 'blocks', is_type('aresMediaMetadata'),
|
||||
'model', {dict}, any))
|
||||
if video_data['type'] == 'video':
|
||||
entry = parse_model(model)
|
||||
else: # legacyMedia: no duration, subtitles
|
||||
block_id, entry = traverse_obj(model, ('blockId', {str})), None
|
||||
media_data = traverse_obj(simorgh_data, (
|
||||
'pageData', 'promo', 'media',
|
||||
{lambda x: x if x['id'] == block_id else None}))
|
||||
formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
|
||||
'url': ('url', {url_or_none}),
|
||||
'ext': ('format', {str}),
|
||||
'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
|
||||
}))
|
||||
if formats:
|
||||
entry = {
|
||||
'id': block_id,
|
||||
'display_id': playlist_id,
|
||||
'formats': formats,
|
||||
'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
|
||||
**traverse_obj(model, {
|
||||
'title': ('title', {str}),
|
||||
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
|
||||
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
|
||||
'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
|
||||
}),
|
||||
}
|
||||
done = True
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
if done:
|
||||
break
|
||||
if entries:
|
||||
return self.playlist_result(
|
||||
entries, playlist_id, playlist_title, playlist_description)
|
||||
|
||||
def extract_all(pattern):
|
||||
return list(filter(None, map(
|
||||
lambda s: self._parse_json(s, playlist_id, fatal=False),
|
||||
re.findall(pattern, webpage))))
|
||||
|
||||
# US accessed article with single embedded video (e.g.
|
||||
# https://www.bbc.com/news/uk-68546268)
|
||||
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
|
||||
('props', 'pageProps', 'page'))
|
||||
model = traverse_obj(next_data, (
|
||||
..., 'contents', is_type('video'),
|
||||
'model', 'blocks', is_type('media'),
|
||||
'model', 'blocks', is_type('mediaMetadata'),
|
||||
'model', {dict}, any))
|
||||
if model and (entry := parse_model(model)):
|
||||
if not entry.get('timestamp'):
|
||||
entry['timestamp'] = traverse_obj(next_data, (
|
||||
..., 'contents', is_type('timestamp'), 'model',
|
||||
'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
|
||||
entries.append(entry)
|
||||
return self.playlist_result(
|
||||
entries, playlist_id, playlist_title, playlist_description)
|
||||
|
||||
# Multiple video article (e.g.
|
||||
# http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
|
||||
EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
|
||||
|
|
|
@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
|
|||
if urlh is False:
|
||||
assert not fatal
|
||||
return False
|
||||
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
|
||||
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
|
||||
encoding=encoding, data=data)
|
||||
return (content, urlh)
|
||||
|
||||
@staticmethod
|
||||
|
@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
|
|||
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
|
||||
expected=True)
|
||||
|
||||
def _request_dump_filename(self, url, video_id):
|
||||
basen = f'{video_id}_{url}'
|
||||
def _request_dump_filename(self, url, video_id, data=None):
|
||||
if data is not None:
|
||||
data = hashlib.md5(data).hexdigest()
|
||||
basen = join_nonempty(video_id, data, url, delim='_')
|
||||
trim_length = self.get_param('trim_file_name') or 240
|
||||
if len(basen) > trim_length:
|
||||
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
|
||||
|
@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
|
|||
except LookupError:
|
||||
return webpage_bytes.decode('utf-8', 'replace')
|
||||
|
||||
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
|
||||
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
|
||||
prefix=None, encoding=None, data=None):
|
||||
webpage_bytes = urlh.read()
|
||||
if prefix is not None:
|
||||
webpage_bytes = prefix + webpage_bytes
|
||||
url_or_request = self._create_request(url_or_request, data)
|
||||
if self.get_param('dump_intermediate_pages', False):
|
||||
self.to_screen('Dumping request to ' + urlh.url)
|
||||
dump = base64.b64encode(webpage_bytes).decode('ascii')
|
||||
self._downloader.to_screen(dump)
|
||||
if self.get_param('write_pages'):
|
||||
filename = self._request_dump_filename(urlh.url, video_id)
|
||||
filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data)
|
||||
self.to_screen(f'Saving request to {filename}')
|
||||
with open(filename, 'wb') as outf:
|
||||
outf.write(webpage_bytes)
|
||||
|
@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
|
|||
impersonate=None, require_impersonation=False):
|
||||
if self.get_param('load_pages'):
|
||||
url_or_request = self._create_request(url_or_request, data, headers, query)
|
||||
filename = self._request_dump_filename(url_or_request.url, video_id)
|
||||
filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
|
||||
self.to_screen(f'Loading request from {filename}')
|
||||
try:
|
||||
with open(filename, 'rb') as dumpf:
|
||||
|
|
|
@ -3317,7 +3317,36 @@ def _extract_heatmap(self, data):
|
|||
'value': ('intensityScoreNormalized', {float_or_none}),
|
||||
})) or None
|
||||
|
||||
def _extract_comment(self, comment_renderer, parent=None):
|
||||
def _extract_comment(self, entities, parent=None):
|
||||
comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
|
||||
if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
|
||||
return
|
||||
|
||||
toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
|
||||
time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
|
||||
|
||||
return {
|
||||
'id': comment_id,
|
||||
'parent': parent or 'root',
|
||||
**traverse_obj(comment_entity_payload, {
|
||||
'text': ('properties', 'content', 'content', {str}),
|
||||
'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
|
||||
'author_id': ('author', 'channelId', {self.ucid_or_none}),
|
||||
'author': ('author', 'displayName', {str}),
|
||||
'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
|
||||
'author_is_uploader': ('author', 'isCreator', {bool}),
|
||||
'author_is_verified': ('author', 'isVerified', {bool}),
|
||||
'author_url': ('author', 'channelCommand', 'innertubeCommand', (
|
||||
('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
|
||||
), {lambda x: urljoin('https://www.youtube.com', x)}),
|
||||
}, get_all=False),
|
||||
'is_favorited': (None if toolbar_entity_payload is None else
|
||||
toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
|
||||
'_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate.
|
||||
'timestamp': self._parse_time_text(time_text),
|
||||
}
|
||||
|
||||
def _extract_comment_old(self, comment_renderer, parent=None):
|
||||
comment_id = comment_renderer.get('commentId')
|
||||
if not comment_id:
|
||||
return
|
||||
|
@ -3398,21 +3427,39 @@ def extract_header(contents):
|
|||
break
|
||||
return _continuation
|
||||
|
||||
def extract_thread(contents):
|
||||
def extract_thread(contents, entity_payloads):
|
||||
if not parent:
|
||||
tracker['current_page_thread'] = 0
|
||||
for content in contents:
|
||||
if not parent and tracker['total_parent_comments'] >= max_parents:
|
||||
yield
|
||||
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
|
||||
comment_renderer = get_first(
|
||||
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
|
||||
expected_type=dict, default={})
|
||||
|
||||
comment = self._extract_comment(comment_renderer, parent)
|
||||
# old comment format
|
||||
if not entity_payloads:
|
||||
comment_renderer = get_first(
|
||||
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
|
||||
expected_type=dict, default={})
|
||||
|
||||
comment = self._extract_comment_old(comment_renderer, parent)
|
||||
|
||||
# new comment format
|
||||
else:
|
||||
view_model = (
|
||||
traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
|
||||
or traverse_obj(content, ('commentViewModel', {dict})))
|
||||
comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
|
||||
if not comment_keys:
|
||||
continue
|
||||
entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
|
||||
comment = self._extract_comment(entities, parent)
|
||||
if comment:
|
||||
comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None
|
||||
|
||||
if not comment:
|
||||
continue
|
||||
comment_id = comment['id']
|
||||
|
||||
if comment.get('is_pinned'):
|
||||
tracker['pinned_comment_ids'].add(comment_id)
|
||||
# Sometimes YouTube may break and give us infinite looping comments.
|
||||
|
@ -3505,7 +3552,7 @@ def extract_thread(contents):
|
|||
check_get_keys = None
|
||||
if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
|
||||
check_get_keys = [[*continuation_items_path, ..., (
|
||||
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
|
||||
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
|
||||
try:
|
||||
response = self._extract_response(
|
||||
item_id=None, query=continuation,
|
||||
|
@ -3529,6 +3576,7 @@ def extract_thread(contents):
|
|||
raise
|
||||
is_forced_continuation = False
|
||||
continuation = None
|
||||
mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
|
||||
for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
|
||||
if is_first_continuation:
|
||||
continuation = extract_header(continuation_items)
|
||||
|
@ -3537,7 +3585,7 @@ def extract_thread(contents):
|
|||
break
|
||||
continue
|
||||
|
||||
for entry in extract_thread(continuation_items):
|
||||
for entry in extract_thread(continuation_items, mutations):
|
||||
if not entry:
|
||||
return
|
||||
yield entry
|
||||
|
|
|
@ -224,4 +224,8 @@ def run(self, info):
|
|||
thumbnail_filename if converted or not self._already_have_thumbnail else None,
|
||||
original_thumbnail if converted and not self._already_have_thumbnail else None,
|
||||
info=info)
|
||||
|
||||
if not self._already_have_thumbnail:
|
||||
info['thumbnails'][idx].pop('filepath', None)
|
||||
|
||||
return [], info
|
||||
|
|
|
@ -662,6 +662,10 @@ def run(self, info):
|
|||
self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
|
||||
os.replace(temp_filename, filename)
|
||||
|
||||
if not self._already_have_subtitle:
|
||||
for _, subtitle in subtitles.items():
|
||||
subtitle.pop('filepath', None)
|
||||
|
||||
files_to_delete = [] if self._already_have_subtitle else sub_filenames
|
||||
return files_to_delete, info
|
||||
|
||||
|
@ -698,6 +702,7 @@ def run(self, info):
|
|||
infojson_filename = info.get('infojson_filename')
|
||||
options.extend(self._get_infojson_opts(info, infojson_filename))
|
||||
if not infojson_filename:
|
||||
info.pop('infojson_filename', None)
|
||||
files_to_delete.append(info.get('infojson_filename'))
|
||||
elif self._add_infojson is True:
|
||||
self.to_screen('The info-json can only be attached to mkv/mka files')
|
||||
|
@ -1016,9 +1021,6 @@ def run(self, info):
|
|||
'filepath': new_file,
|
||||
}
|
||||
|
||||
info['__files_to_move'][new_file] = replace_extension(
|
||||
info['__files_to_move'][sub['filepath']], new_ext)
|
||||
|
||||
return sub_filenames, info
|
||||
|
||||
|
||||
|
@ -1083,16 +1085,15 @@ def is_webp(cls, path):
|
|||
return imghdr.what(path) == 'webp'
|
||||
|
||||
def fixup_webp(self, info, idx=-1):
|
||||
thumbnail_filename = info['thumbnails'][idx]['filepath']
|
||||
thumbnail = info['thumbnails'][idx]
|
||||
thumbnail_filename = thumbnail['filepath']
|
||||
_, thumbnail_ext = os.path.splitext(thumbnail_filename)
|
||||
if thumbnail_ext:
|
||||
if thumbnail_ext.lower() != '.webp' and imghdr.what(thumbnail_filename) == 'webp':
|
||||
self.to_screen('Correcting thumbnail "%s" extension to webp' % thumbnail_filename)
|
||||
webp_filename = replace_extension(thumbnail_filename, 'webp')
|
||||
os.replace(thumbnail_filename, webp_filename)
|
||||
info['thumbnails'][idx]['filepath'] = webp_filename
|
||||
info['__files_to_move'][webp_filename] = replace_extension(
|
||||
info['__files_to_move'].pop(thumbnail_filename), 'webp')
|
||||
thumbnail['filepath'] = webp_filename
|
||||
|
||||
@staticmethod
|
||||
def _options(target_ext):
|
||||
|
@ -1130,8 +1131,6 @@ def run(self, info):
|
|||
continue
|
||||
thumbnail_dict['filepath'] = self.convert_thumbnail(original_thumbnail, target_ext)
|
||||
files_to_delete.append(original_thumbnail)
|
||||
info['__files_to_move'][thumbnail_dict['filepath']] = replace_extension(
|
||||
info['__files_to_move'][original_thumbnail], target_ext)
|
||||
|
||||
if not has_thumbnail:
|
||||
self.to_screen('There aren\'t any thumbnails to convert')
|
||||
|
|
|
@ -1,16 +1,22 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from .common import PostProcessor
|
||||
from ..compat import shutil
|
||||
from ..utils import (
|
||||
PostProcessingError,
|
||||
decodeFilename,
|
||||
encodeFilename,
|
||||
make_dir,
|
||||
replace_extension
|
||||
)
|
||||
|
||||
|
||||
class MoveFilesAfterDownloadPP(PostProcessor):
|
||||
# Map of the keys that contain moveable files and the 'type' of the file
|
||||
# for generating the output filename
|
||||
CHILD_KEYS = {
|
||||
'thumbnails': 'thumbnail',
|
||||
'requested_subtitles': 'subtitle'
|
||||
}
|
||||
|
||||
def __init__(self, downloader=None, downloaded=True):
|
||||
PostProcessor.__init__(self, downloader)
|
||||
|
@ -20,34 +26,79 @@ def __init__(self, downloader=None, downloaded=True):
|
|||
def pp_key(cls):
|
||||
return 'MoveFiles'
|
||||
|
||||
def move_file_and_write_to_info(self, info_dict, relevant_dict=None, output_file_type=None):
|
||||
relevant_dict = relevant_dict or info_dict
|
||||
if 'filepath' not in relevant_dict:
|
||||
return
|
||||
|
||||
output_file_type = output_file_type or ''
|
||||
current_filepath, final_filepath = self.determine_filepath(info_dict, relevant_dict, output_file_type)
|
||||
move_result = self.move_file(info_dict, current_filepath, final_filepath)
|
||||
|
||||
if move_result:
|
||||
relevant_dict['filepath'] = move_result
|
||||
else:
|
||||
del relevant_dict['filepath']
|
||||
|
||||
def determine_filepath(self, info_dict, relevant_dict, output_file_type):
|
||||
current_filepath = relevant_dict['filepath']
|
||||
prepared_filepath = self._downloader.prepare_filename(info_dict, output_file_type)
|
||||
|
||||
if (output_file_type == 'thumbnail' and info_dict['__multiple_thumbnails']) or output_file_type == 'subtitle':
|
||||
desired_extension = ''.join(Path(current_filepath).suffixes[-2:])
|
||||
else:
|
||||
desired_extension = Path(current_filepath).suffix
|
||||
|
||||
return current_filepath, replace_extension(prepared_filepath, desired_extension)
|
||||
|
||||
def move_file(self, info_dict, current_filepath, final_filepath):
|
||||
if not current_filepath or not final_filepath:
|
||||
return
|
||||
|
||||
dl_parent_folder = os.path.split(info_dict['filepath'])[0]
|
||||
finaldir = info_dict.get('__finaldir', os.path.abspath(dl_parent_folder))
|
||||
|
||||
if not os.path.isabs(current_filepath):
|
||||
current_filepath = os.path.join(finaldir, current_filepath)
|
||||
|
||||
if not os.path.isabs(final_filepath):
|
||||
final_filepath = os.path.join(finaldir, final_filepath)
|
||||
|
||||
if current_filepath == final_filepath:
|
||||
return final_filepath
|
||||
|
||||
if not os.path.exists(current_filepath):
|
||||
self.report_warning('File "%s" cannot be found' % current_filepath)
|
||||
return
|
||||
|
||||
if os.path.exists(final_filepath):
|
||||
if self.get_param('overwrites', True):
|
||||
self.report_warning('Replacing existing file "%s"' % final_filepath)
|
||||
os.remove(final_filepath)
|
||||
else:
|
||||
self.report_warning(
|
||||
'Cannot move file "%s" out of temporary directory since "%s" already exists. '
|
||||
% (current_filepath, final_filepath))
|
||||
return
|
||||
|
||||
make_dir(final_filepath, PostProcessingError)
|
||||
self.to_screen(f'Moving file "{current_filepath}" to "{final_filepath}"')
|
||||
shutil.move(current_filepath, final_filepath) # os.rename cannot move between volumes
|
||||
|
||||
return final_filepath
|
||||
|
||||
def run(self, info):
|
||||
dl_path, dl_name = os.path.split(encodeFilename(info['filepath']))
|
||||
finaldir = info.get('__finaldir', dl_path)
|
||||
finalpath = os.path.join(finaldir, dl_name)
|
||||
if self._downloaded:
|
||||
info['__files_to_move'][info['filepath']] = decodeFilename(finalpath)
|
||||
# This represents the main media file (using the 'filepath' key)
|
||||
self.move_file_and_write_to_info(info)
|
||||
|
||||
make_newfilename = lambda old: decodeFilename(os.path.join(finaldir, os.path.basename(encodeFilename(old))))
|
||||
for oldfile, newfile in info['__files_to_move'].items():
|
||||
if not newfile:
|
||||
newfile = make_newfilename(oldfile)
|
||||
if os.path.abspath(encodeFilename(oldfile)) == os.path.abspath(encodeFilename(newfile)):
|
||||
for key, output_file_type in self.CHILD_KEYS.items():
|
||||
if key not in info:
|
||||
continue
|
||||
if not os.path.exists(encodeFilename(oldfile)):
|
||||
self.report_warning('File "%s" cannot be found' % oldfile)
|
||||
continue
|
||||
if os.path.exists(encodeFilename(newfile)):
|
||||
if self.get_param('overwrites', True):
|
||||
self.report_warning('Replacing existing file "%s"' % newfile)
|
||||
os.remove(encodeFilename(newfile))
|
||||
else:
|
||||
self.report_warning(
|
||||
'Cannot move file "%s" out of temporary directory since "%s" already exists. '
|
||||
% (oldfile, newfile))
|
||||
continue
|
||||
make_dir(newfile, PostProcessingError)
|
||||
self.to_screen(f'Moving file "{oldfile}" to "{newfile}"')
|
||||
shutil.move(oldfile, newfile) # os.rename cannot move between volumes
|
||||
|
||||
info['filepath'] = finalpath
|
||||
if isinstance(info[key], list) or isinstance(info[key], dict):
|
||||
iterable = info[key].values() if isinstance(info[key], dict) else info[key]
|
||||
|
||||
for file_dict in iterable:
|
||||
self.move_file_and_write_to_info(info, file_dict, output_file_type)
|
||||
|
||||
return [], info
|
||||
|
|
|
@ -2092,7 +2092,9 @@ def prepend_extension(filename, ext, expected_real_ext=None):
|
|||
|
||||
def replace_extension(filename, ext, expected_real_ext=None):
|
||||
name, real_ext = os.path.splitext(filename)
|
||||
return '{}.{}'.format(
|
||||
ext = ext if ext.startswith('.') else '.' + ext
|
||||
|
||||
return '{}{}'.format(
|
||||
name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
|
||||
ext)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user