diff --git a/README.md b/README.md index 37da789cf6..cdd57b024c 100644 --- a/README.md +++ b/README.md @@ -666,7 +666,7 @@ ## Filesystem Options: The name of the browser to load cookies from. Currently supported browsers are: brave, chrome, chromium, edge, firefox, - opera, safari, vivaldi. Optionally, the + opera, safari, vivaldi, whale. Optionally, the KEYRING used for decrypting Chromium cookies on Linux, the name/path of the PROFILE to load cookies from, and the CONTAINER name @@ -1760,7 +1760,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. The `android` clients will always be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1813,8 +1813,8 @@ #### tiktok * `app_name`: Default app name to use with mobile API calls, e.g. `trill` * `app_version`: Default app version to use with mobile API calls - should be set along with `manifest_app_version`, e.g. `34.1.2` * `manifest_app_version`: Default numeric app version to use with mobile API calls, e.g. `2023401020` -* `aid`: Default app ID to use with API calls, e.g. `1180` -* `app_info`: One or more app info strings in the format of `/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001` +* `aid`: Default app ID to use with mobile API calls, e.g. `1180` +* `app_info`: Enable mobile API extraction with one or more app info strings in the format of `/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001` #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index f1918df7c0..a3f6f0c964 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -3071,7 +3071,7 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions): f = formats[-1] self.report_warning( 'No subtitle format found matching "%s" for language %s, ' - 'using %s' % (formats_query, lang, f['ext'])) + 'using %s. Use --list-subs for a list of available subtitles' % (formats_query, lang, f['ext'])) subs[lang] = f return subs diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 7b8d215f03..815897d5a5 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -46,7 +46,7 @@ from .utils._utils import _YDLLogger from .utils.networking import normalize_url -CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} +CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'), 'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'), 'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'), + 'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'), }[browser_name] elif sys.platform == 'darwin': @@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(appdata, 'Microsoft Edge'), 'opera': os.path.join(appdata, 'com.operasoftware.Opera'), 'vivaldi': os.path.join(appdata, 'Vivaldi'), + 'whale': os.path.join(appdata, 'Naver/Whale'), }[browser_name] else: @@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(config, 'microsoft-edge'), 'opera': os.path.join(config, 'opera'), 'vivaldi': os.path.join(config, 'vivaldi'), + 'whale': os.path.join(config, 'naver-whale'), }[browser_name] # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: @@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium', 'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium', 'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome', + 'whale': 'Whale', }[browser_name] browsers_without_profiles = {'opera'} @@ -347,6 +351,11 @@ def _process_chrome_cookie(decryptor, host_key, name, value, encrypted_value, pa if value is None: return is_encrypted, None + # In chrome, session cookies have expires_utc set to 0 + # In our cookie-store, cookies that do not expire should have expires set to None + if not expires_utc: + expires_utc = None + return is_encrypted, http.cookiejar.Cookie( version=0, name=name, value=value, port=None, port_specified=False, domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 1f095c932a..cf408b6828 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -288,7 +288,6 @@ from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE -from .cableav import CableAVIE from .callin import CallinIE from .caltrans import CaltransIE from .cam4 import CAM4IE @@ -548,7 +547,6 @@ EggheadLessonIE, ) from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE from .eitb import EitbIE from .elementorembed import ElementorEmbedIE from .elonet import ElonetIE @@ -861,10 +859,6 @@ ) from .ixigua import IxiguaIE from .izlesene import IzleseneIE -from .jable import ( - JableIE, - JablePlaylistIE, -) from .jamendo import ( JamendoIE, JamendoAlbumIE, @@ -1499,7 +1493,6 @@ ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE -from .porn91 import Porn91IE from .pornbox import PornboxIE from .pornflip import PornFlipIE from .pornhub import ( @@ -2377,7 +2370,6 @@ ) from .xanimu import XanimuIE from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE from .xhamster import ( XHamsterIE, XHamsterEmbedIE, @@ -2432,8 +2424,6 @@ YouNowMomentIE, ) from .youporn import YouPornIE -from .yourporn import YourPornIE -from .yourupload import YourUploadIE from .zaiko import ( ZaikoIE, ZaikoETicketIE, diff --git a/yt_dlp/extractor/alura.py b/yt_dlp/extractor/alura.py index bfe066bc68..b785c62c32 100644 --- a/yt_dlp/extractor/alura.py +++ b/yt_dlp/extractor/alura.py @@ -39,7 +39,7 @@ class AluraIE(InfoExtractor): def _real_extract(self, url): - course, video_id = self._match_valid_url(url) + course, video_id = self._match_valid_url(url).group('course_name', 'id') video_url = self._VIDEO_URL % (course, video_id) video_dict = self._download_json(video_url, video_id, 'Searching for videos') @@ -52,7 +52,7 @@ def _real_extract(self, url): formats = [] for video_obj in video_dict: - video_url_m3u8 = video_obj.get('link') + video_url_m3u8 = video_obj.get('mp4') video_format = self._extract_m3u8_formats( video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 015af9e1d6..f6b58b361f 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', + 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', 'title': 'BUGGER', + 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$', }, 'playlist_count': 18, }, { @@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'md5:2868290467291b37feda7863f7a83f54', + 'title': 'Germanwings crash site aerial video', + 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', + 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg', }, 'params': { - # rtmp download 'skip_download': True, } }, { @@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': 'now SIMORGH_DATA with no video', }, { # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { - 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'id': '39275083', + 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', + 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'timestamp': 1434713142, 'upload_date': '20150619', + 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg', }, 'params': { 'skip_download': True, - } + }, }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', @@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { - # single video story with digitalData + # single video story with __PWA_PRELOADED_STATE__ 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'info_dict': { 'id': 'p02q6gc4', - 'ext': 'flv', - 'title': 'Sri Lanka’s spicy secret', - 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', - 'timestamp': 1437674293, - 'upload_date': '20150723', + 'ext': 'mp4', + 'title': 'Tasting the spice of life in Jaffna', + 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$', + 'timestamp': 1646058397, + 'upload_date': '20220228', + 'duration': 255, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg', }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { # single video story without digitalData 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', @@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1415867444, 'upload_date': '20141113', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'skip': 'redirects to TopGear home page', }, { # single video embedded with Morph + # TODO: replacement test page 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', 'info_dict': { 'id': 'p041vhd0', @@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'BBC Sport', 'uploader_id': 'bbc_sport', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Georestricted to UK', + 'skip': 'Video no longer in page', }, { - # single video with playlist.sxml URL in playlist param + # single video in __INITIAL_DATA__ 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', + 'title': 'Ronaldo to Man Utd, Arsenal to spend?', + 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', + 'timestamp': 1437750175, + 'upload_date': '20150724', + 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', 'duration': 140, }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { - # article with multiple videos embedded with playlist.sxml in playlist param + # article with multiple videos embedded with Morph.setPayload 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', @@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, + }, { + # Testing noplaylist + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': 'p034ppnv', + 'ext': 'mp4', + 'title': 'All you need to know about Jurgen Klopp', + 'timestamp': 1444335081, + 'upload_date': '20151008', + 'duration': 122.0, + 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg', + }, + 'params': { + 'noplaylist': True, + }, }, { # school report article with single video 'url': 'http://www.bbc.co.uk/schoolreport/35744779', @@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': 'School which breaks down barriers in Jerusalem', }, 'playlist_count': 1, + 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt', }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', + 'duration': 105, }, }, { # video with window.__INITIAL_DATA__ and value as JSON string 'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'info_dict': { - 'id': 'p0b71qth', + 'id': 'p0b779gc', 'ext': 'mp4', 'title': 'Why France is making this woman a national hero', - 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.', 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1638230731, - 'upload_date': '20211130', + 'timestamp': 1638215626, + 'upload_date': '20211129', + 'duration': 125, + }, + }, { + # video with script id __NEXT_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/uk-68546268', + 'info_dict': { + 'id': 'p0hj0lq7', + 'ext': 'mp4', + 'title': 'Nasser Hospital doctor describes his treatment by IDF', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1710188248, + 'upload_date': '20240311', + 'duration': 104, }, }, { # single video article embedded with data-media-vpid @@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'Radio 3', 'uploader_id': 'bbc_radio_three', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'info_dict': { @@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': 'md5:2fabf12a726603193a2879a055f72514', 'description': 'Learn English words and phrases from this story', + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg', }, 'add_ie': [BBCCoUkIE.ie_key()], }, { @@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p07c6sb9', 'ext': 'mp4', - 'title': 'How positive thinking is harming your happiness', - 'alt_title': 'The downsides of positive thinking', - 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'title': 'The downsides of positive thinking', + 'description': 'The downsides of positive thinking', 'duration': 235, - 'thumbnail': r're:https?://.+/p07c9dsr.jpg', - 'upload_date': '20190604', - 'categories': ['Psychology'], + 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)', + 'upload_date': '20220223', + 'timestamp': 1645632746, }, }, { # BBC Sounds - 'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', + 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx', 'info_dict': { - 'id': 'm001q789', + 'id': 'p0hrw4nr', 'ext': 'mp4', - 'title': 'The Night Tracks Mix - Music for the darkling hour', - 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', - 'chapters': 'count:8', - 'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', - 'uploader': 'Radio 3', - 'duration': 1800, - 'uploader_id': 'bbc_radio_three', - }, + 'title': 'Are our coastlines being washed away?', + 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$', + 'timestamp': 1713556800, + 'upload_date': '20240419', + 'duration': 1588, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg', + 'uploader': 'World Service', + 'uploader_id': 'bbc_world_service', + 'series': 'CrowdScience', + 'chapters': [], + } }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'only_matching': True, @@ -1008,8 +1039,7 @@ def _real_extract(self, url): webpage, 'group id', default=None) if group_id: return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % group_id, - ie=BBCCoUkIE.ie_key()) + f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( @@ -1069,83 +1099,133 @@ def _real_extract(self, url): } # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one - morph_payload = self._parse_json( - self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', - webpage, 'morph payload', default='{}'), - playlist_id, fatal=False) + # Several setPayload calls may be present but the video(s) + # should be in one that mentions leadMedia or videoData + morph_payload = self._search_json( + r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id, + contains_pattern=r'{(?s:(?:(?!).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}', + default={}) if morph_payload: - components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] - for component in components: - if not isinstance(component, dict): - continue - lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) - if not lead_media: - continue - identifiers = lead_media.get('identifiers') - if not identifiers or not isinstance(identifiers, dict): - continue - programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + for lead_media in traverse_obj(morph_payload, ( + 'body', 'components', ..., 'props', 'leadMedia', {dict})): + programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any)) if not programme_id: continue - title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - description = lead_media.get('summary') - uploader = lead_media.get('masterBrand') - uploader_id = lead_media.get('mid') - duration = None - duration_d = lead_media.get('duration') - if isinstance(duration_d, dict): - duration = parse_duration(dict_get( - duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) return { 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'title': lead_media.get('title') or self._og_search_title(webpage), + **traverse_obj(lead_media, { + 'description': ('summary', {str}), + 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}), + 'uploader': ('masterBrand', {str}), + 'uploader_id': ('mid', {str}), + }), 'formats': formats, 'subtitles': subtitles, } + body = self._parse_json(traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'body')), playlist_id, fatal=False) + for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')): + if video_data.get('vpid'): + video_id = video_data['vpid'] + formats, subtitles = self._download_media_selector(video_id) + entry = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + else: + video_id = video_data['pid'] + entry = self.url_result( + f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, + video_id, url_transparent=True) + entry.update({ + 'timestamp': traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}) + ), + **traverse_obj(video_data, { + 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), + 'title': (('title', 'caption'), {str}, any), + 'duration': ('duration', {parse_duration}), + }), + }) + if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): + return entry + entries.append(entry) + if entries: + playlist_title = traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'headline', {str})) or playlist_title + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) - preload_state = self._parse_json(self._search_regex( - r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) - if preload_state: - current_programme = preload_state.get('programmes', {}).get('current') or {} - programme_id = current_programme.get('id') - if current_programme and programme_id and current_programme.get('type') == 'playable_item': - title = current_programme.get('titles', {}).get('tertiary') or playlist_title - formats, subtitles = self._download_media_selector(programme_id) - synopses = current_programme.get('synopses') or {} - network = current_programme.get('network') or {} - duration = int_or_none( - current_programme.get('duration', {}).get('value')) - thumbnail = None - image_url = current_programme.get('image_url') - if image_url: - thumbnail = image_url.replace('{recipe}', 'raw') + # various PRELOADED_STATE JSON + preload_state = self._search_json( + r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage, + 'preload state', playlist_id, transform_source=js_to_json, default={}) + # PRELOADED_STATE with current programmme + current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict})) + programme_id = traverse_obj(current_programme, ('id', {str})) + if programme_id and current_programme.get('type') == 'playable_item': + title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + return { + 'id': programme_id, + 'title': title, + 'formats': formats, + **traverse_obj(current_programme, { + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), + 'duration': ('duration', 'value', {int_or_none}), + 'uploader': ('network', 'short_title', {str}), + 'uploader_id': ('network', 'id', {str}), + 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any), + 'series': ('titles', 'primary', {str}), + }), + 'subtitles': subtitles, + 'chapters': traverse_obj(preload_state, ( + 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + }) + ), + } + + # PWA_PRELOADED_STATE with article video asset + asset_id = traverse_obj(preload_state, ( + 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id, + 'assetVideo', 0, {str}, any)) + if asset_id: + video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str})) + if video_id: + article = traverse_obj(preload_state, ( + 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any)) + + def image_url(image_id): + return traverse_obj(preload_state, ( + 'entities', 'images', image_id, 'url', + {lambda u: url_or_none(u.replace('$recipe', 'raw'))})) + + formats, subtitles = self._download_media_selector(video_id) return { - 'id': programme_id, - 'title': title, - 'description': dict_get(synopses, ('long', 'medium', 'short')), - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': network.get('short_title'), - 'uploader_id': network.get('id'), + 'id': video_id, + **traverse_obj(preload_state, ('entities', 'videos', asset_id, { + 'title': ('title', {str}), + 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any), + 'thumbnail': (0, {image_url}), + 'duration': ('duration', {int_or_none}), + })), 'formats': formats, 'subtitles': subtitles, - 'chapters': traverse_obj(preload_state, ( - 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { - 'title': ('titles', {lambda x: join_nonempty( - 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), - 'start_time': ('offset', 'start', {float_or_none}), - 'end_time': ('offset', 'end', {float_or_none}), - })) or None, + 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})), } + else: + return self.url_result( + f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE, + asset_id, playlist_title, display_id=playlist_id, + description=playlist_description) bbc3_config = self._parse_json( self._search_regex( @@ -1191,6 +1271,28 @@ def _real_extract(self, url): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + def parse_model(model): + """Extract single video from model structure""" + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}), + }) + } + + def is_type(*types): + return lambda _, v: v['type'] in types + initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, 'quoted preload state', default=None) @@ -1202,6 +1304,19 @@ def _real_extract(self, url): initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: + for video_data in traverse_obj(initial_data, ( + 'stores', 'article', 'articleBodyContent', is_type('video'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + entry = parse_model(model) + if entry: + entries.append(entry) + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def parse_media(media): if not media: return @@ -1234,27 +1349,90 @@ def parse_media(media): 'subtitles': subtitles, 'timestamp': item_time, 'description': strip_or_none(item_desc), + 'duration': int_or_none(item.get('duration')), }) - for resp in (initial_data.get('data') or {}).values(): - name = resp.get('name') + + for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])): + name = resp['name'] if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, - (lambda x: x['data']['blocks'], - lambda x: x['data']['content']['model']['blocks'],), - list) or []): - if block.get('type') not in ['media', 'video']: - continue - parse_media(block.get('model')) + for block in traverse_obj(resp, ( + 'data', (None, ('content', 'model')), 'blocks', + is_type('media', 'video'), 'model', {dict})): + parse_media(block) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + # extract from SIMORGH_DATA hydration JSON + simorgh_data = self._search_json( + r'window\s*\.\s*SIMORGH_DATA\s*=', webpage, + 'simorgh data', playlist_id, default={}) + if simorgh_data: + done = False + for video_data in traverse_obj(simorgh_data, ( + 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + if video_data['type'] == 'video': + entry = parse_model(model) + else: # legacyMedia: no duration, subtitles + block_id, entry = traverse_obj(model, ('blockId', {str})), None + media_data = traverse_obj(simorgh_data, ( + 'pageData', 'promo', 'media', + {lambda x: x if x['id'] == block_id else None})) + formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), { + 'url': ('url', {url_or_none}), + 'ext': ('format', {str}), + 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + })) + if formats: + entry = { + 'id': block_id, + 'display_id': playlist_id, + 'formats': formats, + 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})), + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}), + }), + } + done = True + if entry: + entries.append(entry) + if done: + break + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + # US accessed article with single embedded video (e.g. + # https://www.bbc.com/news/uk-68546268) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), + ('props', 'pageProps', 'page')) + model = traverse_obj(next_data, ( + ..., 'contents', is_type('video'), + 'model', 'blocks', is_type('media'), + 'model', 'blocks', is_type('mediaMetadata'), + 'model', {dict}, any)) + if model and (entry := parse_model(model)): + if not entry.get('timestamp'): + entry['timestamp'] = traverse_obj(next_data, ( + ..., 'contents', is_type('timestamp'), 'model', + 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + entries.append(entry) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index df34700033..b38c90b1d1 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1045,7 +1045,8 @@ def fetch_page(page_idx): try: response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', - playlist_id, note=f'Downloading page {page_idx}', query=query) + playlist_id, note=f'Downloading page {page_idx}', query=query, + headers={'referer': url}) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError( diff --git a/yt_dlp/extractor/cableav.py b/yt_dlp/extractor/cableav.py deleted file mode 100644 index 4a221414ea..0000000000 --- a/yt_dlp/extractor/cableav.py +++ /dev/null @@ -1,32 +0,0 @@ -from .common import InfoExtractor - - -class CableAVIE(InfoExtractor): - _VALID_URL = r'https?://cableav\.tv/(?P[a-zA-Z0-9]+)' - _TESTS = [{ - 'url': 'https://cableav.tv/lS4iR9lWjN8/', - 'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18', - 'info_dict': { - 'id': 'lS4iR9lWjN8', - 'ext': 'mp4', - 'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV', - 'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._og_search_video_url(webpage, secure=False) - - formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') - - return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': formats, - } diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 90b4d082e2..0a5a524c16 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -16,7 +16,6 @@ merge_dicts, multipart_encode, parse_duration, - random_birthday, traverse_obj, try_call, try_get, @@ -63,38 +62,57 @@ class CDAIE(InfoExtractor): 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'crash404', - 'view_count': int, 'average_rating': float, 'duration': 137, 'age_limit': 0, + 'upload_date': '20160220', + 'timestamp': 1455968218, } }, { - # Age-restricted - 'url': 'http://www.cda.pl/video/1273454c4', + # Age-restricted with vfilm redirection + 'url': 'https://www.cda.pl/video/8753244c4', + 'md5': 'd8eeb83d63611289507010d3df3bb8b3', 'info_dict': { - 'id': '1273454c4', + 'id': '8753244c4', 'ext': 'mp4', - 'title': 'Bronson (2008) napisy HD 1080p', - 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?', + 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e', 'height': 1080, - 'uploader': 'boniek61', + 'uploader': 'arhn eu', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 5554, + 'duration': 991, 'age_limit': 18, - 'view_count': int, 'average_rating': float, - }, + 'timestamp': 1633888264, + 'upload_date': '20211010', + } + }, { + # Age-restricted without vfilm redirection + 'url': 'https://www.cda.pl/video/17028157b8', + 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992', + 'info_dict': { + 'id': '17028157b8', + 'ext': 'mp4', + 'title': 'STENDUPY MICHAŁ OGIŃSKI', + 'description': 'md5:5851f3272bfc31f762d616040a1d609a', + 'height': 480, + 'uploader': 'oginski', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 18855, + 'age_limit': 18, + 'average_rating': float, + 'timestamp': 1699705901, + 'upload_date': '20231111', + } }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] def _download_age_confirm_page(self, url, video_id, *args, **kwargs): - form_data = random_birthday('rok', 'miesiac', 'dzien') - form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) - data, content_type = multipart_encode(form_data) + data, content_type = multipart_encode({'age_confirm': ''}) return self._download_webpage( - urljoin(url, '/a/validatebirth'), video_id, *args, + url, video_id, *args, data=data, headers={ 'Referer': url, 'Content-Type': content_type, @@ -164,7 +182,7 @@ def _real_extract(self, url): if 'Authorization' in self._API_HEADERS: return self._api_extract(video_id) else: - return self._web_extract(video_id, url) + return self._web_extract(video_id) def _api_extract(self, video_id): meta = self._download_json( @@ -197,9 +215,9 @@ def _api_extract(self, video_id): 'view_count': meta.get('views'), } - def _web_extract(self, video_id, url): + def _web_extract(self, video_id): self._set_cookie('cda.pl', 'cda.player', 'html5') - webpage = self._download_webpage( + webpage, urlh = self._download_webpage_handle( f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: @@ -209,10 +227,10 @@ def _web_extract(self, video_id, url): self.raise_geo_restricted() need_confirm_age = False - if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', + if self._html_search_regex(r'(]+name="[^"]*age_confirm[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( - url, video_id, note='Confirming age') + urlh.url, video_id, note='Confirming age') need_confirm_age = True formats = [] @@ -222,9 +240,6 @@ def _web_extract(self, video_id, url): (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) ''', webpage, 'uploader', default=None, group='uploader') - view_count = self._search_regex( - r'Odsłony:(?:\s| )*([0-9]+)', webpage, - 'view_count', default=None) average_rating = self._search_regex( (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', r']+\bclass=["\']rating["\'][^>]*>(?P[0-9.]+)'), webpage, 'rating', fatal=False, @@ -235,7 +250,6 @@ def _web_extract(self, video_id, url): 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'uploader': uploader, - 'view_count': int_or_none(view_count), 'average_rating': float_or_none(average_rating), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index bebbc6b43f..e232aa883a 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data) return (content, urlh) @staticmethod @@ -1005,8 +1006,10 @@ def __check_blocked(self, content): 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) - def _request_dump_filename(self, url, video_id): - basen = f'{video_id}_{url}' + def _request_dump_filename(self, url, video_id, data=None): + if data is not None: + data = hashlib.md5(data).hexdigest() + basen = join_nonempty(video_id, data, url, delim='_') trim_length = self.get_param('trim_file_name') or 240 if len(basen) > trim_length: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() @@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers): except LookupError: return webpage_bytes.decode('utf-8', 'replace') - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, + prefix=None, encoding=None, data=None): webpage_bytes = urlh.read() if prefix is not None: webpage_bytes = prefix + webpage_bytes + url_or_request = self._create_request(url_or_request, data) if self.get_param('dump_intermediate_pages', False): self.to_screen('Dumping request to ' + urlh.url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.url, video_id) + filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: diff --git a/yt_dlp/extractor/einthusan.py b/yt_dlp/extractor/einthusan.py deleted file mode 100644 index 53bc2535d0..0000000000 --- a/yt_dlp/extractor/einthusan.py +++ /dev/null @@ -1,105 +0,0 @@ -import json - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, - compat_urlparse, -) -from ..utils import ( - extract_attributes, - ExtractorError, - get_elements_by_class, - urlencode_postdata, -) - - -class EinthusanIE(InfoExtractor): - _VALID_URL = r'https?://(?Peinthusan\.(?:tv|com|ca))/movie/watch/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://einthusan.tv/movie/watch/9097/', - 'md5': 'ff0f7f2065031b8a2cf13a933731c035', - 'info_dict': { - 'id': '9097', - 'ext': 'mp4', - 'title': 'Ae Dil Hai Mushkil', - 'description': 'md5:33ef934c82a671a94652a9b4e54d931b', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'https://einthusan.tv/movie/watch/51MZ/?lang=hindi', - 'only_matching': True, - }, { - 'url': 'https://einthusan.com/movie/watch/9097/', - 'only_matching': True, - }, { - 'url': 'https://einthusan.ca/movie/watch/4E9n/?lang=hindi', - 'only_matching': True, - }] - - # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js - def _decrypt(self, encrypted_data, video_id): - return self._parse_json(compat_b64decode(( - encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1] - )).decode('utf-8'), video_id) - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - host = mobj.group('host') - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') - - player_params = extract_attributes(self._search_regex( - r'(]+id="UIVideoPlayer"[^>]+>)', webpage, 'player parameters')) - - page_id = self._html_search_regex( - ']+data-pageid="([^"]+)"', webpage, 'page ID') - video_data = self._download_json( - 'https://%s/ajax/movie/watch/%s/' % (host, video_id), video_id, - data=urlencode_postdata({ - 'xEvent': 'UIVideoPlayer.PingOutcome', - 'xJson': json.dumps({ - 'EJOutcomes': player_params['data-ejpingables'], - 'NativeHLS': False - }), - 'arcVersion': 3, - 'appVersion': 59, - 'gorilla.csrf.Token': page_id, - }))['Data'] - - if isinstance(video_data, compat_str) and video_data.startswith('/ratelimited/'): - raise ExtractorError( - 'Download rate reached. Please try again later.', expected=True) - - ej_links = self._decrypt(video_data['EJLinks'], video_id) - - formats = [] - - m3u8_url = ej_links.get('HLSLink') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native')) - - mp4_url = ej_links.get('MP4Link') - if mp4_url: - formats.append({ - 'url': mp4_url, - }) - - description = get_elements_by_class('synopsis', webpage)[0] - thumbnail = self._html_search_regex( - r''']+src=(["'])(?P(?!\1).+?/moviecovers/(?!\1).+?)\1''', - webpage, 'thumbnail url', fatal=False, group='url') - if thumbnail is not None: - thumbnail = compat_urlparse.urljoin(url, thumbnail) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - } diff --git a/yt_dlp/extractor/eplus.py b/yt_dlp/extractor/eplus.py index 88a8d5a949..d2ad5b441e 100644 --- a/yt_dlp/extractor/eplus.py +++ b/yt_dlp/extractor/eplus.py @@ -16,13 +16,31 @@ class EplusIbIE(InfoExtractor): _VALID_URL = [r'https?://live\.eplus\.jp/ex/player\?ib=(?P(?:\w|%2B|%2F){86}%3D%3D)', r'https?://live\.eplus\.jp/(?Psample|\d+)'] _TESTS = [{ - 'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D', + 'url': 'https://live.eplus.jp/ex/player?ib=41K6Wzbr3PlcMD%2FOKHFlC%2FcZCe2Eaw7FK%2BpJS1ooUHki8d0vGSy2mYqxillQBe1dSnOxU%2B8%2FzXKls4XPBSb3vw%3D%3D', 'info_dict': { - 'id': '354502-0001-002', - 'title': 'LoveLive!Series Presents COUNTDOWN LoveLive! 2021→2022~LIVE with a smile!~【Streaming+(配信)】', + 'id': '335699-0001-006', + 'title': '少女☆歌劇 レヴュースタァライト -The LIVE 青嵐- BLUE GLITTER <定点映像配信>【Streaming+(配信)】', 'live_status': 'was_live', - 'release_date': '20211231', - 'release_timestamp': 1640952000, + 'release_date': '20201221', + 'release_timestamp': 1608544800, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'This event may not be accessible', + 'No video formats found', + 'Requested format is not available', + ], + }, { + 'url': 'https://live.eplus.jp/ex/player?ib=6QSsQdyRAwOFZrEHWlhRm7vocgV%2FO0YzBZ%2BaBEBg1XR%2FmbLn0R%2F048dUoAY038%2F%2F92MJ73BsoAtvUpbV6RLtDQ%3D%3D&show_id=2371511', + 'info_dict': { + 'id': '348021-0054-001', + 'title': 'ラブライブ!スーパースター!! Liella! First LoveLive! Tour ~Starlines~【東京/DAY.1】', + 'live_status': 'was_live', + 'release_date': '20220115', + 'release_timestamp': 1642233600, 'description': str, }, 'params': { @@ -124,6 +142,10 @@ def _real_extract(self, url): if data_json.get('drm_mode') == 'ON': self.report_drm(video_id) + if data_json.get('is_pass_ticket') == 'YES': + raise ExtractorError( + 'This URL is for a pass ticket instead of a player page', expected=True) + delivery_status = data_json.get('delivery_status') archive_mode = data_json.get('archive_mode') release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400) diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 191a4361a2..29dfc8ae95 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -94,13 +94,14 @@ def get_item(type_, preference): class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://multimedia\.europarl\.europa\.eu/[^/#?]+/ - (?:(?!video)[^/#?]+/[\w-]+_)(?P[\w-]+) + https?://multimedia\.europarl\.europa\.eu/ + (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', + 'display_id': '20220914-0900-PLENARY', 'ext': 'mp4', 'title': 'Plenary session', 'release_timestamp': 1663139069, @@ -125,6 +126,7 @@ class EuroParlWebstreamIE(InfoExtractor): 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', 'info_dict': { 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', + 'display_id': '20230301-1130-COMMITTEE-CULT', 'ext': 'mp4', 'release_date': '20230301', 'title': 'Committee on Culture and Education', @@ -142,6 +144,19 @@ class EuroParlWebstreamIE(InfoExtractor): 'live_status': 'is_live', }, 'skip': 'Not live anymore' + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER', + 'info_dict': { + 'id': 'c1f11567-5b52-470a-f3e1-08dc3c216ace', + 'display_id': '20240320-1345-SPECIAL-PRESSER', + 'ext': 'mp4', + 'release_date': '20240320', + 'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234', + 'release_timestamp': 1710939767, + } + }, { + 'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER', + 'only_matching': True, }] def _real_extract(self, url): @@ -166,6 +181,7 @@ def _real_extract(self, url): return { 'id': json_info['id'], + 'display_id': display_id, 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), 'formats': formats, 'subtitles': subtitles, diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 06658dd479..c19192cfac 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -1,9 +1,11 @@ import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_parse_qs from ..utils import ( ExtractorError, + bug_reports_message, determine_ext, extract_attributes, get_element_by_class, @@ -38,6 +40,17 @@ class GoogleDriveIE(InfoExtractor): 'duration': 45, 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', } + }, { + # has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922) + 'url': 'https://drive.google.com/uc?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', + 'md5': '322db8d63dd19788c04050a4bba67073', + 'info_dict': { + 'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', + 'ext': 'mp3', + 'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3', + 'duration': 184, + 'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', + }, }, { # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) @@ -58,22 +71,8 @@ class GoogleDriveIE(InfoExtractor): 'only_matching': True, }] _FORMATS_EXT = { - '5': 'flv', - '6': 'flv', - '13': '3gp', - '17': '3gp', - '18': 'mp4', - '22': 'mp4', - '34': 'flv', - '35': 'flv', - '36': '3gp', - '37': 'mp4', - '38': 'mp4', - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', - '59': 'mp4', + **{k: v['ext'] for k, v in YoutubeIE._formats.items() if v.get('ext')}, + '50': 'm4a', } _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' _CAPTIONS_ENTRY_TAG = { @@ -194,10 +193,13 @@ def get_value(key): if len(fmt_stream_split) < 2: continue format_id, format_url = fmt_stream_split[:2] + ext = self._FORMATS_EXT.get(format_id) + if not ext: + self.report_warning(f'Unknown format {format_id}{bug_reports_message()}') f = { 'url': lowercase_escape(format_url), 'format_id': format_id, - 'ext': self._FORMATS_EXT[format_id], + 'ext': ext, } resolution = resolutions.get(format_id) if resolution: diff --git a/yt_dlp/extractor/hytale.py b/yt_dlp/extractor/hytale.py index 0f4dcc309b..e8cd21a648 100644 --- a/yt_dlp/extractor/hytale.py +++ b/yt_dlp/extractor/hytale.py @@ -1,7 +1,8 @@ import re +from .cloudflarestream import CloudflareStreamIE from .common import InfoExtractor -from ..utils import traverse_obj +from ..utils.traversal import traverse_obj class HytaleIE(InfoExtractor): @@ -49,7 +50,7 @@ def _real_extract(self, url): entries = [ self.url_result( f'https://cloudflarestream.com/{video_hash}/manifest/video.mpd?parentOrigin=https%3A%2F%2Fhytale.com', - title=self._titles.get(video_hash), url_transparent=True) + CloudflareStreamIE, title=self._titles.get(video_hash), url_transparent=True) for video_hash in re.findall( r'[\w-]+)' - _TESTS = [{ - 'url': 'https://jable.tv/videos/pppd-812/', - 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', - 'info_dict': { - 'id': 'pppd-812', - 'ext': 'mp4', - 'title': 'PPPD-812 只要表現好巨乳女教師吉根柚莉愛就獎勵學生們在白虎穴內射出精液', - 'description': 'md5:5b6d4199a854f62c5e56e26ccad19967', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - 'like_count': int, - 'view_count': int, - }, - }, { - 'url': 'https://jable.tv/videos/apak-220/', - 'md5': '71f9239d69ced58ab74a816908847cc1', - 'info_dict': { - 'id': 'apak-220', - 'ext': 'mp4', - 'title': 'md5:5c3861b7cf80112a6e2b70bccf170824', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - 'like_count': int, - 'view_count': int, - 'upload_date': '20220319', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - formats = self._extract_m3u8_formats( - self._search_regex(r'var\s+hlsUrl\s*=\s*\'([^\']+)', webpage, 'hls_url'), video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=''), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'formats': formats, - 'age_limit': 18, - 'upload_date': unified_strdate(self._search_regex( - r'class="inactive-color">\D+\s+(\d{4}-\d+-\d+)', webpage, 'upload_date', default=None)), - 'view_count': int_or_none(self._search_regex( - r'#icon-eye">\n*([\d ]+)', - webpage, 'view_count', default='').replace(' ', '')), - 'like_count': int_or_none(self._search_regex( - r'#icon-heart">(\d+)', webpage, 'link_count', default=None)), - } - - -class JablePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable\.tv/(?:categories|models|tags)/(?P[\w-]+)' - _TESTS = [{ - 'url': 'https://jable.tv/models/kaede-karen/', - 'info_dict': { - 'id': 'kaede-karen', - 'title': '楓カレン', - }, - 'playlist_count': 34, - }, { - 'url': 'https://jable.tv/categories/roleplay/', - 'only_matching': True, - }, { - 'url': 'https://jable.tv/tags/girl/', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - - def page_func(page_num): - return [ - self.url_result(player_url, JableIE) - for player_url in orderedSet(re.findall( - r'href="(https://jable.tv/videos/[\w-]+/?)"', - self._download_webpage(url, playlist_id, query={ - 'mode': 'async', - 'from': page_num + 1, - 'function': 'get_block', - 'block_id': 'list_videos_common_videos_list', - }, note=f'Downloading page {page_num + 1}')))] - - return self.playlist_result( - InAdvancePagedList(page_func, int_or_none(self._search_regex( - r'from:(\d+)">[^<]+\s*»', webpage, 'last page number', default=1)), 24), - playlist_id, self._search_regex( - r'

([^<]+)', webpage, 'playlist title', default=None)) diff --git a/yt_dlp/extractor/nfb.py b/yt_dlp/extractor/nfb.py index 6f78728253..968c9728b0 100644 --- a/yt_dlp/extractor/nfb.py +++ b/yt_dlp/extractor/nfb.py @@ -5,7 +5,6 @@ merge_dicts, parse_count, url_or_none, - urljoin, ) from ..utils.traversal import traverse_obj @@ -16,8 +15,7 @@ class NFBBaseIE(InfoExtractor): def _extract_ep_data(self, webpage, video_id, fatal=False): return self._search_json( - r'const\s+episodesData\s*=', webpage, 'episode data', video_id, - contains_pattern=r'\[\s*{(?s:.+)}\s*\]', fatal=fatal) or [] + r'episodesData\s*:', webpage, 'episode data', video_id, fatal=fatal) or {} def _extract_ep_info(self, data, video_id, slug=None): info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], { @@ -224,18 +222,14 @@ def _real_extract(self, url): # type_ can change from film to serie(s) after redirect; new slug may have episode number type_, slug = self._match_valid_url(urlh.url).group('type', 'id') - embed_url = urljoin(f'https://www.{site}.ca', self._html_search_regex( - r'<[^>]+\bid=["\']player-iframe["\'][^>]*\bsrc=["\']([^"\']+)', webpage, 'embed url')) - video_id = self._match_id(embed_url) # embed url has unique slug - player = self._download_webpage(embed_url, video_id, 'Downloading player page') - if 'MESSAGE_GEOBLOCKED' in player: - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + player_data = self._search_json( + r'window\.PLAYER_OPTIONS\[[^\]]+\]\s*=', webpage, 'player data', slug) + video_id = self._match_id(player_data['overlay']['url']) # overlay url always has unique slug formats, subtitles = self._extract_m3u8_formats_and_subtitles( - self._html_search_regex(r'source:\s*\'([^\']+)', player, 'm3u8 url'), - video_id, 'mp4', m3u8_id='hls') + player_data['source'], video_id, 'mp4', m3u8_id='hls') - if dv_source := self._html_search_regex(r'dvSource:\s*\'([^\']+)', player, 'dv', default=None): + if dv_source := url_or_none(player_data.get('dvSource')): fmts, subs = self._extract_m3u8_formats_and_subtitles( dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False) for fmt in fmts: @@ -246,17 +240,16 @@ def _real_extract(self, url): info = { 'id': video_id, 'title': self._html_search_regex( - r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*]*>\s*([^<]+?)\s*

', + r'["\']nfb_version_title["\']\s*:\s*["\']([^"\']+)', webpage, 'title', default=None), 'description': self._html_search_regex( r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*]*>\s*([^<]+)', webpage, 'description', default=None), - 'thumbnail': self._html_search_regex( - r'poster:\s*\'([^\']+)', player, 'thumbnail', default=None), + 'thumbnail': url_or_none(player_data.get('poster')), 'uploader': self._html_search_regex( - r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', webpage, 'uploader', default=None), + r'<[^>]+\bitemprop=["\']director["\'][^>]*>([^<]+)', webpage, 'uploader', default=None), 'release_year': int_or_none(self._html_search_regex( - r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)', + r'["\']nfb_version_year["\']\s*:\s*["\']([^"\']+)', webpage, 'release_year', default=None)), } if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id) diff --git a/yt_dlp/extractor/porn91.py b/yt_dlp/extractor/porn91.py deleted file mode 100644 index 7d16a16319..0000000000 --- a/yt_dlp/extractor/porn91.py +++ /dev/null @@ -1,95 +0,0 @@ -import urllib.parse -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, - unified_strdate, - ExtractorError, -) - - -class Porn91IE(InfoExtractor): - IE_NAME = '91porn' - _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/view_video.php\?([^#]+&)?viewkey=(?P\w+)' - - _TESTS = [{ - 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', - 'md5': 'd869db281402e0ef4ddef3c38b866f86', - 'info_dict': { - 'id': '7e42283b4f5ab36da134', - 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', - 'description': 'md5:1ff241f579b07ae936a54e810ad2e891', - 'ext': 'mp4', - 'duration': 431, - 'upload_date': '20150520', - 'comment_count': int, - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'https://91porn.com/view_video.php?viewkey=7ef0cf3d362c699ab91c', - 'md5': 'f8fd50540468a6d795378cd778b40226', - 'info_dict': { - 'id': '7ef0cf3d362c699ab91c', - 'title': '真实空乘,冲上云霄第二部', - 'description': 'md5:618bf9652cafcc66cd277bd96789baea', - 'ext': 'mp4', - 'duration': 248, - 'upload_date': '20221119', - 'comment_count': int, - 'view_count': int, - 'age_limit': 18, - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - self._set_cookie('91porn.com', 'language', 'cn_CN') - - webpage = self._download_webpage( - 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) - - if '视频不存在,可能已经被删除或者被举报为不良内容!' in webpage: - raise ExtractorError('91 Porn says: Video does not exist', expected=True) - - daily_limit = self._search_regex( - r'作为游客,你每天只可观看([\d]+)个视频', webpage, 'exceeded daily limit', default=None, fatal=False) - if daily_limit: - raise ExtractorError(f'91 Porn says: Daily limit {daily_limit} videos exceeded', expected=True) - - video_link_url = self._search_regex( - r'document\.write\(\s*strencode2\s*\(\s*((?:"[^"]+")|(?:\'[^\']+\'))', webpage, 'video link') - video_link_url = self._search_regex( - r'src=["\']([^"\']+)["\']', urllib.parse.unquote(video_link_url), 'unquoted video link') - - formats, subtitles = self._get_formats_and_subtitle(video_link_url, video_id) - - return { - 'id': video_id, - 'title': remove_end(self._html_extract_title(webpage).replace('\n', ''), 'Chinese homemade video').strip(), - 'formats': formats, - 'subtitles': subtitles, - 'upload_date': unified_strdate(self._search_regex( - r'(\d{4}-\d{2}-\d{2})
', webpage, 'upload_date', fatal=False)), - 'description': self._html_search_regex( - r'\s*([^<]+)', webpage, 'description', fatal=False), - 'duration': parse_duration(self._search_regex( - r'时长:\s*]*>\s*(\d+(?::\d+){1,2})', webpage, 'duration', fatal=False)), - 'comment_count': int_or_none(self._search_regex( - r'留言:\s*]*>\s*(\d+)\s*
', webpage, 'comment count', fatal=False)), - 'view_count': int_or_none(self._search_regex( - r'热度:\s*]*>\s*(\d+)\s*', webpage, 'view count', fatal=False)), - 'age_limit': 18, - } - - def _get_formats_and_subtitle(self, video_link_url, video_id): - ext = determine_ext(video_link_url) - if ext == 'm3u8': - formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_link_url, video_id, ext='mp4') - else: - formats = [{'url': video_link_url, 'ext': ext}] - subtitles = {} - - return formats, subtitles diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 29a3e43cc1..d94f28ceb1 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -97,7 +97,7 @@ def is_logged(webpage): login_form = self._hidden_inputs(login_page) login_form.update({ - 'username': username, + 'email': username, 'password': password, }) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 3d965dd452..2fb41ba794 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -45,19 +45,18 @@ class TikTokBaseIE(InfoExtractor): # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 'aid': '0', } - _KNOWN_APP_INFO = [ - '7351144126450059040', - '7351149742343391009', - '7351153174894626592', - ] _APP_INFO_POOL = None _APP_INFO = None _APP_USER_AGENT = None + @property + def _KNOWN_APP_INFO(self): + return self._configuration_arg('app_info', ie_key=TikTokIE) + @property def _API_HOSTNAME(self): return self._configuration_arg( - 'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0] + 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0] def _get_next_app_info(self): if self._APP_INFO_POOL is None: @@ -66,13 +65,10 @@ def _get_next_app_info(self): for key, default in self._APP_INFO_DEFAULTS.items() if key != 'iid' } - app_info_list = ( - self._configuration_arg('app_info', ie_key=TikTokIE) - or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO))) self._APP_INFO_POOL = [ {**defaults, **dict( (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v - )} for app_info in app_info_list + )} for app_info in self._KNOWN_APP_INFO ] if not self._APP_INFO_POOL: @@ -757,11 +753,13 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id, user_id = self._match_valid_url(url).group('id', 'user_id') - try: - return self._extract_aweme_app(video_id) - except ExtractorError as e: - e.expected = True - self.report_warning(f'{e}; trying with webpage') + + if self._KNOWN_APP_INFO: + try: + return self._extract_aweme_app(video_id) + except ExtractorError as e: + e.expected = True + self.report_warning(f'{e}; trying with webpage') url = self._create_url(user_id, video_id) webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}) diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py index a445fae853..52ff230f2a 100644 --- a/yt_dlp/extractor/tv5mondeplus.py +++ b/yt_dlp/extractor/tv5mondeplus.py @@ -2,85 +2,88 @@ from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, extract_attributes, + get_element_by_class, + get_element_html_by_class, int_or_none, - parse_duration, - traverse_obj, - try_get, url_or_none, ) +from ..utils.traversal import traverse_obj class TV5MondePlusIE(InfoExtractor): - IE_DESC = 'TV5MONDE+' - _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P[^/?#]+)' + IE_NAME = 'TV5MONDE' + _VALID_URL = r'https?://(?:www\.)?tv5monde\.com/tv/video/(?P[^/?#]+)' _TESTS = [{ - # movie - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/les-novices', - 'md5': 'c86f60bf8b75436455b1b205f9745955', + # documentary + 'url': 'https://www.tv5monde.com/tv/video/65931-baudouin-l-heritage-d-un-roi-baudouin-l-heritage-d-un-roi', + 'md5': 'd2a708902d3df230a357c99701aece05', 'info_dict': { - 'id': 'ZX0ipMyFQq_6D4BA7b', - 'display_id': 'les-novices', + 'id': '3FPa7JMu21_6D4BA7b', + 'display_id': '65931-baudouin-l-heritage-d-un-roi-baudouin-l-heritage-d-un-roi', 'ext': 'mp4', - 'title': 'Les novices', - 'description': 'md5:2e7c33ba3ad48dabfcc2a956b88bde2b', - 'upload_date': '20230821', - 'thumbnail': 'https://revoir.tv5monde.com/uploads/media/video_thumbnail/0738/60/01e952b7ccf36b7c6007ec9131588954ab651de9.jpeg', - 'duration': 5177, - 'episode': 'Les novices', + 'title': "Baudouin, l'héritage d'un roi", + 'thumbnail': 'https://psi.tv5monde.com/upsilon-images/960x540/6f/baudouin-f49c6b0e.jpg', + 'duration': 4842, + 'upload_date': '20240130', + 'timestamp': 1706641242, + 'episode': "BAUDOUIN, L'HERITAGE D'UN ROI", + 'description': 'md5:78125c74a5cac06d7743a2d09126edad', + 'series': "Baudouin, l'héritage d'un roi", }, }, { # series episode - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/opj-les-dents-de-la-terre-2', + 'url': 'https://www.tv5monde.com/tv/video/52952-toute-la-vie-mardi-23-mars-2021', + 'md5': 'f5e09637cadd55639c05874e22eb56bf', 'info_dict': { - 'id': 'wJ0eeEPozr_6D4BA7b', - 'display_id': 'opj-les-dents-de-la-terre-2', + 'id': 'obRRZ8m6g9_6D4BA7b', + 'display_id': '52952-toute-la-vie-mardi-23-mars-2021', 'ext': 'mp4', - 'title': "OPJ - Les dents de la Terre (2)", - 'description': 'md5:288f87fd68d993f814e66e60e5302d9d', - 'upload_date': '20230823', - 'series': 'OPJ', - 'episode': 'Les dents de la Terre (2)', - 'duration': 2877, - 'thumbnail': 'https://dl-revoir.tv5monde.com/images/1a/5753448.jpg' + 'title': 'Toute la vie', + 'description': 'md5:a824a2e1dfd94cf45fa379a1fb43ce65', + 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/5880553.jpg', + 'duration': 2526, + 'upload_date': '20230721', + 'timestamp': 1689971646, + 'series': 'Toute la vie', + 'episode': 'Mardi 23 mars 2021', }, }, { # movie - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent', - 'md5': '32fa0cde16a4480d1251502a66856d5f', + 'url': 'https://www.tv5monde.com/tv/video/8771-ce-fleuve-qui-nous-charrie-ce-fleuve-qui-nous-charrie-p001-ce-fleuve-qui-nous-charrie', + 'md5': '87cefc34e10a6bf4f7823cccd7b36eb2', 'info_dict': { - 'id': 'dc57a011-ec4b-4648-2a9a-4f03f8352ed3', - 'display_id': 'ceux-qui-travaillent', + 'id': 'DOcfvdLKXL_6D4BA7b', + 'display_id': '8771-ce-fleuve-qui-nous-charrie-ce-fleuve-qui-nous-charrie-p001-ce-fleuve-qui-nous-charrie', 'ext': 'mp4', - 'title': 'Ceux qui travaillent', - 'description': 'md5:570e8bb688036ace873b2d50d24c026d', - 'upload_date': '20210819', + 'title': 'Ce fleuve qui nous charrie', + 'description': 'md5:62ba3f875343c7fc4082bdfbbc1be992', + 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/5476617.jpg', + 'duration': 5300, + 'upload_date': '20210822', + 'timestamp': 1629594105, + 'episode': 'CE FLEUVE QUI NOUS CHARRIE-P001-CE FLEUVE QUI NOUS CHARRIE', + 'series': 'Ce fleuve qui nous charrie', }, - 'skip': 'no longer available', }, { - # series episode - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice', + # news + 'url': 'https://www.tv5monde.com/tv/video/70402-tv5monde-le-journal-edition-du-08-05-24-11h', + 'md5': 'c62977d6d10754a2ecebba70ad370479', 'info_dict': { - 'id': '9e9d599e-23af-6915-843e-ecbf62e97925', - 'display_id': 'vestiaires-caro-actrice', + 'id': 'LgQFrOCNsc_6D4BA7b', + 'display_id': '70402-tv5monde-le-journal-edition-du-08-05-24-11h', 'ext': 'mp4', - 'title': "Vestiaires - Caro actrice", - 'description': 'md5:db15d2e1976641e08377f942778058ea', - 'upload_date': '20210819', - 'series': "Vestiaires", - 'episode': 'Caro actrice', + 'title': 'TV5MONDE, le journal', + 'description': 'md5:777dc209eaa4423b678477c36b0b04a8', + 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/6184105.jpg', + 'duration': 854, + 'upload_date': '20240508', + 'timestamp': 1715159640, + 'series': 'TV5MONDE, le journal', + 'episode': 'EDITION DU 08/05/24 - 11H', }, - 'params': { - 'skip_download': True, - }, - 'skip': 'no longer available', - }, { - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', - 'only_matching': True, - }, { - 'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30', - 'only_matching': True, }] _GEO_BYPASS = False @@ -98,7 +101,6 @@ def _real_extract(self, url): if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: self.raise_geo_restricted(countries=['FR']) - title = episode = self._html_search_regex(r'

([^<]+)', webpage, 'title') vpl_data = extract_attributes(self._search_regex( r'(<[^>]+class="video_player_loader"[^>]+>)', webpage, 'video player loader')) @@ -147,26 +149,7 @@ def process_video_files(v): process_video_files(video_files) metadata = self._parse_json( - vpl_data['data-metadata'], display_id) - duration = (int_or_none(try_get(metadata, lambda x: x['content']['duration'])) - or parse_duration(self._html_search_meta('duration', webpage))) - - description = self._html_search_regex( - r'(?s)]+class=["\']episode-texte[^>]+>(.+?)', webpage, - 'description', fatal=False) - - series = self._html_search_regex( - r']+class=["\']episode-emission[^>]+>([^<]+)', webpage, - 'series', default=None) - - if series and series != title: - title = '%s - %s' % (series, title) - - upload_date = self._search_regex( - r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})', - webpage, 'upload date', default=None) - if upload_date: - upload_date = upload_date.replace('_', '') + vpl_data.get('data-metadata') or '{}', display_id, fatal=False) if not video_id: video_id = self._search_regex( @@ -175,16 +158,20 @@ def process_video_files(v): default=display_id) return { + **traverse_obj(metadata, ('content', { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'episode': ('title', {str}), + 'series': ('series', {str}), + 'timestamp': ('publishDate_ts', {int_or_none}), + 'duration': ('duration', {int_or_none}), + })), 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': vpl_data.get('data-image'), - 'duration': duration, - 'upload_date': upload_date, + 'title': clean_html(get_element_by_class('main-title', webpage)), + 'description': clean_html(get_element_by_class('text', get_element_html_by_class('ep-summary', webpage) or '')), + 'thumbnail': url_or_none(vpl_data.get('data-image')), 'formats': formats, 'subtitles': self._extract_subtitles(self._parse_json( traverse_obj(vpl_data, ('data-captions', {str}), default='{}'), display_id, fatal=False)), - 'series': series, - 'episode': episode, } diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index ecc865655d..df7f816bd3 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -36,7 +36,7 @@ class TwitterBaseIE(InfoExtractor): _NETRC_MACHINE = 'twitter' _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:(?:twitter|x)\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE' _flow_token = None @@ -1191,6 +1191,31 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, '_old_archive_ids': ['twitter 1724884212803834154'], }, + }, { + # x.com + 'url': 'https://x.com/historyinmemes/status/1790637656616943991', + 'md5': 'daca3952ba0defe2cfafb1276d4c1ea5', + 'info_dict': { + 'id': '1790637589910654976', + 'ext': 'mp4', + 'title': 'Historic Vids - One of the most intense moments in history', + 'description': 'One of the most intense moments in history https://t.co/Zgzhvix8ES', + 'display_id': '1790637656616943991', + 'uploader': 'Historic Vids', + 'uploader_id': 'historyinmemes', + 'uploader_url': 'https://twitter.com/historyinmemes', + 'channel_id': '855481986290524160', + 'upload_date': '20240515', + 'timestamp': 1715756260.0, + 'duration': 15.488, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'age_limit': 0, + '_old_archive_ids': ['twitter 1790637656616943991'], + } }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index 4316c31d2b..1e2d118aa6 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -173,6 +173,20 @@ class KnownPiracyIE(UnsupportedInfoExtractor): r'filemoon\.sx', r'hentai\.animestigma\.com', r'thisav\.com', + r'gounlimited\.to', + r'highstream\.tv', + r'uqload\.com', + r'vedbam\.xyz', + r'vadbam\.net' + r'vidlo\.us', + r'wolfstream\.tv', + r'xvideosharing\.com', + r'(?:\w+\.)?viidshar\.com', + r'sxyprn\.com', + r'jable\.tv', + r'91porn\.com', + r'einthusan\.(?:tv|com|ca)', + r'yourupload\.com', ) _TESTS = [{ diff --git a/yt_dlp/extractor/xfileshare.py b/yt_dlp/extractor/xfileshare.py deleted file mode 100644 index 08c6d6c7c0..0000000000 --- a/yt_dlp/extractor/xfileshare.py +++ /dev/null @@ -1,198 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - decode_packed_codes, - determine_ext, - int_or_none, - js_to_json, - urlencode_postdata, -) - - -# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 -def aa_decode(aa_code): - symbol_table = [ - ('7', '((゚ー゚) + (o^_^o))'), - ('6', '((o^_^o) +(o^_^o))'), - ('5', '((゚ー゚) + (゚Θ゚))'), - ('2', '((o^_^o) - (゚Θ゚))'), - ('4', '(゚ー゚)'), - ('3', '(o^_^o)'), - ('1', '(゚Θ゚)'), - ('0', '(c^_^o)'), - ] - delim = '(゚Д゚)[゚ε゚]+' - ret = '' - for aa_char in aa_code.split(delim): - for val, pat in symbol_table: - aa_char = aa_char.replace(pat, val) - aa_char = aa_char.replace('+ ', '') - m = re.match(r'^\d+', aa_char) - if m: - ret += chr(int(m.group(0), 8)) - else: - m = re.match(r'^u([\da-f]+)', aa_char) - if m: - ret += chr(int(m.group(1), 16)) - return ret - - -class XFileShareIE(InfoExtractor): - _SITES = ( - (r'aparat\.cam', 'Aparat'), - (r'clipwatching\.com', 'ClipWatching'), - (r'gounlimited\.to', 'GoUnlimited'), - (r'govid\.me', 'GoVid'), - (r'holavid\.com', 'HolaVid'), - (r'streamty\.com', 'Streamty'), - (r'thevideobee\.to', 'TheVideoBee'), - (r'uqload\.com', 'Uqload'), - (r'vidbom\.com', 'VidBom'), - (r'vidlo\.us', 'vidlo'), - (r'vidlocker\.xyz', 'VidLocker'), - (r'vidshare\.tv', 'VidShare'), - (r'vup\.to', 'VUp'), - (r'wolfstream\.tv', 'WolfStream'), - (r'xvideosharing\.com', 'XVideoSharing'), - ) - - IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) - _VALID_URL = (r'https?://(?:www\.)?(?P%s)/(?:embed-)?(?P[0-9a-zA-Z]+)' - % '|'.join(site for site in list(zip(*_SITES))[0])) - _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])] - - _FILE_NOT_FOUND_REGEXES = ( - r'>(?:404 - )?File Not Found<', - r'>The file was removed by administrator<', - ) - - _TESTS = [{ - 'url': 'https://uqload.com/dltx1wztngdz', - 'md5': '3cfbb65e4c90e93d7b37bcb65a595557', - 'info_dict': { - 'id': 'dltx1wztngdz', - 'ext': 'mp4', - 'title': 'Rick Astley Never Gonna Give You mp4', - 'thumbnail': r're:https://.*\.jpg' - } - }, { - 'url': 'http://xvideosharing.com/fq65f94nd2ve', - 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', - 'info_dict': { - 'id': 'fq65f94nd2ve', - 'ext': 'mp4', - 'title': 'sample', - 'thumbnail': r're:http://.*\.jpg', - }, - }, { - 'url': 'https://aparat.cam/n4d6dh0wvlpr', - 'only_matching': True, - }, { - 'url': 'https://wolfstream.tv/nthme29v9u2x', - 'only_matching': True, - }] - - def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() - - url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) - webpage = self._download_webpage(url, video_id) - - if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - fields = self._hidden_inputs(webpage) - - if fields.get('op') == 'download1': - countdown = int_or_none(self._search_regex( - r'(?:[Ww]ait)?\s*(\d+)\s*(?:seconds?)?', - webpage, 'countdown', default=None)) - if countdown: - self._sleep(countdown, video_id) - - webpage = self._download_webpage( - url, video_id, 'Downloading video page', - data=urlencode_postdata(fields), headers={ - 'Referer': url, - 'Content-type': 'application/x-www-form-urlencoded', - }) - - title = (self._search_regex( - (r'style="z-index: [0-9]+;">([^<]+)', - r'([^<]+)', - r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+)[ <]', - r'

([^<]+)

', - r'

]*>([^<]+)<', # streamin.to - r'title\s*:\s*"([^"]+)"'), # govid.me - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or video_id).strip() - - for regex, func in ( - (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), - (r'(゚.+)', aa_decode)): - obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) - if obf_code: - webpage = webpage.replace(obf_code, func(obf_code)) - - formats = [] - - jwplayer_data = self._search_regex( - [ - r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', - r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', - ], webpage, - 'jwplayer data', default=None) - if jwplayer_data: - jwplayer_data = self._parse_json( - jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) - if jwplayer_data: - formats = self._parse_jwplayer_data( - jwplayer_data, video_id, False, - m3u8_id='hls', mpd_id='dash')['formats'] - - if not formats: - urls = [] - for regex in ( - r'(?:file|src)\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', - r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', - r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', - r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): - for mobj in re.finditer(regex, webpage): - video_url = mobj.group('url') - if video_url not in urls: - urls.append(video_url) - - sources = self._search_regex( - r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) - if sources: - urls.extend(self._parse_json(sources, video_id)) - - formats = [] - for video_url in urls: - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': video_url, - 'format_id': 'sd', - }) - - thumbnail = self._search_regex( - [ - r']+poster="([^"]+)"', - r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', - ], webpage, 'thumbnail', default=None) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'http_headers': {'Referer': url} - } diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 59eef8490f..a489033abc 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -173,8 +173,41 @@ def _real_extract(self, url): class XVideosQuickiesIE(InfoExtractor): IE_NAME = 'xvideos:quickies' - _VALID_URL = r'https?://(?P(?:[^/]+\.)?xvideos2?\.com)/amateur-channels/[^#]+#quickies/a/(?P\d+)' + _VALID_URL = r'https?://(?P(?:[^/?#]+\.)?xvideos2?\.com)/(?:profiles/|amateur-channels/)?[^/?#]+#quickies/a/(?P\w+)' _TESTS = [{ + 'url': 'https://www.xvideos.com/lili_love#quickies/a/ipdtikh1a4c', + 'md5': 'f9e4f518ff1de14b99a400bbd0fc5ee0', + 'info_dict': { + 'id': 'ipdtikh1a4c', + 'ext': 'mp4', + 'title': 'Mexican chichóna putisima', + 'age_limit': 18, + 'duration': 81, + 'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg', + } + }, { + 'url': 'https://www.xvideos.com/profiles/lili_love#quickies/a/ipphaob6fd1', + 'md5': '5340938aac6b46e19ebdd1d84535862e', + 'info_dict': { + 'id': 'ipphaob6fd1', + 'ext': 'mp4', + 'title': 'Puta chichona mexicana squirting', + 'age_limit': 18, + 'duration': 56, + 'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg', + } + }, { + 'url': 'https://www.xvideos.com/amateur-channels/lili_love#quickies/a/hfmffmd7661', + 'md5': '92428518bbabcb4c513e55922e022491', + 'info_dict': { + 'id': 'hfmffmd7661', + 'ext': 'mp4', + 'title': 'Chichona mexican slut', + 'age_limit': 18, + 'duration': 9, + 'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg', + } + }, { 'url': 'https://www.xvideos.com/amateur-channels/wifeluna#quickies/a/47258683', 'md5': '16e322a93282667f1963915568f782c1', 'info_dict': { @@ -189,4 +222,4 @@ class XVideosQuickiesIE(InfoExtractor): def _real_extract(self, url): domain, id_ = self._match_valid_url(url).group('domain', 'id') - return self.url_result(f'https://{domain}/video{id_}/_', XVideosIE, id_) + return self.url_result(f'https://{domain}/video{"" if id_.isdecimal() else "."}{id_}/_', XVideosIE, id_) diff --git a/yt_dlp/extractor/yourporn.py b/yt_dlp/extractor/yourporn.py deleted file mode 100644 index 38f42a991c..0000000000 --- a/yt_dlp/extractor/yourporn.py +++ /dev/null @@ -1,65 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - parse_duration, - urljoin, -) - - -class YourPornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sxyprn\.com/post/(?P[^/?#&.]+)' - _TESTS = [{ - 'url': 'https://sxyprn.com/post/57ffcb2e1179b.html', - 'md5': '6f8682b6464033d87acaa7a8ff0c092e', - 'info_dict': { - 'id': '57ffcb2e1179b', - 'ext': 'mp4', - 'title': 'md5:c9f43630bd968267672651ba905a7d35', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 165, - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://sxyprn.com/post/57ffcb2e1179b.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - parts = self._parse_json( - self._search_regex( - r'data-vnfo=(["\'])(?P{.+?})\1', webpage, 'data info', - group='data'), - video_id)[video_id].split('/') - - num = 0 - for c in parts[6] + parts[7]: - if c.isnumeric(): - num += int(c) - parts[5] = compat_str(int(parts[5]) - num) - parts[1] += '8' - video_url = urljoin(url, '/'.join(parts)) - - title = (self._search_regex( - r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', - default=None) or self._og_search_description(webpage)).strip() - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'duration\s*:\s*<[^>]+>([\d:]+)', webpage, 'duration', - default=None)) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': 18, - 'ext': 'mp4', - } diff --git a/yt_dlp/extractor/yourupload.py b/yt_dlp/extractor/yourupload.py deleted file mode 100644 index def63293aa..0000000000 --- a/yt_dlp/extractor/yourupload.py +++ /dev/null @@ -1,43 +0,0 @@ -from .common import InfoExtractor -from ..utils import urljoin - - -class YourUploadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:yourupload\.com/(?:watch|embed)|embed\.yourupload\.com)/(?P[A-Za-z0-9]+)' - _TESTS = [{ - 'url': 'http://yourupload.com/watch/14i14h', - 'md5': '5e2c63385454c557f97c4c4131a393cd', - 'info_dict': { - 'id': '14i14h', - 'ext': 'mp4', - 'title': 'BigBuckBunny_320x180.mp4', - 'thumbnail': r're:^https?://.*\.jpe?g', - } - }, { - 'url': 'http://www.yourupload.com/embed/14i14h', - 'only_matching': True, - }, { - 'url': 'http://embed.yourupload.com/14i14h', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - embed_url = 'http://www.yourupload.com/embed/%s' % video_id - - webpage = self._download_webpage(embed_url, video_id) - - title = self._og_search_title(webpage) - video_url = urljoin(embed_url, self._og_search_video_url(webpage)) - thumbnail = self._og_search_thumbnail(webpage, default=None) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': thumbnail, - 'http_headers': { - 'Referer': embed_url, - }, - } diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e553fff9f1..e676c5cde2 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -240,6 +240,16 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 85 }, + # This client has pre-merged video+audio 720p/1080p streams + 'mediaconnect': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'MEDIA_CONNECT_FRONTEND', + 'clientVersion': '0.1', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 95 + }, } @@ -1171,7 +1181,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) - _formats = { + _formats = { # NB: Used in YoutubeWebArchiveIE and GoogleDriveIE '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, @@ -2343,6 +2353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '17', # 3gp format available on android 'extractor_args': {'youtube': {'player_client': ['android']}}, }, + 'skip': 'android client broken', }, { # Skip download of additional client configs (remix client config in this case) @@ -2720,7 +2731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'heatmap': 'count:100', }, 'params': { - 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, + 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, }, }, ] @@ -3307,7 +3318,36 @@ def _extract_heatmap(self, data): 'value': ('intensityScoreNormalized', {float_or_none}), })) or None - def _extract_comment(self, comment_renderer, parent=None): + def _extract_comment(self, entities, parent=None): + comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict})) + if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))): + return + + toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict})) + time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or '' + + return { + 'id': comment_id, + 'parent': parent or 'root', + **traverse_obj(comment_entity_payload, { + 'text': ('properties', 'content', 'content', {str}), + 'like_count': ('toolbar', 'likeCountA11y', {parse_count}), + 'author_id': ('author', 'channelId', {self.ucid_or_none}), + 'author': ('author', 'displayName', {str}), + 'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}), + 'author_is_uploader': ('author', 'isCreator', {bool}), + 'author_is_verified': ('author', 'isVerified', {bool}), + 'author_url': ('author', 'channelCommand', 'innertubeCommand', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url') + ), {lambda x: urljoin('https://www.youtube.com', x)}), + }, get_all=False), + 'is_favorited': (None if toolbar_entity_payload is None else + toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'), + '_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate. + 'timestamp': self._parse_time_text(time_text), + } + + def _extract_comment_old(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: return @@ -3388,21 +3428,39 @@ def extract_header(contents): break return _continuation - def extract_thread(contents): + def extract_thread(contents, entity_payloads): if not parent: tracker['current_page_thread'] = 0 for content in contents: if not parent and tracker['total_parent_comments'] >= max_parents: yield comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) - comment_renderer = get_first( - (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], - expected_type=dict, default={}) - comment = self._extract_comment(comment_renderer, parent) + # old comment format + if not entity_payloads: + comment_renderer = get_first( + (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], + expected_type=dict, default={}) + + comment = self._extract_comment_old(comment_renderer, parent) + + # new comment format + else: + view_model = ( + traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict})) + or traverse_obj(content, ('commentViewModel', {dict}))) + comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str})) + if not comment_keys: + continue + entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys) + comment = self._extract_comment(entities, parent) + if comment: + comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None + if not comment: continue comment_id = comment['id'] + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. @@ -3495,7 +3553,7 @@ def extract_thread(contents): check_get_keys = None if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): check_get_keys = [[*continuation_items_path, ..., ( - 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]] try: response = self._extract_response( item_id=None, query=continuation, @@ -3519,6 +3577,7 @@ def extract_thread(contents): raise is_forced_continuation = False continuation = None + mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict})) for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): if is_first_continuation: continuation = extract_header(continuation_items) @@ -3527,7 +3586,7 @@ def extract_thread(contents): break continue - for entry in extract_thread(continuation_items): + for entry in extract_thread(continuation_items, mutations): if not entry: return yield entry @@ -3604,8 +3663,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, } - if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'): - yt_query['params'] = 'CgIIAQ==' pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] if pp_arg: @@ -3621,19 +3678,24 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - default = ['ios', 'android', 'web'] + android_clients = [] + default = ['ios', 'web'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): - if client in allowed_clients: - requested_clients.append(client) - elif client == 'default': + if client == 'default': requested_clients.extend(default) elif client == 'all': requested_clients.extend(allowed_clients) - else: + elif client not in allowed_clients: self.report_warning(f'Skipping unsupported client {client}') + elif client.startswith('android'): + android_clients.append(client) + else: + requested_clients.append(client) + # Force deprioritization of broken Android clients for format de-duplication + requested_clients.extend(android_clients) if not requested_clients: requested_clients = default @@ -3852,6 +3914,14 @@ def build_fragments(f): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) + # Android client formats are broken due to integrity check enforcement + # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 + is_broken = client_name and client_name.startswith(short_client_name('android')) + if is_broken: + self.report_warning( + f'{video_id}: Android client formats are broken and may yield HTTP Error 403. ' + 'They will be deprioritized', only_once=True) + name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 dct = { @@ -3864,7 +3934,7 @@ def build_fragments(f): name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - throttled and 'THROTTLED', is_damaged and 'DAMAGED', + throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN', (self.get_param('verbose') or all_formats) and client_name, delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 @@ -3882,8 +3952,8 @@ def build_fragments(f): 'language': join_nonempty(audio_track.get('id', '').split('.')[0], 'desc' if language_preference < -1 else '') or None, 'language_preference': language_preference, - # Strictly de-prioritize damaged and 3gp formats - 'preference': -10 if is_damaged else -2 if itag == '17' else None, + # Strictly de-prioritize broken, damaged and 3gp formats + 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '')