diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d8abf0b5d..b24dff995 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -472,6 +472,7 @@ from .dailywire import ( DailyWireIE, DailyWirePodcastIE, + DailyWireShowIE, ) from .damtomo import ( DamtomoRecordIE, diff --git a/yt_dlp/extractor/dailywire.py b/yt_dlp/extractor/dailywire.py index 3d5bb257c..7b18f97b9 100644 --- a/yt_dlp/extractor/dailywire.py +++ b/yt_dlp/extractor/dailywire.py @@ -1,28 +1,72 @@ +import itertools +import json + from .common import InfoExtractor from ..utils import ( + ExtractorError, determine_ext, float_or_none, join_nonempty, + parse_iso8601, traverse_obj, url_or_none, ) class DailyWireBaseIE(InfoExtractor): - _JSON_PATH = { - 'episode': ('props', 'pageProps', 'episodeData', 'episode'), - 'videos': ('props', 'pageProps', 'videoData', 'video'), - 'podcasts': ('props', 'pageProps', 'episode'), + _GRAPHQL_API = 'https://v2server.dailywire.com/app/graphql' + _GRAPHQL_QUERIES = { + 'getClipBySlug': 'query getClipBySlug($slug:String!){clip(where:{slug:$slug}){id,name,slug,description,image,show{id,name,slug},thumbnail,duration,createdBy{firstName,lastName},createdAt,videoURL}}', + 'getEpisodeBySlug': 'query getEpisodeBySlug($slug:String!){episode(where:{slug:$slug}){id,title,slug,description,createdAt,image,show{id,name,slug},segments{audio,video,duration,},createdBy{firstName,lastName}}}', + 'getPodcastEpisodes': 'query getPodcastEpisodes($where: PodcastEpisodeWhereInput, $orderBy: PodcastEpisodeOrderBy, $skip: Int, $first: Int) {listPodcastEpisode(where: $where, orderBy: $orderBy, skip: $skip, first: $first) {...ResPodcastEpisode}}, fragment ResPodcastEpisode on getPodcastEpisodeRes {id,title,description,slug,thumbnail,createdAt,audio,duration,podcast {id,name,slug,author {firstName,lastName}},season {id,name,slug}}', + 'getSeasonEpisodes': 'query getSeasonEpisodes($where:getSeasonEpisodesInput!,$first:Int,$skip:Int){getSeasonEpisodes(where:$where,first:$first,skip:$skip){episode{slug}}}', + 'getShowBySlug': 'query getShowBySlug($slug:String!){show(where:{slug:$slug}){id,name,description,image,seasons(orderBy:weight_DESC){id,name,slug}}}', + 'getVideoBySlug': 'query getVideoBySlug($slug:String!){video(where:{slug:$slug}){id,name,slug,description,image,thumbnail,videoURL,duration,createdBy{firstName,lastName},createdAt}}', + } + _GRAPHQL_VIDEO_QUERIES = { + 'clips': 'getClipBySlug', + 'episode': 'getEpisodeBySlug', + 'videos': 'getVideoBySlug', + } + _GRAPHQL_JSON_PATH = { + 'getClipBySlug': ('data', 'clip'), + 'getEpisodeBySlug': ('data', 'episode'), + 'getPodcastEpisodes': ('data', 'listPodcastEpisode'), + 'getSeasonEpisodes': ('data', 'getSeasonEpisodes', ..., 'episode', 'slug'), + 'getShowBySlug': ('data', 'show'), + 'getVideoBySlug': ('data', 'video'), + } + _API_HEADERS = { + 'Apollographql-Client-Name': 'DW_WEBSITE', + 'Content-Type': 'application/json', + 'Origin': 'https://www.dailywire.com', + 'Referer': 'https://www.dailywire.com/', } - def _get_json(self, url): - sites_type, slug = self._match_valid_url(url).group('sites_type', 'id') - json_data = self._search_nextjs_data(self._download_webpage(url, slug), slug) - return slug, traverse_obj(json_data, self._JSON_PATH[sites_type]) + def _real_initialize(self): + if access_token := self._get_cookies('https://www.dailywire.com').get('accessToken'): + self._API_HEADERS['Authorization'] = f'Bearer {access_token.value}' + + def _call_api(self, slug, query, variables, message='Downloading JSON from GraphQL API'): + json_data = self._download_json( + self._GRAPHQL_API, slug, message, data=json.dumps( + {'query': self._GRAPHQL_QUERIES[query], 'variables': variables}).encode(), + headers=self._API_HEADERS) + + return traverse_obj(json_data, self._GRAPHQL_JSON_PATH.get(query, ())) + + def _paginate(self, slug, query, where): + for i in itertools.count(0): + page = self._call_api( + slug, query, {'where': where, 'first': 10, 'skip': i * 10}, + message=f'Downloading page {i + 1}') + if not page: + break + yield page class DailyWireIE(DailyWireBaseIE): - _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?Pepisode|videos)/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?Pepisode|videos|clips)/(?P[\w-]+)' _TESTS = [{ 'url': 'https://www.dailywire.com/episode/1-fauci', 'info_dict': { @@ -32,32 +76,79 @@ class DailyWireIE(DailyWireBaseIE): 'title': '1. Fauci', 'description': 'md5:9df630347ef85081b7e97dd30bc22853', 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/ckzsl50xnqpy30850in3v4bu7/ckzsl50xnqpy30850in3v4bu7-1648237399554.jpg', - 'creator': 'Caroline Roberts', 'series_id': 'ckzplm0a097fn0826r2vc3j7h', 'series': 'China: The Enemy Within', - }, - }, { - 'url': 'https://www.dailywire.com/episode/ep-124-bill-maher', - 'info_dict': { - 'id': 'cl0ngbaalplc80894sfdo9edf', - 'ext': 'mp3', - 'display_id': 'ep-124-bill-maher', - 'title': 'Ep. 124 - Bill Maher', - 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/cl0ngbaalplc80894sfdo9edf/cl0ngbaalplc80894sfdo9edf-1647065568518.jpg', - 'creator': 'Caroline Roberts', - 'description': 'md5:adb0de584bcfa9c41374999d9e324e98', - 'series_id': 'cjzvep7270hp00786l9hwccob', - 'series': 'The Sunday Special', + 'upload_date': '20220218', + 'creators': ['Caroline Roberts'], + 'timestamp': 1645182003, }, }, { 'url': 'https://www.dailywire.com/videos/the-hyperions', 'only_matching': True, + }, { + 'skip': 'premium only', + 'url': 'https://www.dailywire.com/episode/ep-3-avery-s-niece-new', + 'info_dict': { + 'id': 'clm8geguv3qku0870ewvcu0ed', + 'display_id': 'ep-3-avery-s-niece-new', + 'title': 'Ep 3 - Avery’s Niece', + 'description': 'md5:861ab336bd2bab2abebc25a1479a42e0', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/clm8geguv3qku0870ewvcu0ed/clm8geguv3qku0870ewvcu0ed-1694047935734.png', + 'series_id': 'clim20ue5f8160838ecz7ba8q', + 'ext': 'mp4', + 'subtitles': {'en-US': [{'ext': 'vtt'}]}, + 'timestamp': 1694062826, + 'series': 'Convicting a Murderer', + 'creators': ['Scott Bowler '], + 'upload_date': '20230907', + }, + }, { + 'skip': 'premium only', + 'url': 'https://www.dailywire.com/clips/the-making-of-run-hide-fight', + 'info_dict': { + 'id': 'ckjutyd6810dd0806ivcq2526', + 'display_id': 'the-making-of-run-hide-fight', + 'title': 'The Making of Run Hide Fight', + 'description': 'md5:085297d753b73ad87bdd8b050cc10d2c', + 'thumbnail': 'https://image.media.dailywire.com/K7OqsPwWH5c9hpWT68CHeZ4vRUtoz5Le/thumbnail.png', + 'duration': 916.790889, + 'creators': ['Paul Snyder'], + 'upload_date': '20210113', + 'timestamp': 1610506443, + 'ext': 'mp4', + }, + }, { + 'skip': 'premium only', + 'url': 'https://www.dailywire.com/videos/choosing-death-the-legacy-of-roe', + 'info_dict': { + 'id': 'cl3260dva6pjr097819zw506s', + 'display_id': 'choosing-death-the-legacy-of-roe', + 'title': 'Choosing Death [The Legacy of Roe]', + 'description': 'md5:b07597f0ef32130365427a05fd1ccd25', + 'duration': 2618.0738, + 'timestamp': 1652308821, + 'upload_date': '20220511', + 'thumbnail': 'https://image.media.dailywire.com/FBgIBgmq635VuqTgWKjcGviEjJ2vJ02Zz/thumbnail.png', + 'subtitles': {'en-US': [{'ext': 'vtt'}]}, + 'ext': 'mp4', + }, }] def _real_extract(self, url): - slug, episode_info = self._get_json(url) - urls = traverse_obj( - episode_info, (('segments', 'videoUrl'), ..., ('video', 'audio')), expected_type=url_or_none) + sites_type, slug = self._match_valid_url(url).group('sites_type', 'id') + episode_data = self._call_api(slug, self._GRAPHQL_VIDEO_QUERIES[sites_type], {'slug': slug}) + + if not episode_data: + raise ExtractorError('video not found') + + urls = traverse_obj(episode_data, + (('segments', 'clips'), ..., ('video', 'audio')) + ) or [episode_data.get('videoURL')] + + if 'Access Denied' in urls: + self.report_warning(f'It looks like {slug} requires a login. Try passing cookies and try again.') + + urls = [url_or_none(u) for u in urls if url_or_none(u)] formats, subtitles = [], {} for url in urls: @@ -68,24 +159,27 @@ def _real_extract(self, url): formats.extend(format_) self._merge_subtitles(subs_, target=subtitles) return { - 'id': episode_info['id'], + 'id': episode_data.get('id'), 'display_id': slug, - 'title': traverse_obj(episode_info, 'title', 'name'), - 'description': episode_info.get('description'), - 'creator': join_nonempty(('createdBy', 'firstName'), ('createdBy', 'lastName'), from_dict=episode_info, delim=' '), - 'duration': float_or_none(episode_info.get('duration')), - 'is_live': episode_info.get('isLive'), - 'thumbnail': traverse_obj(episode_info, 'thumbnail', 'image', expected_type=url_or_none), + 'title': traverse_obj(episode_data, 'title', 'name'), + 'description': episode_data.get('description'), + 'creator': join_nonempty(('createdBy', 'firstName'), ('createdBy', 'lastName'), + from_dict=episode_data, delim=' '), + 'duration': float_or_none(episode_data.get('duration')), + 'timestamp': parse_iso8601(episode_data.get('createdAt')), + 'is_live': episode_data.get('isLive'), + 'thumbnail': traverse_obj(episode_data, 'thumbnail', 'image', expected_type=url_or_none), 'formats': formats, 'subtitles': subtitles, - 'series_id': traverse_obj(episode_info, ('show', 'id')), - 'series': traverse_obj(episode_info, ('show', 'name')), + 'series_id': traverse_obj(episode_data, ('show', 'id')), + 'series': traverse_obj(episode_data, ('show', 'name')), } class DailyWirePodcastIE(DailyWireBaseIE): - _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?Ppodcasts)/(?P[\w-]+/(?P[\w-]+))' + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?Ppodcasts)/(?P[\w-]+)/?(?P[\w-]+)?' _TESTS = [{ + 'note': 'serves shorter ad-free stream with paid cookies', 'url': 'https://www.dailywire.com/podcasts/morning-wire/get-ready-for-recession-6-15-22', 'info_dict': { 'id': 'cl4f01d0w8pbe0a98ydd0cfn1', @@ -93,21 +187,120 @@ class DailyWirePodcastIE(DailyWireBaseIE): 'display_id': 'get-ready-for-recession-6-15-22', 'title': 'Get Ready for Recession | 6.15.22', 'description': 'md5:c4afbadda4e1c38a4496f6d62be55634', - 'thumbnail': 'https://daily-wire-production.imgix.net/podcasts/ckx4otgd71jm508699tzb6hf4-1639506575562.jpg', + 'thumbnail': 'https://daily-wire-production.imgix.net/podcasts/ckx4otgd71jm508699tzb6hf4-1667859984424.jpg', 'duration': 900.117667, + 'timestamp': 1655261631, + 'season_id': 'morning-wire-morning-wire-podcast-season', + 'series_id': 'morning-wire', + 'creators': ['Georgia Howe'], + 'season': '2022', + 'series': 'Morning Wire', + 'upload_date': '20220615', }, + }, { + 'url': 'https://www.dailywire.com/podcasts/enough', + 'info_dict': { + 'id': 'ckx4kvm8710i80869lvuu1b8z', + 'title': 'Enough', + 'display_id': 'enough', + }, + 'playlist_mincount': 7, }] def _real_extract(self, url): - slug, episode_info = self._get_json(url) - audio_id = traverse_obj(episode_info, 'audioMuxPlaybackId', 'VUsAipTrBVSgzw73SpC2DAJD401TYYwEp') + podcaster, slug = self._match_valid_url(url).group('podcaster', 'id') - return { - 'id': episode_info['id'], - 'url': f'https://stream.media.dailywire.com/{audio_id}/audio.m4a', - 'display_id': slug, - 'title': episode_info.get('title'), - 'duration': float_or_none(episode_info.get('duration')), - 'thumbnail': episode_info.get('thumbnail'), - 'description': episode_info.get('description'), - } + def _extract_pod_ep_info(episode_data): + print(episode_data) + return { + 'id': episode_data.get('id'), + 'url': episode_data.get('audio'), + 'display_id': episode_data.get('slug'), + 'title': episode_data.get('title'), + 'duration': float_or_none(episode_data.get('duration')), + 'timestamp': parse_iso8601(episode_data.get('createdAt')), + 'thumbnail': episode_data.get('thumbnail'), + 'description': episode_data.get('description'), + 'creator': join_nonempty(('podcast', 'author', 'firstName'), + ('podcast', 'author', 'lastName'), + from_dict=episode_data, delim=' '), + 'season': traverse_obj(episode_data, ('season', 'name')), + 'season_id': traverse_obj(episode_data, ('season', 'slug')), + 'series': traverse_obj(episode_data, ('podcast', 'name')), + 'series_id': traverse_obj(episode_data, ('podcast', 'slug')), + } + + if slug: + episodes = self._call_api(slug, 'getPodcastEpisodes', {'where': {'slug': slug}}) + if episode_data := traverse_obj(episodes, ..., get_all=False): + return _extract_pod_ep_info(episode_data) + else: + episodes = [ + episode for page in + self._paginate(podcaster, 'getPodcastEpisodes', {'podcast': {'slug': podcaster}}) + for episode in page + ] + + if episodes: + podcast_data = traverse_obj(episodes, (..., 'podcast'), {}, get_all=False) + return self.playlist_result( + [_extract_pod_ep_info(e) for e in episodes], + podcast_data.get('id'), podcast_data.get('name'), podcast_data.get('description'), + display_id=podcast_data.get('slug'), thumbnail=podcast_data.get('coverImage')) + + raise ExtractorError('Podcast not found') + + +class DailyWireShowIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?Pshow)/(?P[\w-]+)' + _TESTS = [{ + 'skip': 'premium only', + 'url': 'https://www.dailywire.com/show/apollo-11-what-we-saw', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'ckixsvamonvl40862ysxve50i', + 'thumbnail': 'https://daily-wire-production.imgix.net/shows/ckixsvamonvl40862ysxve50i-1679082975554.jpg', + 'title': 'What We Saw', + 'description': 'md5:98d2a7d5cc8175494a4ca611058ed440', + }, + 'params': { + 'skip_download': True, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'cltf80tk79fxi0942c7h394b5', + 'season_id': 'what-we-saw-season-3-an-empire-of-terror-season', + 'ext': 'mp4', + 'display_id': 'season-3-an-empire-of-terror', + 'display_id': 'season-3-an-empire-of-terror', + 'series_id': 'ckixsvamonvl40862ysxve50i', + 'title': 'Season 3: An Empire of Terror', + 'description': 'What We Saw: An Empire of Terror premieres on March 6, 2024.', + 'creators': ['Scott Bowler '], + 'upload_date': '20240306', + 'timestamp': 1709704832, + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/cltf80tk79fxi0942c7h394b5/cltf80tk79fxi0942c7h394b5-1709694601671.png', + 'series': 'What We Saw', + }}] + }] + + def _real_extract(self, url): + slug = self._match_valid_url(url).group('id') + + show_data = self._call_api(slug, 'getShowBySlug', {'slug': slug}) + if not show_data: + raise ExtractorError('Show not found') + + for season_data in show_data.get('seasons', []): + season_data['episodes'] = [ + episode for page in + self._paginate(season_data.get('slug'), 'getSeasonEpisodes', {'season': {'id': season_data.get('id')}}) + for episode in page + ] + + return self.playlist_result( + [self.url_result(f'https://www.dailywire.com/episode/{episode_slug}', + season_id=season_data.get('slug'), season=season_data.get('title'), url_transparent=True) + for season_data in show_data.get('seasons', []) for episode_slug in season_data['episodes']], + show_data.get('id'), show_data.get('name'), show_data.get('description'), + thumbnail=show_data.get('image'))