From e2ef128e32f0bdf84c85db3f96725550c2ce6485 Mon Sep 17 00:00:00 2001 From: df Date: Thu, 19 Aug 2021 21:10:51 +0100 Subject: [PATCH] Rewrite Megaphone extractor with episode, playlist support --- youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/megaphone.py | 286 +++++++++++++++++++++++++++-- 2 files changed, 281 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6e8fc3961..58af174de 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -651,7 +651,12 @@ from .mediasite import ( MediasiteNamedCatalogIE, ) from .medici import MediciIE -from .megaphone import MegaphoneIE +from .megaphone import ( + MegaphoneIE, + MegaphoneEpisodeIE, + MegaphonePlaylistIE, + MegaphoneChannelIE, +) from .meipai import MeipaiIE from .melonvod import MelonVODIE from .meta import METAIE diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py index 5bafa6cf4..b678de1ff 100644 --- a/youtube_dl/extractor/megaphone.py +++ b/youtube_dl/extractor/megaphone.py @@ -4,28 +4,49 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + js_to_json, + parse_duration, + parse_iso8601, + str_or_none, + try_get, +) +from ..compat import ( + compat_etree_Element, + compat_etree_fromstring, + compat_str, + compat_xpath, +) class MegaphoneIE(InfoExtractor): IE_NAME = 'megaphone.fm' IE_DESC = 'megaphone.fm embedded players' - _VALID_URL = r'https://player\.megaphone\.fm/(?P[A-Z0-9]+)' - _TEST = { - 'url': 'https://player.megaphone.fm/GLT9749789991?"', + _PLAYER_URL_TEMPL = 'https://player.megaphone.fm/%s' + _VALID_URL_TEMPL = _PLAYER_URL_TEMPL.replace('.', r'\.') + _VALID_URL = _VALID_URL_TEMPL % r'(?P[A-Z0-9]+)' + _JSON_URL_TEMPL = _PLAYER_URL_TEMPL % 'playlist/episode/%s' + _TESTS = [{ + 'url': 'https://player.megaphone.fm/GLT9749789991', 'md5': '4816a0de523eb3e972dc0dda2c191f96', 'info_dict': { 'id': 'GLT9749789991', 'ext': 'mp3', 'title': '#97 What Kind Of Idiot Gets Phished?', - 'thumbnail': r're:^https://.*\.png.*$', - 'duration': 1776.26375, - 'author': 'Reply All', + 'thumbnail': r're:^https://.*\.png(?:\?.+)?$', + 'duration': 2013.36, + 'uploader': 'Reply All', + 'upload_date': '20170518', + 'timestamp': 1495101600, + 'description': 'md5:8fc2ba1da0efb099ef928df127358a90', }, - } + }] - def _real_extract(self, url): - video_id = self._match_id(url) + def _old_real_extract(self, url, video_id): + """version for pages before React-ification""" webpage = self._download_webpage(url, video_id) title = self._og_search_property('audio:title', webpage) @@ -45,11 +66,254 @@ class MegaphoneIE(InfoExtractor): 'thumbnail': thumbnail, 'title': title, 'author': author, - 'duration': episode_data['duration'], + 'duration': episode_data.get('duration'), 'formats': formats, } + def _real_extract(self, url): + video_id = self._match_id(url) + + episode_json = self._download_json(self._JSON_URL_TEMPL % video_id, video_id, fatal=False) + if episode_json is False: + # probably, no pages match the old structure, but try anyway + return self._old_real_extract(url, video_id) + entries = [] + for e in try_get(episode_json, lambda x: x['episodes'], list) or []: + title = try_get(e, lambda x: x['title'], compat_str) + if not title: + continue + video_url = dict_get(e, ('episodeUrlHRef', 'audioURL')) + if not video_url: + continue + entry = { + 'id': e.get('UID') or video_id, + 'title': title, + 'description': clean_html(e.get('summary')), + 'alt_title': e.get('subtitle'), + 'formats': [{'url': video_url}], + 'thumbnail': e.get('imageUrl'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': parse_iso8601(e.get('pubDate')), + } + uploader = episode_json.get('podcastTitle') + if uploader: + entry['uploader'] = uploader + entry['author'] = uploader + entries.append(entry) + if entries: + if len(entries) == 1: + return entries[0] + return self.playlist_result(entries, playlist_id=video_id, playlist_title=episode_json.get('podcastTitle')) + @classmethod def _extract_urls(cls, webpage): return [m[0] for m in re.findall( r']*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)] + + +class MegaphoneEpisodeIE(MegaphoneIE): + IE_NAME = 'megaphone.fm:episode' + IE_DESC = 'megaphone.fm episode' + _VALID_URL_TEMPL = r'https://playlist\.megaphone\.fm/?%s' + _VALID_URL = _VALID_URL_TEMPL % r'\?e=(?P[A-Z0-9]+)' + _JSON_URL_TEMPL = MegaphoneIE._PLAYER_URL_TEMPL % 'playlist/episode/%s' + _TESTS = [{ + 'url': 'https://playlist.megaphone.fm/?e=PAN7405681599', + 'md5': '7fa866c3af93caac7e13a579c183f6ab', + 'info_dict': { + 'id': 'PAN7405681599', + 'ext': 'mp3', + 'title': 'Nirvana - Nevermind: 30 Years Later with Danny Goldberg', + 'thumbnail': r're:^https://.*\.jpe?g(?:\?.+)?$', + 'duration': 3576.94, + 'uploader': 'Cobras & Fire: Comedy / Rock Talk Show', + 'upload_date': '20210810', + 'timestamp': 1628578800, + 'description': 'md5:8f5623a8b22d3be4420c4570d0e36b69', + }, + }] + + +class MegaphonePlaylistIE(MegaphoneEpisodeIE): + IE_NAME = 'megaphone.fm:playlist' + IE_DESC = 'megaphone.fm playlist' + _VALID_URL = MegaphoneEpisodeIE._VALID_URL_TEMPL % r'\?p=(?P[A-Z0-9]+)' + _JSON_URL_TEMPL = MegaphoneIE._PLAYER_URL_TEMPL % 'playlist/%s' + _TESTS = [{ + 'url': 'https://playlist.megaphone.fm/?p=DEM6640968282', + 'info_dict': { + 'id': 'DEM6640968282', + 'title': 'Lightbulb Productions', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://playlist.megaphone.fm/?p=DEM6640968282', + 'md5': '71fbb6616c75aa2cc972e978683dffd4', + 'info_dict': { + 'id': 'DEM6640968282', + 'ext': 'mp3', + 'title': 'Open Source World', + 'thumbnail': r're:^https://.*\.jpe?g(?:\?.+)?$', + 'duration': 754.38, + 'uploader': 'Lightbulb Productions', + 'upload_date': '20200602', + 'timestamp': 1591070400, + 'description': 'md5:a06a5a078c0d98bb023626615fb1432d', + }, + 'params': { + 'noplaylist': True, + }, + }] + + def _real_extract(self, url): + entries = super(MegaphonePlaylistIE, self)._real_extract(url) + if entries: + noplaylist = self._downloader.params.get('noplaylist') + if noplaylist: + self.to_screen('Downloading just the first episode because of --no-playlist') + return entries['entries'][0] + return entries + + +import xml +import sys + +if sys.version_info[0] >= 3: + compat_XMLParser = xml.etree.ElementTree.XMLParser +else: + from ..compat import ( + _XML, + _TreeBuilder, + _element_factory, + _etree_iter, + ) + + class compat_XMLParser(xml.etree.ElementTree.XMLParser): + def _fixtext(self, text): + return text + + def compat_etree_fromstring(text): + doc = _XML( + text.encode('utf-8'), + parser=compat_XMLParser( + target=_TreeBuilder(element_factory=_element_factory), + encoding='utf-8')) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc + + +class MegaphoneChannelIE(MegaphoneIE): + IE_NAME = 'megaphone.fm:channel' + IE_DESC = 'megaphone.fm channel' + _VALID_URL = r'https://cms\.megaphone\.fm/channel/(?P[A-Z0-9]+)(?:\?selected=(?P[A-Z0-9]+))?' + _TESTS = [{ + 'url': 'https://cms.megaphone.fm/channel/ADL3707263633', + 'info_dict': { + 'id': 'ADL3707263633', + 'title': 'Pax Britannica', + 'description': 'md5:7b4002330ffe4abcb81d97ab9b56fede', + }, + 'playlist_mincount': 98, + }, { + 'url': 'https://cms.megaphone.fm/channel/ADL3707263633?selected=ADL9449136081', + 'md5': '42901d1112c059a8de374046e0b1ed25', + 'info_dict': { + 'id': 'ADL9449136081', + 'title': '02.23 - Nolumus Leges Angliae Mutari', + 'description': 'md5:d35989ec81de7199b3020bc919ab7c0d', + 'ext': 'mp3', + 'thumbnail': r're:^https://.*\.png(?:\?.+)?$', + 'duration': 2470.09, + 'extractor_key': 'Megaphone', + 'upload_date': '20210711', + 'timestamp': 1625961600, + 'uploader': 'Pax Britannica', + }, + }, { + 'url': 'https://cms.megaphone.fm/channel/ADL3707263633', + 'md5': '81156c760235d45a9133a9ea9ccbb7d0', + 'info_dict': { + 'id': 'ADL9716153485', + 'title': '02.24 - Give Unto Caesar His Due', + 'description': 'The First English Civil War begins.', + 'ext': 'mp3', + 'duration': 1860, + }, + 'params': { + 'noplaylist': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url).groupdict() + video_id = mobj['id'] + # If clip is selected, dl that instead + clip_id = mobj.get('clip_id') + if clip_id: + return self.url_result(self._PLAYER_URL_TEMPL % clip_id, ie='Megaphone', video_id=clip_id) + webpage = self._download_webpage(url, video_id) + # Useful information is split between a JS JSON.parse() call and + # a
]*?class\s*=\s*("|\')public-ep-list\1[^>]*>.*?(?P]*?id\s*=.+?
\s*)\s*', + webpage, 'episode list', default=None, group='ep_list') + if ep_list: + # Although the page itself isn't well-formed as XML, the + #