diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9f2dc803b2..027fb06f93 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -526,7 +526,7 @@ YoutubeUserIE, YoutubeWatchLaterIE, ) -from .zdf import ZDFIE +from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ( ZingMp3SongIE, ZingMp3AlbumIE, diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 9ff00e26c4..221f16686a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -86,6 +86,26 @@ def xml_to_format(fnode): } +def extract_channel_from_xml_url(ie, channel_id, xml_url): + doc = ie._download_xml( + xml_url, channel_id, + note='Downloading channel info', + errnote='Failed to download channel info') + + title = doc.find('.//information/title').text + description = doc.find('.//information/detail').text + assets = [{'id': asset.find('./details/assetId').text, + 'type': asset.find('./type').text, + } for asset in doc.findall('.//teasers/teaser')] + + return { + 'id': channel_id, + 'title': title, + 'description': description, + 'assets': assets, + } + + class ZDFIE(InfoExtractor): _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P#)?/(.*beitrag/(?:video/)?)(?P[0-9]+)(?:/[^/?]+)?(?:\?.*)?' @@ -104,8 +124,67 @@ class ZDFIE(InfoExtractor): 'skip': 'Videos on ZDF.de are depublicised in short order', } - def _real_extract(self, url): - video_id = self._match_id(url) - + def _extract_video(self, video_id): xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id return extract_from_xml_url(self, video_id, xml_url) + + def _real_extract(self, url): + return self._extract_video(self._match_id(url)) + + +class ZDFChannelIE(ZDFIE): + _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P#)?/(.*kanaluebersicht/)(?P[0-9]+)' + + _TEST = { + 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic', + 'info_dict': { + 'id': '1586442', + 'title': 'Titanic', + 'description': 'md5:444c048cfe3fdc2561be7de4bcbf1d04', + }, + 'playlist_count': 3, + } + + def _extract_channel(self, channel_id): + def load_chunks(channel_id, chunk_length): + offset = 0 + while True: + url = ('http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s' + % (offset, chunk_length, channel_id)) + result = extract_channel_from_xml_url(self, channel_id, url) + yield result + if len(result['assets']) < chunk_length: + return + offset += chunk_length + + def load_channel(channel_id): + chunks = list(load_chunks(channel_id, 50)) # The server rejects higher values + assets = [asset for chunk in chunks for asset in chunk['assets']] + video_ids = [asset['id'] for asset in + filter(lambda asset: asset['type'] == 'video', + assets)] + topic_ids = [asset['id'] for asset in + filter(lambda asset: asset['type'] == 'thema', + assets)] + if topic_ids: + video_ids = reduce(list.__add__, + [load_channel(topic_id)['video_ids'] + for topic_id in topic_ids], + video_ids) + + result = chunks[0] + result['video_ids'] = video_ids + return result + + channel = load_channel(channel_id) + return { + '_type': 'playlist', + 'id': channel['id'], + 'title': channel['title'], + 'description': channel['description'], + 'entries': [self._extract_video(video_id) + for video_id in channel['video_ids']], + } + + def _real_extract(self, url): + return self._extract_channel(self._match_id(url))