yt-dlp/youtube_dl/extractor/arte.py

import re
import json
import xml.etree.ElementTree

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    find_xpath_attr,
    unified_strdate,
)

class ArteTvIE(InfoExtractor):
    """
    There are two sources of video in arte.tv: videos.arte.tv and
    www.arte.tv/guide, the extraction process is different for each one.
    The videos expire in 7 days, so we can't add tests.
    """
    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
    _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
    _LIVE_URL = r'index-[0-9]+\.html$'

    IE_NAME = u'arte.tv'

    @classmethod
    def suitable(cls, url):
        return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))

    # TODO implement Live Stream
    # from ..utils import compat_urllib_parse
    # def extractLiveStream(self, url):
    #     video_lang = url.split('/')[-4]
    #     info = self.grep_webpage(
    #         url,
    #         r'src="(.*?/videothek_js.*?\.js)',
    #         0,
    #         [
    #             (1, 'url', u'Invalid URL: %s' % url)
    #         ]
    #     )
    #     http_host = url.split('/')[2]
    #     next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
    #     info = self.grep_webpage(
    #         next_url,
    #         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
    #             '(http://.*?\.swf).*?' +
    #             '(rtmp://.*?)\'',
    #         re.DOTALL,
    #         [
    #             (1, 'path',   u'could not extract video path: %s' % url),
    #             (2, 'player', u'could not extract video player: %s' % url),
    #             (3, 'url',    u'could not extract video url: %s' % url)
    #         ]
    #     )
    #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))

    def _real_extract(self, url):
        mobj = re.match(self._EMISSION_URL, url)
        if mobj is not None:
            lang = mobj.group('lang')
            # This is not a real id, it can be for example AJT for the news
            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
            video_id = mobj.group('id')
            return self._extract_emission(url, video_id, lang)

        mobj = re.match(self._VIDEOS_URL, url)
        if mobj is not None:
            id = mobj.group('id')
            lang = mobj.group('lang')
            return self._extract_video(url, id, lang)

        mobj = re.match(self._LIVEWEB_URL, url)
        if mobj is not None:
            name = mobj.group('name')
            lang = mobj.group('lang')
            return self._extract_liveweb(url, name, lang)

        if re.search(self._LIVE_URL, video_id) is not None:
            raise ExtractorError(u'Arte live streams are not yet supported, sorry')
            # self.extractLiveStream(url)
            # return

    def _extract_emission(self, url, video_id, lang):
        """Extract from www.arte.tv/guide"""
        webpage = self._download_webpage(url, video_id)
        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')

        json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
        self.report_extraction(video_id)
        info = json.loads(json_info)
        player_info = info['videoJsonPlayer']

        info_dict = {'id': player_info['VID'],
                     'title': player_info['VTI'],
                     'description': player_info.get('VDE'),
                     'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
                     'thumbnail': player_info['programImage'],
                     'ext': 'flv',
                     }

        formats = player_info['VSR'].values()
        def _match_lang(f):
            # Return true if that format is in the language of the url
            if lang == 'fr':
                l = 'F'
            elif lang == 'de':
                l = 'A'
            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
            return any(re.match(r, f['versionCode']) for r in regexes)
        # Some formats may not be in the same language as the url
        formats = filter(_match_lang, formats)
        # We order the formats by quality
        formats = sorted(formats, key=lambda f: int(f['height']))
        # Prefer videos without subtitles in the same language
        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None)
        # Pick the best quality
        format_info = formats[-1]
        if format_info['mediaType'] == u'rtmp':
            info_dict['url'] = format_info['streamer']
            info_dict['play_path'] = 'mp4:' + format_info['url']
        else:
            info_dict['url'] = format_info['url']

        return info_dict

    def _extract_video(self, url, video_id, lang):
        """Extract from videos.arte.tv"""
        ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
        ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
        ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
        ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
        config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
        config_xml_url = config_node.attrib['ref']
        config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')

        video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
        def _key(m):
            quality = m.group('quality')
            if quality == 'hd':
                return 2
            else:
                return 1
        # We pick the best quality
        video_urls = sorted(video_urls, key=_key)
        video_url = list(video_urls)[-1].group('url')
        
        title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
        thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
                                            config_xml, 'thumbnail')
        return {'id': video_id,
                'title': title,
                'thumbnail': thumbnail,
                'url': video_url,
                'ext': 'flv',
                }

    def _extract_liveweb(self, url, name, lang):
        """Extract form http://liveweb.arte.tv/"""
        webpage = self._download_webpage(url, name)
        video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id')
        config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
                                            video_id, u'Downloading information')
        config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
        event_doc = config_doc.find('event')
        url_node = event_doc.find('video').find('urlHd')
        if url_node is None:
            url_node = video_doc.find('urlSd')

        return {'id': video_id,
                'title': event_doc.find('name%s' % lang.capitalize()).text,
                'url': url_node.text.replace('MP4', 'mp4'),
                'ext': 'flv',
                'thumbnail': self._og_search_thumbnail(webpage),
                }
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00			`import re`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00			`import json`
[arte] Fix language selection (Fixes #988) 2013-07-04 18:06:47 +02:00			`import xml.etree.ElementTree`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00
			`from .common import InfoExtractor`
			`from ..utils import (`
			`ExtractorError,`
[arte] Fix on 2.6 2013-07-11 16:12:16 +02:00			`find_xpath_attr,`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00			`unified_strdate,`
			`)`

			`class ArteTvIE(InfoExtractor):`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00			`"""`
			`There are two sources of video in arte.tv: videos.arte.tv and`
			`www.arte.tv/guide, the extraction process is different for each one.`
			`The videos expire in 7 days, so we can't add tests.`
			`"""`
ArteTVIE: extract the video with the correct language Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download 2013-07-02 17:34:40 +02:00			`_EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr\|de)/(?:(?:sendungen\|emissions)/)?(?P<id>.?)/(?P<name>.?)(\?.*)?'`
[arte] Fix language selection (Fixes #988) 2013-07-04 18:06:47 +02:00			`_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr\|de)/.-(?P<id>.?).html'`
[arte] add support for downloading from http://liveweb.arte.tv (fixes #1014) 2013-08-03 19:07:04 +02:00			`_LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr\|de)/(?P<subpage>.+?)/(?P<name>.+)'`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00			`_LIVE_URL = r'index-[0-9]+\.html$'`

			`IE_NAME = u'arte.tv'`

ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00			`@classmethod`
			`def suitable(cls, url):`
[arte] add support for downloading from http://liveweb.arte.tv (fixes #1014) 2013-08-03 19:07:04 +02:00			`return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00
[arte] Mark dead code as such 2013-06-23 20:26:35 +02:00			`# TODO implement Live Stream`
Remove dead code 2013-07-08 02:13:50 +02:00			`# from ..utils import compat_urllib_parse`
[arte] Mark dead code as such 2013-06-23 20:26:35 +02:00			`# def extractLiveStream(self, url):`
			`# video_lang = url.split('/')[-4]`
			`# info = self.grep_webpage(`
			`# url,`
			`# r'src="(.?/videothek_js.?\.js)',`
			`# 0,`
			`# [`
			`# (1, 'url', u'Invalid URL: %s' % url)`
			`# ]`
			`# )`
			`# http_host = url.split('/')[2]`
			`# next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))`
			`# info = self.grep_webpage(`
			`# next_url,`
			`# r'(s_artestras_scst_geoFRDE_' + video_lang + '.?)\'.?' +`
			`# '(http://.?\.swf).?' +`
			`# '(rtmp://.*?)\'',`
			`# re.DOTALL,`
			`# [`
			`# (1, 'path', u'could not extract video path: %s' % url),`
			`# (2, 'player', u'could not extract video player: %s' % url),`
			`# (3, 'url', u'could not extract video url: %s' % url)`
			`# ]`
			`# )`
			`# video_url = u'%s/%s' % (info.get('url'), info.get('path'))`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00
			`def _real_extract(self, url):`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00			`mobj = re.match(self._EMISSION_URL, url)`
			`if mobj is not None:`
ArteTVIE: extract the video with the correct language Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download 2013-07-02 17:34:40 +02:00			`lang = mobj.group('lang')`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00			`# This is not a real id, it can be for example AJT for the news`
			`# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal`
			`video_id = mobj.group('id')`
ArteTVIE: extract the video with the correct language Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download 2013-07-02 17:34:40 +02:00			`return self._extract_emission(url, video_id, lang)`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00
			`mobj = re.match(self._VIDEOS_URL, url)`
			`if mobj is not None:`
			`id = mobj.group('id')`
[arte] Fix language selection (Fixes #988) 2013-07-04 18:06:47 +02:00			`lang = mobj.group('lang')`
			`return self._extract_video(url, id, lang)`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00
[arte] add support for downloading from http://liveweb.arte.tv (fixes #1014) 2013-08-03 19:07:04 +02:00			`mobj = re.match(self._LIVEWEB_URL, url)`
			`if mobj is not None:`
			`name = mobj.group('name')`
			`lang = mobj.group('lang')`
			`return self._extract_liveweb(url, name, lang)`

Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00			`if re.search(self._LIVE_URL, video_id) is not None:`
[arte] Mark dead code as such 2013-06-23 20:26:35 +02:00			`raise ExtractorError(u'Arte live streams are not yet supported, sorry')`
			`# self.extractLiveStream(url)`
			`# return`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00
ArteTVIE: extract the video with the correct language Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download 2013-07-02 17:34:40 +02:00			`def _extract_emission(self, url, video_id, lang):`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00			`"""Extract from www.arte.tv/guide"""`
[arte] Always look for the JSON URL (Fixes #1002) 2013-07-08 01:28:19 +02:00			`webpage = self._download_webpage(url, video_id)`
			`json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00
			`json_info = self._download_webpage(json_url, video_id, 'Downloading info json')`
			`self.report_extraction(video_id)`
			`info = json.loads(json_info)`
			`player_info = info['videoJsonPlayer']`

			`info_dict = {'id': player_info['VID'],`
			`'title': player_info['VTI'],`
[arte] Prefer vídeos without subtitles in the same language (fixes #1173) and fix crash when there's no description 2013-08-03 17:32:29 +02:00			`'description': player_info.get('VDE'),`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00			`'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),`
			`'thumbnail': player_info['programImage'],`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00			`'ext': 'flv',`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00			`}`

			`formats = player_info['VSR'].values()`
ArteTVIE: extract the video with the correct language Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download 2013-07-02 17:34:40 +02:00			`def _match_lang(f):`
			`# Return true if that format is in the language of the url`
			`if lang == 'fr':`
			`l = 'F'`
			`elif lang == 'de':`
			`l = 'A'`
Fix regex error when only subtitled video is available on arte. 2013-08-01 11:48:17 +02:00			`regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]`
ArteTVIE: extract the video with the correct language Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download 2013-07-02 17:34:40 +02:00			`return any(re.match(r, f['versionCode']) for r in regexes)`
			`# Some formats may not be in the same language as the url`
			`formats = filter(_match_lang, formats)`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00			`# We order the formats by quality`
			`formats = sorted(formats, key=lambda f: int(f['height']))`
[arte] Prefer vídeos without subtitles in the same language (fixes #1173) and fix crash when there's no description 2013-08-03 17:32:29 +02:00			`# Prefer videos without subtitles in the same language`
			`formats = sorted(formats, key=lambda f: re.match(r'VO(F\|A)-STM\1', f['versionCode']) is None)`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00			`# Pick the best quality`
			`format_info = formats[-1]`
			`if format_info['mediaType'] == u'rtmp':`
			`info_dict['url'] = format_info['streamer']`
			`info_dict['play_path'] = 'mp4:' + format_info['url']`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00			`else:`
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00			`info_dict['url'] = format_info['url']`
Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00
ArteTvIE: rewrite the extract process to support the new site (fixes #875) The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods. 2013-06-27 00:09:51 +02:00			`return info_dict`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00
[arte] Fix language selection (Fixes #988) 2013-07-04 18:06:47 +02:00			`def _extract_video(self, url, video_id, lang):`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00			`"""Extract from videos.arte.tv"""`
[arte] Fix language selection (Fixes #988) 2013-07-04 18:06:47 +02:00			`ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')`
			`ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')`
			`ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')`
			`ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)`
[arte] Fix on 2.6 2013-07-11 16:12:16 +02:00			`config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)`
[arte] Fix language selection (Fixes #988) 2013-07-04 18:06:47 +02:00			`config_xml_url = config_node.attrib['ref']`
			`config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')`
ArteTvIE: support videos from videos.arte.tv Each source of videos have a different extraction process, they are in different methods of the extractor. Changed the extension of videos from mp4 to flv. 2013-06-30 13:38:22 +02:00
			`video_urls = list(re.finditer(r'<url quality="(?P<quality>.?)">(?P<url>.?)</url>', config_xml))`
			`def _key(m):`
			`quality = m.group('quality')`
			`if quality == 'hd':`
			`return 2`
			`else:`
			`return 1`
			`# We pick the best quality`
			`video_urls = sorted(video_urls, key=_key)`
			`video_url = list(video_urls)[-1].group('url')`

			`title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')`
			`thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',`
			`config_xml, 'thumbnail')`
			`return {'id': video_id,`
			`'title': title,`
			`'thumbnail': thumbnail,`
			`'url': video_url,`
			`'ext': 'flv',`
			`}`
[arte] add support for downloading from http://liveweb.arte.tv (fixes #1014) 2013-08-03 19:07:04 +02:00
			`def _extract_liveweb(self, url, name, lang):`
			`"""Extract form http://liveweb.arte.tv/"""`
			`webpage = self._download_webpage(url, name)`
			`video_id = self._search_regex(r'eventId=(\d+?)("\|&)', webpage, u'event id')`
			`config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,`
			`video_id, u'Downloading information')`
			`config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))`
			`event_doc = config_doc.find('event')`
			`url_node = event_doc.find('video').find('urlHd')`
			`if url_node is None:`
			`url_node = video_doc.find('urlSd')`

			`return {'id': video_id,`
			`'title': event_doc.find('name%s' % lang.capitalize()).text,`
			`'url': url_node.text.replace('MP4', 'mp4'),`
			`'ext': 'flv',`
			`'thumbnail': self._og_search_thumbnail(webpage),`
			`}`