youtube-dl/youtube_dl/extractor/nfl.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse_urlparse,
)
from ..utils import (
    ExtractorError,
    int_or_none,
    remove_end,
)


class NFLIE(InfoExtractor):
    IE_NAME = 'nfl.com'
    _VALID_URL = r'''(?x)
                    https?://
                        (?P<host>
                            (?:www\.)?
                            (?:
                                (?:
                                    nfl|
                                    buffalobills|
                                    miamidolphins|
                                    patriots|
                                    newyorkjets|
                                    baltimoreravens|
                                    bengals|
                                    clevelandbrowns|
                                    steelers|
                                    houstontexans|
                                    colts|
                                    jaguars|
                                    titansonline|
                                    denverbroncos|
                                    kcchiefs|
                                    raiders|
                                    chargers|
                                    dallascowboys|
                                    giants|
                                    philadelphiaeagles|
                                    redskins|
                                    chicagobears|
                                    detroitlions|
                                    packers|
                                    vikings|
                                    atlantafalcons|
                                    panthers|
                                    neworleanssaints|
                                    buccaneers|
                                    azcardinals|
                                    stlouisrams|
                                    49ers|
                                    seahawks
                                )\.com|
                                .+?\.clubs\.nfl\.com
                            )
                        )/
                        (?:.+?/)*
                        (?P<id>[^/#?&]+)
                    '''
    _TESTS = [{
        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
        'md5': '394ef771ddcd1354f665b471d78ec4c6',
        'info_dict': {
            'id': '0ap3000000398478',
            'ext': 'mp4',
            'title': 'Week 3: Redskins vs. Eagles highlights',
            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
            'upload_date': '20140921',
            'timestamp': 1411337580,
            'thumbnail': r're:^https?://.*\.jpg$',
        }
    }, {
        'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
        'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
        'info_dict': {
            'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
            'ext': 'mp4',
            'title': 'LIVE: Post Game vs. Browns',
            'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
            'upload_date': '20131229',
            'timestamp': 1388354455,
            'thumbnail': r're:^https?://.*\.jpg$',
        }
    }, {
        'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',
        'info_dict': {
            'id': '0ap3000000467607',
            'ext': 'mp4',
            'title': 'Frustrations flare on the field',
            'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',
            'timestamp': 1422850320,
            'upload_date': '20150202',
        },
    }, {
        'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette',
        'md5': '4c319e2f625ffd0b481b4382c6fc124c',
        'info_dict': {
            'id': 'n-238346',
            'ext': 'mp4',
            'title': '10 Days at Gillette',
            'description': 'md5:8cd9cd48fac16de596eadc0b24add951',
            'timestamp': 1442618809,
            'upload_date': '20150918',
        },
    }, {
        # lowercase data-contentid
        'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7',
        'info_dict': {
            'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2',
            'ext': 'mp4',
            'title': 'Tomlin looks ahead to Ravens on a short week',
            'description': 'md5:32f3f7b139f43913181d5cbb24ecad75',
            'timestamp': 1443459651,
            'upload_date': '20150928',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
        'only_matching': True,
    }, {
        'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a',
        'only_matching': True,
    }]

    @staticmethod
    def prepend_host(host, url):
        if not url.startswith('http'):
            if not url.startswith('/'):
                url = '/%s' % url
            url = 'http://{0:}{1:}'.format(host, url)
        return url

    @staticmethod
    def format_from_stream(stream, protocol, host, path_prefix='',
                           preference=0, note=None):
        url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
            protocol=protocol,
            host=host,
            prefix=path_prefix,
            path=stream.get('path'),
        )
        return {
            'url': url,
            'vbr': int_or_none(stream.get('rate', 0), 1000),
            'preference': preference,
            'format_note': note,
        }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id, host = mobj.group('id'), mobj.group('host')

        webpage = self._download_webpage(url, video_id)

        config_url = NFLIE.prepend_host(host, self._search_regex(
            r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1',
            webpage, 'config URL', default='static/content/static/config/video/config.json',
            group='config'))
        # For articles, the id in the url is not the video id
        video_id = self._search_regex(
            r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>(?:(?!\1).)+)\1',
            webpage, 'video id', default=video_id, group='id')
        config = self._download_json(config_url, video_id, 'Downloading player config')
        url_template = NFLIE.prepend_host(
            host, '{contentURLTemplate:}'.format(**config))
        video_data = self._download_json(
            url_template.format(id=video_id), video_id)

        formats = []
        cdn_data = video_data.get('cdnData', {})
        streams = cdn_data.get('bitrateInfo', [])
        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
            parts = compat_urllib_parse_urlparse(cdn_data.get('uri'))
            protocol, host = parts.scheme, parts.netloc
            for stream in streams:
                formats.append(
                    NFLIE.format_from_stream(stream, protocol, host))
        else:
            cdns = config.get('cdns')
            if not cdns:
                raise ExtractorError('Failed to get CDN data', expected=True)

            for name, cdn in cdns.items():
                # LimeLight streams don't seem to work
                if cdn.get('name') == 'LIMELIGHT':
                    continue

                protocol = cdn.get('protocol')
                host = remove_end(cdn.get('host', ''), '/')
                if not (protocol and host):
                    continue

                prefix = cdn.get('pathprefix', '')
                if prefix and not prefix.endswith('/'):
                    prefix = '%s/' % prefix

                preference = 0
                if protocol == 'rtmp':
                    preference = -2
                elif 'prog' in name.lower():
                    preference = 1

                for stream in streams:
                    formats.append(
                        NFLIE.format_from_stream(stream, protocol, host,
                                                 prefix, preference, name))

        self._sort_formats(formats)

        thumbnail = None
        for q in ('xl', 'l', 'm', 's', 'xs'):
            thumbnail = video_data.get('imagePaths', {}).get(q)
            if thumbnail:
                break

        return {
            'id': video_id,
            'title': video_data.get('headline'),
            'formats': formats,
            'description': video_data.get('caption'),
            'duration': video_data.get('duration'),
            'thumbnail': thumbnail,
            'timestamp': int_or_none(video_data.get('posted'), 1000),
        }
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import re`

			`from .common import InfoExtractor`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 12:24:42 +01:00			`from ..compat import (`
			`compat_urllib_parse_urlparse,`
			`)`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00			`from ..utils import (`
			`ExtractorError,`
			`int_or_none,`
			`remove_end,`
			`)`


			`class NFLIE(InfoExtractor):`
			`IE_NAME = 'nfl.com'`
[nfl] Add team domains (#6907) 2015-09-20 19:12:40 +02:00			`_VALID_URL = r'''(?x)`
			`https?://`
			`(?P<host>`
			`(?:www\.)?`
			`(?:`
			`(?:`
			`nfl\|`
			`buffalobills\|`
			`miamidolphins\|`
			`patriots\|`
			`newyorkjets\|`
			`baltimoreravens\|`
			`bengals\|`
			`clevelandbrowns\|`
			`steelers\|`
			`houstontexans\|`
			`colts\|`
			`jaguars\|`
			`titansonline\|`
			`denverbroncos\|`
			`kcchiefs\|`
			`raiders\|`
			`chargers\|`
			`dallascowboys\|`
			`giants\|`
			`philadelphiaeagles\|`
			`redskins\|`
			`chicagobears\|`
			`detroitlions\|`
			`packers\|`
			`vikings\|`
			`atlantafalcons\|`
			`panthers\|`
			`neworleanssaints\|`
			`buccaneers\|`
			`azcardinals\|`
			`stlouisrams\|`
			`49ers\|`
			`seahawks`
			`)\.com\|`
			`.+?\.clubs\.nfl\.com`
			`)`
			`)/`
			`(?:.+?/)*`
[nfl] Add support for URLs without id (Closes #6907) 2015-09-20 19:45:01 +02:00			`(?P<id>[^/#?&]+)`
[nfl] Add team domains (#6907) 2015-09-20 19:12:40 +02:00			`'''`
			`_TESTS = [{`
			`'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',`
			`'md5': '394ef771ddcd1354f665b471d78ec4c6',`
			`'info_dict': {`
			`'id': '0ap3000000398478',`
			`'ext': 'mp4',`
			`'title': 'Week 3: Redskins vs. Eagles highlights',`
			`'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',`
			`'upload_date': '20140921',`
			`'timestamp': 1411337580,`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 13:08:07 +01:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[nfl] Relax _VALID_URL (fixes #5940) 2015-06-10 08:15:20 +02:00			`}`
[nfl] Add team domains (#6907) 2015-09-20 19:12:40 +02:00			`}, {`
			`'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',`
			`'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',`
			`'info_dict': {`
			`'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',`
			`'ext': 'mp4',`
			`'title': 'LIVE: Post Game vs. Browns',`
			`'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',`
			`'upload_date': '20131229',`
			`'timestamp': 1388354455,`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 13:08:07 +01:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[nfl] Add team domains (#6907) 2015-09-20 19:12:40 +02:00			`}`
			`}, {`
			`'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',`
			`'info_dict': {`
			`'id': '0ap3000000467607',`
			`'ext': 'mp4',`
			`'title': 'Frustrations flare on the field',`
			`'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',`
			`'timestamp': 1422850320,`
			`'upload_date': '20150202',`
			`},`
[nfl] Add support for URLs without id (Closes #6907) 2015-09-20 19:45:01 +02:00			`}, {`
			`'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette',`
			`'md5': '4c319e2f625ffd0b481b4382c6fc124c',`
			`'info_dict': {`
			`'id': 'n-238346',`
			`'ext': 'mp4',`
			`'title': '10 Days at Gillette',`
			`'description': 'md5:8cd9cd48fac16de596eadc0b24add951',`
			`'timestamp': 1442618809,`
			`'upload_date': '20150918',`
			`},`
[nfl] Add test for #7012 2015-09-30 16:06:21 +02:00			`}, {`
			`# lowercase data-contentid`
			`'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7',`
			`'info_dict': {`
			`'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2',`
			`'ext': 'mp4',`
			`'title': 'Tomlin looks ahead to Ravens on a short week',`
			`'description': 'md5:32f3f7b139f43913181d5cbb24ecad75',`
			`'timestamp': 1443459651,`
			`'upload_date': '20150928',`
			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[nfl] Add team domains (#6907) 2015-09-20 19:12:40 +02:00			`}, {`
			`'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',`
			`'only_matching': True,`
			`}, {`
			`'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a',`
			`'only_matching': True,`
			`}]`
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00
			`@staticmethod`
			`def prepend_host(host, url):`
			`if not url.startswith('http'):`
			`if not url.startswith('/'):`
			`url = '/%s' % url`
			`url = 'http://{0:}{1:}'.format(host, url)`
			`return url`

			`@staticmethod`
			`def format_from_stream(stream, protocol, host, path_prefix='',`
			`preference=0, note=None):`
			`url = '{protocol:}://{host:}/{prefix:}{path:}'.format(`
			`protocol=protocol,`
			`host=host,`
			`prefix=path_prefix,`
			`path=stream.get('path'),`
			`)`
			`return {`
			`'url': url,`
			`'vbr': int_or_none(stream.get('rate', 0), 1000),`
			`'preference': preference,`
			`'format_note': note,`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00			`}`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`video_id, host = mobj.group('id'), mobj.group('host')`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`webpage = self._download_webpage(url, video_id)`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`config_url = NFLIE.prepend_host(host, self._search_regex(`
[nfl] Add support for URLs without id (Closes #6907) 2015-09-20 19:45:01 +02:00			`r'(?:(?:config\|configURL)\s:\s\|<nflcs:avplayer[^>]+data-config\s=\s)(["\'])(?P<config>.+?)\1',`
			`webpage, 'config URL', default='static/content/static/config/video/config.json',`
			`group='config'))`
[nfl] Add support for articles pages (fixes #4848) 2015-02-02 23:13:28 +01:00			`# For articles, the id in the url is not the video id`
			`video_id = self._search_regex(`
Improve some id regexes 2016-09-14 18:03:26 +02:00			`r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s=\s\|content[Ii]d\s:\s)(["\'])(?P<id>(?:(?!\1).)+)\1',`
[nfl] Add support for URLs without id (Closes #6907) 2015-09-20 19:45:01 +02:00			`webpage, 'video id', default=video_id, group='id')`
			`config = self._download_json(config_url, video_id, 'Downloading player config')`
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`url_template = NFLIE.prepend_host(`
			`host, '{contentURLTemplate:}'.format(**config))`
			`video_data = self._download_json(`
			`url_template.format(id=video_id), video_id)`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00
			`formats = []`
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`cdn_data = video_data.get('cdnData', {})`
			`streams = cdn_data.get('bitrateInfo', [])`
			`if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':`
[nfl] Use compatible urlparse 2014-09-30 19:01:37 +02:00			`parts = compat_urllib_parse_urlparse(cdn_data.get('uri'))`
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`protocol, host = parts.scheme, parts.netloc`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00			`for stream in streams:`
[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`formats.append(`
			`NFLIE.format_from_stream(stream, protocol, host))`
			`else:`
			`cdns = config.get('cdns')`
			`if not cdns:`
			`raise ExtractorError('Failed to get CDN data', expected=True)`

			`for name, cdn in cdns.items():`
			`# LimeLight streams don't seem to work`
			`if cdn.get('name') == 'LIMELIGHT':`
			`continue`

			`protocol = cdn.get('protocol')`
			`host = remove_end(cdn.get('host', ''), '/')`
			`if not (protocol and host):`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00			`continue`

[nfl] Support team micro-sites (fixes #3831) 2014-09-28 20:48:26 +02:00			`prefix = cdn.get('pathprefix', '')`
			`if prefix and not prefix.endswith('/'):`
			`prefix = '%s/' % prefix`

			`preference = 0`
			`if protocol == 'rtmp':`
			`preference = -2`
			`elif 'prog' in name.lower():`
			`preference = 1`

			`for stream in streams:`
			`formats.append(`
			`NFLIE.format_from_stream(stream, protocol, host,`
			`prefix, preference, name))`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00
			`self._sort_formats(formats)`

			`thumbnail = None`
			`for q in ('xl', 'l', 'm', 's', 'xs'):`
			`thumbnail = video_data.get('imagePaths', {}).get(q)`
			`if thumbnail:`
			`break`

			`return {`
			`'id': video_id,`
[nfl] Prefer progressive downloads 2014-09-28 18:25:28 +02:00			`'title': video_data.get('headline'),`
[nfl] Add new extractor. (Closes #3815) 2014-09-22 23:28:19 +02:00			`'formats': formats,`
			`'description': video_data.get('caption'),`
			`'duration': video_data.get('duration'),`
			`'thumbnail': thumbnail,`
			`'timestamp': int_or_none(video_data.get('posted'), 1000),`
			`}`