youtube-dl/youtube_dl/extractor/bloomberg.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor


class BloombergIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'

    _TESTS = [{
        'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
        # The md5 checksum changes
        'info_dict': {
            'id': 'qurhIVlJSB6hzkVi229d8g',
            'ext': 'flv',
            'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
            'description': 'md5:a8ba0302912d03d246979735c17d2761',
        },
        'params': {
            'format': 'best[format_id^=hds]',
        },
    }, {
        # video ID in BPlayer(...)
        'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
        'info_dict': {
            'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
            'ext': 'flv',
            'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
            'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.',
        },
        'params': {
            'format': 'best[format_id^=hds]',
        },
    }, {
        # data-bmmrid=
        'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
        'only_matching': True,
    }, {
        'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
        'only_matching': True,
    }, {
        'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        name = self._match_id(url)
        webpage = self._download_webpage(url, name)
        video_id = self._search_regex(
            (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
             r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
             r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
            webpage, 'id', group='id', default=None)
        if not video_id:
            bplayer_data = self._parse_json(self._search_regex(
                r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
            video_id = bplayer_data['id']
        title = re.sub(': Video$', '', self._og_search_title(webpage))

        embed_info = self._download_json(
            'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
        formats = []
        for stream in embed_info['streams']:
            stream_url = stream.get('url')
            if not stream_url:
                continue
            if stream['muxing_format'] == 'TS':
                formats.extend(self._extract_m3u8_formats(
                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
            else:
                formats.extend(self._extract_f4m_formats(
                    stream_url, video_id, f4m_id='hds', fatal=False))
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': self._og_search_description(webpage),
            'thumbnail': self._og_search_thumbnail(webpage),
        }
-												[bloomberg] Support BPlayer() players (closes #10187)

											
										
										
											2016-07-31 08:46:54 +02:00
+								# coding: utf-8
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											2014-03-29 11:55:12 +01:00
+								from __future__ import unicode_literals
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											2013-09-16 19:39:39 +02:00
+								import re
 								from .common import InfoExtractor
 								class BloombergIE(InfoExtractor):
-												[bloomberg] Relax _VALID_URL even more (Closes #7685)

											
										
										
											2015-11-28 17:39:36 +01:00
+								    _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											2013-09-16 19:39:39 +02:00
-												[bloomberg] Reax _VALID_URL (Closes #7546)

											
										
										
											2015-11-19 17:55:06 +01:00
+								    _TESTS = [{
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											2015-04-03 15:01:17 +02:00
+								        'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											2014-07-28 15:25:56 +02:00
+								        # The md5 checksum changes
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											2014-03-29 11:55:12 +01:00
+								        'info_dict': {
 								            'id': 'qurhIVlJSB6hzkVi229d8g',
 								            'ext': 'flv',
 								            'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											2015-04-03 15:01:17 +02:00
+								            'description': 'md5:a8ba0302912d03d246979735c17d2761',
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											2013-09-16 19:39:39 +02:00
+								        },
-												[bloomberg] Fix test_Bloomberg

In this test case, sometimes HLS is the best format while sometimes HDS
is. To prevent occasional test failures, force HDS to be the best
format. In the past, testing against HDS formats causes the same error
as #9214, which is fixed as #9377 landed.

											
										
										
											2016-05-12 14:05:43 +02:00
+								        'params': {
 								            'format': 'best[format_id^=hds]',
 								        },
-												[bloomberg] Support BPlayer() players (closes #10187)

											
										
										
											2016-07-31 08:46:54 +02:00
+								    }, {
 								        # video ID in BPlayer(...)
 								        'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
 								        'info_dict': {
 								            'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
 								            'ext': 'flv',
 								            'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
 								            'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.',
 								        },
 								        'params': {
 								            'format': 'best[format_id^=hds]',
 								        },
-												[bloomberg] Add another video id regex (closes #12062)

											
										
										
											2017-02-10 16:16:20 +01:00
+								    }, {
 								        # data-bmmrid=
 								        'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
 								        'only_matching': True,
-												[bloomberg] Reax _VALID_URL (Closes #7546)

											
										
										
											2015-11-19 17:55:06 +01:00
+								    }, {
 								        'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
 								        'only_matching': True,
-												[bloomberg] Relax _VALID_URL even more (Closes #7685)

											
										
										
											2015-11-28 17:39:36 +01:00
+								    }, {
 								        'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
 								        'only_matching': True,
-												[bloomberg] Reax _VALID_URL (Closes #7546)

											
										
										
											2015-11-19 17:55:06 +01:00
+								    }]
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											2013-09-16 19:39:39 +02:00
 								    def _real_extract(self, url):
-												[bloomberg] Modernize

											
										
										
											2015-02-24 11:08:00 +01:00
+								        name = self._match_id(url)
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											2013-09-16 19:39:39 +02:00
+								        webpage = self._download_webpage(url, name)
-												[bloomberg] Improve video id regex

											
										
										
											2015-11-28 17:41:39 +01:00
+								        video_id = self._search_regex(
-												[bloomberg] Add another video id regex (closes #12062)

											
										
										
											2017-02-10 16:16:20 +01:00
+								            (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
 								             r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
 								             r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
 								            webpage, 'id', group='id', default=None)
-												[bloomberg] Support BPlayer() players (closes #10187)

											
										
										
											2016-07-31 08:46:54 +02:00
+								        if not video_id:
 								            bplayer_data = self._parse_json(self._search_regex(
 								                r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
 								            video_id = bplayer_data['id']
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											2014-03-29 11:55:12 +01:00
+								        title = re.sub(': Video$', '', self._og_search_title(webpage))
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											2015-04-03 15:01:17 +02:00
+								        embed_info = self._download_json(
 								            'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
 								        formats = []
 								        for stream in embed_info['streams']:
-												[bloomberg] Improve formats extraction

											
										
										
											2015-11-28 17:45:19 +01:00
+								            stream_url = stream.get('url')
 								            if not stream_url:
 								                continue
-												[bloomberg] Modernize

											
										
										
											2015-11-28 17:40:29 +01:00
+								            if stream['muxing_format'] == 'TS':
-												Simplify formats accumulation for f4m/m3u8/smil formats

Now all _extract_*_formats routines return a list

											
										
										
											2015-12-28 19:58:24 +01:00
+								                formats.extend(self._extract_m3u8_formats(
 								                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											2015-04-03 15:01:17 +02:00
+								            else:
-												Simplify formats accumulation for f4m/m3u8/smil formats

Now all _extract_*_formats routines return a list

											
										
										
											2015-12-28 19:58:24 +01:00
+								                formats.extend(self._extract_f4m_formats(
 								                    stream_url, video_id, f4m_id='hds', fatal=False))
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											2015-04-03 15:01:17 +02:00
+								        self._sort_formats(formats)
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											2014-03-29 11:55:12 +01:00
+								        return {
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											2015-04-03 15:01:17 +02:00
+								            'id': video_id,
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											2014-03-29 11:55:12 +01:00
+								            'title': title,
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											2015-04-03 15:01:17 +02:00
+								            'formats': formats,
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											2014-03-29 11:55:12 +01:00
+								            'description': self._og_search_description(webpage),
 								            'thumbnail': self._og_search_thumbnail(webpage),
 								        }