youtube-dl/youtube_dl/extractor/ted.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor

from ..compat import compat_str
from ..utils import int_or_none


class TEDIE(InfoExtractor):
    IE_NAME = 'ted'
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                            'argument that not only don\'t we understand our own '
                            'consciousness, but that half the time our brains are '
                            'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
            'duration': 1308,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'mp4',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # YouTube video
        'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': 'aFBIPO-P7LM',
            'ext': 'mp4',
            'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
            'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
            'uploader': 'TEDx Talks',
            'uploader_id': 'TEDxTalks',
            'upload_date': '20111216',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
                                       webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type').startswith('embed'):
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
                                         'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        external = talk_info.get('external')
        if external:
            service = external['service']
            self.to_screen('Found video from %s' % service)
            ext_url = None
            if service.lower() == 'youtube':
                ext_url = external.get('code')
            return {
                '_type': 'url',
                'url': ext_url or external['uri'],
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)

        for format_id, resources in talk_info['resources'].items():
            if format_id == 'h264':
                for resource in resources:
                    bitrate = int_or_none(resource.get('bitrate'))
                    formats.append({
                        'url': resource['file'],
                        'format_id': '%s-%sk' % (format_id, bitrate),
                        'tbr': bitrate,
                    })
            elif format_id == 'rtmp':
                streamer = talk_info.get('streamer')
                if not streamer:
                    continue
                for resource in resources:
                    formats.append({
                        'format_id': '%s-%s' % (format_id, resource.get('name')),
                        'url': streamer,
                        'play_path': resource['file'],
                        'ext': 'flv',
                        'width': int_or_none(resource.get('width')),
                        'height': int_or_none(resource.get('height')),
                        'tbr': int_or_none(resource.get('bitrate')),
                    })
            elif format_id == 'hls':
                hls_formats = self._extract_m3u8_formats(
                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
                for f in hls_formats:
                    if f.get('format_id') == 'hls-meta':
                        continue
                    if not f.get('height'):
                        f['vcodec'] = 'none'
                    else:
                        f['acodec'] = 'none'
                formats.extend(hls_formats)

        audio_download = talk_info.get('audioDownload')
        if audio_download:
            formats.append({
                'url': audio_download,
                'format_id': 'audio',
                'vcodec': 'none',
                'preference': -0.5,
            })

        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': self._get_subtitles(video_id, talk_info),
            'formats': formats,
            'duration': talk_info.get('duration'),
        }

    def _get_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                sub_lang_list[l] = [
                    {
                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
                        'ext': ext,
                    }
                    for ext in ['ted', 'srt']
                ]
            return sub_lang_list
        else:
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
            webpage, 'config')
        config = json.loads(config_json)['config']
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
[ted] Use unicode_literals 2014-01-17 03:52:17 +01:00			`from __future__ import unicode_literals`

Move TED IE into its own file 2013-06-23 21:55:53 +02:00			`import json`
			`import re`

Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-15 18:03:41 +01:00			`from .common import InfoExtractor`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00
[ted] Extract all formats (Closes #5397) 2015-04-10 19:36:28 +02:00			`from ..compat import compat_str`
			`from ..utils import int_or_none`
[ted] fixed error in case of no subtitles present I created a test, but I leave it commented since TED videos get new subtitles frequently. 2013-11-05 12:00:13 +01:00
[ted] Use unicode_literals 2014-01-17 03:52:17 +01:00
Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-15 18:03:41 +01:00			`class TEDIE(InfoExtractor):`
[ted] Clarify IE_NAME 2015-04-20 17:42:42 +02:00			`IE_NAME = 'ted'`
[ted] Simplify embed code (#2587) 2014-03-20 16:33:23 +01:00			`_VALID_URL = r'''(?x)`
			`(?P<proto>https?://)`
[ted] Add support for embed-ssl.ted.com embedded videos 2015-01-05 13:11:13 +01:00			`(?P<type>www\|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`(`
			`(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist`
			`\|`
			`((?P<type_talk>talks)) # We have a simple talk`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`\|`
			`(?P<type_watch>watch)/[^/]+/[^/]+`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`)`
			`(/lang/(.*?))? # The url may contain the language`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`/(?P<name>[\w-]+) # Here goes the name and then ".html"`
[ted] Simplify embed code (#2587) 2014-03-20 16:33:23 +01:00			`.*)$`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`'''`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`_TESTS = [{`
[ted] Use unicode_literals 2014-01-17 03:52:17 +01:00			`'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',`
[ted] Update test md5 2014-06-12 15:33:53 +02:00			`'md5': 'fc94ac279feebbce69f21c0c6ee82810',`
[ted] Use unicode_literals 2014-01-17 03:52:17 +01:00			`'info_dict': {`
[ted] Remove unused import and modernize test 2014-03-05 14:27:45 +01:00			`'id': '102',`
			`'ext': 'mp4',`
[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`'title': 'The illusion of consciousness',`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`'description': ('Philosopher Dan Dennett makes a compelling '`
PEP8: applied even more rules 2014-11-23 21:39:15 +01:00			`'argument that not only don\'t we understand our own '`
			`'consciousness, but that half the time our brains are '`
			`'actively fooling us.'),`
[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`'uploader': 'Dan Dennett',`
[ted] Add width and height (Fixes #2716) 2014-04-07 13:07:07 +02:00			`'width': 854,`
[ted] Extract duration (closes #4155) 2014-11-12 09:30:57 +01:00			`'duration': 1308,`
Move tests to the IE definitions 2013-06-27 20:46:46 +02:00			`}`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`}, {`
			`'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'md5': '226f4fb9c62380d11b7995efa4c87994',`
			`'info_dict': {`
			`'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'ext': 'mp4',`
			`'title': 'Vishal Sikka: The beauty and power of algorithms',`
			`'thumbnail': 're:^https?://.+\.jpg',`
			`'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',`
			`}`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 15:23:12 +02:00			`}, {`
			`'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',`
			`'info_dict': {`
			`'id': '1972',`
[ted] Update test 2014-04-22 14:49:41 +02:00			`'ext': 'mp4',`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 15:23:12 +02:00			`'title': 'Be passionate. Be courageous. Be your best.',`
			`'uploader': 'Gabby Giffords and Mark Kelly',`
[ted] Update test 2014-04-22 14:49:41 +02:00			`'description': 'md5:5174aed4d0f16021b704120360f72b92',`
[ted] Extract duration (closes #4155) 2014-11-12 09:30:57 +01:00			`'duration': 1128,`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 15:23:12 +02:00			`},`
Move playlist tests to extractors. From now on, test_download will run these tests. That means we benefit not only from the networking setup in there, but also from the other tests (for example test_all_urls to find problems with _VALID_URLs). 2014-08-28 00:58:24 +02:00			`}, {`
			`'url': 'http://www.ted.com/playlists/who_are_the_hackers',`
			`'info_dict': {`
			`'id': '10',`
			`'title': 'Who are the hackers?',`
			`},`
			`'playlist_mincount': 6,`
[ted] Add support for external videos (fixes #3948) 2014-10-15 12:24:11 +02:00			`}, {`
			`# contains a youtube video`
			`'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',`
			`'add_ie': ['Youtube'],`
			`'info_dict': {`
			`'id': '_ZG8HBuDjgc',`
			`'ext': 'mp4',`
			`'title': 'Douglas Adams: Parrots the Universe and Everything',`
			`'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',`
			`'uploader': 'University of California Television (UCTV)',`
			`'uploader_id': 'UCtelevision',`
			`'upload_date': '20080522',`
			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[ted] Improve external video handling and add test 2015-02-20 19:14:38 +01:00			`}, {`
			`# YouTube video`
			`'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',`
			`'add_ie': ['Youtube'],`
			`'info_dict': {`
			`'id': 'aFBIPO-P7LM',`
			`'ext': 'mp4',`
			`'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',`
			`'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',`
			`'uploader': 'TEDx Talks',`
			`'uploader_id': 'TEDxTalks',`
			`'upload_date': '20111216',`
			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`}]`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00
[ted] Add width and height (Fixes #2716) 2014-04-07 13:07:07 +02:00			`_NATIVE_FORMATS = {`
			`'low': {'preference': 1, 'width': 320, 'height': 180},`
			`'medium': {'preference': 2, 'width': 512, 'height': 288},`
			`'high': {'preference': 3, 'width': 854, 'height': 480},`
[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`}`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`def _extract_info(self, webpage):`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',`
PEP8: applied even more rules 2014-11-23 21:39:15 +01:00			`webpage, 'info json')`
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`return json.loads(info_json)`

Move TED IE into its own file 2013-06-23 21:55:53 +02:00			`def _real_extract(self, url):`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`m = re.match(self._VALID_URL, url, re.VERBOSE)`
[ted] Add support for embed-ssl.ted.com embedded videos 2015-01-05 13:11:13 +01:00			`if m.group('type').startswith('embed'):`
[ted] Simplify embed code (#2587) 2014-03-20 16:33:23 +01:00			`desktop_url = m.group('proto') + 'www' + m.group('urlmain')`
			`return self.url_result(desktop_url, 'TED')`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`name = m.group('name')`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00			`if m.group('type_talk'):`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`return self._talk_info(url, name)`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`elif m.group('type_watch'):`
			`return self._watch_info(url, name)`
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`else:`
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`return self._playlist_videos_info(url, name)`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`def _playlist_videos_info(self, url, name):`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00			`'''Returns the videos of the playlist'''`
[ted] Fix playlists (Fixes #1770) 2013-11-15 14:33:51 +01:00
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`webpage = self._download_webpage(url, name,`
PEP8: applied even more rules 2014-11-23 21:39:15 +01:00			`'Downloading playlist webpage')`
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`info = self._extract_info(webpage)`
			`playlist_info = info['playlist']`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00
[ted] Fix playlists (Fixes #1770) 2013-11-15 14:33:51 +01:00			`playlist_entries = [`
[ted] Remove superfluous u prefixes 2014-04-21 12:34:32 +02:00			`self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())`
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`for talk in info['talks']`
[ted] Fix playlists (Fixes #1770) 2013-11-15 14:33:51 +01:00			`]`
			`return self.playlist_result(`
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`playlist_entries,`
			`playlist_id=compat_str(playlist_info['id']),`
			`playlist_title=playlist_info['title'])`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00
[ted] Style fixes 2014-03-05 13:27:26 +01:00			`def _talk_info(self, url, video_name):`
			`webpage = self._download_webpage(url, video_name)`
Move TED IE into its own file 2013-06-23 21:55:53 +02:00			`self.report_extraction(video_name)`
[ted] Added support for subtitle download 2013-11-02 19:48:39 +01:00
[ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00			`talk_info = self._extract_info(webpage)['talks'][0]`
[ted] Added support for subtitle download 2013-11-02 19:48:39 +01:00
[ted] Improve external video handling and add test 2015-02-20 19:14:38 +01:00			`external = talk_info.get('external')`
			`if external:`
			`service = external['service']`
			`self.to_screen('Found video from %s' % service)`
			`ext_url = None`
			`if service.lower() == 'youtube':`
			`ext_url = external.get('code')`
[ted] Add support for external videos (fixes #3948) 2014-10-15 12:24:11 +02:00			`return {`
			`'_type': 'url',`
[ted] Improve external video handling and add test 2015-02-20 19:14:38 +01:00			`'url': ext_url or external['uri'],`
[ted] Add support for external videos (fixes #3948) 2014-10-15 12:24:11 +02:00			`}`

[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`formats = [{`
			`'url': format_url,`
			`'format_id': format_id,`
			`'format': format_id,`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 15:23:12 +02:00			`} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]`
			`if formats:`
			`for f in formats:`
			`finfo = self._NATIVE_FORMATS.get(f['format_id'])`
			`if finfo:`
			`f.update(finfo)`
[ted] Extract all formats (Closes #5397) 2015-04-10 19:36:28 +02:00
			`for format_id, resources in talk_info['resources'].items():`
			`if format_id == 'h264':`
			`for resource in resources:`
			`bitrate = int_or_none(resource.get('bitrate'))`
			`formats.append({`
			`'url': resource['file'],`
			`'format_id': '%s-%sk' % (format_id, bitrate),`
			`'tbr': bitrate,`
			`})`
			`elif format_id == 'rtmp':`
			`streamer = talk_info.get('streamer')`
			`if not streamer:`
			`continue`
			`for resource in resources:`
			`formats.append({`
			`'format_id': '%s-%s' % (format_id, resource.get('name')),`
			`'url': streamer,`
			`'play_path': resource['file'],`
			`'ext': 'flv',`
			`'width': int_or_none(resource.get('width')),`
			`'height': int_or_none(resource.get('height')),`
			`'tbr': int_or_none(resource.get('bitrate')),`
			`})`
			`elif format_id == 'hls':`
[ted] Clarify audio/video-only formats 2015-04-20 17:42:20 +02:00			`hls_formats = self._extract_m3u8_formats(`
			`resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)`
			`for f in hls_formats:`
[ted] Skip hls quality selection format 2015-04-20 18:04:42 +02:00			`if f.get('format_id') == 'hls-meta':`
			`continue`
[ted] Fix hls audio/video-only formats 2015-04-20 18:01:02 +02:00			`if not f.get('height'):`
			`f['vcodec'] = 'none'`
			`else:`
			`f['acodec'] = 'none'`
[ted] Clarify audio/video-only formats 2015-04-20 17:42:20 +02:00			`formats.extend(hls_formats)`
[ted] Extract all formats (Closes #5397) 2015-04-10 19:36:28 +02:00
			`audio_download = talk_info.get('audioDownload')`
			`if audio_download:`
			`formats.append({`
			`'url': audio_download,`
			`'format_id': 'audio',`
[ted] Clarify audio/video-only formats 2015-04-20 17:42:20 +02:00			`'vcodec': 'none',`
[ted] Lower preference for direct audio since it's mono 2015-04-20 18:04:17 +02:00			`'preference': -0.5,`
[ted] Extract all formats (Closes #5397) 2015-04-10 19:36:28 +02:00			`})`

[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`self._sort_formats(formats)`

[ted] Remove unused import and modernize test 2014-03-05 14:27:45 +01:00			`video_id = compat_str(talk_info['id'])`
[ted] Added support for subtitle download 2013-11-02 19:48:39 +01:00
[ted] Add 'http://' to the thumbnail url if it's missing 2014-03-16 11:24:11 +01:00			`thumbnail = talk_info['thumb']`
			`if not thumbnail.startswith('http'):`
			`thumbnail = 'http://' + thumbnail`
[ted] simplify 2013-11-15 14:06:38 +01:00			`return {`
[ted] Added support for subtitle download 2013-11-02 19:48:39 +01:00			`'id': video_id,`
[generic] Fix testcases 2014-09-29 05:12:57 +02:00			`'title': talk_info['title'].strip(),`
[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`'uploader': talk_info['speaker'],`
[ted] Add 'http://' to the thumbnail url if it's missing 2014-03-16 11:24:11 +01:00			`'thumbnail': thumbnail,`
[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`'description': self._og_search_description(webpage),`
[ted] Always extract the subtitles The required info is already in the webpage 2015-02-21 22:33:11 +01:00			`'subtitles': self._get_subtitles(video_id, talk_info),`
[ted] Prepare #980 merge 2013-10-04 10:32:34 +02:00			`'formats': formats,`
[ted] Extract duration (closes #4155) 2014-11-12 09:30:57 +01:00			`'duration': talk_info.get('duration'),`
[ted] Prepare #980 merge 2013-10-04 10:32:34 +02:00			`}`

Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-15 18:03:41 +01:00			`def _get_subtitles(self, video_id, talk_info):`
[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]`
			`if languages:`
			`sub_lang_list = {}`
			`for l in languages:`
Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-15 18:03:41 +01:00			`sub_lang_list[l] = [`
			`{`
			`'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),`
			`'ext': ext,`
			`}`
			`for ext in ['ted', 'srt']`
			`]`
[ted] Fix video extraction The site has been redesigned 2014-03-04 21:47:01 +01:00			`return sub_lang_list`
			`else:`
			`return {}`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00
			`def _watch_info(self, url, name):`
			`webpage = self._download_webpage(url, name)`

			`config_json = self._html_search_regex(`
[ted] Fix type_watch links extraction 2014-12-03 16:17:11 +01:00			`r'"pages\.jwplayer"\s,\s({.+?})\s\)\s</script>',`
			`webpage, 'config')`
			`config = json.loads(config_json)['config']`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`video_url = config['video']['url']`
			`thumbnail = config.get('image', {}).get('url')`

			`title = self._html_search_regex(`
			`r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')`
			`description = self._html_search_regex(`
[ted] Extend search for description 2014-04-21 12:37:16 +02:00			`[`
			`r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',`
			`r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',`
			`],`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00			`webpage, 'description', fatal=False)`

			`return {`
			`'id': name,`
			`'url': video_url,`
			`'title': title,`
			`'thumbnail': thumbnail,`
			`'description': description,`
			`}`