youtube-dl/youtube_dl/extractor/viewlift.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    clean_html,
    determine_ext,
    int_or_none,
    js_to_json,
    parse_duration,
)


class ViewLiftBaseIE(InfoExtractor):
    _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv'


class ViewLiftEmbedIE(ViewLiftBaseIE):
    _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX
    _TESTS = [{
        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
        'md5': '2924e9215c6eff7a55ed35b72276bd93',
        'info_dict': {
            'id': '74849a00-85a9-11e1-9660-123139220831',
            'ext': 'mp4',
            'title': '#whilewewatch',
        }
    }, {
        # invalid labels, 360p is better that 480p
        'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
        'md5': '882fca19b9eb27ef865efeeaed376a48',
        'info_dict': {
            'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
            'ext': 'mp4',
            'title': 'Life in Limbo',
        }
    }, {
        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_url(webpage):
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX,
            webpage)
        if mobj:
            return mobj.group('url')

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        if '>This film is not playable in your area.<' in webpage:
            raise ExtractorError(
                'Film %s is not playable in your area.' % video_id, expected=True)

        formats = []
        has_bitrate = False
        for source in self._parse_json(js_to_json(self._search_regex(
                r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
            file_ = source.get('file')
            if not file_:
                continue
            type_ = source.get('type')
            ext = determine_ext(file_)
            format_id = source.get('label') or ext
            if all(v in ('m3u8', 'hls') for v in (type_, ext)):
                formats.extend(self._extract_m3u8_formats(
                    file_, video_id, 'mp4', m3u8_id='hls'))
            else:
                bitrate = int_or_none(self._search_regex(
                    [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
                    file_, 'bitrate', default=None))
                if not has_bitrate and bitrate:
                    has_bitrate = True
                height = int_or_none(self._search_regex(
                    r'^(\d+)[pP]$', format_id, 'height', default=None))
                formats.append({
                    'url': file_,
                    'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')),
                    'tbr': bitrate,
                    'height': height,
                })
        field_preference = None if has_bitrate else ('height', 'tbr', 'format_id')
        self._sort_formats(formats, field_preference)

        title = self._search_regex(
            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
            webpage, 'title')

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
        }


class ViewLiftIE(ViewLiftBaseIE):
    _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)/(?:films/title|show|(?:news/)?videos?)/(?P<id>[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX
    _TESTS = [{
        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
        'md5': '19844f897b35af219773fd63bdec2942',
        'info_dict': {
            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
            'display_id': 'lost_for_life',
            'ext': 'mp4',
            'title': 'Lost for Life',
            'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
            'thumbnail': r're:^https?://.*\.jpg',
            'duration': 4489,
            'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
        }
    }, {
        'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
        'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
        'info_dict': {
            'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
            'display_id': 'the_world_cut_project/india',
            'ext': 'mp4',
            'title': 'India',
            'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
            'thumbnail': r're:^https?://.*\.jpg',
            'duration': 979,
            'categories': ['Documentary', 'Sports', 'Politics']
        }
    }, {
        # Film is not playable in your area.
        'url': 'http://www.snagfilms.com/films/title/inside_mecca',
        'only_matching': True,
    }, {
        # Film is not available.
        'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
        'only_matching': True,
    }, {
        'url': 'http://www.winnersview.com/videos/the-good-son',
        'only_matching': True,
    }, {
        'url': 'http://www.kesari.tv/news/video/1461919076414',
        'only_matching': True,
    }, {
        # Was once Kaltura embed
        'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        domain, display_id = re.match(self._VALID_URL, url).groups()

        webpage = self._download_webpage(url, display_id)

        if ">Sorry, the Film you're looking for is not available.<" in webpage:
            raise ExtractorError(
                'Film %s is not available.' % display_id, expected=True)

        film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')

        snag = self._parse_json(
            self._search_regex(
                r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
            display_id)

        for item in snag:
            if item.get('data', {}).get('film', {}).get('id') == film_id:
                data = item['data']['film']
                title = data['title']
                description = clean_html(data.get('synopsis'))
                thumbnail = data.get('image')
                duration = int_or_none(data.get('duration') or data.get('runtime'))
                categories = [
                    category['title'] for category in data.get('categories', [])
                    if category.get('title')]
                break
        else:
            title = self._search_regex(
                r'itemprop="title">([^<]+)<', webpage, 'title')
            description = self._html_search_regex(
                r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
                webpage, 'description', default=None) or self._og_search_description(webpage)
            thumbnail = self._og_search_thumbnail(webpage)
            duration = parse_duration(self._search_regex(
                r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
                webpage, 'duration', fatal=False))
            categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)

        return {
            '_type': 'url_transparent',
            'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id),
            'id': film_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'categories': categories,
            'ie_key': 'ViewLiftEmbed',
        }
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`from __future__ import unicode_literals`

			`import re`

[snagfilms] Add new extractor 2015-06-26 19:25:43 +02:00			`from .common import InfoExtractor`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`from ..utils import (`
[snagfilms:embed] Capture geolocation restriction error 2015-06-27 14:50:26 +02:00			`ExtractorError,`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`clean_html,`
			`determine_ext,`
			`int_or_none,`
			`js_to_json,`
			`parse_duration,`
			`)`
[snagfilms] Add new extractor 2015-06-26 19:25:43 +02:00
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`class ViewLiftBaseIE(InfoExtractor):`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 13:08:07 +01:00			`_DOMAINS_REGEX = r'(?:snagfilms\|snagxtreme\|funnyforfree\|kiddovid\|winnersview\|monumentalsportsnetwork\|vayafilm)\.com\|kesari\.tv'`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00

			`class ViewLiftEmbedIE(ViewLiftBaseIE):`
			`_VALID_URL = r'https?://(?:(?:www\|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX`
add support for embed links 2015-06-27 01:13:14 +02:00			`_TESTS = [{`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',`
			`'md5': '2924e9215c6eff7a55ed35b72276bd93',`
			`'info_dict': {`
add support for embed links 2015-06-27 01:13:14 +02:00			`'id': '74849a00-85a9-11e1-9660-123139220831',`
			`'ext': 'mp4',`
			`'title': '#whilewewatch',`
			`}`
[snagfilms] Improve m3u8 extraction (Closes #6309) 2015-07-21 19:54:31 +02:00			`}, {`
			`# invalid labels, 360p is better that 480p`
			`'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',`
			`'md5': '882fca19b9eb27ef865efeeaed376a48',`
			`'info_dict': {`
			`'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',`
			`'ext': 'mp4',`
			`'title': 'Life in Limbo',`
			`}`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`}, {`
			`'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',`
			`'only_matching': True,`
add support for embed links 2015-06-27 01:13:14 +02:00			`}]`
[snagfilms] Add new extractor 2015-06-26 19:25:43 +02:00
[snagfilms] Add routine for generic embeds extractions 2015-06-27 14:25:50 +02:00			`@staticmethod`
			`def _extract_url(webpage):`
			`mobj = re.search(`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX,`
[snagfilms:embed] Capture geolocation restriction error 2015-06-27 14:50:26 +02:00			`webpage)`
[snagfilms] Add routine for generic embeds extractions 2015-06-27 14:25:50 +02:00			`if mobj:`
			`return mobj.group('url')`

convert tabs to 4 spaces identation 2015-06-26 22:50:27 +02:00			`def _real_extract(self, url):`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`video_id = self._match_id(url)`

			`webpage = self._download_webpage(url, video_id)`
[snagfilms] Add new extractor 2015-06-26 19:25:43 +02:00
[snagfilms:embed] Capture geolocation restriction error 2015-06-27 14:50:26 +02:00			`if '>This film is not playable in your area.<' in webpage:`
			`raise ExtractorError(`
[snagfilms] Capture not available error 2015-06-27 14:54:08 +02:00			`'Film %s is not playable in your area.' % video_id, expected=True)`
[snagfilms:embed] Capture geolocation restriction error 2015-06-27 14:50:26 +02:00
convert tabs to 4 spaces identation 2015-06-26 22:50:27 +02:00			`formats = []`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`has_bitrate = False`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`for source in self._parse_json(js_to_json(self._search_regex(`
			`r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):`
			`file_ = source.get('file')`
			`if not file_:`
			`continue`
			`type_ = source.get('type')`
			`ext = determine_ext(file_)`
[snagfilms] Improve m3u8 extraction (Closes #6309) 2015-07-21 19:54:31 +02:00			`format_id = source.get('label') or ext`
improve coding style 2017-04-12 21:38:43 +02:00			`if all(v in ('m3u8', 'hls') for v in (type_, ext)):`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`formats.extend(self._extract_m3u8_formats(`
			`file_, video_id, 'mp4', m3u8_id='hls'))`
convert tabs to 4 spaces identation 2015-06-26 22:50:27 +02:00			`else:`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`bitrate = int_or_none(self._search_regex(`
[snagfilms] Improve m3u8 extraction (Closes #6309) 2015-07-21 19:54:31 +02:00			`[r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],`
			`file_, 'bitrate', default=None))`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`if not has_bitrate and bitrate:`
			`has_bitrate = True`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`height = int_or_none(self._search_regex(`
			`r'^(\d+)[pP]$', format_id, 'height', default=None))`
			`formats.append({`
			`'url': file_,`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')),`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`'tbr': bitrate,`
			`'height': height,`
			`})`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`field_preference = None if has_bitrate else ('height', 'tbr', 'format_id')`
			`self._sort_formats(formats, field_preference)`
[snagfilms] Add new extractor 2015-06-26 19:25:43 +02:00
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`title = self._search_regex(`
			`[r"title\s:\s'([^']+)'", r'<title>([^<]+)</title>'],`
			`webpage, 'title')`

convert tabs to 4 spaces identation 2015-06-26 22:50:27 +02:00			`return {`
			`'id': video_id,`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`'title': title,`
			`'formats': formats,`
			`}`


[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`class ViewLiftIE(ViewLiftBaseIE):`
			`_VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)/(?:films/title\|show\|(?:news/)?videos?)/(?P<id>[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX`
[snagfilms] Add support for shows 2015-06-27 14:40:01 +02:00			`_TESTS = [{`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`'url': 'http://www.snagfilms.com/films/title/lost_for_life',`
			`'md5': '19844f897b35af219773fd63bdec2942',`
			`'info_dict': {`
			`'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',`
			`'display_id': 'lost_for_life',`
			`'ext': 'mp4',`
			`'title': 'Lost for Life',`
			`'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 13:08:07 +01:00			`'thumbnail': r're:^https?://.*\.jpg',`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`'duration': 4489,`
			`'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']`
			`}`
[snagfilms] Add support for shows 2015-06-27 14:40:01 +02:00			`}, {`
			`'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',`
			`'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',`
			`'info_dict': {`
			`'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',`
			`'display_id': 'the_world_cut_project/india',`
			`'ext': 'mp4',`
			`'title': 'India',`
			`'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 13:08:07 +01:00			`'thumbnail': r're:^https?://.*\.jpg',`
[snagfilms] Add support for shows 2015-06-27 14:40:01 +02:00			`'duration': 979,`
			`'categories': ['Documentary', 'Sports', 'Politics']`
			`}`
[snagfilms] More tests 2015-06-27 14:57:01 +02:00			`}, {`
			`# Film is not playable in your area.`
			`'url': 'http://www.snagfilms.com/films/title/inside_mecca',`
			`'only_matching': True,`
			`}, {`
			`# Film is not available.`
			`'url': 'http://www.snagfilms.com/show/augie_alone/flirting',`
			`'only_matching': True,`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`}, {`
			`'url': 'http://www.winnersview.com/videos/the-good-son',`
			`'only_matching': True,`
			`}, {`
			`'url': 'http://www.kesari.tv/news/video/1461919076414',`
			`'only_matching': True,`
[generic,viewlift] Move a test case to the specialized extractor 2016-06-01 13:18:01 +02:00			`}, {`
			`# Was once Kaltura embed`
			`'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15',`
			`'only_matching': True,`
[snagfilms] Add support for shows 2015-06-27 14:40:01 +02:00			`}]`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00
			`def _real_extract(self, url):`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`domain, display_id = re.match(self._VALID_URL, url).groups()`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00
			`webpage = self._download_webpage(url, display_id)`

[snagfilms] Capture not available error 2015-06-27 14:54:08 +02:00			`if ">Sorry, the Film you're looking for is not available.<" in webpage:`
			`raise ExtractorError(`
			`'Film %s is not available.' % display_id, expected=True)`

[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')`

			`snag = self._parse_json(`
			`self._search_regex(`
Fix "invalid escape sequences" error on Python 3.6 2017-01-02 13:08:07 +01:00			`r'Snag\.page\.data\s=\s(\[.+?\]);', webpage, 'snag'),`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`display_id)`

			`for item in snag:`
			`if item.get('data', {}).get('film', {}).get('id') == film_id:`
			`data = item['data']['film']`
			`title = data['title']`
			`description = clean_html(data.get('synopsis'))`
			`thumbnail = data.get('image')`
			`duration = int_or_none(data.get('duration') or data.get('runtime'))`
			`categories = [`
			`category['title'] for category in data.get('categories', [])`
			`if category.get('title')]`
			`break`
			`else:`
			`title = self._search_regex(`
			`r'itemprop="title">([^<]+)<', webpage, 'title')`
			`description = self._html_search_regex(`
			`r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',`
			`webpage, 'description', default=None) or self._og_search_description(webpage)`
			`thumbnail = self._og_search_thumbnail(webpage)`
			`duration = parse_duration(self._search_regex(`
			`r'<span itemprop="duration" class="film-duration strong">([^<]+)<',`
			`webpage, 'duration', fatal=False))`
			`categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)`

			`return {`
			`'_type': 'url_transparent',`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id),`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`'id': film_id,`
convert tabs to 4 spaces identation 2015-06-26 22:50:27 +02:00			`'display_id': display_id,`
			`'title': title,`
			`'description': description,`
add support for embed links 2015-06-27 01:13:14 +02:00			`'thumbnail': thumbnail,`
[snagfilms] Improve and simplify 2015-06-27 14:20:42 +02:00			`'duration': duration,`
			`'categories': categories,`
[viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting 2016-04-29 12:14:42 +02:00			`'ie_key': 'ViewLiftEmbed',`
convert tabs to 4 spaces identation 2015-06-26 22:50:27 +02:00			`}`