youtube-dl/youtube_dl/extractor/youjizz.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    int_or_none,
    parse_duration,
    url_or_none,
)


class YouJizzIE(InfoExtractor):
    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'
    _TESTS = [{
        'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
        'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',
        'info_dict': {
            'id': '2189178',
            'ext': 'mp4',
            'title': 'Zeichentrick 1',
            'age_limit': 18,
            'duration': 2874,
        }
    }, {
        'url': 'http://www.youjizz.com/videos/-2189178.html',
        'only_matching': True,
    }, {
        'url': 'https://www.youjizz.com/videos/embed/31991001',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id') or mobj.group('embed_id')

        webpage = self._download_webpage(url, video_id)

        title = self._html_search_regex(
            r'<title>(.+?)</title>', webpage, 'title')

        formats = []

        encodings = self._parse_json(
            self._search_regex(
                r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
                default='[]'),
            video_id, fatal=False)
        for encoding in encodings:
            if not isinstance(encoding, dict):
                continue
            format_url = url_or_none(encoding.get('filename'))
            if not format_url:
                continue
            if determine_ext(format_url) == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id='hls', fatal=False))
            else:
                format_id = encoding.get('name') or encoding.get('quality')
                height = int_or_none(self._search_regex(
                    r'^(\d+)[pP]', format_id, 'height', default=None))
                formats.append({
                    'url': format_url,
                    'format_id': format_id,
                    'height': height,
                })

        if formats:
            info_dict = {
                'formats': formats,
            }
        else:
            # YouJizz's HTML5 player has invalid HTML
            webpage = webpage.replace('"controls', '" controls')
            info_dict = self._parse_html5_media_entries(
                url, webpage, video_id)[0]

        duration = parse_duration(self._search_regex(
            r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration',
            default=None))
        uploader = self._search_regex(
            r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader',
            default=None)

        info_dict.update({
            'id': video_id,
            'title': title,
            'age_limit': self._rta_search(webpage),
            'duration': duration,
            'uploader': uploader,
        })

        return info_dict
[youjizz] Simplify and use unicode_literals 2014-01-29 16:44:21 +01:00			`from __future__ import unicode_literals`

[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`import re`

[youjizz] move into own file 2013-06-23 22:14:22 +02:00			`from .common import InfoExtractor`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`from ..utils import (`
			`determine_ext,`
			`int_or_none,`
			`parse_duration,`
Improve URL extraction 2018-07-21 14:08:28 +02:00			`url_or_none,`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`)`
[youjizz] move into own file 2013-06-23 22:14:22 +02:00

			`class YouJizzIE(InfoExtractor):`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`_VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html\|embed/(?P<embed_id>\d+))'`
[youjizz] Relax _VALID_URL (Closes #10131) 2016-07-20 17:41:13 +02:00			`_TESTS = [{`
[youjizz] Simplify and use unicode_literals 2014-01-29 16:44:21 +01:00			`'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',`
[youjizz] Simplify and use unicode_literals 2014-01-29 16:44:21 +01:00			`'info_dict': {`
[youjizz] Modernize (#4131) 2014-11-12 15:19:23 +01:00			`'id': '2189178',`
[youjizz] Fix extraction. The site has moved to HTML5 Closes #10437 2016-09-03 12:37:36 +02:00			`'ext': 'mp4',`
[refactor] Single quotes consistency 2016-02-14 10:37:17 +01:00			`'title': 'Zeichentrick 1',`
			`'age_limit': 18,`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`'duration': 2874,`
Move tests to the IE definitions 2013-06-27 20:46:46 +02:00			`}`
[youjizz] Relax _VALID_URL (Closes #10131) 2016-07-20 17:41:13 +02:00			`}, {`
			`'url': 'http://www.youjizz.com/videos/-2189178.html',`
			`'only_matching': True,`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`}, {`
			`'url': 'https://www.youjizz.com/videos/embed/31991001',`
			`'only_matching': True,`
[youjizz] Relax _VALID_URL (Closes #10131) 2016-07-20 17:41:13 +02:00			`}]`
[youjizz] move into own file 2013-06-23 22:14:22 +02:00
			`def _real_extract(self, url):`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`mobj = re.match(self._VALID_URL, url)`
			`video_id = mobj.group('id') or mobj.group('embed_id')`

[youjizz] move into own file 2013-06-23 22:14:22 +02:00			`webpage = self._download_webpage(url, video_id)`
[youjizz] Modernize (#4131) 2014-11-12 15:19:23 +01:00
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`title = self._html_search_regex(`
			`r'<title>(.+?)</title>', webpage, 'title')`

			`formats = []`

			`encodings = self._parse_json(`
			`self._search_regex(`
			`r'encodings\s=\s(\[.+?\]);\n', webpage, 'encodings',`
			`default='[]'),`
			`video_id, fatal=False)`
			`for encoding in encodings:`
			`if not isinstance(encoding, dict):`
			`continue`
Improve URL extraction 2018-07-21 14:08:28 +02:00			`format_url = url_or_none(encoding.get('filename'))`
			`if not format_url:`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`continue`
			`if determine_ext(format_url) == 'm3u8':`
			`formats.extend(self._extract_m3u8_formats(`
			`format_url, video_id, 'mp4', entry_protocol='m3u8_native',`
			`m3u8_id='hls', fatal=False))`
			`else:`
			`format_id = encoding.get('name') or encoding.get('quality')`
			`height = int_or_none(self._search_regex(`
			`r'^(\d+)[pP]', format_id, 'height', default=None))`
			`formats.append({`
			`'url': format_url,`
			`'format_id': format_id,`
			`'height': height,`
			`})`

			`if formats:`
			`info_dict = {`
			`'formats': formats,`
			`}`
			`else:`
			`# YouJizz's HTML5 player has invalid HTML`
			`webpage = webpage.replace('"controls', '" controls')`
			`info_dict = self._parse_html5_media_entries(`
			`url, webpage, video_id)[0]`

			`duration = parse_duration(self._search_regex(`
			`r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration',`
			`default=None))`
			`uploader = self._search_regex(`
			`r'<strong>Uploaded By:.?<a[^>]>([^<]+)', webpage, 'uploader',`
			`default=None)`
[youjizz] move into own file 2013-06-23 22:14:22 +02:00
[youjizz] Fix extraction. The site has moved to HTML5 Closes #10437 2016-09-03 12:37:36 +02:00			`info_dict.update({`
[youjizz] Simplify and use unicode_literals 2014-01-29 16:44:21 +01:00			`'id': video_id,`
[youjizz] Fix extraction (closes #13744) 2017-07-30 10:48:22 +02:00			`'title': title,`
			`'age_limit': self._rta_search(webpage),`
			`'duration': duration,`
			`'uploader': uploader,`
[youjizz] Fix extraction. The site has moved to HTML5 Closes #10437 2016-09-03 12:37:36 +02:00			`})`

			`return info_dict`