youtube-dl/youtube_dl/extractor/xhamster.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    unified_strdate,
    str_to_int,
    int_or_none,
    parse_duration,
)


class XHamsterIE(InfoExtractor):
    """Information Extractor for xHamster"""
    _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
    _TESTS = [
        {
            'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
            'info_dict': {
                'id': '1509445',
                'ext': 'mp4',
                'title': 'FemaleAgent Shy beauty takes the bait',
                'upload_date': '20121014',
                'uploader_id': 'Ruseful2011',
                'duration': 893,
                'age_limit': 18,
            }
        },
        {
            'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
            'info_dict': {
                'id': '2221348',
                'ext': 'mp4',
                'title': 'Britney Spears  Sexy Booty',
                'upload_date': '20130914',
                'uploader_id': 'jojo747400',
                'duration': 200,
                'age_limit': 18,
            }
        }
    ]

    def _real_extract(self, url):
        def extract_video_url(webpage):
            mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
            if mp4 is None:
                raise ExtractorError('Unable to extract media URL')
            else:
                return mp4.group(1)

        def is_hd(webpage):
            return '<div class=\'icon iconHD\'' in webpage

        mobj = re.match(self._VALID_URL, url)

        video_id = mobj.group('id')
        seo = mobj.group('seo')
        mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
        webpage = self._download_webpage(mrss_url, video_id)

        title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')

        # Only a few videos have an description
        mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
        description = mobj.group(1) if mobj else None

        upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',
                                              webpage, 'upload date', fatal=False)
        if upload_date:
            upload_date = unified_strdate(upload_date)

        uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
                                              webpage, 'uploader id', default='anonymous')

        thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)

        duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
                                                          webpage, 'duration', fatal=False))

        view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)
        if view_count:
            view_count = str_to_int(view_count)

        mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)
        (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)

        mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)
        comment_count = mobj.group('commentcount') if mobj else 0

        age_limit = self._rta_search(webpage)

        hd = is_hd(webpage)

        video_url = extract_video_url(webpage)
        formats = [{
            'url': video_url,
            'format_id': 'hd' if hd else 'sd',
            'preference': 1,
        }]

        if not hd:
            mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
            webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
            if is_hd(webpage):
                video_url = extract_video_url(webpage)
                formats.append({
                    'url': video_url,
                    'format_id': 'hd',
                    'preference': 2,
                })

        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'upload_date': upload_date,
            'uploader_id': uploader_id,
            'thumbnail': thumbnail,
            'duration': duration,
            'view_count': view_count,
            'like_count': int_or_none(like_count),
            'dislike_count': int_or_none(dislike_count),
            'comment_count': int_or_none(comment_count),
            'age_limit': age_limit,
            'formats': formats,
        }
[xhamster] Use unicode_literals 2014-01-23 03:52:59 +01:00			`from __future__ import unicode_literals`

[xhamster] Move into own file 2013-06-23 22:32:44 +02:00			`import re`

			`from .common import InfoExtractor`
			`from ..utils import (`
			`ExtractorError,`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`unified_strdate,`
			`str_to_int,`
			`int_or_none,`
			`parse_duration,`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00			`)`


			`class XHamsterIE(InfoExtractor):`
			`"""Information Extractor for xHamster"""`
changed _VALID_URL to allow for country specific prefixes 2014-08-22 22:17:07 +02:00			`_VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`_TESTS = [`
			`{`
			`'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',`
			`'info_dict': {`
			`'id': '1509445',`
			`'ext': 'mp4',`
			`'title': 'FemaleAgent Shy beauty takes the bait',`
			`'upload_date': '20121014',`
			`'uploader_id': 'Ruseful2011',`
			`'duration': 893,`
			`'age_limit': 18,`
			`}`
			`},`
			`{`
			`'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',`
			`'info_dict': {`
			`'id': '2221348',`
			`'ext': 'mp4',`
			`'title': 'Britney Spears Sexy Booty',`
			`'upload_date': '20130914',`
			`'uploader_id': 'jojo747400',`
			`'duration': 200,`
			`'age_limit': 18,`
			`}`
XHamsterIE: Fix support for new HD video url format and add test (closes PR #1443) 2013-09-17 06:24:20 +02:00			`}`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`]`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00
PEP8 applied 2014-11-23 20:41:03 +01:00			`def _real_extract(self, url):`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`def extract_video_url(webpage):`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`mp4 = re.search(r'<video\s+.?file="([^"]+)".?>', webpage)`
[xhamster] Add support for hd video Signed-off-by: Philipp Hagemeister <phihag@phihag.de> 2014-01-23 03:51:09 +01:00			`if mp4 is None:`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`raise ExtractorError('Unable to extract media URL')`
[xhamster] Add support for hd video Signed-off-by: Philipp Hagemeister <phihag@phihag.de> 2014-01-23 03:51:09 +01:00			`else:`
			`return mp4.group(1)`

[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`def is_hd(webpage):`
[xhamster] Futher simplification 2014-01-23 04:04:35 +01:00			`return '<div class=\'icon iconHD\'' in webpage`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00			`mobj = re.match(self._VALID_URL, url)`

			`video_id = mobj.group('id')`
XHamsterIE: Fix support for new HD video url format and add test (closes PR #1443) 2013-09-17 06:24:20 +02:00			`seo = mobj.group('seo')`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00			`webpage = self._download_webpage(mrss_url, video_id)`

[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00
XHamsterIE: Add video description 2013-08-23 16:40:20 +02:00			`# Only a few videos have an description`
[xhamster] Futher simplification 2014-01-23 04:04:35 +01:00			`mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`description = mobj.group(1) if mobj else None`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',`
PEP8: applied even more rules 2014-11-23 21:39:15 +01:00			`webpage, 'upload date', fatal=False)`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`if upload_date:`
			`upload_date = unified_strdate(upload_date)`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',`
PEP8: applied even more rules 2014-11-23 21:39:15 +01:00			`webpage, 'uploader id', default='anonymous')`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`thumbnail = self._html_search_regex(r'<video\s+.?poster="([^"]+)".?>', webpage, 'thumbnail', fatal=False)`

			`duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',`
PEP8: applied even more rules 2014-11-23 21:39:15 +01:00			`webpage, 'duration', fatal=False))`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00
			`view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)`
			`if view_count:`
			`view_count = str_to_int(view_count)`

			`mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)`
			`(like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)`

			`mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)`
			`comment_count = mobj.group('commentcount') if mobj else 0`
[xhamster] Move into own file 2013-06-23 22:32:44 +02:00
[xhamster] Add support for age_limit (Instead of #1627) 2013-10-19 21:09:48 +02:00			`age_limit = self._rta_search(webpage)`

[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`hd = is_hd(webpage)`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00
[xhamster] Add support for hd video Signed-off-by: Philipp Hagemeister <phihag@phihag.de> 2014-01-23 03:51:09 +01:00			`video_url = extract_video_url(webpage)`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`formats = [{`
			`'url': video_url,`
			`'format_id': 'hd' if hd else 'sd',`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`'preference': 1,`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`}]`
[xhamster] Add support for hd video Signed-off-by: Philipp Hagemeister <phihag@phihag.de> 2014-01-23 03:51:09 +01:00
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`if not hd:`
[XHamsterIE] Make hd video search more robust 2014-02-26 10:01:44 +01:00			`mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`if is_hd(webpage):`
			`video_url = extract_video_url(webpage)`
			`formats.append({`
			`'url': video_url,`
			`'format_id': 'hd',`
[xhamster] Futher simplification 2014-01-23 04:04:35 +01:00			`'preference': 2,`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`})`

[xhamster] Futher simplification 2014-01-23 04:04:35 +01:00			`self._sort_formats(formats)`

[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`return {`
			`'id': video_id,`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`'title': title,`
			`'description': description,`
			`'upload_date': upload_date,`
			`'uploader_id': uploader_id,`
			`'thumbnail': thumbnail,`
			`'duration': duration,`
			`'view_count': view_count,`
			`'like_count': int_or_none(like_count),`
			`'dislike_count': int_or_none(dislike_count),`
			`'comment_count': int_or_none(comment_count),`
[xhamster] Add support for age_limit (Instead of #1627) 2013-10-19 21:09:48 +02:00			`'age_limit': age_limit,`
[xhamster] Fix and improve 2014-02-19 19:42:15 +01:00			`'formats': formats,`
[XHamsterIE] Extract SD and HD video 2013-10-26 20:38:54 +02:00			`}`