From 6a1c4fbfcb57c138b8ace305127c2654844e8099 Mon Sep 17 00:00:00 2001 From: peugeot Date: Fri, 26 Dec 2014 15:49:12 +0100 Subject: [PATCH 01/16] [hellporno] new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/hellporno.py | 56 +++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/hellporno.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fd0ebffe3..de5206729 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -166,6 +166,7 @@ from .groupon import GrouponIE from .hark import HarkIE from .heise import HeiseIE +from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hornbunny import HornBunnyIE diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py new file mode 100644 index 000000000..06f85127f --- /dev/null +++ b/youtube_dl/extractor/hellporno.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +class HellPornoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hellporno\.com/videos/(?P[^/]+)' + _TEST = { + 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', + 'md5': '1fee339c610d2049699ef2aa699439f1', + 'info_dict': { + 'id': '149116', + 'ext': 'mp4', + 'title': 'Dixie is posing with naked ass very erotic', + 'description': 'md5:5ba02cbf31eff820147b3cc25306d89a', + 'categories': list, # NSFW + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, 'main') + + video_id = self._html_search_regex(r'video_id:\s*\'([^\']+)\'', webpage, 'id') + + video_url = self._html_search_regex(r'video_url:\s*\'([^\']+)\'', webpage, 'video_url') + + ext = self._html_search_regex(r'postfix:\s*\'([^\']+)\'', webpage, 'ext')[1:] + + title = self._html_search_regex( + r'([^<]+)\s*-\s*Hell Porno', webpage, 'title') + + description = self._html_search_meta('description', webpage, 'description', fatal=False) + + thumbnail = self._html_search_regex( + r'preview_url:\s*\'([^\']+)\'', + webpage, 'thumbnail', fatal=False) + + categories_str = self._html_search_regex( + r' Date: Mon, 29 Dec 2014 10:38:07 +0100 Subject: [PATCH 02/16] [hellporno] simplify --- youtube_dl/extractor/hellporno.py | 32 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py index 06f85127f..754322cdf 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/youtube_dl/extractor/hellporno.py @@ -1,55 +1,51 @@ from __future__ import unicode_literals - -import re - from .common import InfoExtractor + class HellPornoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hellporno\.com/videos/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?hellporno\.com/videos/(?P[^/]+)' _TEST = { 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', 'md5': '1fee339c610d2049699ef2aa699439f1', 'info_dict': { 'id': '149116', + 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic', 'ext': 'mp4', 'title': 'Dixie is posing with naked ass very erotic', - 'description': 'md5:5ba02cbf31eff820147b3cc25306d89a', - 'categories': list, # NSFW 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): - webpage = self._download_webpage(url, 'main') + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex(r'video_id:\s*\'([^\']+)\'', webpage, 'id') + video_id = self._html_search_regex( + r'video_id:\s*\'([^\']+)\'', webpage, 'id') - video_url = self._html_search_regex(r'video_url:\s*\'([^\']+)\'', webpage, 'video_url') + ext = self._html_search_regex( + r'postfix:\s*\'([^\']+)\'', webpage, 'ext')[1:] - ext = self._html_search_regex(r'postfix:\s*\'([^\']+)\'', webpage, 'ext')[1:] + video_url = self._html_search_regex( + r'video_url:\s*\'([^\']+)\'', webpage, 'video_url') title = self._html_search_regex( r'([^<]+)\s*-\s*Hell Porno', webpage, 'title') - description = self._html_search_meta('description', webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( r'preview_url:\s*\'([^\']+)\'', webpage, 'thumbnail', fatal=False) - categories_str = self._html_search_regex( - r' Date: Mon, 29 Dec 2014 11:31:22 +0100 Subject: [PATCH 03/16] [xxxymovies] new ectractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/xxxymovies.py | 63 ++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 youtube_dl/extractor/xxxymovies.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fd0ebffe3..c8a77616e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -512,6 +512,7 @@ from .xnxx import XNXXIE from .xvideos import XVideosIE from .xtube import XTubeUserIE, XTubeIE +from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, YahooSearchIE, diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py new file mode 100644 index 000000000..4a9144661 --- /dev/null +++ b/youtube_dl/extractor/xxxymovies.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, +) + + +class XXXYMoviesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?xxxymovies\.com/videos/(?P\d+)/(?P[^/]+)' + _TEST = { + 'url': 'http://xxxymovies.com/videos/138669/ecstatic-orgasm-sofcore/', + 'md5': '810b1bdbbffff89dd13bdb369fe7be4b', + 'info_dict': { + 'id': '138669', + 'display_id': 'ecstatic-orgasm-sofcore', + 'ext': 'mp4', + 'title': 'Ecstatic Orgasm Sofcore', + 'duration': 931, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') + + title = self._html_search_regex( + r'(.*?)\s*-\s*XXXYMovies.com', webpage, 'title') + + thumbnail = self._html_search_regex( + r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) + + categories = self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + + duration = parse_duration(self._search_regex( + r'Duration:\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._html_search_regex( + r'
\s*(\d+)', webpage, 'view count', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'categories': categories, + 'duration': duration, + 'view_count': view_count, + 'age_limit': 18, + } From 6343a5f68e6dbf5a4fc1b0c7d0bec9e1ddc57143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Dec 2014 21:05:21 +0600 Subject: [PATCH 04/16] [xxxymovies] Improve --- youtube_dl/extractor/xxxymovies.py | 36 ++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py index 4a9144661..5c8f17eb2 100644 --- a/youtube_dl/extractor/xxxymovies.py +++ b/youtube_dl/extractor/xxxymovies.py @@ -20,35 +20,51 @@ class XXXYMoviesIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ecstatic Orgasm Sofcore', 'duration': 931, + 'categories': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') display_id = mobj.group('display_id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - video_url = self._html_search_regex( + video_url = self._search_regex( r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') title = self._html_search_regex( - r'(.*?)\s*-\s*XXXYMovies.com', webpage, 'title') + [r'
\s*

([^<]+)

', + r'(.*?)\s*-\s*XXXYMovies\.com'], + webpage, 'title') - thumbnail = self._html_search_regex( - r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) + thumbnail = self._search_regex( + r"preview_url\s*:\s*'([^']+)'", + webpage, 'thumbnail', fatal=False) categories = self._html_search_meta( 'keywords', webpage, 'categories', default='').split(',') duration = parse_duration(self._search_regex( - r'Duration:\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + r'Duration:\s*(\d+:\d+)', + webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( - r'
\s*(\d+)', webpage, 'view count', fatal=False)) + r'
\s*(\d+)', + webpage, 'view count', fatal=False)) + like_count = int_or_none(self._search_regex( + r'>\s*Likes? \((\d+)\)', + webpage, 'like count', fatal=False)) + dislike_count = int_or_none(self._search_regex( + r'>\s*Dislike \((\d+)\)', + webpage, 'dislike count', fatal=False)) + + age_limit = self._rta_search(webpage) return { 'id': video_id, @@ -59,5 +75,7 @@ def _real_extract(self, url): 'categories': categories, 'duration': duration, 'view_count': view_count, - 'age_limit': 18, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'age_limit': age_limit, } From 355e41466d8edcb5b2457dfa0c5715ef87ff5832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Dec 2014 21:33:41 +0600 Subject: [PATCH 05/16] [hellporno] Extract all formats and improve --- youtube_dl/extractor/hellporno.py | 47 ++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py index 754322cdf..7a1c75b65 100644 --- a/youtube_dl/extractor/hellporno.py +++ b/youtube_dl/extractor/hellporno.py @@ -1,5 +1,12 @@ from __future__ import unicode_literals + +import re + from .common import InfoExtractor +from ..utils import ( + js_to_json, + remove_end, +) class HellPornoIE(InfoExtractor): @@ -19,23 +26,36 @@ class HellPornoIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - r'video_id:\s*\'([^\']+)\'', webpage, 'id') + title = remove_end(self._html_search_regex( + r'([^<]+)', webpage, 'title'), ' - Hell Porno') - ext = self._html_search_regex( - r'postfix:\s*\'([^\']+)\'', webpage, 'ext')[1:] + flashvars = self._parse_json(self._search_regex( + r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), + display_id, transform_source=js_to_json) - video_url = self._html_search_regex( - r'video_url:\s*\'([^\']+)\'', webpage, 'video_url') + video_id = flashvars.get('video_id') + thumbnail = flashvars.get('preview_url') + ext = flashvars.get('postfix', '.mp4')[1:] - title = self._html_search_regex( - r'([^<]+)\s*-\s*Hell Porno', webpage, 'title') - - thumbnail = self._html_search_regex( - r'preview_url:\s*\'([^\']+)\'', - webpage, 'thumbnail', fatal=False) + formats = [] + for video_url_key in ['video_url', 'video_alt_url']: + video_url = flashvars.get(video_url_key) + if not video_url: + continue + video_text = flashvars.get('%s_text' % video_url_key) + fmt = { + 'url': video_url, + 'ext': ext, + 'format_id': video_text, + } + m = re.search(r'^(?P\d+)[pP]', video_text) + if m: + fmt['height'] = int(m.group('height')) + formats.append(fmt) + self._sort_formats(formats) categories = self._html_search_meta( 'keywords', webpage, 'categories', default='').split(',') @@ -43,10 +63,9 @@ def _real_extract(self, url): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, - 'ext': ext, 'thumbnail': thumbnail, 'categories': categories, 'age_limit': 18, + 'formats': formats, } From 429ddfd38dfb53aa05f0e2a77aeeb4ac5d20a17d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 30 Dec 2014 01:50:28 +0600 Subject: [PATCH 06/16] [cnn] Add support for hln URL format (Closes #4595) --- youtube_dl/extractor/cnn.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 1bff005d6..93e8d0de3 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P.+?/(?P[^/]+?)(?:\.cnn(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', @@ -35,6 +35,16 @@ class CNNIE(InfoExtractor): "description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", "upload_date": "20130821", } + }, { + 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', + 'md5': 'f14d02ebd264df951feb2400e2c25a1b', + 'info_dict': { + 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'ext': 'mp4', + 'title': 'Nashville Ep. 1: Hand crafted skateboards', + 'description': 'md5:e7223a503315c9f150acac52e76de086', + 'upload_date': '20141222', + } }] def _real_extract(self, url): From da3f7fb7f84e47de6aa0b29b16f78cb5bdf7d746 Mon Sep 17 00:00:00 2001 From: t0mm0 <dev@onairsoftware.co.uk> Date: Sun, 28 Dec 2014 17:07:32 +0000 Subject: [PATCH 07/16] [hitbox] add extractor for hitbox vods --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/hitbox.py | 104 +++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 youtube_dl/extractor/hitbox.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ea4faf2a6..3300dfeb4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -172,6 +172,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE +from .hitbox import HitboxIE from .hornbunny import HornBunnyIE from .hostingbulk import HostingBulkIE from .hotnewhiphop import HotNewHipHopIE diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py new file mode 100644 index 000000000..239da3cd4 --- /dev/null +++ b/youtube_dl/extractor/hitbox.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, +) + + +class HitboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.hitbox.tv/video/358062', + 'info_dict': { + 'id': '358062', + 'title': 'Megaman', + 'alt_title': 'Megaman', + 'description': '', + 'ext': 'mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 3834, + 'resolution': 'SD 480p', + 'uploader_id': 'supergreatfriend', + 'view_count': int, + 'upload_date': '20141225', + 'categories': [None], + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.hitbox.tv/video/203213', + 'info_dict': { + 'id': '203213', + 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy', + 'alt_title': 'hitboxlive - Aug 9th #6', + 'description': '', + 'ext': 'mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 215, + 'resolution': 'HD 720p', + 'uploader_id': 'hitboxlive', + 'view_count': int, + 'upload_date': '20140809', + 'categories': ['Live Show'], + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + thumb_base = 'https://edge.sf.hitbox.tv' + metadata = self._download_json( + 'https://www.hitbox.tv/api/media/video/%s' % (video_id), video_id + ) + + video_meta = metadata.get('video', [])[0] + title = video_meta.get('media_status') + alt_title = video_meta.get('media_title') + description = video_meta.get('media_description') + duration = int(float(video_meta.get('media_duration'))) + uploader = video_meta.get('media_user_name') + views = int(video_meta.get('media_views')) + upload_date = unified_strdate(video_meta.get('media_date_added')) + categories = [video_meta.get('category_name')] + thumbs = [ + {'url': thumb_base + video_meta.get('media_thumbnail'), + 'width': 320, + 'height': 180}, + {'url': thumb_base + video_meta.get('media_thumbnail_large'), + 'width': 768, + 'height': 432}, + ] + + player_config = self._download_json( + 'https://www.hitbox.tv/api/player/config/video/%s' % (video_id), + video_id + ) + + clip = player_config.get('clip') + video_url = clip.get('url') + res = clip.get('bitrates', [])[0].get('label') + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'url': video_url, + 'ext': 'mp4', + 'thumbnails': thumbs, + 'duration': duration, + 'resolution': res, + 'uploader_id': uploader, + 'view_count': views, + 'upload_date': upload_date, + 'categories': categories, + 'protocol': 'm3u8', + } From e3947e2b7fe1cd81f841daa1c4dc2ca72af8aefe Mon Sep 17 00:00:00 2001 From: t0mm0 <dev@onairsoftware.co.uk> Date: Mon, 29 Dec 2014 20:10:59 +0000 Subject: [PATCH 08/16] [hitbox] add support for live streams --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/hitbox.py | 146 +++++++++++++++++++++---------- 2 files changed, 102 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3300dfeb4..4f5a1ce18 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -172,7 +172,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE -from .hitbox import HitboxIE +from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE from .hostingbulk import HostingBulkIE from .hotnewhiphop import HotNewHipHopIE diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index 239da3cd4..eab2749ec 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( @@ -9,33 +10,13 @@ class HitboxIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.hitbox.tv/video/358062', - 'info_dict': { - 'id': '358062', - 'title': 'Megaman', - 'alt_title': 'Megaman', - 'description': '', - 'ext': 'mp4', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 3834, - 'resolution': 'SD 480p', - 'uploader_id': 'supergreatfriend', - 'view_count': int, - 'upload_date': '20141225', - 'categories': [None], - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { + _TEST = { 'url': 'http://www.hitbox.tv/video/203213', 'info_dict': { 'id': '203213', 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy', 'alt_title': 'hitboxlive - Aug 9th #6', - 'description': '', + 'description': '\n', 'ext': 'mp4', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 215, @@ -49,24 +30,28 @@ class HitboxIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) + } + def _extract_metadata(self, url, video_id): thumb_base = 'https://edge.sf.hitbox.tv' metadata = self._download_json( - 'https://www.hitbox.tv/api/media/video/%s' % (video_id), video_id + '%s/%s' % (url, video_id), video_id ) - video_meta = metadata.get('video', [])[0] + date = 'media_live_since' + media_type = 'livestream' + if metadata.get('media_type') == 'video': + media_type = 'video' + date = 'media_date_added' + + video_meta = metadata.get(media_type, [])[0] title = video_meta.get('media_status') alt_title = video_meta.get('media_title') - description = video_meta.get('media_description') + description = video_meta.get('media_description_md') duration = int(float(video_meta.get('media_duration'))) uploader = video_meta.get('media_user_name') views = int(video_meta.get('media_views')) - upload_date = unified_strdate(video_meta.get('media_date_added')) + upload_date = unified_strdate(video_meta.get(date)) categories = [video_meta.get('category_name')] thumbs = [ {'url': thumb_base + video_meta.get('media_thumbnail'), @@ -77,6 +62,28 @@ def _real_extract(self, url): 'height': 432}, ] + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'ext': 'mp4', + 'thumbnails': thumbs, + 'duration': duration, + 'uploader_id': uploader, + 'view_count': views, + 'upload_date': upload_date, + 'categories': categories, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._extract_metadata( + 'https://www.hitbox.tv/api/media/video', + video_id + ) + player_config = self._download_json( 'https://www.hitbox.tv/api/player/config/video/%s' % (video_id), video_id @@ -86,19 +93,68 @@ def _real_extract(self, url): video_url = clip.get('url') res = clip.get('bitrates', [])[0].get('label') - return { - 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'url': video_url, + metadata['resolution'] = res + metadata['url'] = video_url + metadata['protocol'] = 'm3u8' + + return metadata + + +class HitboxLiveIE(HitboxIE): + _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)' + _TEST = { + 'url': 'http://www.hitbox.tv/dimak', + 'info_dict': { + 'id': 'dimak', 'ext': 'mp4', - 'thumbnails': thumbs, - 'duration': duration, - 'resolution': res, - 'uploader_id': uploader, - 'view_count': views, - 'upload_date': upload_date, - 'categories': categories, - 'protocol': 'm3u8', - } + 'description': str, + 'upload_date': str, + 'title': str, + 'uploader_id': 'Dimak', + }, + 'params': { + # live + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._extract_metadata( + 'https://www.hitbox.tv/api/media/live', + video_id + ) + + player_config = self._download_json( + 'https://www.hitbox.tv/api/player/config/live/%s' % (video_id), + video_id + ) + + formats = [] + cdns = player_config.get('cdns') + servers = [] + for cdn in cdns: + base_url = cdn.get('netConnectionUrl') + host = re.search('.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1) + if base_url not in servers: + servers.append(base_url) + for stream in cdn.get('bitrates'): + label = stream.get('label') + if label != 'Auto': + formats.append({ + 'url': '%s/%s' % (base_url, stream.get('url')), + 'ext': 'mp4', + 'vbr': stream.get('bitrate'), + 'resolution': label, + 'rtmp_live': True, + 'format_note': host, + 'page_url': url, + 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf', + }) + + self._sort_formats(formats) + metadata['formats'] = formats + metadata['is_live'] = True + metadata['title'] = self._live_title(metadata.get('title')) + return metadata From 0c0a70f4c6839903b326d7d9074e93235defaa5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 30 Dec 2014 03:22:07 +0600 Subject: [PATCH 09/16] [hitbox] Minor changes --- youtube_dl/extractor/hitbox.py | 58 +++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index eab2749ec..84bd7c080 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -1,14 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( - unified_strdate, + clean_html, + parse_iso8601, + float_or_none, + int_or_none, + compat_str, ) class HitboxIE(InfoExtractor): + IE_NAME = 'hitbox' _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.hitbox.tv/video/203213', @@ -16,13 +22,14 @@ class HitboxIE(InfoExtractor): 'id': '203213', 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy', 'alt_title': 'hitboxlive - Aug 9th #6', - 'description': '\n', + 'description': '', 'ext': 'mp4', 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 215, + 'duration': 215.1666, 'resolution': 'HD 720p', - 'uploader_id': 'hitboxlive', + 'uploader': 'hitboxlive', 'view_count': int, + 'timestamp': 1407576133, 'upload_date': '20140809', 'categories': ['Live Show'], }, @@ -35,8 +42,7 @@ class HitboxIE(InfoExtractor): def _extract_metadata(self, url, video_id): thumb_base = 'https://edge.sf.hitbox.tv' metadata = self._download_json( - '%s/%s' % (url, video_id), video_id - ) + '%s/%s' % (url, video_id), video_id) date = 'media_live_since' media_type = 'livestream' @@ -47,11 +53,13 @@ def _extract_metadata(self, url, video_id): video_meta = metadata.get(media_type, [])[0] title = video_meta.get('media_status') alt_title = video_meta.get('media_title') - description = video_meta.get('media_description_md') - duration = int(float(video_meta.get('media_duration'))) + description = clean_html( + video_meta.get('media_description') or + video_meta.get('media_description_md')) + duration = float_or_none(video_meta.get('media_duration')) uploader = video_meta.get('media_user_name') - views = int(video_meta.get('media_views')) - upload_date = unified_strdate(video_meta.get(date)) + views = int_or_none(video_meta.get('media_views')) + timestamp = parse_iso8601(video_meta.get(date), ' ') categories = [video_meta.get('category_name')] thumbs = [ {'url': thumb_base + video_meta.get('media_thumbnail'), @@ -70,9 +78,9 @@ def _extract_metadata(self, url, video_id): 'ext': 'mp4', 'thumbnails': thumbs, 'duration': duration, - 'uploader_id': uploader, + 'uploader': uploader, 'view_count': views, - 'upload_date': upload_date, + 'timestamp': timestamp, 'categories': categories, } @@ -81,13 +89,11 @@ def _real_extract(self, url): metadata = self._extract_metadata( 'https://www.hitbox.tv/api/media/video', - video_id - ) + video_id) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/video/%s' % (video_id), - video_id - ) + 'https://www.hitbox.tv/api/player/config/video/%s' % video_id, + video_id) clip = player_config.get('clip') video_url = clip.get('url') @@ -101,16 +107,18 @@ def _real_extract(self, url): class HitboxLiveIE(HitboxIE): + IE_NAME = 'hitbox:live' _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)' _TEST = { 'url': 'http://www.hitbox.tv/dimak', 'info_dict': { 'id': 'dimak', 'ext': 'mp4', - 'description': str, - 'upload_date': str, - 'title': str, - 'uploader_id': 'Dimak', + 'description': 'md5:c9f80fa4410bc588d7faa40003fc7d0e', + 'timestamp': int, + 'upload_date': compat_str, + 'title': compat_str, + 'uploader': 'Dimak', }, 'params': { # live @@ -123,13 +131,11 @@ def _real_extract(self, url): metadata = self._extract_metadata( 'https://www.hitbox.tv/api/media/live', - video_id - ) + video_id) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/live/%s' % (video_id), - video_id - ) + 'https://www.hitbox.tv/api/player/config/live/%s' % video_id, + video_id) formats = [] cdns = player_config.get('cdns') From beb95e778170895c1435f55b71e3d76cbd9d8bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ce=CC=81dric=20Luthi?= <cedric.luthi@gmail.com> Date: Mon, 29 Dec 2014 22:58:14 +0100 Subject: [PATCH 10/16] [youtube] Fix videos with age gate and encrypted signatures The `sts` value is available on the embed webpage, get it from there. Fixes #4108. --- youtube_dl/extractor/youtube.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 550e18733..175e43272 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -766,11 +766,13 @@ def _real_extract(self, url): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube + url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') data = compat_urllib_parse.urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''), + r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage( @@ -968,11 +970,10 @@ def _map_to_format_list(urlmap): elif 's' in url_data: encrypted_sig = url_data['s'][0] - if not age_gate: - jsplayer_url_json = self._search_regex( - r'"assets":.+?"js":\s*("[^"]+")', - video_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + embed_webpage if age_gate else video_webpage, 'JS player URL') + player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', From a87bb090d95bdfe02429c37a2a10a4733ae69fc6 Mon Sep 17 00:00:00 2001 From: t0mm0 <dev@onairsoftware.co.uk> Date: Mon, 29 Dec 2014 23:06:56 +0000 Subject: [PATCH 11/16] [daum] update 'full id' regex fixes #4566 --- youtube_dl/extractor/daum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index c6b813f58..b4b3b795c 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -38,7 +38,7 @@ def _real_extract(self, url): canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) full_id = self._search_regex( - r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]', + r'src=["\']http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"\']', webpage, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( From 1c57e7f1f496a6f1fad43c0b193ce534fc9ce67b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 30 Dec 2014 16:55:53 +0600 Subject: [PATCH 12/16] [daum] Improve full_id regex --- youtube_dl/extractor/daum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index b4b3b795c..934da765e 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -38,7 +38,7 @@ def _real_extract(self, url): canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) full_id = self._search_regex( - r'src=["\']http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"\']', + r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', webpage, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( From 3ee08848dbc2314df39e6931c75920b406733868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 30 Dec 2014 17:12:12 +0600 Subject: [PATCH 13/16] Credit @0xced for #4598 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index bb4d8b4d1..29ce9e3e4 100644 --- a/AUTHORS +++ b/AUTHORS @@ -96,3 +96,4 @@ Mathias Rav Petr Kutalek Will Glynn Max Reimann +Cédric Luthi From fccae2b911970d0ffa97800b27e70b1937cd3058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 30 Dec 2014 17:26:21 +0600 Subject: [PATCH 14/16] [youtube] Add test for age-gate video with encrypted signature --- youtube_dl/extractor/youtube.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 175e43272..3da83e3a8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -418,6 +418,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'upload_date': '20140605', }, }, + # Age-gate video with encrypted signature + { + 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU', + 'info_dict': { + 'id': '6kLq3WMV1nU', + 'ext': 'mp4', + 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', + 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'uploader': 'LloydVEVO', + 'uploader_id': 'LloydVEVO', + 'upload_date': '20110629', + }, + }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) { 'url': '__2ABJjxzNo', From a349873226c873a2f3bea9f5dffe167053cb1666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 30 Dec 2014 22:28:07 +0600 Subject: [PATCH 15/16] [atresplayer] Add extractor (Closes #2341) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/atresplayer.py | 111 ++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 youtube_dl/extractor/atresplayer.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4f5a1ce18..c15786ad7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -25,6 +25,7 @@ ArteTVDDCIE, ArteTVEmbedIE, ) +from .atresplayer import AtresPlayerIE from .audiomack import AudiomackIE from .auengine import AUEngineIE from .azubu import AzubuIE diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py new file mode 100644 index 000000000..7e987b2a0 --- /dev/null +++ b/youtube_dl/extractor/atresplayer.py @@ -0,0 +1,111 @@ +from __future__ import unicode_literals + +import time +import hmac + +from .common import InfoExtractor +from ..utils import ( + compat_str, + compat_urllib_request, + int_or_none, + float_or_none, + xpath_text, + ExtractorError, +) + + +class AtresPlayerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html' + _TESTS = [ + { + 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', + 'md5': 'efd56753cda1bb64df52a3074f62e38a', + 'info_dict': { + 'id': 'capitulo-10-especial-solidario-nochebuena', + 'ext': 'mp4', + 'title': 'Especial Solidario de Nochebuena', + 'description': 'md5:e2d52ff12214fa937107d21064075bf1', + 'duration': 5527.6, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, + { + 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', + 'only_matching': True, + }, + ] + + _USER_AGENT = 'Dalvik/1.6.0 (Linux; U; Android 4.3; GT-I9300 Build/JSS15J' + _MAGIC = 'QWtMLXs414Yo+c#_+Q#K@NN)' + _TIMESTAMP_SHIFT = 30000 + + _TIME_API_URL = 'http://servicios.atresplayer.com/api/admin/time.json' + _URL_VIDEO_TEMPLATE = 'https://servicios.atresplayer.com/api/urlVideo/{1}/{0}/{1}|{2}|{3}.json' + _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s' + _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + episode_id = self._search_regex( + r'episode="([^"]+)"', webpage, 'episode id') + + timestamp = int_or_none(self._download_webpage( + self._TIME_API_URL, + video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) + timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) + token = hmac.new(self._MAGIC.encode('utf-8'), episode_id + timestamp_shifted).hexdigest() + + formats = [] + for fmt in ['windows', 'android_tablet']: + request = compat_urllib_request.Request( + self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) + request.add_header('Youtubedl-user-agent', self._USER_AGENT) + + fmt_json = self._download_json( + request, video_id, 'Downloading %s video JSON' % fmt) + + result = fmt_json.get('resultDes') + if result.lower() != 'ok': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, result), expected=True) + + for _, video_url in fmt_json['resultObject'].items(): + if video_url.endswith('/Manifest'): + formats.extend(self._extract_f4m_formats(video_url[:-9] + '/manifest.f4m', video_id)) + else: + formats.append({ + 'url': video_url, + 'format_id': 'android', + 'preference': 1, + }) + self._sort_formats(formats) + + player = self._download_json( + self._PLAYER_URL_TEMPLATE % episode_id, + episode_id) + + path_data = player.get('pathData') + + episode = self._download_xml( + self._EPISODE_URL_TEMPLATE % path_data, + video_id, 'Downloading episode XML') + + duration = float_or_none(xpath_text( + episode, './media/asset/info/technical/contentDuration', 'duration')) + + art = episode.find('./media/asset/info/art') + title = xpath_text(art, './name', 'title') + description = xpath_text(art, './description', 'description') + thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } From e83eebb12f984c1614204e53c09dc5124b52b45c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 30 Dec 2014 22:46:23 +0600 Subject: [PATCH 16/16] [atresplayer] Fix python3 bug --- youtube_dl/extractor/atresplayer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 7e987b2a0..72e83bfc2 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -56,7 +56,10 @@ def _real_extract(self, url): self._TIME_API_URL, video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) - token = hmac.new(self._MAGIC.encode('utf-8'), episode_id + timestamp_shifted).hexdigest() + token = hmac.new( + self._MAGIC.encode('ascii'), + (episode_id + timestamp_shifted).encode('utf-8') + ).hexdigest() formats = [] for fmt in ['windows', 'android_tablet']: