From b66745288e50cff42ff711e63242b5d97e80cd4f Mon Sep 17 00:00:00 2001 From: net Date: Sat, 27 Sep 2014 20:21:46 +0300 Subject: [PATCH 1/3] [sport5] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sport5.py | 70 ++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 youtube_dl/extractor/sport5.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1f1fc0eb26..c3a4d3c9aa 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -340,6 +340,7 @@ from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .sportdeutschland import SportDeutschlandIE +from .sport5 import Sport5IE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py new file mode 100644 index 0000000000..9a4e39a43e --- /dev/null +++ b/youtube_dl/extractor/sport5.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from youtube_dl.utils import compat_str, compat_urlretrieve + + + +class Sport5IE(InfoExtractor): + _VALID_URL = r'http://.*sport5\.co\.il' + _TESTS = [{ + 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', + 'info_dict': { + 'id': 's5-Y59xx1-GUh2', + 'ext': 'mp4', + 'title': 'md5:4a2a5eba7e7dc88fdc446cbca8a41c79', + } + }, { + 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', + 'info_dict': { + 'id': 's5-SiXxx1-hKh2', + 'ext': 'mp4', + 'title': 'md5:5cb1c6bfc0f16086e59f6683013f8e02', + } + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + webpage = self._download_webpage(url, '') + + media_id = self._html_search_regex('clipId=(s5-\w+-\w+)', webpage, 'media id') + + xml = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % media_id, + media_id, 'Downloading media XML') + + title = xml.find('./Title').text + duration = xml.find('./Duration').text + description = xml.find('./Description').text + thumbnail = xml.find('./PosterLinks/PosterIMG').text + player_url = xml.find('./PlaybackLinks/PlayerUrl').text + file_els = xml.findall('./PlaybackLinks/FileURL') + + formats = [] + + for file_el in file_els: + bitrate = file_el.attrib.get('bitrate') + width = int(file_el.attrib.get('width')) + height = int(file_el.attrib.get('height')) + formats.append({ + 'url': compat_str(file_el.text), + 'ext': 'mp4', + 'height': height, + 'width': width + }) + + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'player_url': player_url, + } \ No newline at end of file From 68b09730461de20395cee9427dc469fa9edc4022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Sep 2014 02:07:42 +0700 Subject: [PATCH 2/3] [YoutubeDL] Expect all kind of strings in urlopen Now it doesn't fail if req is python2's str --- youtube_dl/YoutubeDL.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a1713dc5ad..b485dbdf1c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1250,12 +1250,13 @@ def urlopen(self, req): # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) # To work around aforementioned issue we will replace request's original URL with # percent-encoded one - url = req if isinstance(req, compat_str) else req.get_full_url() + req_is_string = isinstance(req, basestring) + url = req if req_is_string else req.get_full_url() url_escaped = escape_url(url) # Substitute URL if any change after escaping if url != url_escaped: - if isinstance(req, compat_str): + if req_is_string: req = url_escaped else: req = compat_urllib_request.Request( From 0b75c2a88ba56a84322db6cc1a298d7e52b44b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Sep 2014 02:31:14 +0700 Subject: [PATCH 3/3] [sport5] Capture error message and improve --- youtube_dl/extractor/sport5.py | 88 +++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py index 9a4e39a43e..3f680bfc63 100644 --- a/youtube_dl/extractor/sport5.py +++ b/youtube_dl/extractor/sport5.py @@ -4,67 +4,89 @@ import re from .common import InfoExtractor -from youtube_dl.utils import compat_str, compat_urlretrieve - +from ..utils import ExtractorError class Sport5IE(InfoExtractor): - _VALID_URL = r'http://.*sport5\.co\.il' - _TESTS = [{ + _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P\d+)' + _TESTS = [ + { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', 'info_dict': { 'id': 's5-Y59xx1-GUh2', 'ext': 'mp4', - 'title': 'md5:4a2a5eba7e7dc88fdc446cbca8a41c79', - } + 'title': 'ולנסיה-קורדובה 0:3', + 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', + 'duration': 228, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', }, { 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', 'info_dict': { 'id': 's5-SiXxx1-hKh2', 'ext': 'mp4', - 'title': 'md5:5cb1c6bfc0f16086e59f6683013f8e02', - } + 'title': 'GOALS_CELTIC_270914.mp4', + 'description': '', + 'duration': 87, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + media_id = mobj.group('id') - webpage = self._download_webpage(url, '') + webpage = self._download_webpage(url, media_id) - media_id = self._html_search_regex('clipId=(s5-\w+-\w+)', webpage, 'media id') + video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id') - xml = self._download_xml( - 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % media_id, - media_id, 'Downloading media XML') + metadata = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, + video_id) - title = xml.find('./Title').text - duration = xml.find('./Duration').text - description = xml.find('./Description').text - thumbnail = xml.find('./PosterLinks/PosterIMG').text - player_url = xml.find('./PlaybackLinks/PlayerUrl').text - file_els = xml.findall('./PlaybackLinks/FileURL') + error = metadata.find('./Error') + if error is not None: + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, + error.find('./Name').text, + error.find('./Description').text), + expected=True) - formats = [] + title = metadata.find('./Title').text + description = metadata.find('./Description').text + duration = int(metadata.find('./Duration').text) - for file_el in file_els: - bitrate = file_el.attrib.get('bitrate') - width = int(file_el.attrib.get('width')) - height = int(file_el.attrib.get('height')) - formats.append({ - 'url': compat_str(file_el.text), - 'ext': 'mp4', - 'height': height, - 'width': width - }) + posters_el = metadata.find('./PosterLinks') + thumbnails = [{ + 'url': thumbnail.text, + 'width': int(thumbnail.get('width')), + 'height': int(thumbnail.get('height')), + } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] + categories_el = metadata.find('./Categories') + categories = [ + cat.get('name') for cat in categories_el.findall('./Category') + ] if categories_el is not None else [] + + formats = [{ + 'url': fmt.text, + 'ext': 'mp4', + 'vbr': int(fmt.get('bitrate')), + 'width': int(fmt.get('width')), + 'height': int(fmt.get('height')), + } for fmt in metadata.findall('./PlaybackLinks/FileURL')] self._sort_formats(formats) return { - 'id': media_id, + 'id': video_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, + 'thumbnails': thumbnails, 'duration': duration, + 'categories': categories, 'formats': formats, - 'player_url': player_url, } \ No newline at end of file