From 691008087b902fa731a8f4f840c1821c93505840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 31 Aug 2013 15:05:59 +0200 Subject: [PATCH 1/7] Add an automatic page generator for the supported sites (related #156) They are listed in the "supportedsites.html" page. --- devscripts/gh-pages/update-sites.py | 33 +++++++++++++++++++++++++++++ devscripts/release.sh | 1 + 2 files changed, 34 insertions(+) create mode 100755 devscripts/gh-pages/update-sites.py diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py new file mode 100755 index 000000000..fa4bb2beb --- /dev/null +++ b/devscripts/gh-pages/update-sites.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import sys +import os +import textwrap + +# We must be able to import youtube_dl +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +import youtube_dl + +def main(): + with open('supportedsites.html.in', 'r', encoding='utf-8') as tmplf: + template = tmplf.read() + + ie_htmls = [] + for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME): + ie_html = '{}'.format(ie.IE_NAME) + try: + ie_html += ': {}'.format(ie.IE_DESC) + except AttributeError: + pass + if ie.working() == False: + ie_html += ' (Currently broken)' + ie_htmls.append('
  • {}
  • '.format(ie_html)) + + template = template.replace('@SITES@', textwrap.indent('\n'.join(ie_htmls), '\t')) + + with open('supportedsites.html', 'w', encoding='utf-8') as sitesf: + sitesf.write(template) + +if __name__ == '__main__': + main() diff --git a/devscripts/release.sh b/devscripts/release.sh index 24c9ad8d8..62c68a6cf 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -85,6 +85,7 @@ ROOT=$(pwd) "$ROOT/devscripts/gh-pages/sign-versions.py" < "$ROOT/updates_key.pem" "$ROOT/devscripts/gh-pages/generate-download.py" "$ROOT/devscripts/gh-pages/update-copyright.py" + "$ROOT/devscripts/gh-pages/update-sites.py" git add *.html *.html.in update git commit -m "release $version" git show HEAD From 6c758d79de48956b90d9e78aec695ee0b10b00d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 31 Aug 2013 22:35:39 +0200 Subject: [PATCH 2/7] [metacafe] Add more cases for detecting the uploader detection (reported in #1343) --- youtube_dl/extractor/metacafe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e38dc98b4..e537648ff 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -122,7 +122,7 @@ def _real_extract(self, url): video_title = self._html_search_regex(r'(?im)(.*) - Video', webpage, u'title') description = self._og_search_description(webpage) video_uploader = self._html_search_regex( - r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);', + r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) return { From 8e4e89f1c236e1bec38c5363c1c341930056211e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 2 Sep 2013 11:54:09 +0200 Subject: [PATCH 3/7] Add an extractor for VeeHD (closes #1359) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/veehd.py | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/veehd.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 90f1a4418..9f56e427c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -89,6 +89,7 @@ from .unistra import UnistraIE from .ustream import UstreamIE from .vbox7 import Vbox7IE +from .veehd import VeeHDIE from .veoh import VeohIE from .vevo import VevoIE from .videofyme import VideofyMeIE diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py new file mode 100644 index 000000000..3a99a29c6 --- /dev/null +++ b/youtube_dl/extractor/veehd.py @@ -0,0 +1,56 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + get_element_by_id, + clean_html, +) + +class VeeHDIE(InfoExtractor): + _VALID_URL = r'https?://veehd.com/video/(?P\d+)' + + _TEST = { + u'url': u'http://veehd.com/video/4686958', + u'file': u'4686958.mp4', + u'info_dict': { + u'title': u'Time Lapse View from Space ( ISS)', + u'uploader_id': u'spotted', + u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"', + webpage, u'player path') + player_url = compat_urlparse.urljoin(url, player_path) + player_page = self._download_webpage(player_url, video_id, + u'Downloading player page') + config_json = self._search_regex(r'value=\'config=({.+?})\'', + player_page, u'config json') + config = json.loads(config_json) + + video_url = compat_urlparse.unquote(config['clip']['url']) + title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) + uploader_id = self._html_search_regex(r'(.+?)', + webpage, u'uploader') + thumbnail = self._search_regex(r'(.*?) Date: Tue, 3 Sep 2013 10:48:56 +0200 Subject: [PATCH 4/7] [vimeo] add support for videos that embed the download url in the player page (fixes #1364) --- youtube_dl/extractor/vimeo.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 512e06e2a..dee4175ef 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -44,6 +44,16 @@ class VimeoIE(InfoExtractor): u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', }, }, + { + u'url': u'http://player.vimeo.com/video/54469442', + u'file': u'54469442.mp4', + u'md5': u'619b811a4417aa4abe78dc653becf511', + u'note': u'Videos that embed the url in the player page', + u'info_dict': { + u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', + u'uploader': u'The BLN & Business of Software', + }, + }, ] def _login(self): @@ -112,7 +122,8 @@ def _real_extract(self, url, new_video=True): # Extract the config JSON try: - config = webpage.split(' = {config:')[1].split(',assets:')[0] + config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + webpage, u'info section', flags=re.DOTALL) config = json.loads(config) except: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): @@ -132,7 +143,9 @@ def _real_extract(self, url, new_video=True): video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None # Extract video thumbnail - video_thumbnail = config["video"]["thumbnail"] + video_thumbnail = config["video"].get("thumbnail") + if video_thumbnail is None: + _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] # Extract video description video_description = get_element_by_attribute("itemprop", "description", webpage) @@ -154,14 +167,15 @@ def _real_extract(self, url, new_video=True): # TODO bind to format param codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] files = { 'hd': [], 'sd': [], 'other': []} + config_files = config["video"].get("files") or config["request"].get("files") for codec_name, codec_extension in codecs: - if codec_name in config["video"]["files"]: - if 'hd' in config["video"]["files"][codec_name]: + if codec_name in config_files: + if 'hd' in config_files[codec_name]: files['hd'].append((codec_name, codec_extension, 'hd')) - elif 'sd' in config["video"]["files"][codec_name]: + elif 'sd' in config_files[codec_name]: files['sd'].append((codec_name, codec_extension, 'sd')) else: - files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0])) + files['other'].append((codec_name, codec_extension, config_files[codec_name][0])) for quality in ('hd', 'sd', 'other'): if len(files[quality]) > 0: @@ -173,8 +187,12 @@ def _real_extract(self, url, new_video=True): else: raise ExtractorError(u'No known codec found') - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) + video_url = None + if isinstance(config_files[video_codec], dict): + video_url = config_files[video_codec][video_quality].get("url") + if video_url is None: + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, From 9c2ade40de53bae865c5267642651c81d16e48a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 3 Sep 2013 11:11:36 +0200 Subject: [PATCH 5/7] [vimeo] Handle Assertions Error when trying to get the description In some pages the html tags are not closed, python 2.6 cannot handle it. --- youtube_dl/extractor/vimeo.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index dee4175ef..4a7d82b7a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -148,9 +148,17 @@ def _real_extract(self, url, new_video=True): _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] # Extract video description - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) - else: video_description = u'' + video_description = None + try: + video_description = get_element_by_attribute("itemprop", "description", webpage) + if video_description: video_description = clean_html(video_description) + except AssertionError as err: + # On some pages like (http://player.vimeo.com/video/54469442) the + # html tags are not closed, python 2.6 cannot handle it + if err.args[0] == 'we should not get here!': + pass + else: + raise # Extract upload date video_upload_date = None From 4ff7a0f1f6e6b1ad1743330d318dfe85806923b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 3 Sep 2013 11:33:59 +0200 Subject: [PATCH 6/7] [dailymotion] improve the regex for extracting the video info --- youtube_dl/extractor/dailymotion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 1ea449ca8..439033d23 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -55,7 +55,8 @@ def _real_extract(self, url): embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id embed_page = self._download_webpage(embed_url, video_id, u'Downloading embed page') - info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info') + info = self._search_regex(r'var info = ({.*?}),$', embed_page, + 'video info', flags=re.MULTILINE) info = json.loads(info) # TODO: support choosing qualities From c8dbccde30d9ca06d4c9305329a9aacd10420276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 3 Sep 2013 11:51:01 +0200 Subject: [PATCH 7/7] [orf] Remove the test video, they seem to expire in one week --- youtube_dl/extractor/orf.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 41ef8e992..cfca2a063 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -14,19 +14,6 @@ class ORFIE(InfoExtractor): _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P\d+)' - _TEST = { - u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter', - u'file': u'6566957.flv', - u'info_dict': { - u'title': u'Wetter', - u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at', - }, - u'params': { - # It uses rtmp - u'skip_download': True, - } - } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id')