From 689fb748ee1ba8e61f99d21a3bcb1bc83b708649 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 11 Sep 2015 04:44:17 +0100 Subject: [PATCH 01/18] [utlis] add extract_attributes for extracting html tags attributes --- youtube_dl/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 206dd56bc..bcebf9cc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -248,6 +248,14 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'): + attributes = re.findall(attributes_regex, attributes_str) + attributes_dict = {} + if attributes: + attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + return attributes_dict + + def clean_html(html): """Clean an HTML snippet into a readable string""" From ed1269000f24a6ddc683a295ff402ef3ded5c4fb Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 11 Sep 2015 04:46:21 +0100 Subject: [PATCH 02/18] [brightcove] add support for brightcove in page embed(fixes #6824) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/brightcove.py | 92 ++++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 21 ++++++- 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..fcd9edec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,7 +59,10 @@ from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..a07c0888f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -22,6 +22,10 @@ fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@ -346,3 +350,91 @@ def _extract_video_info(self, video_info): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'flv', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'duration': 165768, + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)]*)>.*?', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + account_id, player_id, embed, video_id = mobj.groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('name') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') + if src: + formats.append({ + 'url': src, + 'abr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('container'), + }) + else: + formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..7a3a7f66b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,7 +29,10 @@ url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -1012,6 +1015,17 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, } ] @@ -1288,6 +1302,11 @@ def _playlist_from_matches(matches, getter=None, ie=None): 'entries': entries, } + # Look for Brightcove In Page Embed: + brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + if brightcove_in_page_embed_url: + return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', From 53407e3f383ed80c67db9e06b8c3480257aa3184 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 23 Sep 2015 14:02:13 +0100 Subject: [PATCH 03/18] [brightcove] fix streaming_src extraction --- youtube_dl/extractor/brightcove.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a07c0888f..e4a7befee 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -413,7 +413,7 @@ def _real_extract(self, url): if source_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) else: - src = source.get('src') + src = source.get('src') or source.get('streaming_src') if src: formats.append({ 'url': src, @@ -424,8 +424,6 @@ def _real_extract(self, url): 'container': source.get('container'), 'vcodec': source.get('container'), }) - else: - formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) self._sort_formats(formats) From c01e1a96aa964ef6d5f0bf7675dbe34096b1d2c8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 30 Sep 2015 11:20:43 +0100 Subject: [PATCH 04/18] [brightcove] fix test and fields extraction --- youtube_dl/extractor/brightcove.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e4a7befee..b41cee91b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -354,14 +354,18 @@ def _extract_video_info(self, video_info): class BrightcoveInPageEmbedIE(InfoExtractor): _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' - TEST = { + _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', 'info_dict': { 'id': '4463358922001', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Meet the man behind Popcorn Time', - 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'timestamp': 1441391203, + 'upload_date': '20150904', 'duration': 165768, + 'uploader_id': '929656772001', } } @@ -403,7 +407,7 @@ def _real_extract(self, url): title = json_data['name'] description = json_data.get('description') - thumbnail = json_data.get('name') + thumbnail = json_data.get('thumbnail') timestamp = parse_iso8601(json_data.get('published_at')) duration = int_or_none(json_data.get('duration')) @@ -417,12 +421,13 @@ def _real_extract(self, url): if src: formats.append({ 'url': src, - 'abr': source.get('avg_bitrate'), + 'tbr': source.get('avg_bitrate'), 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), 'filesize': source.get('size'), 'container': source.get('container'), - 'vcodec': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), }) self._sort_formats(formats) @@ -435,4 +440,5 @@ def _real_extract(self, url): 'timestamp': timestamp, 'duration': duration, 'formats': formats, + 'uploader_id': account_id, } From 9550ca506fccf9c9d795816cc0a7817ff262ef45 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 31 Oct 2015 19:36:04 +0100 Subject: [PATCH 05/18] [utils] change extract_attributes to work in python 2 --- youtube_dl/extractor/brightcove.py | 3 +-- youtube_dl/utils.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b41cee91b..c6ad1d065 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -383,8 +383,7 @@ def _extract_url(webpage): return None def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - account_id, player_id, embed, video_id = mobj.groups() + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bcebf9cc5..518cea98b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -252,7 +252,8 @@ def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s attributes = re.findall(attributes_regex, attributes_str) attributes_dict = {} if attributes: - attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + for (attribute_name, attribute_value) in attributes: + attributes_dict[attribute_name] = attribute_value return attributes_dict From a662489877aa8d88b898a4984c2e580b9edbe7de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 05:09:50 +0600 Subject: [PATCH 06/18] [brightcove:embedinpage] Make more robust and extract rtmp streams --- youtube_dl/extractor/brightcove.py | 117 +++++++++++++++++++---------- 1 file changed, 79 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 2c7d968a8..2ad35ec90 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -23,6 +23,7 @@ unescapeHTML, unsmuggle_url, js_to_json, + float_or_none, int_or_none, parse_iso8601, extract_attributes, @@ -353,7 +354,7 @@ def _extract_video_info(self, video_info): class BrightcoveInPageEmbedIE(InfoExtractor): - _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[\da-f-]+)_(?P[a-z]+)/index\.html\?.*videoId=(?P\d+)' _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -385,59 +386,99 @@ def _extract_url(webpage): def _real_extract(self, url): account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) - catalog = self._parse_json( - js_to_json( - self._search_regex( - r'catalog\(({[^}]+})\);', - webpage, - 'catalog' - ) - ), - video_id - ) - policy_key = catalog['policyKey'] + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') req = compat_urllib_request.Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' + % (account_id, video_id), headers={'Accept': 'application/json;pk=%s' % policy_key}) json_data = self._download_json(req, video_id) title = json_data['name'] + + formats = [] + for source in json_data.get('sources', []): + source_type = source.get('type') + src = source.get('src') + if source_type == 'application/x-mpegURL': + if not src: + continue + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + streaming_src = source.get('streaming_src') + stream_name, app_name = source.get('stream_name'), source.get('app_name') + if not src and not streaming_src and (not stream_name or not app_name): + continue + tbr = float_or_none(source.get('avg_bitrate'), 1000) + height = int_or_none(source.get('height')) + f = { + 'tbr': tbr, + 'width': int_or_none(source.get('width')), + 'height': height, + 'filesize': int_or_none(source.get('size')), + 'container': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), + } + + def build_format_id(kind): + format_id = kind + if tbr: + format_id += '-%dk' % int(tbr) + if height: + format_id += '-%dp' % height + return format_id + + if src or streaming_src: + f.update({ + 'url': src or streaming_src, + 'format_id': build_format_id('http' if src else 'http-streaming'), + 'preference': 2 if src else 1, + }) + else: + f.update({ + 'url': app_name, + 'play_path': stream_name, + 'format_id': build_format_id('rtmp'), + }) + formats.append(f) + self._sort_formats(formats) + description = json_data.get('description') thumbnail = json_data.get('thumbnail') timestamp = parse_iso8601(json_data.get('published_at')) - duration = int_or_none(json_data.get('duration')) - - formats = [] - for source in json_data.get('sources'): - source_type = source.get('type') - if source_type == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) - else: - src = source.get('src') or source.get('streaming_src') - if src: - formats.append({ - 'url': src, - 'tbr': source.get('avg_bitrate'), - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'filesize': source.get('size'), - 'container': source.get('container'), - 'vcodec': source.get('codec'), - 'ext': source.get('container').lower(), - }) - - self._sort_formats(formats) + duration = float_or_none(json_data.get('duration'), 1000) + tags = json_data.get('tags', []) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, - 'timestamp': timestamp, 'duration': duration, - 'formats': formats, + 'timestamp': timestamp, 'uploader_id': account_id, + 'formats': formats, + 'tags': tags, } From 536f819eda975224666374a9ce83cc3472f5aa5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 05:51:05 +0600 Subject: [PATCH 07/18] [brightcove] Imrove extraction of new embeds --- youtube_dl/extractor/brightcove.py | 42 +++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 2ad35ec90..d494b8b67 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -354,7 +354,7 @@ def _extract_video_info(self, video_info): class BrightcoveInPageEmbedIE(InfoExtractor): - _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[\da-f-]+)_(?P[a-z]+)/index\.html\?.*videoId=(?P\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P\d+)' _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -370,18 +370,34 @@ class BrightcoveInPageEmbedIE(InfoExtractor): } } - @staticmethod - def _extract_url(webpage): - video_attributes = re.search(r'(?s)]*)>.*?', webpage) - if video_attributes: - video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') - account_id = video_attributes.get('account') - player_id = video_attributes.get('player') - embed = video_attributes.get('embed') - video_id = video_attributes.get('video-id') - if account_id and player_id and embed and video_id: - return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) - return None + def _extract_urls(self, webpage): + # Reference: + # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html + + entries = [] + + # Look for iframe embeds [1] + for _, url in re.findall( + r']+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(self.url_result(self._proto_relative_url(url))) + # Look for embed_in_page embeds [2] + # According to examples from [3] it's unclear whether video id may be optional + # and what to do when it is + for video_id, account_id, player_id, embed in re.findall( + r'''(?sx) + ]+ + data-video-id=["\'](\d+)["\'][^>]*>.*? + .*? + ]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js + ''', webpage): + entries.append(self.url_result( + 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + % (account_id, player_id, embed, video_id))) + return entries def _real_extract(self, url): account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() From 4fcaa4f4a5ef328009bef53ebc491ebe76452550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 05:54:16 +0600 Subject: [PATCH 08/18] [brightcove] Rename extractor to brightcove legacy Old embedding approaches are now "Legacy Studio" --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/brightcove.py | 2 +- youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/extractor/nowness.py | 4 ++-- youtube_dl/extractor/safari.py | 4 ++-- youtube_dl/extractor/space.py | 6 +++--- youtube_dl/extractor/tlc.py | 6 +++--- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 08cb93d76..8a0f76d7e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -61,7 +61,7 @@ from .br import BRIE from .breakcom import BreakIE from .brightcove import ( - BrightcoveIE, + BrightcoveLegacyIE, BrightcoveInPageEmbedIE, ) from .buzzfeed import BuzzFeedIE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d494b8b67..4dbc2e975 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -30,7 +30,7 @@ ) -class BrightcoveIE(InfoExtractor): +class BrightcoveLegacyIE(InfoExtractor): _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 34d930a2d..8f99dd9b1 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -31,7 +31,7 @@ xpath_text, ) from .brightcove import ( - BrightcoveIE, + BrightcoveLegacyIE, BrightcoveInPageEmbedIE, ) from .nbc import NBCSportsVPlayerIE @@ -1305,7 +1305,7 @@ def _playlist_from_matches(matches, getter=None, ie=None): urlrs, playlist_id=video_id, playlist_title=video_title) # Look for BrightCove: - bc_urls = BrightcoveIE._extract_brightcove_urls(webpage) + bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: self.to_screen('Brightcove video detected.') entries = [{ diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index b97f62fdb..dab487ea4 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,7 +1,7 @@ # encoding: utf-8 from __future__ import unicode_literals -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE from .common import InfoExtractor from ..utils import ExtractorError from ..compat import ( @@ -22,7 +22,7 @@ def _extract_url_result(self, post): 'http://www.nowness.com/iframe?id=%s' % video_id, video_id, note='Downloading player JavaScript', errnote='Unable to download player JavaScript') - bc_url = BrightcoveIE._extract_brightcove_url(player_code) + bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) if bc_url is None: raise ExtractorError('Could not find player definition') return self.url_result(bc_url, 'Brightcove') diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index a602af692..4f1f05c6a 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE from ..compat import ( compat_urllib_parse, @@ -112,7 +112,7 @@ def _real_extract(self, url): '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), part) - bc_url = BrightcoveIE._extract_brightcove_url(webpage) + bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) if not bc_url: raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index c2d0d36a6..2f190f764 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE from ..utils import RegexNotFoundError, ExtractorError @@ -31,8 +31,8 @@ def _real_extract(self, url): brightcove_url = self._og_search_video_url(webpage) except RegexNotFoundError: # Other videos works fine with the info from the object - brightcove_url = BrightcoveIE._extract_brightcove_url(webpage) + brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) if brightcove_url is None: raise ExtractorError( 'The webpage does not contain a video', expected=True) - return self.url_result(brightcove_url, BrightcoveIE.ie_key()) + return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key()) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index 13263614c..d6d038a8d 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor -from .brightcove import BrightcoveIE +from .brightcove import BrightcoveLegacyIE from .discovery import DiscoveryIE from ..compat import compat_urlparse @@ -66,6 +66,6 @@ def _real_extract(self, url): return { '_type': 'url', - 'url': BrightcoveIE._extract_brightcove_url(iframe), - 'ie': BrightcoveIE.ie_key(), + 'url': BrightcoveLegacyIE._extract_brightcove_url(iframe), + 'ie': BrightcoveLegacyIE.ie_key(), } From 5c17f0a67a3a0518f448825eee54d16045acd63c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 05:55:59 +0600 Subject: [PATCH 09/18] [brightcove:embedinpage] Rename extractor to brightcove new It's not actually embed_in_page but "New Studio" and allows both iframe and embed_in_page embeds --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/brightcove.py | 2 +- youtube_dl/extractor/generic.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8a0f76d7e..64ce3210b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -62,7 +62,7 @@ from .breakcom import BreakIE from .brightcove import ( BrightcoveLegacyIE, - BrightcoveInPageEmbedIE, + BrightcoveNewIE, ) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4dbc2e975..5aca0b378 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -353,7 +353,7 @@ def _extract_video_info(self, video_info): return info -class BrightcoveInPageEmbedIE(InfoExtractor): +class BrightcoveNewIE(InfoExtractor): _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P\d+)' _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8f99dd9b1..0797e1a90 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -32,7 +32,7 @@ ) from .brightcove import ( BrightcoveLegacyIE, - BrightcoveInPageEmbedIE, + BrightcoveNewIE, ) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE @@ -1322,7 +1322,7 @@ def _playlist_from_matches(matches, getter=None, ie=None): } # Look for Brightcove In Page Embed: - brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + brightcove_in_page_embed_url = BrightcoveNewIE._extract_url(webpage) if brightcove_in_page_embed_url: return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') From e721d857c2b24c10c09626a4a79172d85e0dc5fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 05:56:51 +0600 Subject: [PATCH 10/18] [brightcove] Clarify IE_NAMEs --- youtube_dl/extractor/brightcove.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 5aca0b378..8ee5486fe 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -31,6 +31,7 @@ class BrightcoveLegacyIE(InfoExtractor): + IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' @@ -354,6 +355,7 @@ def _extract_video_info(self, video_info): class BrightcoveNewIE(InfoExtractor): + IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*videoId=(?P\d+)' _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', From 24af85298ed1862ac809677e70ff59f3e9ee3234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 06:01:56 +0600 Subject: [PATCH 11/18] [brightcove] Fix _extract_urls --- youtube_dl/extractor/brightcove.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 8ee5486fe..1c7783dcb 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -372,7 +372,8 @@ class BrightcoveNewIE(InfoExtractor): } } - def _extract_urls(self, webpage): + @staticmethod + def _extract_urls(webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) @@ -383,7 +384,7 @@ def _extract_urls(self, webpage): # Look for iframe embeds [1] for _, url in re.findall( r']+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(self.url_result(self._proto_relative_url(url))) + entries.append(url) # Look for embed_in_page embeds [2] # According to examples from [3] it's unclear whether video id may be optional # and what to do when it is @@ -396,9 +397,9 @@ def _extract_urls(self, webpage): src=["\'](?:https?:)?//players\.brightcove\.net/ (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js ''', webpage): - entries.append(self.url_result( + entries.append( 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' - % (account_id, player_id, embed, video_id))) + % (account_id, player_id, embed, video_id)) return entries def _real_extract(self, url): From f6519f89b09be788549f68ba12f0cc31c55d9751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 06:03:07 +0600 Subject: [PATCH 12/18] [generic] Extract Brightcove New Studio embeds --- youtube_dl/extractor/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0797e1a90..334864db3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1321,10 +1321,10 @@ def _playlist_from_matches(matches, getter=None, ie=None): 'entries': entries, } - # Look for Brightcove In Page Embed: - brightcove_in_page_embed_url = BrightcoveNewIE._extract_url(webpage) - if brightcove_in_page_embed_url: - return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for Brightcove New Studio embeds + bc_urls = BrightcoveNewIE._extract_urls(webpage) + if bc_urls: + return _playlist_from_matches(bc_urls, ie='BrightcoveNew') # Look for embedded rtl.nl player matches = re.findall( From 1f4b722b00fd5c24468cd4d072e8b5c5428ca515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 06:03:32 +0600 Subject: [PATCH 13/18] [generic] Clarify Brightcove Legacy Studio comment --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 334864db3..8ba0a9913 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1304,7 +1304,7 @@ def _playlist_from_matches(matches, getter=None, ie=None): return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) - # Look for BrightCove: + # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: self.to_screen('Brightcove video detected.') From 3b7d9aa487399e06bba5dc03c90b6576c2b067b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 06:05:46 +0600 Subject: [PATCH 14/18] Rename all references to legacy studio Brightcove extractor --- youtube_dl/extractor/aljazeera.py | 4 ++-- youtube_dl/extractor/generic.py | 8 ++++---- youtube_dl/extractor/nowness.py | 2 +- youtube_dl/extractor/safari.py | 2 +- youtube_dl/extractor/space.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 184a14a4f..5b2c0dc9a 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -15,7 +15,7 @@ class AlJazeeraIE(InfoExtractor): 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', 'uploader': 'Al Jazeera English', }, - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'skip': 'Not accessible from Travis CI server', } @@ -32,5 +32,5 @@ def _real_extract(self, url): 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' '&%40videoPlayer={0}'.format(brightcove_id) ), - 'ie_key': 'Brightcove', + 'ie_key': 'BrightcoveLegacy', } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8ba0a9913..51516a38a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -278,7 +278,7 @@ class GenericIE(InfoExtractor): # it also tests brightcove videos that need to set the 'Referer' in the # http requests { - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { 'id': '2765128793001', @@ -302,7 +302,7 @@ class GenericIE(InfoExtractor): 'uploader': 'thestar.com', 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -317,7 +317,7 @@ class GenericIE(InfoExtractor): }, { # https://github.com/rg3/youtube-dl/issues/3541 - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { 'id': '3866516442001', @@ -1311,7 +1311,7 @@ def _playlist_from_matches(matches, getter=None, ie=None): entries = [{ '_type': 'url', 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'Brightcove' + 'ie_key': 'BrightcoveLegacy' } for bc_url in bc_urls] return { diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index dab487ea4..0fba55833 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -25,7 +25,7 @@ def _extract_url_result(self, post): bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) if bc_url is None: raise ExtractorError('Could not find player definition') - return self.url_result(bc_url, 'Brightcove') + return self.url_result(bc_url, 'BrightcoveLegacy') elif source == 'vimeo': return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') elif source == 'youtube': diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 4f1f05c6a..e9e33d0a3 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -116,7 +116,7 @@ def _real_extract(self, url): if not bc_url: raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) - return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove') + return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy') class SafariCourseIE(SafariBaseIE): diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index 2f190f764..ebb5d6ec0 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -10,7 +10,7 @@ class SpaceIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P[^/\.\?]*?)-video\.html' _TEST = { - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', 'info_dict': { 'id': '2780937028001', From 75eac8961ee2ff004891ec57d5a2fec4f0b5574d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Nov 2015 06:07:24 +0600 Subject: [PATCH 15/18] [brightcove] Remove unused import --- youtube_dl/extractor/brightcove.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 1c7783dcb..ef34ae48f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -26,7 +26,6 @@ float_or_none, int_or_none, parse_iso8601, - extract_attributes, ) From c7b959ce383040f1d507eef0e43041029583b307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Nov 2015 06:07:44 +0600 Subject: [PATCH 16/18] [utils] Remove unused function --- youtube_dl/utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 65556d056..d39f313a4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -259,15 +259,6 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) -def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'): - attributes = re.findall(attributes_regex, attributes_str) - attributes_dict = {} - if attributes: - for (attribute_name, attribute_value) in attributes: - attributes_dict[attribute_name] = attribute_value - return attributes_dict - - def clean_html(html): """Clean an HTML snippet into a readable string""" From fd91257c4019a1956cc59eac1232f2c413b9747d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Nov 2015 06:08:36 +0600 Subject: [PATCH 17/18] [brightcove] Order imports alphabetically --- youtube_dl/extractor/brightcove.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ef34ae48f..f137ba8c6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -20,12 +20,12 @@ ExtractorError, find_xpath_attr, fix_xml_ampersands, - unescapeHTML, - unsmuggle_url, - js_to_json, float_or_none, + js_to_json, int_or_none, parse_iso8601, + unescapeHTML, + unsmuggle_url, ) From e01b432ad38b36b1ba6cb1b6dccecec51f9fc1e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 Nov 2015 06:11:17 +0600 Subject: [PATCH 18/18] [brightcove:new] Fix test --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f137ba8c6..6b184157c 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -366,7 +366,7 @@ class BrightcoveNewIE(InfoExtractor): 'description': 'md5:eac376a4fe366edc70279bfb681aea16', 'timestamp': 1441391203, 'upload_date': '20150904', - 'duration': 165768, + 'duration': 165.768, 'uploader_id': '929656772001', } }