From 574b2a7393ef389792d5010704e505ef0eaaa5e8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 16:07:36 +0100 Subject: [PATCH] [nbc:nbcnews] improve extraction(fixes #6922) - extract more metadata and formats - relax regex --- youtube_dl/extractor/nbc.py | 137 ++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 2202cfa33..bb0817e34 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,13 +3,16 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from .theplatform import ThePlatformIE from ..utils import ( - ExtractorError, find_xpath_attr, lowercase_escape, smuggle_url, unescapeHTML, + update_url_query, + int_or_none, + HEADRequest, + parse_iso8601, ) @@ -131,10 +134,10 @@ def _real_extract(self, url): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') -class NBCNewsIE(InfoExtractor): +class NBCNewsIE(ThePlatformIE): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P\d+)| - (?:watch|feature|nightly-news)/[^/]+/(?P.+)) + ([^/]+/)*(?P<display_id>[^/?]+)) ''' _TESTS = [ @@ -149,15 +152,14 @@ class NBCNewsIE(InfoExtractor): }, }, { - 'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', - 'md5': 'b2421750c9f260783721d898f4c42063', + 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', + 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'I1wpAI_zmhsQ', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', }, - 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', @@ -168,17 +170,29 @@ class NBCNewsIE(InfoExtractor): 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', }, + 'skip': 'This page is unavailable.', }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', + 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'sekXqyTVnmN3', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', + 'md5': 'a49e173825e5fcd15c13fc297fced39d', + 'info_dict': { + 'id': '529953347624', + 'ext': 'mp4', + 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + }, + 'expected_warnings': ['http-6000 is not available'] + }, { 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'only_matching': True, @@ -202,49 +216,80 @@ def _real_extract(self, url): } else: # "feature" and "nightly-news" pages use theplatform.com - title = mobj.group('title') - webpage = self._download_webpage(url, title) + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + info = None bootstrap_json = self._search_regex( - r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', - webpage, 'bootstrap json', flags=re.MULTILINE) - bootstrap = self._parse_json(bootstrap_json, video_id) - info = bootstrap['results'][0]['video'] - mpxid = info['mpxId'] + r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', + webpage, 'bootstrap json', default=None) + if bootstrap_json: + bootstrap = self._parse_json(bootstrap_json, display_id) + info = bootstrap['results'][0]['video'] + else: + player_instance_json = self._search_regex( + r'videoObj\s*:\s*({.+})', webpage, 'player instance') + info = self._parse_json(player_instance_json, display_id) + video_id = info['mpxId'] + title = info['title'] - base_urls = [ - info['fallbackPlaylistUrl'], - info['associatedPlaylistUrl'], - ] + subtitles = {} + caption_links = info.get('captionLinks') + if caption_links: + for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): + sub_url = caption_links.get(sub_key) + if sub_url: + subtitles.setdefault('en', []).append({ + 'url': sub_url, + 'ext': sub_ext, + }) - for base_url in base_urls: - if not base_url: + formats = [] + for video_asset in info['videoAssets']: + video_url = video_asset.get('publicUrl') + if not video_url: continue - playlist_url = base_url + '?form=MPXNBCNewsAPI' - - try: - all_videos = self._download_json(playlist_url, title) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - continue - raise - - if not all_videos or 'videos' not in all_videos: + container = video_asset.get('format') + asset_type = video_asset.get('assetType') or '' + if container == 'ISM' or asset_type == 'FireTV-Once': continue - - try: - info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) - break - except StopIteration: - continue - - if info is None: - raise ExtractorError('Could not find video in playlists') + elif asset_type == 'OnceURL': + tp_formats, tp_subtitles = self._extract_theplatform_smil( + video_url, video_id) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + else: + tbr = int_or_none(video_asset.get('bitRate'), 1000) + format_id = 'http%s' % ('-%d' % tbr if tbr else '') + video_url = update_url_query( + video_url, {'format': 'redirect'}) + # resolve the url so that we can check availability and detect the correct extension + head = self._request_webpage( + HEADRequest(video_url), video_id, + 'Checking %s url' % format_id, + '%s is not available' % format_id, + fatal=False) + if head: + video_url = head.geturl() + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(video_asset.get('width')), + 'height': int_or_none(video_asset.get('height')), + 'tbr': tbr, + 'container': video_asset.get('format'), + }) + self._sort_formats(formats) return { - '_type': 'url', - # We get the best quality video - 'url': info['videoAssets'][-1]['publicUrl'], - 'ie_key': 'ThePlatform', + 'id': video_id, + 'title': title, + 'description': info.get('description'), + 'thumbnail': info.get('description'), + 'thumbnail': info.get('thumbnail'), + 'duration': int_or_none(info.get('duration')), + 'timestamp': parse_iso8601(info.get('pubDate')), + 'formats': formats, + 'subtitles': subtitles, }