From 979a9dd4c4d46e0f2b11bc4bcac51ad8d446d186 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 11:57:13 +0200 Subject: [PATCH] _html_search_regex with clean_html superpowers --- test/tests.json | 2 +- youtube_dl/InfoExtractors.py | 151 ++++++++++++++++------------------- 2 files changed, 72 insertions(+), 81 deletions(-) diff --git a/test/tests.json b/test/tests.json index c39d1d9c1..82da27d5b 100644 --- a/test/tests.json +++ b/test/tests.json @@ -325,7 +325,7 @@ "file": "wshh6a7q1ny0G34ZwuIO.mp4", "md5": "9d04de741161603bf7071bbf4e883186", "info_dict": { - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! " + "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 86cc7c748..6060a5988 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -222,6 +222,16 @@ def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0 u'please report this issue on GitHub.' % _name) return None + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -1923,9 +1933,8 @@ def _real_extract(self, url): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._search_regex('

([^<]+)

', + video_title = self._html_search_regex('

([^<]+)

', webpage, u'title') - video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2087,7 +2096,7 @@ def _real_extract(self,url): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - video_title = self._search_regex('([^<]+)', + video_title = self._html_search_regex('([^<]+)', webpage, u'title') video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') @@ -2169,7 +2178,7 @@ def _real_extract(self,url): video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') video_swfobj = compat_urllib_parse.unquote(video_swfobj) - video_title = self._search_regex("(.*?)", + video_title = self._html_search_regex("(.*?)", webpage, u'title') return [{ @@ -2371,17 +2380,14 @@ def _real_extract(self, url): self.report_extraction(showName) webpage = self._download_webpage(url, showName) - videoDesc = self._search_regex('(.*?)\s+-\s+XVID', + video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', webpage, u'title') # Extract video thumbnail @@ -2665,7 +2671,7 @@ def _real_extract(self, url): webpage, u'title') # Extract description - video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] @@ -2837,12 +2843,10 @@ def _real_extract(self, url): note='Downloading course info page', errnote='Unable to download course info page') - info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['title'] = unescapeHTML(info['title']) + info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['description'] = self._search_regex('<description>([^<]+)</description>', + info['description'] = self._html_search_regex('<description>([^<]+)</description>', coursepage, u'description', fatal=False) - if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2903,15 +2907,13 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) - song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage, u'song name', fatal=False) - if song_name: song_name = unescapeHTML(song_name) - video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', webpage, u'title') - video_title = unescapeHTML(video_title) - mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage, u'mtvn_uri', fatal=False) content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', @@ -3067,7 +3069,7 @@ def _real_extract(self, url): webpage, u'video URL') video_url = compat_urllib_parse.unquote(video_url) - video_title = self._search_regex(self.VIDEO_TITLE_RE, + video_title = self._html_search_regex(self.VIDEO_TITLE_RE, webpage, u'title') video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, @@ -3108,7 +3110,7 @@ def _real_extract(self, url): self.report_extraction(video_id) # Extract update date - upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename @@ -3116,12 +3118,12 @@ def _real_extract(self, url): upload_date = upload_date.strftime('%Y%m%d') # Extract uploader - uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', webpage, u'uploader', fatal=False) # Extract title # Get the first line for title - video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', + video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video @@ -3175,13 +3177,13 @@ def _real_extract(self, url): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._search_regex(r'<meta property="og:title" content="(.*?)"', + title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') # It isn't there in the HTML it returns to us - # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) + # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) + description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, @@ -3337,17 +3339,14 @@ def _real_extract(self, url): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, u'video URL', flags=re.DOTALL) - video_url = unescapeHTML(video_url) - title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", + title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - title = clean_html(title) - video_description = self._search_regex(r'.+)"', + video_title = self._html_search_regex(r'data-title="(?P.+)"', webpage, u'title') - uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', webpage, u'uploader', fatal=False, flags=re.DOTALL) - if uploader: uploader = unescapeHTML(uploader.strip()) - thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', + thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage, u'thumbnail', fatal=False) info = { @@ -3454,11 +3452,11 @@ def _real_extract(self, url): else: ext = 'flv' - video_title = self._search_regex(r"<title>(.*)", + video_title = self._html_search_regex(r"(.*)", webpage_src, u'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', webpage_src, u'thumbnail', fatal=False) if not thumbnail: @@ -3640,7 +3638,7 @@ def _real_extract(self, url): #Get the uploaded date VIDEO_UPLOADED_RE = r'
Added (?P[0-9\/]+) by' - upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, @@ -3668,7 +3666,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) # Get the video title - video_title = self._search_regex(r'(?P<title>.*)', + video_title = self._html_search_regex(r'(?P<title>.*)', webpage, u'title').strip() # Get the embed page @@ -3747,13 +3745,11 @@ def _real_extract(self, url): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + uploader = self._html_search_regex(r'
[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) - if uploader: uploader = clean_html(uploader) info = { 'id': video_id, @@ -3907,9 +3903,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'
(.*?)
', + video_title = self._html_search_regex(r'
(.*?)
', webpage, u'title') - video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -3948,15 +3943,13 @@ def _real_extract(self, url): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._search_regex(r'', + video_uploader = self._html_search_regex(r'By:.*?(\w+)', webpage, u'uploader', fatal=False) info = { @@ -4033,9 +4026,8 @@ def _real_extract(self, url): # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - video_title = self._search_regex(r'(?P<title>.*?)', + video_title = self._html_search_regex(r'(?P<title>.*?)', webpage, u'title', flags=re.DOTALL) - video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, @@ -4105,10 +4097,10 @@ def _real_extract(self,url): self.report_extraction(video_id) - video_url = self._search_regex(r'', + video_url = self._html_search_regex(r'', webpage, u'video URL') - video_title = self._search_regex('

(.+?)

', + video_title = self._html_search_regex('

(.+?)

', webpage, u'title') return [{ @@ -4132,7 +4124,7 @@ def _real_extract(self,url): self.report_extraction(video_id) - video_url = self._search_regex(r'.*?)]]>', @@ -4161,13 +4153,13 @@ def _real_extract(self, url): video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', webpage, u'video URL') - video_title = self._search_regex(r'.*?

(.+?)

', + uploader = self._html_search_regex(r'
.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ @@ -4230,7 +4222,7 @@ def _real_extract(self, url): first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - node_id = self._search_regex(r'(\d+-\d+)', + node_id = self._html_search_regex(r'(\d+-\d+)', first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' @@ -4243,13 +4235,13 @@ def _real_extract(self, url): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - video_title = self._search_regex(r'(.*?)', + video_url = self._html_search_regex(r'(.*?)', data, u'video URL') return [{ @@ -4321,12 +4313,11 @@ def _real_extract(self,url): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, u'title') - video_title = unescapeHTML(video_title) # Can't see the description anywhere in the UI - # video_description = self._search_regex(r'Description: (?P[^<]+)', + # video_description = self._html_search_regex(r'Description: (?P[^<]+)', # webpage, u'description', fatal=False) # if video_description: video_description = unescapeHTML(video_description) @@ -4337,7 +4328,7 @@ def _real_extract(self,url): video_upload_date = None self._downloader.report_warning(u'Unable to extract upload date') - video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + video_uploader_id = self._html_search_regex(r']+>(?P[^>]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', @@ -4373,7 +4364,7 @@ def _real_extract(self, url): self.report_extraction(track_id) - html_tracks = self._search_regex(r'', + html_tracks = self._html_search_regex(r'', response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks)