Use _download_xml in more extractors

This commit is contained in:
Jaime Marquínez Ferrándiz 2013-12-10 21:03:53 +01:00
parent a0088bdf93
commit 1825836235
4 changed files with 25 additions and 22 deletions

View File

@ -1,5 +1,4 @@
import re import re
import xml.etree.ElementTree
import json import json
from .common import InfoExtractor from .common import InfoExtractor
@ -65,18 +64,18 @@ def _real_extract(self, url):
uploader_id = mobj.group('company') uploader_id = mobj.group('company')
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
playlist_snippet = self._download_webpage(playlist_url, movie) def fix_html(s):
playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed # The ' in the onClick attributes are not escaped, it couldn't be parsed
# with xml.etree.ElementTree.fromstring
# like: http://trailers.apple.com/trailers/wb/gravity/ # like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json(m): def _clean_json(m):
return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;') return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) s = re.sub(self._JSON_RE, _clean_json, s)
playlist_html = u'<html>' + playlist_cleaned + u'</html>' s = u'<html>' + s + u'</html>'
return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
doc = xml.etree.ElementTree.fromstring(playlist_html)
playlist = [] playlist = []
for li in doc.findall('./div/ul/li'): for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick'] on_click = li.find('.//a').attrib['onClick']

View File

@ -1,9 +1,9 @@
import re import re
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
find_xpath_attr, find_xpath_attr,
fix_xml_all_ampersand,
) )
@ -30,12 +30,10 @@ def _real_extract(self, url):
# it includes a required token # it includes a required token
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
playlist_page = self._download_webpage( pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
video_id, u'Downloading video info') video_id, u'Downloading video info',
# Fix broken xml transform_source=fix_xml_all_ampersand)
playlist_page = re.sub('&', '&amp;', playlist_page)
pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
track_doc = pdoc.find('trackList/track') track_doc = pdoc.find('trackList/track')
def find_param(name): def find_param(name):

View File

@ -1,8 +1,10 @@
import re import re
import xml.etree.ElementTree
import operator import operator
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
fix_xml_all_ampersand,
)
class MetacriticIE(InfoExtractor): class MetacriticIE(InfoExtractor):
@ -23,9 +25,8 @@ def _real_extract(self, url):
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&' # The xml is not well formatted, there are raw '&'
info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
video_id, u'Downloading info xml').replace('&', '&amp;') video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = [] formats = []

View File

@ -1057,3 +1057,8 @@ def month_by_name(name):
return ENGLISH_NAMES.index(name) + 1 return ENGLISH_NAMES.index(name) + 1
except ValueError: except ValueError:
return None return None
def fix_xml_all_ampersand(xml_str):
"""Replace all the '&' by '&amp;' in XML"""
return xml_str.replace(u'&', u'&amp;')