[mtv] fix mtv.com and more(?)

This commit is contained in:
Unknown 2020-10-09 07:06:49 +02:00
parent 39b7f3ec15
commit b6e0c7d2e3
2 changed files with 46 additions and 2 deletions

View File

@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
compat_xpath, compat_xpath,
compat_urlparse,
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
@ -22,6 +23,7 @@ from ..utils import (
unescapeHTML, unescapeHTML,
update_url_query, update_url_query,
url_basename, url_basename,
get_domain,
xpath_text, xpath_text,
) )
@ -253,7 +255,39 @@ class MTVServicesInfoExtractor(InfoExtractor):
return try_get(feed, lambda x: x['result']['data']['id'], compat_str) return try_get(feed, lambda x: x['result']['data']['id'], compat_str)
def _extract_mgid(self, webpage): def _extract_new_triforce_mgid(self, webpage, url='', data_zone=None, video_id=None):
# print(compat_urlparse.urlparse(url).netloc)
domain = get_domain(url)
if domain is None:
raise ExtractorError(
'[%s] could not get domain' % self.IE_NAME,
expected=True)
url = url.replace("https://", "http://")
enc_url = compat_urlparse.quote(url, safe='')
_TRIFORCE_V8_TEMPLATE = 'https://%s/feeds/triforce/manifest/v8?url=%s'
triforce_manifest_url = _TRIFORCE_V8_TEMPLATE % (domain, enc_url)
manifest = self._download_json(triforce_manifest_url, video_id, fatal=False)
if manifest.get('manifest').get('type') == 'redirect':
self.to_screen('Found a redirect. Downloading manifest from new location')
new_loc = manifest.get('manifest').get('newLocation')
new_loc = new_loc.replace("https://", "http://")
enc_new_loc = compat_urlparse.quote(new_loc, safe='')
triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc)
manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False)
item_id = try_get(manifest, lambda x: x['manifest']['reporting']['itemId'], compat_str)
if not item_id:
self.to_screen('Found no id!')
return
# 'episode' can be anything. 'content' is used often as well
_MGID_TEMPLATE = 'mgid:arc:episode:%s:%s'
mgid = _MGID_TEMPLATE % (domain, item_id)
return mgid
def _extract_mgid(self, webpage, url):
try: try:
# the url can be http://media.mtvnservices.com/fb/{mgid}.swf # the url can be http://media.mtvnservices.com/fb/{mgid}.swf
# or http://media.mtvnservices.com/{mgid} # or http://media.mtvnservices.com/{mgid}
@ -275,6 +309,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
mgid = self._search_regex( mgid = self._search_regex(
r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None) r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None)
if not mgid:
mgid = self._extract_new_triforce_mgid(webpage, url)
if not mgid: if not mgid:
mgid = self._extract_triforce_mgid(webpage) mgid = self._extract_triforce_mgid(webpage)
@ -283,7 +320,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
title = url_basename(url) title = url_basename(url)
webpage = self._download_webpage(url, title) webpage = self._download_webpage(url, title)
mgid = self._extract_mgid(webpage) mgid = self._extract_mgid(webpage, url)
videos_info = self._get_videos_info(mgid) videos_info = self._get_videos_info(mgid)
return videos_info return videos_info

View File

@ -1984,6 +1984,7 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
class HTMLAttributeParser(compat_HTMLParser): class HTMLAttributeParser(compat_HTMLParser):
"""Trivial HTML parser to gather the attributes for a single element""" """Trivial HTML parser to gather the attributes for a single element"""
def __init__(self): def __init__(self):
self.attrs = {} self.attrs = {}
compat_HTMLParser.__init__(self) compat_HTMLParser.__init__(self)
@ -2378,6 +2379,7 @@ class GeoRestrictedError(ExtractorError):
This exception may be thrown when a video is not available from your This exception may be thrown when a video is not available from your
geographic location due to geographic restrictions imposed by a website. geographic location due to geographic restrictions imposed by a website.
""" """
def __init__(self, msg, countries=None): def __init__(self, msg, countries=None):
super(GeoRestrictedError, self).__init__(msg, expected=True) super(GeoRestrictedError, self).__init__(msg, expected=True)
self.msg = msg self.msg = msg
@ -3558,6 +3560,11 @@ def remove_quotes(s):
return s return s
def get_domain(url):
domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
return domain.group('domain') if domain else None
def url_basename(url): def url_basename(url):
path = compat_urlparse.urlparse(url).path path = compat_urlparse.urlparse(url).path
return path.strip('/').split('/')[-1] return path.strip('/').split('/')[-1]