[crunchyroll] parse vilos media data(closes #17343)

This commit is contained in:
Remita Amine 2018-09-01 08:16:28 +01:00
parent ed6919e737
commit 54a5be4dba
2 changed files with 141 additions and 112 deletions

View File

@ -7,7 +7,7 @@
from hashlib import sha1 from hashlib import sha1
from math import pow, sqrt, floor from math import pow, sqrt, floor
from .common import InfoExtractor from .vrv import VRVIE
from ..compat import ( from ..compat import (
compat_b64decode, compat_b64decode,
compat_etree_fromstring, compat_etree_fromstring,
@ -18,6 +18,8 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
bytes_to_intlist, bytes_to_intlist,
extract_attributes,
float_or_none,
intlist_to_bytes, intlist_to_bytes,
int_or_none, int_or_none,
lowercase_escape, lowercase_escape,
@ -26,14 +28,13 @@
unified_strdate, unified_strdate,
urlencode_postdata, urlencode_postdata,
xpath_text, xpath_text,
extract_attributes,
) )
from ..aes import ( from ..aes import (
aes_cbc_decrypt, aes_cbc_decrypt,
) )
class CrunchyrollBaseIE(InfoExtractor): class CrunchyrollBaseIE(VRVIE):
_LOGIN_URL = 'https://www.crunchyroll.com/login' _LOGIN_URL = 'https://www.crunchyroll.com/login'
_LOGIN_FORM = 'login_form' _LOGIN_FORM = 'login_form'
_NETRC_MACHINE = 'crunchyroll' _NETRC_MACHINE = 'crunchyroll'
@ -148,7 +149,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Wanna be the Strongest in the World Episode 1 An Idol-Wrestler is Born!', 'title': 'Wanna be the Strongest in the World Episode 1 An Idol-Wrestler is Born!',
'description': 'md5:2d17137920c64f2f49981a7797d275ef', 'description': 'md5:2d17137920c64f2f49981a7797d275ef',
'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'uploader': 'Yomiuri Telecasting Corporation (YTV)',
'upload_date': '20131013', 'upload_date': '20131013',
'url': 're:(?!.*&amp)', 'url': 're:(?!.*&amp)',
@ -221,7 +222,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'info_dict': { 'info_dict': {
'id': '535080', 'id': '535080',
'ext': 'mp4', 'ext': 'mp4',
'title': '11eyes Episode 1 Piros éjszaka - Red Night', 'title': '11eyes Episode 1 Red Night ~ Piros éjszaka',
'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
'uploader': 'Marvelous AQL Inc.', 'uploader': 'Marvelous AQL Inc.',
'upload_date': '20091021', 'upload_date': '20091021',
@ -437,13 +438,18 @@ def _real_extract(self, url):
if 'To view this, please log in to verify you are 18 or older.' in webpage: if 'To view this, please log in to verify you are 18 or older.' in webpage:
self.raise_login_required() self.raise_login_required()
media = self._parse_json(self._search_regex(
r'vilos\.config\.media\s*=\s*({.+?});',
webpage, 'vilos media', default='{}'), video_id)
media_metadata = media.get('metadata') or {}
video_title = self._html_search_regex( video_title = self._html_search_regex(
r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
webpage, 'video_title') webpage, 'video_title')
video_title = re.sub(r' {2,}', ' ', video_title) video_title = re.sub(r' {2,}', ' ', video_title)
video_description = self._parse_json(self._html_search_regex( video_description = (self._parse_json(self._html_search_regex(
r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
webpage, 'description', default='{}'), video_id).get('description') webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
if video_description: if video_description:
video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
video_upload_date = self._html_search_regex( video_upload_date = self._html_search_regex(
@ -456,6 +462,12 @@ def _real_extract(self, url):
[r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
webpage, 'video_uploader', fatal=False) webpage, 'video_uploader', fatal=False)
formats = []
for stream in media.get('streams', []):
formats.extend(self._extract_vrv_formats(
stream.get('url'), video_id, stream.get('format'),
stream.get('audio_lang'), stream.get('hardsub_lang')))
if not formats:
available_fmts = [] available_fmts = []
for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
attrs = extract_attributes(a) attrs = extract_attributes(a)
@ -468,8 +480,10 @@ def _real_extract(self, url):
available_fmts = re.findall(p, webpage) available_fmts = re.findall(p, webpage)
if available_fmts: if available_fmts:
break break
if not available_fmts:
available_fmts = self._FORMAT_IDS.keys()
video_encode_ids = [] video_encode_ids = []
formats = []
for fmt in available_fmts: for fmt in available_fmts:
stream_quality, stream_format = self._FORMAT_IDS[fmt] stream_quality, stream_format = self._FORMAT_IDS[fmt]
video_format = fmt + 'p' video_format = fmt + 'p'
@ -549,6 +563,16 @@ def _real_extract(self, url):
'media_id': video_id, 'media_id': video_id,
}) })
subtitles = {}
for subtitle in media.get('subtitles', []):
subtitle_url = subtitle.get('url')
if not subtitle_url:
continue
subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
'url': subtitle_url,
'ext': subtitle.get('format', 'ass'),
})
if not subtitles:
subtitles = self.extract_subtitles(video_id, webpage) subtitles = self.extract_subtitles(video_id, webpage)
# webpage provide more accurate data than series_title from XML # webpage provide more accurate data than series_title from XML
@ -557,8 +581,8 @@ def _real_extract(self, url):
webpage, 'series', fatal=False) webpage, 'series', fatal=False)
season = xpath_text(metadata, 'series_title') season = xpath_text(metadata, 'series_title')
episode = xpath_text(metadata, 'episode_title') episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title')
episode_number = int_or_none(xpath_text(metadata, 'episode_number')) episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number'))
season_number = int_or_none(self._search_regex( season_number = int_or_none(self._search_regex(
r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
@ -568,7 +592,8 @@ def _real_extract(self, url):
'id': video_id, 'id': video_id,
'title': video_title, 'title': video_title,
'description': video_description, 'description': video_description,
'thumbnail': xpath_text(metadata, 'episode_image_url'), 'duration': float_or_none(media_metadata.get('duration'), 1000),
'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'),
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': video_upload_date, 'upload_date': video_upload_date,
'series': series, 'series': series,

View File

@ -72,7 +72,7 @@ def _get_cms_resource(self, resource_key, video_id):
class VRVIE(VRVBaseIE): class VRVIE(VRVBaseIE):
IE_NAME = 'vrv' IE_NAME = 'vrv'
_VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
_TEST = { _TESTS = [{
'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
'info_dict': { 'info_dict': {
'id': 'GR9PNZ396', 'id': 'GR9PNZ396',
@ -85,7 +85,28 @@ class VRVIE(VRVBaseIE):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
} }]
def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
if not url or stream_format not in ('hls', 'dash'):
return []
stream_id = hardsub_lang or audio_lang
format_id = '%s-%s' % (stream_format, stream_id)
if stream_format == 'hls':
adaptive_formats = self._extract_m3u8_formats(
url, video_id, 'mp4', m3u8_id=format_id,
note='Downloading %s m3u8 information' % stream_id,
fatal=False)
elif stream_format == 'dash':
adaptive_formats = self._extract_mpd_formats(
url, video_id, mpd_id=format_id,
note='Downloading %s MPD information' % stream_id,
fatal=False)
if audio_lang:
for f in adaptive_formats:
if f.get('acodec') != 'none':
f['language'] = audio_lang
return adaptive_formats
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -115,26 +136,9 @@ def _real_extract(self, url):
for stream_type, streams in streams_json.get('streams', {}).items(): for stream_type, streams in streams_json.get('streams', {}).items():
if stream_type in ('adaptive_hls', 'adaptive_dash'): if stream_type in ('adaptive_hls', 'adaptive_dash'):
for stream in streams.values(): for stream in streams.values():
stream_url = stream.get('url') formats.extend(self._extract_vrv_formats(
if not stream_url: stream.get('url'), video_id, stream_type.split('_')[1],
continue audio_locale, stream.get('hardsub_locale')))
stream_id = stream.get('hardsub_locale') or audio_locale
format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
if stream_type == 'adaptive_hls':
adaptive_formats = self._extract_m3u8_formats(
stream_url, video_id, 'mp4', m3u8_id=format_id,
note='Downloading %s m3u8 information' % stream_id,
fatal=False)
else:
adaptive_formats = self._extract_mpd_formats(
stream_url, video_id, mpd_id=format_id,
note='Downloading %s MPD information' % stream_id,
fatal=False)
if audio_locale:
for f in adaptive_formats:
if f.get('acodec') != 'none':
f['language'] = audio_locale
formats.extend(adaptive_formats)
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}