[kinja] add support for Kinja embeds

closes #5756
closes #11282
closes #22237
closes #22384
This commit is contained in:
Remita Amine 2019-11-06 19:56:10 +01:00
parent d64ec1242e
commit 55adb63e54
4 changed files with 241 additions and 52 deletions

View File

@ -513,6 +513,7 @@
from .ketnet import KetnetIE from .ketnet import KetnetIE
from .khanacademy import KhanAcademyIE from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE from .kinopoisk import KinoPoiskIE
from .konserthusetplay import KonserthusetPlayIE from .konserthusetplay import KonserthusetPlayIE
from .kontrtube import KontrTubeIE from .kontrtube import KontrTubeIE

View File

@ -119,6 +119,7 @@
from .expressen import ExpressenIE from .expressen import ExpressenIE
from .zype import ZypeIE from .zype import ZypeIE
from .odnoklassniki import OdnoklassnikiIE from .odnoklassniki import OdnoklassnikiIE
from .kinja import KinjaEmbedIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1487,16 +1488,18 @@ class GenericIE(InfoExtractor):
'timestamp': 1432570283, 'timestamp': 1432570283,
}, },
}, },
# OnionStudios embed # Kinja embed
{ {
'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
'info_dict': { 'info_dict': {
'id': '2855', 'id': '106351',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Dont Understand Bitcoin? This Man Will Mumble An Explanation At You', 'title': 'Dont Understand Bitcoin? This Man Will Mumble An Explanation At You',
'description': 'Migrated from OnionStudios',
'thumbnail': r're:^https?://.*\.jpe?g$', 'thumbnail': r're:^https?://.*\.jpe?g$',
'uploader': 'ClickHole', 'uploader': 'clickhole',
'uploader_id': 'clickhole', 'upload_date': '20150527',
'timestamp': 1432744860,
} }
}, },
# SnagFilms embed # SnagFilms embed
@ -2894,6 +2897,12 @@ def _real_extract(self, url):
if senate_isvp_url: if senate_isvp_url:
return self.url_result(senate_isvp_url, 'SenateISVP') return self.url_result(senate_isvp_url, 'SenateISVP')
# Look for Kinja embeds
kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url)
if kinja_embed_urls:
return self.playlist_from_matches(
kinja_embed_urls, video_id, video_title)
# Look for OnionStudios embeds # Look for OnionStudios embeds
onionstudios_url = OnionStudiosIE._extract_url(webpage) onionstudios_url = OnionStudiosIE._extract_url(webpage)
if onionstudios_url: if onionstudios_url:

View File

@ -0,0 +1,221 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urllib_parse_unquote,
)
from ..utils import (
int_or_none,
parse_iso8601,
strip_or_none,
try_get,
unescapeHTML,
urljoin,
)
class KinjaEmbedIE(InfoExtractor):
IENAME = 'kinja:embed'
_DOMAIN_REGEX = r'''(?:[^.]+\.)?
(?:
avclub|
clickhole|
deadspin|
gizmodo|
jalopnik|
jezebel|
kinja|
kotaku|
lifehacker|
splinternews|
the(?:inventory|onion|root|takeout)
)\.com'''
_COMMON_REGEX = r'''/
(?:
ajax/inset|
embed/video
)/iframe\?.*?\bid='''
_VALID_URL = r'''(?x)https?://%s%s
(?P<type>
fb|
imgur|
instagram|
jwp(?:layer)?-video|
kinjavideo|
mcp|
megaphone|
ooyala|
soundcloud(?:-playlist)?|
tumblr-post|
twitch-stream|
twitter|
ustream-channel|
vimeo|
vine|
youtube-(?:list|video)
)-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX)
_TESTS = [{
'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E',
'only_matching': True,
}, {
'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE',
'only_matching': True,
}]
_JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform')
_PROVIDER_MAP = {
'fb': ('facebook.com/video.php?v=', 'Facebook'),
'imgur': ('imgur.com/', 'Imgur'),
'instagram': ('instagram.com/p/', 'Instagram'),
'jwplayer-video': _JWPLATFORM_PROVIDER,
'jwp-video': _JWPLATFORM_PROVIDER,
'megaphone': ('player.megaphone.fm/', 'Generic'),
'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'),
'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'),
'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'),
'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'),
'twitch-stream': ('twitch.tv/', 'TwitchStream'),
'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'),
'ustream-channel': ('ustream.tv/embed/', 'Ustream'),
'vimeo': ('vimeo.com/', 'Vimeo'),
'vine': ('vine.co/v/', 'Vine'),
'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'),
'youtube-video': ('youtube.com/embed/', 'Youtube'),
}
@staticmethod
def _extract_urls(webpage, url):
return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer(
r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX),
webpage)]
def _real_extract(self, url):
video_type, video_id = re.match(self._VALID_URL, url).groups()
provider = self._PROVIDER_MAP.get(video_type)
if provider:
video_id = compat_urllib_parse_unquote(video_id)
if video_type == 'tumblr-post':
video_id, blog = video_id.split('-', 1)
result_url = provider[0] % (blog, video_id)
elif video_type == 'youtube-list':
video_id, playlist_id = video_id.split('/')
result_url = provider[0] % (video_id, playlist_id)
else:
if video_type == 'ooyala':
video_id = video_id.split('/')[0]
result_url = provider[0] + video_id
return self.url_result('http://' + result_url, provider[1])
if video_type == 'kinjavideo':
data = self._download_json(
'https://kinja.com/api/core/video/views/videoById',
video_id, query={'videoId': video_id})['data']
title = data['title']
formats = []
for k in ('signedPlaylist', 'streaming'):
m3u8_url = data.get(k + 'Url')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
self._sort_formats(formats)
thumbnail = None
poster = data.get('poster') or {}
poster_id = poster.get('id')
if poster_id:
thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg')
return {
'id': video_id,
'title': title,
'description': strip_or_none(data.get('description')),
'formats': formats,
'tags': data.get('tags'),
'timestamp': int_or_none(try_get(
data, lambda x: x['postInfo']['publishTimeMillis']), 1000),
'thumbnail': thumbnail,
'uploader': data.get('network'),
}
else:
video_data = self._download_json(
'https://api.vmh.univision.com/metadata/v1/content/' + video_id,
video_id)['videoMetadata']
iptc = video_data['photoVideoMetadataIPTC']
title = iptc['title']['en']
fmg = video_data.get('photoVideoMetadata_fmg') or {}
tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com'
data = self._download_json(
tvss_domain + '/api/v3/video-auth/url-signature-tokens',
video_id, query={'mcpids': video_id})['data'][0]
formats = []
rendition_url = data.get('renditionUrl')
if rendition_url:
formats = self._extract_m3u8_formats(
rendition_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False)
fallback_rendition_url = data.get('fallbackRenditionUrl')
if fallback_rendition_url:
formats.append({
'format_id': 'fallback',
'tbr': int_or_none(self._search_regex(
r'_(\d+)\.mp4', fallback_rendition_url,
'bitrate', default=None)),
'url': fallback_rendition_url,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str),
'uploader': fmg.get('network'),
'duration': int_or_none(iptc.get('fileDuration')),
'formats': formats,
'description': try_get(iptc, lambda x: x['description']['en'], compat_str),
'timestamp': parse_iso8601(iptc.get('dateReleased')),
}

View File

@ -4,13 +4,8 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..compat import compat_str
compat_str, from ..utils import js_to_json
int_or_none,
js_to_json,
parse_iso8601,
try_get,
)
class OnionStudiosIE(InfoExtractor): class OnionStudiosIE(InfoExtractor):
@ -20,7 +15,7 @@ class OnionStudiosIE(InfoExtractor):
'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
'md5': '5a118d466d62b5cd03647cf2c593977f', 'md5': '5a118d466d62b5cd03647cf2c593977f',
'info_dict': { 'info_dict': {
'id': '2937', 'id': '3459881',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Hannibal charges forward, stops for a cocktail', 'title': 'Hannibal charges forward, stops for a cocktail',
'description': 'md5:545299bda6abf87e5ec666548c6a9448', 'description': 'md5:545299bda6abf87e5ec666548c6a9448',
@ -53,43 +48,6 @@ def _real_extract(self, url):
mcp_id = compat_str(self._parse_json(self._search_regex( mcp_id = compat_str(self._parse_json(self._search_regex(
r'window\.mcpMapping\s*=\s*({.+?});', webpage, r'window\.mcpMapping\s*=\s*({.+?});', webpage,
'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id'])
video_data = self._download_json( return self.url_result(
'https://api.vmh.univision.com/metadata/v1/content/' + mcp_id, 'http://kinja.com/ajax/inset/iframe?id=mcp-' + mcp_id,
mcp_id)['videoMetadata'] 'KinjaEmbed', mcp_id)
iptc = video_data['photoVideoMetadataIPTC']
title = iptc['title']['en']
fmg = video_data.get('photoVideoMetadata_fmg') or {}
tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com'
data = self._download_json(
tvss_domain + '/api/v3/video-auth/url-signature-tokens',
mcp_id, query={'mcpids': mcp_id})['data'][0]
formats = []
rendition_url = data.get('renditionUrl')
if rendition_url:
formats = self._extract_m3u8_formats(
rendition_url, mcp_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False)
fallback_rendition_url = data.get('fallbackRenditionUrl')
if fallback_rendition_url:
formats.append({
'format_id': 'fallback',
'tbr': int_or_none(self._search_regex(
r'_(\d+)\.mp4', fallback_rendition_url,
'bitrate', default=None)),
'url': fallback_rendition_url,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str),
'uploader': fmg.get('network'),
'duration': int_or_none(iptc.get('fileDuration')),
'formats': formats,
'description': try_get(iptc, lambda x: x['description']['en'], compat_str),
'timestamp': parse_iso8601(iptc.get('dateReleased')),
}