Update to ytdl-commit-cf2dbec

cf2dbec630

Except: [kakao] improve info extraction and detect geo restriction
d8085580f6
This commit is contained in:
pukkandan 2021-02-20 02:14:36 +05:30
parent 5e41dca334
commit bc2ca1bb75
19 changed files with 1013 additions and 395 deletions

View File

@ -12,6 +12,7 @@
from youtube_dlc.extractor import (
YoutubePlaylistIE,
YoutubeTabIE,
YoutubeIE,
)
@ -57,14 +58,22 @@ def test_youtube_toptracks(self):
entries = result['entries']
self.assertEqual(len(entries), 100)
def test_youtube_flat_playlist_titles(self):
def test_youtube_flat_playlist_extraction(self):
dl = FakeYDL()
dl.params['extract_flat'] = True
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv')
ie = YoutubeTabIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc')
self.assertIsPlaylist(result)
for entry in result['entries']:
self.assertTrue(entry.get('title'))
entries = list(result['entries'])
self.assertTrue(len(entries) == 1)
video = entries[0]
self.assertEqual(video['_type'], 'url_transparent')
self.assertEqual(video['ie_key'], 'Youtube')
self.assertEqual(video['id'], 'BaW_jenozKc')
self.assertEqual(video['url'], 'BaW_jenozKc')
self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐')
self.assertEqual(video['duration'], 10)
self.assertEqual(video['uploader'], 'Philipp Hagemeister')
if __name__ == '__main__':

View File

@ -324,20 +324,42 @@ def _real_extract(self, url):
formats = []
for a in video_node.findall('.//asset'):
file_name = xpath_text(a, './fileName', default=None)
if not file_name:
continue
format_type = a.attrib.get('type')
format_url = url_or_none(file_name)
if format_url:
ext = determine_ext(file_name)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, display_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_type or 'hls', fatal=False))
continue
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
update_url_query(format_url, {'hdcore': '3.7.0'}),
display_id, f4m_id=format_type or 'hds', fatal=False))
continue
f = {
'format_id': a.attrib['type'],
'width': int_or_none(a.find('./frameWidth').text),
'height': int_or_none(a.find('./frameHeight').text),
'vbr': int_or_none(a.find('./bitrateVideo').text),
'abr': int_or_none(a.find('./bitrateAudio').text),
'vcodec': a.find('./codecVideo').text,
'tbr': int_or_none(a.find('./totalBitrate').text),
'format_id': format_type,
'width': int_or_none(xpath_text(a, './frameWidth')),
'height': int_or_none(xpath_text(a, './frameHeight')),
'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
'abr': int_or_none(xpath_text(a, './bitrateAudio')),
'vcodec': xpath_text(a, './codecVideo'),
'tbr': int_or_none(xpath_text(a, './totalBitrate')),
}
if a.find('./serverPrefix').text:
f['url'] = a.find('./serverPrefix').text
f['playpath'] = a.find('./fileName').text
server_prefix = xpath_text(a, './serverPrefix', default=None)
if server_prefix:
f.update({
'url': server_prefix,
'playpath': file_name,
})
else:
f['url'] = a.find('./fileName').text
if not format_url:
continue
f['url'] = format_url
formats.append(f)
self._sort_formats(formats)

View File

@ -7,19 +7,21 @@
from .gigya import GigyaBaseIE
from ..compat import compat_HTTPError
from ..utils import (
extract_attributes,
ExtractorError,
strip_or_none,
clean_html,
extract_attributes,
float_or_none,
get_element_by_class,
int_or_none,
merge_dicts,
str_or_none,
strip_or_none,
url_or_none,
)
class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '68993eda72ef62386a15ea2cf3c93107',
@ -332,3 +334,51 @@ def _real_extract(self, url):
'display_id': display_id,
'season_number': int_or_none(page.get('episode_season')),
})
class DagelijkseKostIE(InfoExtractor):
IE_DESC = 'dagelijksekost.een.be'
_VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
'md5': '30bfffc323009a3e5f689bef6efa2365',
'info_dict': {
'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
'display_id': 'hachis-parmentier-met-witloof',
'ext': 'mp4',
'title': 'Hachis parmentier met witloof',
'description': 'md5:9960478392d87f63567b5b117688cdc5',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 283.02,
},
'expected_warnings': ['is not a supported codec'],
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
title = strip_or_none(get_element_by_class(
'dish-metadata__title', webpage
) or self._html_search_meta(
'twitter:title', webpage))
description = clean_html(get_element_by_class(
'dish-description', webpage)
) or self._html_search_meta(
('description', 'twitter:description', 'og:description'),
webpage)
video_id = self._html_search_regex(
r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
group='id')
return {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
}

View File

@ -1,12 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import calendar
import datetime
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_timezone,
int_or_none,
parse_duration,
parse_resolution,
@ -97,8 +99,9 @@ def _real_extract(self, url):
timestamp = None
data_utc = try_get(informacio, lambda x: x['data_emissio']['utc'])
try:
timestamp = datetime.datetime.strptime(
data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp()
timezone, data_utc = extract_timezone(data_utc)
timestamp = calendar.timegm((datetime.datetime.strptime(
data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple())
except TypeError:
pass

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
@ -10,11 +11,13 @@
ExtractorError,
float_or_none,
int_or_none,
strip_or_none,
unified_timestamp,
)
class DPlayIE(InfoExtractor):
_PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)'
_VALID_URL = r'''(?x)https?://
(?P<domain>
(?:www\.)?(?P<host>d
@ -24,7 +27,7 @@ class DPlayIE(InfoExtractor):
)
)|
(?P<subdomain_country>es|it)\.dplay\.com
)/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
)/[^/]+''' + _PATH_REGEX
_TESTS = [{
# non geo restricted, via secure api, unsigned download hls URL
@ -151,21 +154,47 @@ class DPlayIE(InfoExtractor):
'only_matching': True,
}]
def _process_errors(self, e, geo_countries):
info = self._parse_json(e.cause.read().decode('utf-8'), None)
error = info['errors'][0]
error_code = error.get('code')
if error_code == 'access.denied.geoblocked':
self.raise_geo_restricted(countries=geo_countries)
elif error_code in ('access.denied.missingpackage', 'invalid.token'):
raise ExtractorError(
'This video is only available for registered users. You may want to use --cookies.', expected=True)
raise ExtractorError(info['errors'][0]['detail'], expected=True)
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers['Authorization'] = 'Bearer ' + self._download_json(
disco_base + 'token', display_id, 'Downloading token',
query={
'realm': realm,
})['data']['attributes']['token']
def _download_video_playback_info(self, disco_base, video_id, headers):
streaming = self._download_json(
disco_base + 'playback/videoPlaybackInfo/' + video_id,
video_id, headers=headers)['data']['attributes']['streaming']
streaming_list = []
for format_id, format_dict in streaming.items():
streaming_list.append({
'type': format_id,
'url': format_dict.get('url'),
})
return streaming_list
def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
geo_countries = [country.upper()]
self._initialize_geo_bypass({
'countries': geo_countries,
})
disco_base = 'https://%s/' % disco_host
token = self._download_json(
disco_base + 'token', display_id, 'Downloading token',
query={
'realm': realm,
})['data']['attributes']['token']
headers = {
'Referer': url,
'Authorization': 'Bearer ' + token,
}
self._update_disco_api_headers(headers, disco_base, display_id, realm)
try:
video = self._download_json(
disco_base + 'content/videos/' + display_id, display_id,
headers=headers, query={
@ -176,31 +205,28 @@ def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
'include': 'images,primaryChannel,show,tags'
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
self._process_errors(e, geo_countries)
raise
video_id = video['data']['id']
info = video['data']['attributes']
title = info['name'].strip()
formats = []
try:
streaming = self._download_json(
disco_base + 'playback/videoPlaybackInfo/' + video_id,
display_id, headers=headers)['data']['attributes']['streaming']
streaming = self._download_video_playback_info(
disco_base, video_id, headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
error = info['errors'][0]
error_code = error.get('code')
if error_code == 'access.denied.geoblocked':
self.raise_geo_restricted(countries=geo_countries)
elif error_code == 'access.denied.missingpackage':
self.raise_login_required()
raise ExtractorError(info['errors'][0]['detail'], expected=True)
self._process_errors(e, geo_countries)
raise
for format_id, format_dict in streaming.items():
for format_dict in streaming:
if not isinstance(format_dict, dict):
continue
format_url = format_dict.get('url')
if not format_url:
continue
format_id = format_dict.get('type')
ext = determine_ext(format_url)
if format_id == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
@ -248,7 +274,7 @@ def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
'id': video_id,
'display_id': display_id,
'title': title,
'description': info.get('description'),
'description': strip_or_none(info.get('description')),
'duration': float_or_none(info.get('videoDuration'), 1000),
'timestamp': unified_timestamp(info.get('publishStart')),
'series': series,
@ -268,3 +294,75 @@ def _real_extract(self, url):
host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
return self._get_disco_api_info(
url, display_id, host, 'dplay' + country, country)
class DiscoveryPlusIE(DPlayIE):
_VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
'info_dict': {
'id': '1140794',
'display_id': 'property-brothers-forever-home/food-and-family',
'ext': 'mp4',
'title': 'Food and Family',
'description': 'The brothers help a Richmond family expand their single-level home.',
'duration': 2583.113,
'timestamp': 1609304400,
'upload_date': '20201230',
'creator': 'HGTV',
'series': 'Property Brothers: Forever Home',
'season_number': 1,
'episode_number': 1,
},
'skip': 'Available for Premium users',
}]
def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0'
def _download_video_playback_info(self, disco_base, video_id, headers):
return self._download_json(
disco_base + 'playback/v3/videoPlaybackInfo',
video_id, headers=headers, data=json.dumps({
'deviceInfo': {
'adBlocker': False,
},
'videoId': video_id,
'wisteriaProperties': {
'platform': 'desktop',
},
}).encode('utf-8'))['data']['attributes']['streaming']
def _real_extract(self, url):
display_id = self._match_id(url)
return self._get_disco_api_info(
url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us')
class HGTVDeIE(DPlayIE):
_VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
_TESTS = [{
'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
'info_dict': {
'id': '151205',
'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
'ext': 'mp4',
'title': 'Wer braucht schon eine Toilette',
'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
'duration': 1177.024,
'timestamp': 1595705400,
'upload_date': '20200725',
'creator': 'HGTV',
'series': 'Tiny House - klein, aber oho',
'season_number': 3,
'episode_number': 3,
},
'params': {
'format': 'bestvideo',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
return self._get_disco_api_info(
url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')

View File

@ -0,0 +1,193 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
xpath_text,
determine_ext,
float_or_none,
ExtractorError,
)
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
_GEO_COUNTRIES = ['DE']
_VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)'
_TESTS = [
{
'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
'md5': 'be37228896d30a88f315b638900a026e',
'info_dict': {
'id': '45918',
'ext': 'mp4',
'title': 'Waidmannsheil',
'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
'uploader': 'SCHWEIZWEIT',
'uploader_id': '100000210',
'upload_date': '20140913'
},
'params': {
'skip_download': True, # m3u8 downloads
}
},
{
'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
'only_matching': True,
},
]
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
param_groups = {}
for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)):
group_id = param_group.get(self._xpath_ns(
'id', 'http://www.w3.org/XML/1998/namespace'))
params = {}
for param in param_group:
params[param.get('name')] = param.get('value')
param_groups[group_id] = params
formats = []
for video in smil.findall(self._xpath_ns('.//video', namespace)):
src = video.get('src')
if not src:
continue
bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
group_id = video.get('paramGroup')
param_group = param_groups[group_id]
for proto in param_group['protocols'].split(','):
formats.append({
'url': '%s://%s' % (proto, param_group['host']),
'app': param_group['app'],
'play_path': src,
'ext': 'flv',
'format_id': '%s-%d' % (proto, bitrate),
'tbr': bitrate,
})
self._sort_formats(formats)
return formats
def extract_from_xml_url(self, video_id, xml_url):
doc = self._download_xml(
xml_url, video_id,
note='Downloading video info',
errnote='Failed to download video info')
status_code = xpath_text(doc, './status/statuscode')
if status_code and status_code != 'ok':
if status_code == 'notVisibleAnymore':
message = 'Video %s is not available' % video_id
else:
message = '%s returned error: %s' % (self.IE_NAME, status_code)
raise ExtractorError(message, expected=True)
title = xpath_text(doc, './/information/title', 'title', True)
urls = []
formats = []
for fnode in doc.findall('.//formitaeten/formitaet'):
video_url = xpath_text(fnode, 'url')
if not video_url or video_url in urls:
continue
urls.append(video_url)
is_available = 'http://www.metafilegenerator' not in video_url
geoloced = 'static_geoloced_online' in video_url
if not is_available or geoloced:
continue
format_id = fnode.attrib['basetype']
format_m = re.match(r'''(?x)
(?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
(?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
''', format_id)
ext = determine_ext(video_url, None) or format_m.group('container')
if ext == 'meta':
continue
elif ext == 'smil':
formats.extend(self._extract_smil_formats(
video_url, video_id, fatal=False))
elif ext == 'm3u8':
# the certificates are misconfigured (see
# https://github.com/ytdl-org/youtube-dl/issues/8665)
if video_url.startswith('https://'):
continue
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id=format_id, fatal=False))
else:
quality = xpath_text(fnode, './quality')
if quality:
format_id += '-' + quality
abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000)
vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000)
tbr = int_or_none(self._search_regex(
r'_(\d+)k', video_url, 'bitrate', None))
if tbr and vbr and not abr:
abr = tbr - vbr
formats.append({
'format_id': format_id,
'url': video_url,
'ext': ext,
'acodec': format_m.group('acodec'),
'vcodec': format_m.group('vcodec'),
'abr': abr,
'vbr': vbr,
'tbr': tbr,
'width': int_or_none(xpath_text(fnode, './width')),
'height': int_or_none(xpath_text(fnode, './height')),
'filesize': int_or_none(xpath_text(fnode, './filesize')),
'protocol': format_m.group('proto').lower(),
})
geolocation = xpath_text(doc, './/details/geolocation')
if not formats and geolocation and geolocation != 'none':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
self._sort_formats(formats)
thumbnails = []
for node in doc.findall('.//teaserimages/teaserimage'):
thumbnail_url = node.text
if not thumbnail_url:
continue
thumbnail = {
'url': thumbnail_url,
}
thumbnail_key = node.get('key')
if thumbnail_key:
m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
if m:
thumbnail['width'] = int(m.group(1))
thumbnail['height'] = int(m.group(2))
thumbnails.append(thumbnail)
upload_date = unified_strdate(xpath_text(doc, './/details/airtime'))
return {
'id': video_id,
'title': title,
'description': xpath_text(doc, './/information/detail'),
'duration': int_or_none(xpath_text(doc, './/details/lengthSec')),
'thumbnails': thumbnails,
'uploader': xpath_text(doc, './/details/originChannelTitle'),
'uploader_id': xpath_text(doc, './/details/originChannelId'),
'upload_date': upload_date,
'formats': formats,
}
def _real_extract(self, url):
video_id = self._match_id(url)
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id
return self.extract_from_xml_url(video_id, details_url)

View File

@ -182,6 +182,7 @@
CanvasIE,
CanvasEenIE,
VrtNUIE,
DagelijkseKostIE,
)
from .carambatv import (
CarambaTVIE,
@ -309,7 +310,12 @@
DouyuShowIE,
DouyuTVIE,
)
from .dplay import DPlayIE
from .dplay import (
DPlayIE,
DiscoveryPlusIE,
HGTVDeIE,
)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
from .drtv import (
@ -1107,6 +1113,11 @@
VivoIE,
)
from .showroomlive import ShowRoomLiveIE
from .simplecast import (
SimplecastIE,
SimplecastEpisodeIE,
SimplecastPodcastIE,
)
from .sina import SinaIE
from .sixplay import SixPlayIE
from .skyit import (
@ -1165,11 +1176,6 @@
BellatorIE,
ParamountNetworkIE,
)
from .storyfire import (
StoryFireIE,
StoryFireUserIE,
StoryFireSeriesIE,
)
from .stitcher import StitcherIE
from .sport5 import Sport5IE
from .sportbox import SportBoxIE
@ -1193,6 +1199,11 @@
from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .storyfire import (
StoryFireIE,
StoryFireUserIE,
StoryFireSeriesIE,
)
from .streamable import StreamableIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
@ -1652,6 +1663,7 @@
ZattooLiveIE,
)
from .zdf import ZDFIE, ZDFChannelIE
from .zhihu import ZhihuIE
from .zingmp3 import ZingMp3IE
from .zoom import ZoomIE
from .zype import ZypeIE

View File

@ -133,6 +133,7 @@
from .rumble import RumbleEmbedIE
from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE
from .simplecast import SimplecastIE
class GenericIE(InfoExtractor):
@ -2240,6 +2241,15 @@ class GenericIE(InfoExtractor):
'duration': 159,
},
},
{
# Simplecast player embed
'url': 'https://www.bio.org/podcast',
'info_dict': {
'id': 'podcast',
'title': 'I AM BIO Podcast | BIO',
},
'playlist_mincount': 52,
},
]
def report_following_redirect(self, new_url):
@ -2794,6 +2804,12 @@ def _real_extract(self, url):
return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
# Look for Simplecast embeds
simplecast_urls = SimplecastIE._extract_urls(webpage)
if simplecast_urls:
return self.playlist_from_matches(
simplecast_urls, video_id, video_title)
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:

View File

@ -2,10 +2,11 @@
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
determine_ext,
int_or_none,
try_get,
unescapeHTML,
url_or_none,
)
@ -14,7 +15,7 @@ class NineGagIE(InfoExtractor):
IE_NAME = '9gag'
_VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)'
_TEST = {
_TESTS = [{
'url': 'https://9gag.com/gag/ae5Ag7B',
'info_dict': {
'id': 'ae5Ag7B',
@ -29,7 +30,11 @@ class NineGagIE(InfoExtractor):
'dislike_count': int,
'comment_count': int,
}
}
}, {
# HTML escaped title
'url': 'https://9gag.com/gag/av5nvyb',
'only_matching': True,
}]
def _real_extract(self, url):
post_id = self._match_id(url)
@ -43,7 +48,7 @@ def _real_extract(self, url):
'The given url does not contain a video',
expected=True)
title = post['title']
title = unescapeHTML(post['title'])
duration = None
formats = []

View File

@ -0,0 +1,160 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_podcast_url,
int_or_none,
parse_iso8601,
strip_or_none,
try_get,
urlencode_postdata,
)
class SimplecastBaseIE(InfoExtractor):
_UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
_API_BASE = 'https://api.simplecast.com/'
def _call_api(self, path_tmpl, video_id):
return self._download_json(
self._API_BASE + path_tmpl % video_id, video_id)
def _call_search_api(self, resource, resource_id, resource_url):
return self._download_json(
'https://api.simplecast.com/%ss/search' % resource, resource_id,
data=urlencode_postdata({'url': resource_url}))
def _parse_episode(self, episode):
episode_id = episode['id']
title = episode['title'].strip()
audio_file = episode.get('audio_file') or {}
audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url']
season = episode.get('season') or {}
season_href = season.get('href')
season_id = None
if season_href:
season_id = self._search_regex(
r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX,
season_href, 'season id', default=None)
webpage_url = episode.get('episode_url')
channel_url = None
if webpage_url:
channel_url = self._search_regex(
r'(https?://[^/]+\.simplecast\.com)',
webpage_url, 'channel url', default=None)
return {
'id': episode_id,
'display_id': episode.get('slug'),
'title': title,
'url': clean_podcast_url(audio_file_url),
'webpage_url': webpage_url,
'channel_url': channel_url,
'series': try_get(episode, lambda x: x['podcast']['title']),
'season_number': int_or_none(season.get('number')),
'season_id': season_id,
'thumbnail': episode.get('image_url'),
'episode_id': episode_id,
'episode_number': int_or_none(episode.get('number')),
'description': strip_or_none(episode.get('description')),
'timestamp': parse_iso8601(episode.get('published_at')),
'duration': int_or_none(episode.get('duration')),
'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')),
}
class SimplecastIE(SimplecastBaseIE):
IE_NAME = 'simplecast'
_VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX
_COMMON_TEST_INFO = {
'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'ext': 'mp3',
'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
'episode_number': 1,
'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
'description': 'md5:34752789d3d2702e2d2c975fbd14f357',
'season_number': 1,
'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
'series': 'The RE:BIND.io Podcast',
'duration': 5343,
'timestamp': 1580979475,
'upload_date': '20200206',
'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$',
}
_TESTS = [{
'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
'md5': '8c93be7be54251bf29ee97464eabe61c',
'info_dict': _COMMON_TEST_INFO,
}, {
'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'''(?x)<iframe[^>]+src=["\']
(
https?://(?:embed\.simplecast\.com/[0-9a-f]{8}|
player\.simplecast\.com/%s
))''' % SimplecastBaseIE._UUID_REGEX, webpage)
def _real_extract(self, url):
episode_id = self._match_id(url)
episode = self._call_api('episodes/%s', episode_id)
return self._parse_episode(episode)
class SimplecastEpisodeIE(SimplecastBaseIE):
IE_NAME = 'simplecast:episode'
_VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)'
_TEST = {
'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
'md5': '8c93be7be54251bf29ee97464eabe61c',
'info_dict': SimplecastIE._COMMON_TEST_INFO,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
episode = self._call_search_api(
'episode', mobj.group(1), mobj.group(0))
return self._parse_episode(episode)
class SimplecastPodcastIE(SimplecastBaseIE):
IE_NAME = 'simplecast:podcast'
_VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)'
_TESTS = [{
'url': 'https://the-re-bind-io-podcast.simplecast.com',
'playlist_mincount': 33,
'info_dict': {
'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
'title': 'The RE:BIND.io Podcast',
},
}, {
'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes',
'only_matching': True,
}]
def _real_extract(self, url):
subdomain = self._match_id(url)
site = self._call_search_api('site', subdomain, url)
podcast = site['podcast']
podcast_id = podcast['id']
podcast_title = podcast.get('title')
def entries():
episodes = self._call_api('podcasts/%s/episodes', podcast_id)
for episode in (episodes.get('collection') or []):
info = self._parse_episode(episode)
info['series'] = podcast_title
yield info
return self.playlist_result(entries(), podcast_id, podcast_title)

View File

@ -1,255 +1,151 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import functools
from .common import InfoExtractor
from ..utils import (
# HEADRequest,
int_or_none,
OnDemandPagedList,
smuggle_url,
)
class StoryFireIE(InfoExtractor):
_VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)'
_TESTS = [{
class StoryFireBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/'
def _call_api(self, path, video_id, resource, query=None):
return self._download_json(
'https://storyfire.com/app/%s/%s' % (path, video_id), video_id,
'Downloading %s JSON metadata' % resource, query=query)
def _parse_video(self, video):
title = video['title']
vimeo_id = self._search_regex(
r'https?://player\.vimeo\.com/external/(\d+)',
video['vimeoVideoURL'], 'vimeo id')
# video_url = self._request_webpage(
# HEADRequest(video['vimeoVideoURL']), video_id).geturl()
# formats = []
# for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]:
# formats.extend(self._extract_m3u8_formats(
# v_url, video_id, 'mp4', 'm3u8_native',
# m3u8_id='hls' + suffix, fatal=False))
# formats.extend(self._extract_mpd_formats(
# v_url.replace('.m3u8', '.mpd'), video_id,
# mpd_id='dash' + suffix, fatal=False))
# self._sort_formats(formats)
uploader_id = video.get('hostID')
return {
'_type': 'url_transparent',
'id': vimeo_id,
'title': title,
'description': video.get('description'),
'url': smuggle_url(
'https://player.vimeo.com/video/' + vimeo_id, {
'http_headers': {
'Referer': 'https://storyfire.com/',
}
}),
# 'formats': formats,
'thumbnail': video.get('storyImage'),
'view_count': int_or_none(video.get('views')),
'like_count': int_or_none(video.get('likesCount')),
'comment_count': int_or_none(video.get('commentsCount')),
'duration': int_or_none(video.get('videoDuration')),
'timestamp': int_or_none(video.get('publishDate')),
'uploader': video.get('username'),
'uploader_id': uploader_id,
'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None,
'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')),
}
class StoryFireIE(StoryFireBaseIE):
_VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})'
_TEST = {
'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181',
'md5': '560953bfca81a69003cfa5e53ac8a920',
'md5': 'caec54b9e4621186d6079c7ec100c1eb',
'info_dict': {
'id': '5df1d132b6378700117f9181',
'id': '378954662',
'ext': 'mp4',
'title': 'Buzzfeed Teaches You About Memes',
'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
'timestamp': 1576129028,
'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies',
'description': 'md5:0b4e28021548e144bed69bb7539e62ea',
'uploader': 'whang!',
'upload_date': '20191212',
'duration': 418,
'view_count': int,
'like_count': int,
'comment_count': int,
},
'params': {'format': 'bestvideo'} # There are no merged formats in the playlist.
}, {
'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID
'md5': '7a2dc6d60c4889edfed459c620fe690d',
'info_dict': {
'id': '5f1e11ecd78a57b6c702001d',
'ext': 'm4a',
'title': 'Weird Nintendo Prototype Leaks',
'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis',
'timestamp': 1595808576,
'upload_date': '20200727',
'uploader': 'whang!',
'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
'params': {
'skip_download': True,
},
'params': {'format': 'bestaudio'} # Verifying audio extraction
}]
_aformats = {
'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10},
'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1},
'expected_warnings': ['Unable to download JSON metadata']
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# Extracting the json blob is mandatory to proceed with extraction.
jsontext = self._html_search_regex(
r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
webpage, 'json_data')
json = self._parse_json(jsontext, video_id)
# The currentVideo field in the json is mandatory
# because it contains the only link to the m3u playlist
video = json['props']['initialState']['video']['currentVideo']
videourl = video['vimeoVideoURL'] # Video URL is mandatory
# Extract other fields from the json in an error tolerant fashion
# ID may be incorrect (on short URL format), correct it.
parsed_id = video.get('_id')
if parsed_id:
video_id = parsed_id
title = video.get('title')
description = video.get('description')
thumbnail = video.get('storyImage')
views = video.get('views')
likes = video.get('likesCount')
comments = video.get('commentsCount')
duration = video.get('videoDuration')
publishdate = video.get('publishDate') # Apparently epoch time, day only
uploader = video.get('username')
uploader_id = video.get('hostID')
# Construct an uploader URL
uploader_url = None
if uploader_id:
uploader_url = "https://storyfire.com/user/%s/video" % uploader_id
# Collect root playlist to determine formats
formats = self._extract_m3u8_formats(
videourl, video_id, 'mp4', 'm3u8_native')
# Modify formats to fill in missing information about audio codecs
for format in formats:
aformat = self._aformats.get(format['format_id'])
if aformat:
format['acodec'] = aformat['acodec']
format['abr'] = aformat['abr']
format['quality'] = aformat['preference']
format['ext'] = 'm4a'
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'ext': "mp4",
'url': videourl,
'formats': formats,
'thumbnail': thumbnail,
'view_count': views,
'like_count': likes,
'comment_count': comments,
'duration': duration,
'timestamp': publishdate,
'uploader': uploader,
'uploader_id': uploader_id,
'uploader_url': uploader_url,
}
video = self._call_api(
'generic/video-detail', video_id, 'video')['video']
return self._parse_video(video)
class StoryFireUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video'
_TESTS = [{
'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video',
'info_dict': {
'id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
'title': 'whang!',
},
'playlist_mincount': 18
}, {
class StoryFireUserIE(StoryFireBaseIE):
_VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video'
_TEST = {
'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video',
'info_dict': {
'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2',
'title': 'McJuggerNuggets',
},
'playlist_mincount': 143
}]
# Generator for fetching playlist items
def _enum_videos(self, baseurl, user_id, firstjson):
totalVideos = int(firstjson['videosCount'])
haveVideos = 0
json = firstjson
for page in itertools.count(1):
for video in json['videos']:
id = video['_id']
url = "https://storyfire.com/video-details/%s" % id
haveVideos += 1
yield {
'_type': 'url',
'id': id,
'url': url,
'ie_key': 'StoryFire',
'title': video.get('title'),
'description': video.get('description'),
'view_count': video.get('views'),
'comment_count': video.get('commentsCount'),
'duration': video.get('videoDuration'),
'timestamp': video.get('publishDate'),
'playlist_mincount': 151,
}
# Are there more pages we could fetch?
if haveVideos < totalVideos:
pageurl = baseurl + ("%i" % haveVideos)
json = self._download_json(pageurl, user_id,
note='Downloading page %s' % page)
_PAGE_SIZE = 20
# Are there any videos in the new json?
videos = json.get('videos')
if not videos or len(videos) == 0:
break # no videos
else:
break # We have fetched all the videos, stop
def _fetch_page(self, user_id, page):
videos = self._call_api(
'publicVideos', user_id, 'page %d' % (page + 1), {
'skip': page * self._PAGE_SIZE,
})['videos']
for video in videos:
yield self._parse_video(video)
def _real_extract(self, url):
user_id = self._match_id(url)
baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id
# Download first page to ensure it can be downloaded, and get user information if available.
firstpage = baseurl + "0"
firstjson = self._download_json(firstpage, user_id)
title = None
videos = firstjson.get('videos')
if videos and len(videos):
title = videos[1].get('username')
return {
'_type': 'playlist',
'entries': self._enum_videos(baseurl, user_id, firstjson),
'id': user_id,
'title': title,
}
entries = OnDemandPagedList(functools.partial(
self._fetch_page, user_id), self._PAGE_SIZE)
return self.playlist_result(entries, user_id)
class StoryFireSeriesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)'
class StoryFireSeriesIE(StoryFireBaseIE):
_VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/',
'info_dict': {
'id': '-Lq6MsuIHLODO6d2dDkr',
},
'playlist_mincount': 13
'playlist_mincount': 13,
}, {
'url': 'https://storyfire.com/write/series/stories/the_mortal_one/',
'info_dict': {
'id': 'the_mortal_one',
},
'playlist_count': 0 # This playlist has entries, but no videos.
}, {
'url': 'https://storyfire.com/write/series/stories/story_time',
'info_dict': {
'id': 'story_time',
},
'playlist_mincount': 10
'playlist_count': 0,
}]
# Generator for returning playlist items
# This object is substantially different than the one in the user videos page above
def _enum_videos(self, jsonlist):
for video in jsonlist:
id = video['_id']
if video.get('hasVideo'): # Boolean element
url = "https://storyfire.com/video-details/%s" % id
yield {
'_type': 'url',
'id': id,
'url': url,
'ie_key': 'StoryFire',
'title': video.get('title'),
'description': video.get('description'),
'view_count': video.get('views'),
'likes_count': video.get('likesCount'),
'comment_count': video.get('commentsCount'),
'duration': video.get('videoDuration'),
'timestamp': video.get('publishDate'),
}
def _extract_videos(self, stories):
for story in stories.values():
if story.get('hasVideo'):
yield self._parse_video(story)
def _real_extract(self, url):
list_id = self._match_id(url)
listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id
json = self._download_json(listurl, list_id)
return {
'_type': 'playlist',
'entries': self._enum_videos(json),
'id': list_id
}
series_id = self._match_id(url)
stories = self._call_api(
'seriesStories', series_id, 'series stories')
return self.playlist_result(self._extract_videos(stories), series_id)

View File

@ -4,21 +4,22 @@
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
parse_age_limit,
qualities,
random_birthday,
try_get,
unified_timestamp,
urljoin,
)
class VideoPressIE(InfoExtractor):
_VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)'
_ID_REGEX = r'[\da-zA-Z]{8}'
_PATH_REGEX = r'video(?:\.word)?press\.com/embed/'
_VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX)
_TESTS = [{
'url': 'https://videopress.com/embed/kUJmAcSf',
'md5': '706956a6c875873d51010921310e4bc6',
@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor):
# 17+, requires birth_* params
'url': 'https://videopress.com/embed/iH3gstfZ',
'only_matching': True,
}, {
'url': 'https://video.wordpress.com/embed/kUJmAcSf',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)',
r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX),
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
query = random_birthday('birth_year', 'birth_month', 'birth_day')
query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width'
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
video_id, query=query)
title = video['title']
def base_url(scheme):
return try_get(
video, lambda x: x['file_url_base'][scheme], compat_str)
base_url = base_url('https') or base_url('http')
file_url_base = video.get('file_url_base') or {}
base_url = file_url_base.get('https') or file_url_base.get('http')
QUALITIES = ('std', 'dvd', 'hd')
quality = qualities(QUALITIES)
formats = []
for format_id, f in video['files'].items():
for format_id, f in (video.get('files') or {}).items():
if not isinstance(f, dict):
continue
for ext, path in f.items():
@ -75,12 +77,14 @@ def base_url(scheme):
'ext': determine_ext(path, ext),
'quality': quality(format_id),
})
original_url = try_get(video, lambda x: x['original'], compat_str)
original_url = video.get('original')
if original_url:
formats.append({
'url': original_url,
'format_id': 'original',
'quality': len(QUALITIES),
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
})
self._sort_formats(formats)

View File

@ -22,6 +22,7 @@
parse_iso8601,
sanitized_Request,
std_headers,
try_get,
)
@ -42,7 +43,7 @@ class VikiBaseIE(InfoExtractor):
_ERRORS = {
'geo': 'Sorry, this content is not available in your region.',
'upcoming': 'Sorry, this content is not yet available.',
# 'paywall': 'paywall',
'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers',
}
def _prepare_call(self, path, timestamp=None, post_data=None):
@ -94,11 +95,13 @@ def _raise_error(self, error):
expected=True)
def _check_errors(self, data):
for reason, status in data.get('blocking', {}).items():
for reason, status in (data.get('blocking') or {}).items():
if status and reason in self._ERRORS:
message = self._ERRORS[reason]
if reason == 'geo':
self.raise_geo_restricted(msg=message)
elif reason == 'paywall':
self.raise_login_required(message)
raise ExtractorError('%s said: %s' % (
self.IE_NAME, message), expected=True)
@ -143,13 +146,19 @@ class VikiIE(VikiBaseIE):
'info_dict': {
'id': '1023585v',
'ext': 'mp4',
'title': 'Heirs Episode 14',
'uploader': 'SBS',
'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
'title': 'Heirs - Episode 14',
'uploader': 'SBS Contents Hub',
'timestamp': 1385047627,
'upload_date': '20131121',
'age_limit': 13,
'duration': 3570,
'episode_number': 14,
},
'params': {
'format': 'bestvideo',
},
'skip': 'Blocked in the US',
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@ -165,7 +174,8 @@ class VikiIE(VikiBaseIE):
'uploader': 'Arirang TV',
'like_count': int,
'age_limit': 0,
}
},
'skip': 'Sorry. There was an error loading this video',
}, {
'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
'info_dict': {
@ -183,7 +193,7 @@ class VikiIE(VikiBaseIE):
}, {
# episode
'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
'md5': '94e0e34fd58f169f40c184f232356cfe',
'md5': '0a53dc252e6e690feccd756861495a8c',
'info_dict': {
'id': '44699v',
'ext': 'mp4',
@ -195,6 +205,10 @@ class VikiIE(VikiBaseIE):
'uploader': 'group8',
'like_count': int,
'age_limit': 13,
'episode_number': 1,
},
'params': {
'format': 'bestvideo',
},
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
@ -221,7 +235,7 @@ class VikiIE(VikiBaseIE):
}, {
# non-English description
'url': 'http://www.viki.com/videos/158036v-love-in-magic',
'md5': 'adf9e321a0ae5d0aace349efaaff7691',
'md5': '41faaba0de90483fb4848952af7c7d0d',
'info_dict': {
'id': '158036v',
'ext': 'mp4',
@ -232,6 +246,10 @@ class VikiIE(VikiBaseIE):
'title': 'Love In Magic',
'age_limit': 13,
},
'params': {
'format': 'bestvideo',
},
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}]
def _real_extract(self, url):
@ -249,22 +267,19 @@ def _real_extract(self, url):
self._check_errors(video)
title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
episode_number = int_or_none(video.get('number'))
if not title:
title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
container_titles = video.get('container', {}).get('titles', {})
title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id
container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {}
container_title = self.dict_selection(container_titles, 'en')
title = '%s - %s' % (container_title, title)
description = self.dict_selection(video.get('descriptions', {}), 'en')
duration = int_or_none(video.get('duration'))
timestamp = parse_iso8601(video.get('created_at'))
uploader = video.get('author')
like_count = int_or_none(video.get('likes', {}).get('count'))
age_limit = parse_age_limit(video.get('rating'))
like_count = int_or_none(try_get(video, lambda x: x['likes']['count']))
thumbnails = []
for thumbnail_id, thumbnail in video.get('images', {}).items():
for thumbnail_id, thumbnail in (video.get('images') or {}).items():
thumbnails.append({
'id': thumbnail_id,
'url': thumbnail.get('url'),
@ -289,7 +304,7 @@ def _real_extract(self, url):
}]
except AttributeError:
# fall-back to the old way if there isn't a streamSubtitles attribute
for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items():
subtitles[subtitle_lang] = [{
'ext': subtitles_format,
'url': self._prepare_call(
@ -300,13 +315,15 @@ def _real_extract(self, url):
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'timestamp': timestamp,
'uploader': uploader,
'duration': int_or_none(video.get('duration')),
'timestamp': parse_iso8601(video.get('created_at')),
'uploader': video.get('author'),
'uploader_url': video.get('author_url'),
'like_count': like_count,
'age_limit': age_limit,
'age_limit': parse_age_limit(video.get('rating')),
'thumbnails': thumbnails,
'subtitles': subtitles,
'episode_number': episode_number,
}
formats = []
@ -400,7 +417,7 @@ class VikiChannelIE(VikiBaseIE):
'info_dict': {
'id': '50c',
'title': 'Boys Over Flowers',
'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59',
},
'playlist_mincount': 71,
}, {
@ -411,6 +428,7 @@ class VikiChannelIE(VikiBaseIE):
'description': 'md5:05bf5471385aa8b21c18ad450e350525',
},
'playlist_count': 127,
'skip': 'Page not found',
}, {
'url': 'http://www.viki.com/news/24569c-showbiz-korea',
'only_matching': True,

View File

@ -221,10 +221,12 @@ def _parse_config(self, config, video_id):
'is_live': is_live,
}
def _extract_original_format(self, url, video_id):
def _extract_original_format(self, url, video_id, unlisted_hash=None):
query = {'action': 'load_download_config'}
if unlisted_hash:
query['unlisted_hash'] = unlisted_hash
download_data = self._download_json(
url, video_id, fatal=False,
query={'action': 'load_download_config'},
url, video_id, fatal=False, query=query,
headers={'X-Requested-With': 'XMLHttpRequest'})
if download_data:
source_file = download_data.get('source_file')
@ -504,6 +506,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
{
'url': 'https://vimeo.com/160743502/abd0e13fb4',
'only_matching': True,
},
{
# requires passing unlisted_hash(a52724358e) to load_download_config request
'url': 'https://vimeo.com/392479337/a52724358e',
'only_matching': True,
}
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
@ -668,7 +675,8 @@ def _real_extract(self, url):
if config.get('view') == 4:
config = self._verify_player_video_password(redirect_url, video_id, headers)
vod = config.get('video', {}).get('vod', {})
video = config.get('video') or {}
vod = video.get('vod') or {}
def is_rented():
if '>You rented this title.<' in webpage:
@ -728,7 +736,7 @@ def is_rented():
formats = []
source_format = self._extract_original_format(
'https://vimeo.com/' + video_id, video_id)
'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash'))
if source_format:
formats.append(source_format)

View File

@ -1,40 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
int_or_none,
month_by_abbreviation,
parse_filesize,
unified_strdate,
)
class XboxClipsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
_TEST = {
_VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{
'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
'info_dict': {
'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
'ext': 'mp4',
'title': 'Iabdulelah playing Titanfall',
'title': 'iAbdulElah playing Titanfall',
'filesize_approx': 26800000,
'upload_date': '20140807',
'duration': 56,
}
}
}, {
'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
if '/video.php' in url:
qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0])
video_url = self._html_search_regex(
r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL')
title = self._html_search_regex(
r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title')
upload_date = unified_strdate(self._html_search_regex(
r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False))
webpage = self._download_webpage(url, video_id)
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
upload_date = None
mobj = re.search(
r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})',
webpage)
if mobj:
upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1))
filesize = parse_filesize(self._html_search_regex(
r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
duration = int_or_none(self._html_search_regex(
@ -42,12 +57,12 @@ def _real_extract(self, url):
view_count = int_or_none(self._html_search_regex(
r'>Views: (\d+)<', webpage, 'view count', fatal=False))
return {
info.update({
'id': video_id,
'url': video_url,
'title': title,
'upload_date': upload_date,
'filesize_approx': filesize,
'duration': duration,
'view_count': view_count,
}
})
return info

View File

@ -1,8 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import hashlib
import itertools
import re
from .common import InfoExtractor
from ..compat import compat_str
@ -209,10 +210,18 @@ def _extract_tracks(self, source, item_id, url, tld):
missing_track_ids = [
track_id for track_id in track_ids
if track_id not in present_track_ids]
# Request missing tracks in chunks to avoid exceeding max HTTP header size,
# see https://github.com/ytdl-org/youtube-dl/issues/27355
_TRACKS_PER_CHUNK = 250
for chunk_num in itertools.count(0):
start = chunk_num * _TRACKS_PER_CHUNK
end = start + _TRACKS_PER_CHUNK
missing_track_ids_req = missing_track_ids[start:end]
assert missing_track_ids_req
missing_tracks = self._call_api(
'track-entries', tld, url, item_id,
'Downloading missing tracks JSON', {
'entries': ','.join(missing_track_ids),
'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), {
'entries': ','.join(missing_track_ids_req),
'lang': tld,
'external-domain': 'music.yandex.%s' % tld,
'overembed': 'false',
@ -220,6 +229,8 @@ def _extract_tracks(self, source, item_id, url, tld):
})
if missing_tracks:
tracks.extend(missing_tracks)
if end >= len(missing_track_ids):
break
return tracks

View File

@ -324,7 +324,9 @@ def _extract_video(self, renderer):
r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
'view count', default=None))
uploader = try_get(
renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
renderer,
(lambda x: x['ownerText']['runs'][0]['text'],
lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
return {
'_type': 'url_transparent',
'ie_key': YoutubeIE.ie_key(),
@ -340,64 +342,70 @@ def _extract_video(self, renderer):
class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
_INVIDIOUS_SITES = (
# invidious-redirect websites
r'(?:www\.)?redirect\.invidious\.io',
r'(?:(?:www|dev)\.)?invidio\.us',
# Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
r'(?:www\.)?invidious\.pussthecat\.org',
r'(?:www\.)?invidious\.048596\.xyz',
r'(?:www\.)?invidious\.zee\.li',
r'(?:www\.)?vid\.puffyan\.us',
r'(?:(?:www|au)\.)?ytprivate\.com',
r'(?:www\.)?invidious\.namazso\.eu',
r'(?:www\.)?invidious\.ethibox\.fr',
r'(?:www\.)?inv\.skyn3t\.in',
r'(?:www\.)?invidious\.himiko\.cloud',
r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
# youtube-dl invidious instances list
r'(?:(?:www|no)\.)?invidiou\.sh',
r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
r'(?:www\.)?invidious\.kabi\.tk',
r'(?:www\.)?invidious\.13ad\.de',
r'(?:www\.)?invidious\.mastodon\.host',
r'(?:www\.)?invidious\.zapashcanon\.fr',
r'(?:www\.)?invidious\.kavin\.rocks',
r'(?:www\.)?invidious\.tube',
r'(?:www\.)?invidiou\.site',
r'(?:www\.)?invidious\.site',
r'(?:www\.)?invidious\.xyz',
r'(?:www\.)?invidious\.nixnet\.xyz',
r'(?:www\.)?invidious\.drycat\.fr',
r'(?:www\.)?tube\.poal\.co',
r'(?:www\.)?tube\.connect\.cafe',
r'(?:www\.)?vid\.wxzm\.sx',
r'(?:www\.)?vid\.mint\.lgbt',
r'(?:www\.)?yewtu\.be',
r'(?:www\.)?yt\.elukerio\.org',
r'(?:www\.)?yt\.lelux\.fi',
r'(?:www\.)?invidious\.ggc-project\.de',
r'(?:www\.)?yt\.maisputain\.ovh',
r'(?:www\.)?invidious\.toot\.koeln',
r'(?:www\.)?invidious\.fdn\.fr',
r'(?:www\.)?watch\.nettohikari\.com',
r'(?:www\.)?kgg2m7yk5aybusll\.onion',
r'(?:www\.)?qklhadlycap4cnod\.onion',
r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
)
_VALID_URL = r"""(?x)^
(
(?:https?://|//) # http(s):// or protocol-independent URL
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com/|
(?:www\.)?hooktube\.com/|
(?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
(?:www\.)?invidious\.pussthecat\.org/|
(?:www\.)?invidious\.048596\.xyz/|
(?:www\.)?invidious\.zee\.li/|
(?:www\.)?vid\.puffyan\.us/|
(?:(?:www|au)\.)?ytprivate\.com/|
(?:www\.)?invidious\.namazso\.eu/|
(?:www\.)?invidious\.ethibox\.fr/|
(?:www\.)?inv\.skyn3t\.in/|
(?:www\.)?invidious\.himiko\.cloud/|
(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion/|
(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion/|
(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion/|
(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion/|
(?:(?:www|dev)\.)?invidio\.us/|
(?:(?:www|no)\.)?invidiou\.sh/|
(?:(?:www|fi)\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?invidious\.13ad\.de/|
(?:www\.)?invidious\.mastodon\.host/|
(?:www\.)?invidious\.zapashcanon\.fr/|
(?:www\.)?invidious\.kavin\.rocks/|
(?:www\.)?invidious\.tube/|
(?:www\.)?invidiou\.site/|
(?:www\.)?invidious\.site/|
(?:www\.)?invidious\.xyz/|
(?:www\.)?invidious\.nixnet\.xyz/|
(?:www\.)?invidious\.drycat\.fr/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?tube\.connect\.cafe/|
(?:www\.)?vid\.wxzm\.sx/|
(?:www\.)?vid\.mint\.lgbt/|
(?:www\.)?yewtu\.be/|
(?:www\.)?yt\.elukerio\.org/|
(?:www\.)?yt\.lelux\.fi/|
(?:www\.)?invidious\.ggc-project\.de/|
(?:www\.)?yt\.maisputain\.ovh/|
(?:www\.)?invidious\.toot\.koeln/|
(?:www\.)?invidious\.fdn\.fr/|
(?:www\.)?watch\.nettohikari\.com/|
(?:www\.)?kgg2m7yk5aybusll\.onion/|
(?:www\.)?qklhadlycap4cnod\.onion/|
(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
(?:www\.)?deturl\.com/www\.youtube\.com|
(?:www\.)?pwnyoutube\.com|
(?:www\.)?hooktube\.com|
(?:www\.)?yourepeat\.com|
tube\.majestyc\.net|
%(invidious)s|
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
@ -412,6 +420,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
youtu\.be| # just youtu.be/xxxx
vid\.plus| # or vid.plus/xxxx
zwearz\.com/watch| # or zwearz.com/watch/xxxx
%(invidious)s
)/
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
@ -424,7 +433,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
)
)
(?(1).+)? # if we found the ID, everything can follow
$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
$""" % {
'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
'invidious': '|'.join(_INVIDIOUS_SITES),
}
_PLAYER_INFO_RE = (
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
@ -1031,6 +1043,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://invidio.us/watch?v=BaW_jenozKc',
'only_matching': True,
},
{
'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
'only_matching': True,
},
{
# from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
'only_matching': True,
},
{
# DRM protected
'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
@ -1169,6 +1190,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True,
},
},
{
# controversial video, only works with bpctr when authenticated with cookies
'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
'only_matching': True,
},
]
def __init__(self, *args, **kwargs):
@ -1426,7 +1452,7 @@ def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
base_url = self.http_scheme() + '//www.youtube.com/'
webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1'
webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1&bpctr=9999999999'
webpage = self._download_webpage(webpage_url, video_id, fatal=False)
player_response = None

View File

@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import float_or_none, int_or_none
class ZhihuIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.zhihu.com/zvideo/1342930761977176064',
'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464',
'info_dict': {
'id': '1342930761977176064',
'ext': 'mp4',
'title': '写春联也太难了吧!',
'thumbnail': r're:^https?://.*\.jpg',
'uploader': '桥半舫',
'timestamp': 1612959715,
'upload_date': '20210210',
'uploader_id': '244ecb13b0fd7daf92235288c8ca3365',
'duration': 146.333,
'view_count': int,
'like_count': int,
'comment_count': int,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
zvideo = self._download_json(
'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id)
title = zvideo['title']
video = zvideo.get('video') or {}
formats = []
for format_id, q in (video.get('playlist') or {}).items():
play_url = q.get('url') or q.get('play_url')
if not play_url:
continue
formats.append({
'asr': int_or_none(q.get('sample_rate')),
'filesize': int_or_none(q.get('size')),
'format_id': format_id,
'fps': int_or_none(q.get('fps')),
'height': int_or_none(q.get('height')),
'tbr': float_or_none(q.get('bitrate')),
'url': play_url,
'width': int_or_none(q.get('width')),
})
self._sort_formats(formats)
author = zvideo.get('author') or {}
url_token = author.get('url_token')
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': video.get('thumbnail') or zvideo.get('image_url'),
'uploader': author.get('name'),
'timestamp': int_or_none(zvideo.get('published_at')),
'uploader_id': author.get('id'),
'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None,
'duration': float_or_none(video.get('duration')),
'view_count': int_or_none(zvideo.get('play_count')),
'like_count': int_or_none(zvideo.get('liked_count')),
'comment_count': int_or_none(zvideo.get('comment_count')),
}

View File

@ -127,10 +127,13 @@ def is_webp(path):
except PostProcessingError as err:
self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err))
if not check_executable('AtomicParsley', ['-v']):
atomicparsley = next((
x for x in ['AtomicParsley', 'atomicparsley']
if check_executable(x, ['-v'])), None)
if atomicparsley is None:
raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
cmd = [encodeFilename('AtomicParsley', True),
cmd = [encodeFilename(atomicparsley, True),
encodeFilename(filename, True),
encodeArgument('--artwork'),
encodeFilename(thumbnail_filename, True),