[extractor] Framework for embed detection (#4307)

This commit is contained in:
pukkandan 2022-08-01 06:52:03 +05:30
parent 47304e07dc
commit 8f97a15d1c
8 changed files with 149 additions and 77 deletions

View File

@ -9,11 +9,13 @@
write_string, write_string,
) )
# These bloat the lazy_extractors, so allow them to passthrough silently
ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}
class LazyLoadMetaClass(type): class LazyLoadMetaClass(type):
def __getattr__(cls, name): def __getattr__(cls, name):
# "_TESTS" bloat the lazy_extractors if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
if '_real_class' not in cls.__dict__ and name != 'get_testcases':
write_string( write_string(
'WARNING: Falling back to normal extractor since lazy extractor ' 'WARNING: Falling back to normal extractor since lazy extractor '
f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n') f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')

View File

@ -11,7 +11,7 @@
from inspect import getsource from inspect import getsource
NO_ATTR = object() NO_ATTR = object()
STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit'] STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
CLASS_METHODS = [ CLASS_METHODS = [
'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable' 'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
] ]
@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):
}.get(base.__name__, base.__name__) for base in ie.__bases__) }.get(base.__name__, base.__name__) for base in ie.__bases__)
s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases) s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
valid_url = getattr(ie, '_VALID_URL', None)
if not valid_url and hasattr(ie, '_make_valid_url'):
valid_url = ie._make_valid_url()
if valid_url:
s += f' _VALID_URL = {valid_url!r}\n'
return s + '\n'.join(extra_ie_code(ie, attr_base)) return s + '\n'.join(extra_ie_code(ie, attr_base))

View File

@ -1566,7 +1566,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
result_type = ie_result.get('_type', 'video') result_type = ie_result.get('_type', 'video')
if result_type in ('url', 'url_transparent'): if result_type in ('url', 'url_transparent'):
ie_result['url'] = sanitize_url(ie_result['url']) ie_result['url'] = sanitize_url(
ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
if ie_result.get('original_url'): if ie_result.get('original_url'):
extra_info.setdefault('original_url', ie_result['original_url']) extra_info.setdefault('original_url', ie_result['original_url'])

View File

@ -402,11 +402,11 @@ class BrightcoveNewIE(AdobePassIE):
@staticmethod @staticmethod
def _extract_url(ie, webpage): def _extract_url(ie, webpage):
urls = BrightcoveNewIE._extract_urls(ie, webpage) urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
return urls[0] if urls else None return urls[0] if urls else None
@staticmethod @staticmethod
def _extract_urls(ie, webpage): def _extract_brightcove_urls(ie, webpage):
# Reference: # Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
# 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag

View File

@ -14,6 +14,7 @@
import re import re
import sys import sys
import time import time
import types
import urllib.parse import urllib.parse
import urllib.request import urllib.request
import xml.etree.ElementTree import xml.etree.ElementTree
@ -23,6 +24,7 @@
from ..downloader import FileDownloader from ..downloader import FileDownloader
from ..downloader.f4m import get_base_url, remove_encrypted_media from ..downloader.f4m import get_base_url, remove_encrypted_media
from ..utils import ( from ..utils import (
IDENTITY,
JSON_LD_RE, JSON_LD_RE,
NO_DEFAULT, NO_DEFAULT,
ExtractorError, ExtractorError,
@ -59,6 +61,7 @@
parse_m3u8_attributes, parse_m3u8_attributes,
parse_resolution, parse_resolution,
sanitize_filename, sanitize_filename,
sanitize_url,
sanitized_Request, sanitized_Request,
str_or_none, str_or_none,
str_to_int, str_to_int,
@ -431,14 +434,26 @@ class InfoExtractor:
title, description etc. title, description etc.
Subclasses of this should define a _VALID_URL regexp and, re-define the Subclasses of this should also be added to the list of extractors and
_real_extract() and (optionally) _real_initialize() methods. should define a _VALID_URL regexp and, re-define the _real_extract() and
Probably, they should also be added to the list of extractors. (optionally) _real_initialize() methods.
Subclasses may also override suitable() if necessary, but ensure the function Subclasses may also override suitable() if necessary, but ensure the function
signature is preserved and that this function imports everything it needs signature is preserved and that this function imports everything it needs
(except other extractors), so that lazy_extractors works correctly. (except other extractors), so that lazy_extractors works correctly.
Subclasses can define a list of _EMBED_REGEX, which will be searched for in
the HTML of Generic webpages. It may also override _extract_embed_urls
or _extract_from_webpage as necessary. While these are normally classmethods,
_extract_from_webpage is allowed to be an instance method.
_extract_from_webpage may raise self.StopExtraction() to stop further
processing of the webpage and obtain exclusive rights to it. This is useful
when the extractor cannot reliably be matched using just the URL.
Eg: invidious/peertube instances
Embed-only extractors can be defined by setting _VALID_URL = False.
To support username + password (or netrc) login, the extractor must define a To support username + password (or netrc) login, the extractor must define a
_NETRC_MACHINE and re-define _perform_login(username, password) and _NETRC_MACHINE and re-define _perform_login(username, password) and
(optionally) _initialize_pre_login() methods. The _perform_login method will (optionally) _initialize_pre_login() methods. The _perform_login method will
@ -476,6 +491,8 @@ class InfoExtractor:
_NETRC_MACHINE = None _NETRC_MACHINE = None
IE_DESC = None IE_DESC = None
SEARCH_KEY = None SEARCH_KEY = None
_VALID_URL = None
_EMBED_REGEX = []
def _login_hint(self, method=NO_DEFAULT, netrc=None): def _login_hint(self, method=NO_DEFAULT, netrc=None):
password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
@ -499,12 +516,12 @@ def __init__(self, downloader=None):
@classmethod @classmethod
def _match_valid_url(cls, url): def _match_valid_url(cls, url):
if cls._VALID_URL is False:
return None
# This does not use has/getattr intentionally - we want to know whether # This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also # we have cached the regexp for *this* class, whereas getattr would also
# match the superclass # match the superclass
if '_VALID_URL_RE' not in cls.__dict__: if '_VALID_URL_RE' not in cls.__dict__:
if '_VALID_URL' not in cls.__dict__:
cls._VALID_URL = cls._make_valid_url()
cls._VALID_URL_RE = re.compile(cls._VALID_URL) cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url) return cls._VALID_URL_RE.match(url)
@ -1143,10 +1160,12 @@ def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent
'url': url, 'url': url,
} }
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): @classmethod
urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
for m in orderedSet(map(getter, matches) if getter else matches)) getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) return cls.playlist_result(
(cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
playlist_id, playlist_title, **kwargs)
@staticmethod @staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs): def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
@ -1353,12 +1372,20 @@ def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs
def _dc_search_uploader(self, html): def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader') return self._html_search_meta('dc.creator', html, 'uploader')
def _rta_search(self, html): @staticmethod
def _rta_search(html):
# See http://www.rtalabel.org/index.php?content=howtofaq#single # See http://www.rtalabel.org/index.php?content=howtofaq#single
if re.search(r'(?ix)<meta\s+name="rating"\s+' if re.search(r'(?ix)<meta\s+name="rating"\s+'
r' content="RTA-5042-1996-1400-1577-RTA"', r' content="RTA-5042-1996-1400-1577-RTA"',
html): html):
return 18 return 18
# And then there are the jokers who advertise that they use RTA, but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
]
if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
return 18
return 0 return 0
def _media_rating_search(self, html): def _media_rating_search(self, html):
@ -1965,14 +1992,9 @@ def http_scheme(self):
else 'https:') else 'https:')
def _proto_relative_url(self, url, scheme=None): def _proto_relative_url(self, url, scheme=None):
if url is None: scheme = scheme or self.http_scheme()
return url assert scheme.endswith(':')
if url.startswith('//'): return sanitize_url(url, scheme=scheme[:-1])
if scheme is None:
scheme = self.http_scheme()
return scheme + url
else:
return url
def _sleep(self, timeout, video_id, msg_template=None): def _sleep(self, timeout, video_id, msg_template=None):
if msg_template is None: if msg_template is None:
@ -3767,10 +3789,12 @@ def geo_verification_headers(self):
headers['Ytdl-request-proxy'] = geo_verification_proxy headers['Ytdl-request-proxy'] = geo_verification_proxy
return headers return headers
def _generic_id(self, url): @staticmethod
def _generic_id(url):
return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
def _generic_title(self, url): @staticmethod
def _generic_title(url):
return urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
@staticmethod @staticmethod
@ -3816,6 +3840,37 @@ def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_l
self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
return True return True
@classmethod
def extract_from_webpage(cls, ydl, url, webpage):
ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
else ydl.get_info_extractor(cls.ie_key()))
yield from ie._extract_from_webpage(url, webpage) or []
@classmethod
def _extract_from_webpage(cls, url, webpage):
for embed_url in orderedSet(
cls._extract_embed_urls(url, webpage) or [], lazy=True):
yield cls.url_result(embed_url, cls)
@classmethod
def _extract_embed_urls(cls, url, webpage):
"""@returns all the embed urls on the webpage"""
if '_EMBED_URL_RE' not in cls.__dict__:
assert isinstance(cls._EMBED_REGEX, (list, tuple))
for idx, regex in enumerate(cls._EMBED_REGEX):
assert regex.count('(?P<url>') == 1, \
f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
for regex in cls._EMBED_URL_RE:
for mobj in regex.finditer(webpage):
embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
if cls._VALID_URL is False or cls.suitable(embed_url):
yield embed_url
class StopExtraction(Exception):
pass
class SearchInfoExtractor(InfoExtractor): class SearchInfoExtractor(InfoExtractor):
""" """
@ -3826,8 +3881,8 @@ class SearchInfoExtractor(InfoExtractor):
_MAX_RESULTS = float('inf') _MAX_RESULTS = float('inf')
@classmethod @classproperty
def _make_valid_url(cls): def _VALID_URL(cls):
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
def _real_extract(self, query): def _real_extract(self, query):

View File

@ -3,6 +3,8 @@
import urllib.parse import urllib.parse
import xml.etree.ElementTree import xml.etree.ElementTree
from . import gen_extractor_classes
from .common import InfoExtractor # isort: split
from .ant1newsgr import Ant1NewsGrEmbedIE from .ant1newsgr import Ant1NewsGrEmbedIE
from .anvato import AnvatoIE from .anvato import AnvatoIE
from .apa import APAIE from .apa import APAIE
@ -14,7 +16,6 @@
from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
from .channel9 import Channel9IE from .channel9 import Channel9IE
from .cloudflarestream import CloudflareStreamIE from .cloudflarestream import CloudflareStreamIE
from .common import InfoExtractor
from .commonprotocols import RtmpIE from .commonprotocols import RtmpIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .dailymail import DailyMailIE from .dailymail import DailyMailIE
@ -115,6 +116,7 @@
determine_ext, determine_ext,
dict_get, dict_get,
float_or_none, float_or_none,
format_field,
int_or_none, int_or_none,
is_html, is_html,
js_to_json, js_to_json,
@ -2641,8 +2643,15 @@ def report_following_redirect(self, new_url):
"""Report information extraction.""" """Report information extraction."""
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
def report_detected(self, name): def report_detected(self, name, num=1, note=None):
self._downloader.write_debug(f'Identified a {name}') if num > 1:
name += 's'
elif not num:
return
else:
num = 'a'
self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
def _extract_rss(self, url, video_id, doc): def _extract_rss(self, url, video_id, doc):
NS_MAP = { NS_MAP = {
@ -2854,8 +2863,7 @@ def _real_extract(self, url):
if not self.get_param('test', False) and not is_intentional: if not self.get_param('test', False) and not is_intentional:
force = self.get_param('force_generic_extractor', False) force = self.get_param('force_generic_extractor', False)
self.report_warning( self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
'%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
first_bytes = full_response.read(512) first_bytes = full_response.read(512)
@ -2933,6 +2941,22 @@ def _real_extract(self, url):
self.report_detected('Camtasia video') self.report_detected('Camtasia video')
return camtasia_res return camtasia_res
info_dict.update({
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
'title': (self._og_search_title(webpage, default=None)
or self._html_extract_title(webpage, 'video title', default='video')),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'age_limit': self._rta_search(webpage),
})
domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Sometimes embedded video player is hidden behind percent encoding # Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way # Unescaping the whole page allows to handle those cases in a generic way
@ -2946,40 +2970,12 @@ def _real_extract(self, url):
r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
lambda x: unescapeHTML(x.group(0)), webpage) lambda x: unescapeHTML(x.group(0)), webpage)
# it's tempting to parse this further, but you would # TODO: Remove
# have to take into account all the variations like video_title, video_description, video_thumbnail, age_limit, video_uploader = \
# Video Title - Site Name info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = (self._og_search_title(webpage, default=None)
or self._html_extract_title(webpage, 'video title', default='video'))
# Try to detect age limit automatically # TODO: Move Embeds
age_limit = self._rta_search(webpage) self._downloader.write_debug('Looking for single embeds')
# And then there are the jokers who advertise that they use RTA,
# but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
]
if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
age_limit = 18
# video uploader is domain name
video_uploader = self._search_regex(
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
video_description = self._og_search_description(webpage, default=None)
video_thumbnail = self._og_search_thumbnail(webpage, default=None)
info_dict.update({
'title': video_title,
'description': video_description,
'thumbnail': video_thumbnail,
'age_limit': age_limit,
})
self._downloader.write_debug('Looking for video embeds')
# Look for Brightcove Legacy Studio embeds # Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
@ -2998,7 +2994,7 @@ def _real_extract(self, url):
} }
# Look for Brightcove New Studio embeds # Look for Brightcove New Studio embeds
bc_urls = BrightcoveNewIE._extract_urls(self, webpage) bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
if bc_urls: if bc_urls:
return self.playlist_from_matches( return self.playlist_from_matches(
bc_urls, video_id, video_title, bc_urls, video_id, video_title,
@ -3246,7 +3242,7 @@ def _real_extract(self, url):
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
# Look for embedded Spotify player # Look for embedded Spotify player
spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage) spotify_urls = SpotifyBaseIE._extract_urls(webpage)
if spotify_urls: if spotify_urls:
return self.playlist_from_matches(spotify_urls, video_id, video_title) return self.playlist_from_matches(spotify_urls, video_id, video_title)
@ -3837,6 +3833,30 @@ def _real_extract(self, url):
tiktok_urls = TikTokIE._extract_urls(webpage) tiktok_urls = TikTokIE._extract_urls(webpage)
if tiktok_urls: if tiktok_urls:
return self.playlist_from_matches(tiktok_urls, video_id, video_title) return self.playlist_from_matches(tiktok_urls, video_id, video_title)
# TODO: END: Move Embeds
self._downloader.write_debug('Looking for embeds')
embeds = []
for ie in gen_extractor_classes():
gen = ie.extract_from_webpage(self._downloader, url, webpage)
current_embeds = []
try:
while True:
current_embeds.append(next(gen))
except self.StopExtraction:
self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
embeds and 'discarding other embeds')
embeds = current_embeds
break
except StopIteration:
self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
embeds.extend(current_embeds)
del current_embeds
if len(embeds) == 1:
return {**info_dict, **embeds[0]}
elif embeds:
return self.playlist_result(embeds, **info_dict)
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@ -4119,7 +4139,6 @@ def filter_video(urls):
entries.append(self.url_result(video_url, 'Youtube')) entries.append(self.url_result(video_url, 'Youtube'))
continue continue
# here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0] video_id = os.path.splitext(video_id)[0]
headers = { headers = {
'referer': full_response.geturl() 'referer': full_response.geturl()

View File

@ -98,7 +98,7 @@ def _extract_episode(self, episode, series):
} }
@classmethod @classmethod
def _extract_embed_urls(cls, webpage): def _extract_urls(cls, webpage):
return re.findall( return re.findall(
r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"', r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
webpage) webpage)

View File

@ -705,13 +705,13 @@ def sanitize_path(s, force=False):
return os.path.join(*sanitized_path) return os.path.join(*sanitized_path)
def sanitize_url(url): def sanitize_url(url, *, scheme='http'):
# Prepend protocol-less URLs with `http:` scheme in order to mitigate # Prepend protocol-less URLs with `http:` scheme in order to mitigate
# the number of unwanted failures due to missing protocol # the number of unwanted failures due to missing protocol
if url is None: if url is None:
return return
elif url.startswith('//'): elif url.startswith('//'):
return 'http:%s' % url return f'{scheme}:{url}'
# Fix some common typos seen so far # Fix some common typos seen so far
COMMON_TYPOS = ( COMMON_TYPOS = (
# https://github.com/ytdl-org/youtube-dl/issues/15649 # https://github.com/ytdl-org/youtube-dl/issues/15649