[ie/nebula] Overhaul extractors (#8566)

Closes #4300, Closes #5814, Closes #7588, Closes #6334, Closes #6538
Authored by: elyse0, pukkandan, seproDev

Co-authored-by: Elyse <26639800+elyse0@users.noreply.github.com>
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
This commit is contained in:
sepro 2023-11-20 02:03:33 +01:00 committed by GitHub
parent 3237f8ba29
commit 45d82be65f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 336 additions and 232 deletions

View File

@ -214,8 +214,9 @@ def sanitize(key, value):
test_info_dict = { test_info_dict = {
key: sanitize(key, value) for key, value in got_dict.items() key: sanitize(key, value) for key, value in got_dict.items()
if value is not None and key not in IGNORED_FIELDS and not any( if value is not None and key not in IGNORED_FIELDS and (
key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES) not any(key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES)
or key == '_old_archive_ids')
} }
# display_id may be generated from id # display_id may be generated from id

View File

@ -1247,6 +1247,7 @@
from .ndtv import NDTVIE from .ndtv import NDTVIE
from .nebula import ( from .nebula import (
NebulaIE, NebulaIE,
NebulaClassIE,
NebulaSubscriptionsIE, NebulaSubscriptionsIE,
NebulaChannelIE, NebulaChannelIE,
) )

View File

@ -3,140 +3,140 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start from ..utils import (
ExtractorError,
int_or_none,
make_archive_id,
parse_iso8601,
smuggle_url,
try_call,
unsmuggle_url,
update_url_query,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
class NebulaBaseIE(InfoExtractor): class NebulaBaseIE(InfoExtractor):
_NETRC_MACHINE = 'watchnebula' _NETRC_MACHINE = 'watchnebula'
_token = _api_token = None
_nebula_api_token = None def _perform_login(self, username, password):
_nebula_bearer_token = None
def _perform_nebula_auth(self, username, password):
if not username or not password:
self.raise_login_required(method='password')
data = json.dumps({'email': username, 'password': password}).encode('utf8')
response = self._download_json(
'https://api.watchnebula.com/api/v1/auth/login/',
data=data, fatal=False, video_id=None,
headers={
'content-type': 'application/json',
# Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
'cookie': ''
},
note='Logging in to Nebula with supplied credentials',
errnote='Authentication failed or rejected')
if not response or not response.get('key'):
self.raise_login_required(method='password')
return response['key']
def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
assert method in ('GET', 'POST',)
assert auth_type in ('api', 'bearer',)
def inner_call():
authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
return self._download_json(
url, video_id, note=note, headers={'Authorization': authorization},
data=b'' if method == 'POST' else None)
try: try:
return inner_call() response = self._download_json(
except ExtractorError as exc: 'https://nebula.tv/auth/login/', None,
# if 401 or 403, attempt credential re-auth and retry 'Logging in to Nebula', 'Login failed',
if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.status in (401, 403): data=json.dumps({'email': username, 'password': password}).encode(),
self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') headers={'content-type': 'application/json'})
self._perform_login() except ExtractorError as e:
return inner_call() if isinstance(e.cause, HTTPError) and e.cause.status == 400:
else: raise ExtractorError('Login failed: Invalid username or password', expected=True)
raise
self._api_token = traverse_obj(response, ('key', {str}))
if not self._api_token:
raise ExtractorError('Login failed: No token')
def _call_api(self, *args, **kwargs):
if self._token:
kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
try:
return self._download_json(*args, **kwargs)
except ExtractorError as e:
if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403):
raise
self.to_screen(
f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}')
self._real_initialize()
if self._token:
kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
return self._download_json(*args, **kwargs)
def _real_initialize(self):
if not self._api_token:
self._api_token = try_call(
lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value)
self._token = self._download_json(
'https://users.api.nebula.app/api/v1/authorization/', None,
headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None,
note='Authorizing to Nebula', data=b'')['token']
def _extract_formats(self, content_id, slug):
for retry in (False, True):
try:
fmts, subs = self._extract_m3u8_formats_and_subtitles(
f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8',
slug, 'mp4', query={
'token': self._token,
'app_version': '23.10.0',
'platform': 'ios',
})
return {'formats': fmts, 'subtitles': subs}
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self.raise_login_required()
if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403:
self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error')
self._real_initialize()
continue
raise raise
def _fetch_nebula_bearer_token(self): def _extract_video_metadata(self, episode):
""" channel_url = traverse_obj(
Get a Bearer token for the Nebula API. This will be required to fetch video meta data. episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False)
"""
response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
method='POST',
note='Authorizing to Nebula')
return response['token']
def _fetch_video_formats(self, slug):
stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/',
video_id=slug,
auth_type='bearer',
note='Fetching video stream info')
manifest_url = stream_info['manifest']
return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4')
def _build_video_info(self, episode):
fmts, subs = self._fetch_video_formats(episode['slug'])
channel_slug = episode['channel_slug']
channel_title = episode['channel_title']
zype_id = episode.get('zype_id')
return { return {
'id': remove_start(episode['id'], 'video_episode:'), 'id': episode['id'].partition(':')[2],
'display_id': episode['slug'], **traverse_obj(episode, {
'formats': fmts, 'display_id': 'slug',
'subtitles': subs, 'title': 'title',
'webpage_url': f'https://nebula.tv/{episode["slug"]}', 'description': 'description',
'title': episode['title'], 'timestamp': ('published_at', {parse_iso8601}),
'description': episode['description'], 'duration': ('duration', {int_or_none}),
'timestamp': parse_iso8601(episode['published_at']), 'channel_id': 'channel_slug',
'thumbnails': [{ 'uploader_id': 'channel_slug',
# 'id': tn.get('name'), # this appears to be null 'channel': 'channel_title',
'url': tn['original'], 'uploader': 'channel_title',
'height': key, 'series': 'channel_title',
} for key, tn in episode['assets']['thumbnail'].items()], 'creator': 'channel_title',
'duration': episode['duration'], 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}),
'channel': channel_title, 'episode_number': ('order', {int_or_none}),
'channel_id': channel_slug, # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE
'channel_url': f'https://nebula.tv/{channel_slug}', '_old_archive_ids': ('zype_id', {lambda x: [
'uploader': channel_title, make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}),
'uploader_id': channel_slug, }),
'uploader_url': f'https://nebula.tv/{channel_slug}', 'channel_url': channel_url,
'series': channel_title, 'uploader_url': channel_url,
'creator': channel_title,
'extractor_key': NebulaIE.ie_key(),
'extractor': NebulaIE.IE_NAME,
'_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None,
} }
def _perform_login(self, username=None, password=None):
self._nebula_api_token = self._perform_nebula_auth(username, password)
self._nebula_bearer_token = self._fetch_nebula_bearer_token()
class NebulaIE(NebulaBaseIE): class NebulaIE(NebulaBaseIE):
_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)' _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
_TESTS = [ _TESTS = [{
{
'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
'md5': '14944cfee8c7beeea106320c47560efc',
'info_dict': { 'info_dict': {
'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
'ext': 'mp4', 'ext': 'mp4',
'title': 'That Time Disney Remade Beauty and the Beast', 'title': 'That Time Disney Remade Beauty and the Beast',
'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We werent able to remove it without reducing video quality, so its presented here in its original context.', 'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4',
'upload_date': '20180731', 'upload_date': '20180731',
'timestamp': 1533009600, 'timestamp': 1533009600,
'channel': 'Lindsay Ellis', 'channel': 'Lindsay Ellis',
'channel_id': 'lindsayellis', 'channel_id': 'lindsayellis',
'uploader': 'Lindsay Ellis', 'uploader': 'Lindsay Ellis',
'uploader_id': 'lindsayellis', 'uploader_id': 'lindsayellis',
'uploader_url': 'https://nebula.tv/lindsayellis', 'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis',
'series': 'Lindsay Ellis', 'series': 'Lindsay Ellis',
'display_id': 'that-time-disney-remade-beauty-and-the-beast', 'display_id': 'that-time-disney-remade-beauty-and-the-beast',
'channel_url': 'https://nebula.tv/lindsayellis', 'channel_url': r're:https://nebula\.(tv|app)/lindsayellis',
'creator': 'Lindsay Ellis', 'creator': 'Lindsay Ellis',
'duration': 2212, 'duration': 2212,
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
'_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'],
}, },
}, 'params': {'skip_download': 'm3u8'},
{ }, {
'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
'md5': 'd05739cf6c38c09322422f696b569c23', 'md5': 'd05739cf6c38c09322422f696b569c23',
'info_dict': { 'info_dict': {
@ -156,10 +156,11 @@ class NebulaIE(NebulaBaseIE):
'duration': 841, 'duration': 841,
'channel_url': 'https://nebula.tv/d-day', 'channel_url': 'https://nebula.tv/d-day',
'uploader_url': 'https://nebula.tv/d-day', 'uploader_url': 'https://nebula.tv/d-day',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
'_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'],
}, },
}, 'params': {'skip_download': 'm3u8'},
{ }, {
'url': 'https://nebula.tv/videos/money-episode-1-the-draw', 'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
'md5': 'ebe28a7ad822b9ee172387d860487868', 'md5': 'ebe28a7ad822b9ee172387d860487868',
'info_dict': { 'info_dict': {
@ -178,55 +179,130 @@ class NebulaIE(NebulaBaseIE):
'channel_url': 'https://nebula.tv/tom-scott-presents-money', 'channel_url': 'https://nebula.tv/tom-scott-presents-money',
'series': 'Tom Scott Presents: Money', 'series': 'Tom Scott Presents: Money',
'display_id': 'money-episode-1-the-draw', 'display_id': 'money-episode-1-the-draw',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
'creator': 'Tom Scott Presents: Money', 'creator': 'Tom Scott Presents: Money',
'_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'],
}, },
}, 'params': {'skip_download': 'm3u8'},
{ }, {
'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
'info_dict': {
'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d',
'ext': 'mp4',
'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
'title': 'Did the US Really Blow Up the NordStream Pipelines?',
'description': 'md5:b4e2a14e3ff08f546a3209c75261e789',
'upload_date': '20230223',
'timestamp': 1677144070,
'channel': 'TLDR News EU',
'channel_id': 'tldrnewseu',
'uploader': 'TLDR News EU',
'uploader_id': 'tldrnewseu',
'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu',
'duration': 524,
'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu',
'series': 'TLDR News EU',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
'creator': 'TLDR News EU',
'_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'],
}, },
{ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw', 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
'only_matching': True, 'only_matching': True,
}, }]
]
def _fetch_video_metadata(self, slug):
return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/',
video_id=slug,
auth_type='bearer',
note='Fetching video meta data')
def _real_extract(self, url): def _real_extract(self, url):
slug = self._match_id(url) slug = self._match_id(url)
video = self._fetch_video_metadata(slug) url, smuggled_data = unsmuggle_url(url, {})
return self._build_video_info(video) if smuggled_data.get('id'):
return {
'id': smuggled_data['id'],
'display_id': slug,
'title': '',
**self._extract_formats(smuggled_data['id'], slug),
}
metadata = self._call_api(
f'https://content.api.nebula.app/content/videos/{slug}',
slug, note='Fetching video metadata')
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
class NebulaClassIE(NebulaBaseIE):
IE_NAME = 'nebula:class'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>[-\w]+)/(?P<ep>\d+)'
_TESTS = [{
'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
'info_dict': {
'id': 'd7432cdc-c608-474d-942c-f74345daed7b',
'ext': 'mp4',
'display_id': '14',
'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit',
'episode_number': 14,
'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9',
'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit',
'duration': 646,
'episode': 'Episode 14',
'title': 'Photos, Sculpture, and Video',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
slug, episode = self._match_valid_url(url).group('id', 'ep')
url, smuggled_data = unsmuggle_url(url, {})
if smuggled_data.get('id'):
return {
'id': smuggled_data['id'],
'display_id': slug,
'title': '',
**self._extract_formats(smuggled_data['id'], slug),
}
metadata = self._call_api(
f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
slug, note='Fetching video metadata')
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
class NebulaSubscriptionsIE(NebulaBaseIE): class NebulaSubscriptionsIE(NebulaBaseIE):
IE_NAME = 'nebula:subscriptions' IE_NAME = 'nebula:subscriptions'
_VALID_URL = rf'{_BASE_URL_RE}/myshows' _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)'
_TESTS = [ _TESTS = [{
{
'url': 'https://nebula.tv/myshows', 'url': 'https://nebula.tv/myshows',
'playlist_mincount': 1, 'playlist_mincount': 1,
'info_dict': { 'info_dict': {
'id': 'myshows', 'id': 'myshows',
}, },
}, }]
]
def _generate_playlist_entries(self): def _generate_playlist_entries(self):
next_url = 'https://content.watchnebula.com/library/video/?page_size=100' next_url = update_url_query('https://content.api.nebula.app/video_episodes/', {
page_num = 1 'following': 'true',
while next_url: 'include': 'engagement',
channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer', 'ordering': '-published_at',
note=f'Retrieving subscriptions page {page_num}') })
for page_num in itertools.count(1):
channel = self._call_api(
next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}')
for episode in channel['results']: for episode in channel['results']:
yield self._build_video_info(episode) metadata = self._extract_video_metadata(episode)
next_url = channel['next'] yield self.url_result(smuggle_url(
page_num += 1 f'https://nebula.tv/videos/{metadata["display_id"]}',
{'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
next_url = channel.get('next')
if not next_url:
return
def _real_extract(self, url): def _real_extract(self, url):
return self.playlist_result(self._generate_playlist_entries(), 'myshows') return self.playlist_result(self._generate_playlist_entries(), 'myshows')
@ -234,9 +310,8 @@ def _real_extract(self, url):
class NebulaChannelIE(NebulaBaseIE): class NebulaChannelIE(NebulaBaseIE):
IE_NAME = 'nebula:channel' IE_NAME = 'nebula:channel'
_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)' _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos/)(?P<id>[-\w]+)/?(?:$|[?#])'
_TESTS = [ _TESTS = [{
{
'url': 'https://nebula.tv/tom-scott-presents-money', 'url': 'https://nebula.tv/tom-scott-presents-money',
'info_dict': { 'info_dict': {
'id': 'tom-scott-presents-money', 'id': 'tom-scott-presents-money',
@ -252,30 +327,57 @@ class NebulaChannelIE(NebulaBaseIE):
'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
}, },
'playlist_mincount': 2, 'playlist_mincount': 2,
}, {
'url': 'https://nebula.tv/johnnyharris',
'info_dict': {
'id': 'johnnyharris',
'title': 'Johnny Harris',
'description': 'I make videos about maps and many other things.',
}, },
] 'playlist_mincount': 90,
}, {
'url': 'https://nebula.tv/copyright-for-fun-and-profit',
'info_dict': {
'id': 'copyright-for-fun-and-profit',
'title': 'Copyright for Fun and Profit',
'description': 'md5:6690248223eed044a9f11cd5a24f9742',
},
'playlist_count': 23,
}]
def _generate_playlist_entries(self, collection_id, channel): def _generate_playlist_entries(self, collection_id, collection_slug):
episodes = channel['episodes']['results'] next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at'
for page_num in itertools.count(2): for page_num in itertools.count(1):
for episode in episodes: episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}')
yield self._build_video_info(episode) for episode in episodes['results']:
next_url = channel['episodes']['next'] metadata = self._extract_video_metadata(episode)
yield self.url_result(smuggle_url(
episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}',
{'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
next_url = episodes.get('next')
if not next_url: if not next_url:
break break
channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
note=f'Retrieving channel page {page_num}') def _generate_class_entries(self, channel):
episodes = channel['episodes']['results'] for lesson in channel['lessons']:
metadata = self._extract_video_metadata(lesson)
yield self.url_result(smuggle_url(
lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
{'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
def _real_extract(self, url): def _real_extract(self, url):
collection_id = self._match_id(url) collection_slug = self._match_id(url)
channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' channel = self._call_api(
channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons',
channel_details = channel['details'] collection_slug, note='Retrieving channel')
if channel.get('type') == 'class':
entries = self._generate_class_entries(channel)
else:
entries = self._generate_playlist_entries(channel['id'], collection_slug)
return self.playlist_result( return self.playlist_result(
entries=self._generate_playlist_entries(collection_id, channel), entries=entries,
playlist_id=collection_id, playlist_id=collection_slug,
playlist_title=channel_details['title'], playlist_title=channel.get('title'),
playlist_description=channel_details['description'] playlist_description=channel.get('description'))
)