mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-06-01 19:08:14 +02:00
Compare commits
13 Commits
fa57a99ba0
...
2945d26039
Author | SHA1 | Date | |
---|---|---|---|
|
2945d26039 | ||
|
8e15177b41 | ||
|
dd9ad97b1f | ||
|
61b17437dc | ||
|
dd74aa0bca | ||
|
e5e91ad05d | ||
|
7308dc895c | ||
|
1f719e1934 | ||
|
a8edca98f5 | ||
|
827560f2b9 | ||
|
5db908bebf | ||
|
e2243c2033 | ||
|
960b8931c6 |
|
@ -666,7 +666,7 @@ ## Filesystem Options:
|
|||
The name of the browser to load cookies
|
||||
from. Currently supported browsers are:
|
||||
brave, chrome, chromium, edge, firefox,
|
||||
opera, safari, vivaldi. Optionally, the
|
||||
opera, safari, vivaldi, whale. Optionally, the
|
||||
KEYRING used for decrypting Chromium cookies
|
||||
on Linux, the name/path of the PROFILE to
|
||||
load cookies from, and the CONTAINER name
|
||||
|
|
|
@ -46,7 +46,7 @@
|
|||
from .utils._utils import _YDLLogger
|
||||
from .utils.networking import normalize_url
|
||||
|
||||
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
|
||||
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'}
|
||||
SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
|
||||
|
||||
|
||||
|
@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'),
|
||||
'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'),
|
||||
'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'),
|
||||
'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'),
|
||||
}[browser_name]
|
||||
|
||||
elif sys.platform == 'darwin':
|
||||
|
@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': os.path.join(appdata, 'Microsoft Edge'),
|
||||
'opera': os.path.join(appdata, 'com.operasoftware.Opera'),
|
||||
'vivaldi': os.path.join(appdata, 'Vivaldi'),
|
||||
'whale': os.path.join(appdata, 'Naver/Whale'),
|
||||
}[browser_name]
|
||||
|
||||
else:
|
||||
|
@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': os.path.join(config, 'microsoft-edge'),
|
||||
'opera': os.path.join(config, 'opera'),
|
||||
'vivaldi': os.path.join(config, 'vivaldi'),
|
||||
'whale': os.path.join(config, 'naver-whale'),
|
||||
}[browser_name]
|
||||
|
||||
# Linux keyring names can be determined by snooping on dbus while opening the browser in KDE:
|
||||
|
@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name):
|
|||
'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium',
|
||||
'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium',
|
||||
'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome',
|
||||
'whale': 'Whale',
|
||||
}[browser_name]
|
||||
|
||||
browsers_without_profiles = {'opera'}
|
||||
|
|
|
@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
|
|||
if urlh is False:
|
||||
assert not fatal
|
||||
return False
|
||||
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
|
||||
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
|
||||
encoding=encoding, data=data)
|
||||
return (content, urlh)
|
||||
|
||||
@staticmethod
|
||||
|
@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
|
|||
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
|
||||
expected=True)
|
||||
|
||||
def _request_dump_filename(self, url, video_id):
|
||||
basen = f'{video_id}_{url}'
|
||||
def _request_dump_filename(self, url, video_id, data=None):
|
||||
if data is not None:
|
||||
data = hashlib.md5(data).hexdigest()
|
||||
basen = join_nonempty(video_id, data, url, delim='_')
|
||||
trim_length = self.get_param('trim_file_name') or 240
|
||||
if len(basen) > trim_length:
|
||||
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
|
||||
|
@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
|
|||
except LookupError:
|
||||
return webpage_bytes.decode('utf-8', 'replace')
|
||||
|
||||
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
|
||||
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
|
||||
prefix=None, encoding=None, data=None):
|
||||
webpage_bytes = urlh.read()
|
||||
if prefix is not None:
|
||||
webpage_bytes = prefix + webpage_bytes
|
||||
url_or_request = self._create_request(url_or_request, data)
|
||||
if self.get_param('dump_intermediate_pages', False):
|
||||
self.to_screen('Dumping request to ' + urlh.url)
|
||||
dump = base64.b64encode(webpage_bytes).decode('ascii')
|
||||
self._downloader.to_screen(dump)
|
||||
if self.get_param('write_pages'):
|
||||
filename = self._request_dump_filename(urlh.url, video_id)
|
||||
filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data)
|
||||
self.to_screen(f'Saving request to {filename}')
|
||||
with open(filename, 'wb') as outf:
|
||||
outf.write(webpage_bytes)
|
||||
|
@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
|
|||
impersonate=None, require_impersonation=False):
|
||||
if self.get_param('load_pages'):
|
||||
url_or_request = self._create_request(url_or_request, data, headers, query)
|
||||
filename = self._request_dump_filename(url_or_request.url, video_id)
|
||||
filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
|
||||
self.to_screen(f'Loading request from {filename}')
|
||||
try:
|
||||
with open(filename, 'rb') as dumpf:
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import itertools
|
||||
import re
|
||||
import urllib.parse
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
|
@ -237,7 +236,8 @@ def _real_extract(self, url):
|
|||
|
||||
if substation_id:
|
||||
webpage = self._download_webpage(url, station_id)
|
||||
api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
|
||||
api_response = self._search_json(r'webradioLive:\s*', webpage, station_id, substation_id,
|
||||
transform_source=js_to_json)
|
||||
else:
|
||||
api_response = self._download_json(
|
||||
f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
|
||||
|
@ -267,10 +267,10 @@ def _real_extract(self, url):
|
|||
class RadioFrancePlaylistBaseIE(RadioFranceBaseIE):
|
||||
"""Subclasses must set _METADATA_KEY"""
|
||||
|
||||
def _call_api(self, content_id, cursor, page_num):
|
||||
def _call_api(self, station, content_id, cursor):
|
||||
raise NotImplementedError('This method must be implemented by subclasses')
|
||||
|
||||
def _generate_playlist_entries(self, content_id, content_response):
|
||||
def _generate_playlist_entries(self, station, content_id, content_response):
|
||||
for page_num in itertools.count(2):
|
||||
for entry in content_response['items']:
|
||||
yield self.url_result(
|
||||
|
@ -281,28 +281,30 @@ def _generate_playlist_entries(self, content_id, content_response):
|
|||
'thumbnail': ('visual', 'src'),
|
||||
}))
|
||||
|
||||
next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
|
||||
if not next_cursor:
|
||||
if not content_response['next']:
|
||||
break
|
||||
|
||||
content_response = self._call_api(content_id, next_cursor, page_num)
|
||||
content_response = self._call_api(station, content_id, content_response['next'])
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
playlist_id = self._match_id(url)
|
||||
# If it is a podcast playlist, get the name of the station it is on
|
||||
# profile page playlists are not attached to a station currently
|
||||
station = self._match_valid_url(url).group('station') if isinstance(self, RadioFrancePodcastIE) else None
|
||||
|
||||
metadata = self._download_json(
|
||||
'https://www.radiofrance.fr/api/v2.1/path', display_id,
|
||||
query={'value': urllib.parse.urlparse(url).path})['content']
|
||||
|
||||
content_id = metadata['id']
|
||||
# Get data for the first page, and the uuid for the playlist
|
||||
metadata = self._call_api(station, playlist_id, 1)
|
||||
uuid = traverse_obj(metadata, ('metadata', 'id'))
|
||||
|
||||
return self.playlist_result(
|
||||
self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
|
||||
display_id=display_id, **{**traverse_obj(metadata, {
|
||||
self._generate_playlist_entries(station, playlist_id, metadata),
|
||||
uuid,
|
||||
display_id=playlist_id,
|
||||
**{**traverse_obj(metadata['metadata'], {
|
||||
'title': 'title',
|
||||
'description': 'standFirst',
|
||||
'thumbnail': ('visual', 'src'),
|
||||
}), **traverse_obj(metadata, {
|
||||
}), **traverse_obj(metadata['metadata'], {
|
||||
'title': 'name',
|
||||
'description': 'role',
|
||||
})})
|
||||
|
@ -311,7 +313,7 @@ def _real_extract(self, url):
|
|||
class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
|
||||
_VALID_URL = rf'''(?x)
|
||||
{RadioFranceBaseIE._VALID_URL_BASE}
|
||||
/(?:{RadioFranceBaseIE._STATIONS_RE})
|
||||
/(?P<station>{RadioFranceBaseIE._STATIONS_RE})
|
||||
/podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
|
||||
'''
|
||||
|
||||
|
@ -326,15 +328,15 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
|
|||
},
|
||||
'playlist_mincount': 11,
|
||||
}, {
|
||||
'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
|
||||
'url': 'https://www.radiofrance.fr/franceinter/podcasts/avec-la-langue',
|
||||
'info_dict': {
|
||||
'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
|
||||
'display_id': 'jean-marie-le-pen-l-obsession-nationale',
|
||||
'title': 'Jean-Marie Le Pen, l\'obsession nationale',
|
||||
'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
|
||||
'id': '53a95989-7c61-48c7-873c-6a71009101bb',
|
||||
'display_id': 'avec-la-langue',
|
||||
'title': 'Avec la langue',
|
||||
'description': 'md5:4ddb6d4ed46dbbdee611b8e16e4af868',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
},
|
||||
'playlist_count': 7,
|
||||
'playlist_mincount': 36,
|
||||
}, {
|
||||
'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
|
||||
'info_dict': {
|
||||
|
@ -349,7 +351,7 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
|
|||
'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
|
||||
'display_id': 'certains-l-aiment-fip',
|
||||
'title': 'Certains l’aiment Fip',
|
||||
'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
|
||||
'description': 'md5:7c373cdcec7a024f12fa34de7612e44e',
|
||||
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
|
||||
},
|
||||
'playlist_mincount': 321,
|
||||
|
@ -363,10 +365,27 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
|
|||
|
||||
_METADATA_KEY = 'expressions'
|
||||
|
||||
def _call_api(self, podcast_id, cursor, page_num):
|
||||
return self._download_json(
|
||||
f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
|
||||
note=f'Downloading page {page_num}', query={'pageCursor': cursor})
|
||||
def _call_api(self, station, podcast_id, cursor):
|
||||
# The data is stored in the last <script> tag on a page
|
||||
url = 'https://www.radiofrance.fr/' + station + '/podcasts/' + podcast_id + '?p=' + str(cursor)
|
||||
webpage = self._download_webpage(url, podcast_id, note=f'Downloading {podcast_id} page {cursor}')
|
||||
|
||||
resp = dict()
|
||||
|
||||
# _search_json cannot parse the data as it contains javascript
|
||||
# Therefore, parse the episodes objects array separately
|
||||
resp['items'] = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
|
||||
contains_pattern=r'\[.+\]', transform_source=js_to_json)
|
||||
|
||||
# the pagination data is stored in a javascript object 'a'
|
||||
lastPage = int(re.search(r'a\.lastPage\s*=\s*(\d+);', webpage).group(1))
|
||||
hasMorePages = cursor < lastPage
|
||||
resp['next'] = cursor + 1 if hasMorePages else None
|
||||
|
||||
resp['metadata'] = self._search_json(r'content:\s*', webpage, podcast_id, podcast_id,
|
||||
transform_source=js_to_json)
|
||||
|
||||
return resp
|
||||
|
||||
|
||||
class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
|
||||
|
@ -380,7 +399,7 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
|
|||
'title': 'Thomas Pesquet',
|
||||
'description': 'Astronaute à l\'agence spatiale européenne',
|
||||
},
|
||||
'playlist_mincount': 212,
|
||||
'playlist_mincount': 100,
|
||||
}, {
|
||||
'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
|
||||
'info_dict': {
|
||||
|
@ -398,15 +417,43 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
|
|||
|
||||
_METADATA_KEY = 'documents'
|
||||
|
||||
def _call_api(self, profile_id, cursor, page_num):
|
||||
resp = self._download_json(
|
||||
f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
|
||||
note=f'Downloading page {page_num}', query={
|
||||
'relation': 'personality',
|
||||
'cursor': cursor,
|
||||
})
|
||||
def _call_api(self, station, profile_id, cursor):
|
||||
url = 'https://www.radiofrance.fr/personnes/' + profile_id + '?p=' + str(cursor)
|
||||
webpage = self._download_webpage(url, profile_id, note=f'Downloading {profile_id} page {cursor}')
|
||||
|
||||
resp = dict()
|
||||
|
||||
# On profile pages, the data is stored in a javascript array in the final <script>
|
||||
# Each episode is stored as
|
||||
# a[0] = { id: ... }; a[1] = [ id: ... ]; on page 2->
|
||||
# If a page had a thumbnail, the a variable contains image data,
|
||||
# and episode data is stored in b[0]...
|
||||
resp['items'] = []
|
||||
podcastindex = 0
|
||||
nextmatch = True
|
||||
while nextmatch:
|
||||
nextmatch = self._search_json(r'\w+\[' + str(podcastindex) + r'\]\s*=\s*', webpage, profile_id,
|
||||
profile_id, transform_source=js_to_json, fatal=False, default=None)
|
||||
podcastindex += 1
|
||||
if nextmatch is not None:
|
||||
resp['items'].append(nextmatch)
|
||||
|
||||
# There is more than one pagination key in the final <script>
|
||||
# We should use pick the pagination object which is within a documents object
|
||||
pagedata = self._search_json(r'documents\s*:\s*', webpage, profile_id, profile_id,
|
||||
transform_source=js_to_json)
|
||||
lastPage = traverse_obj(pagedata, ('pagination', 'lastPage'))
|
||||
hasMorePages = cursor < lastPage
|
||||
resp['next'] = cursor + 1 if hasMorePages else None
|
||||
|
||||
resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,
|
||||
transform_source=js_to_json)
|
||||
# If the image data is stored separately rather than in the main content area
|
||||
if resp['metadata']['visual'] and isinstance(resp['metadata']['visual'], str):
|
||||
imagedata = dict()
|
||||
imagedata['src'] = self._og_search_thumbnail(webpage)
|
||||
resp['metadata']['visual'] = imagedata
|
||||
|
||||
resp['next'] = traverse_obj(resp, ('pagination', 'next'))
|
||||
return resp
|
||||
|
||||
|
||||
|
|
|
@ -3317,7 +3317,36 @@ def _extract_heatmap(self, data):
|
|||
'value': ('intensityScoreNormalized', {float_or_none}),
|
||||
})) or None
|
||||
|
||||
def _extract_comment(self, comment_renderer, parent=None):
|
||||
def _extract_comment(self, entities, parent=None):
|
||||
comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict}))
|
||||
if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))):
|
||||
return
|
||||
|
||||
toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict}))
|
||||
time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or ''
|
||||
|
||||
return {
|
||||
'id': comment_id,
|
||||
'parent': parent or 'root',
|
||||
**traverse_obj(comment_entity_payload, {
|
||||
'text': ('properties', 'content', 'content', {str}),
|
||||
'like_count': ('toolbar', 'likeCountA11y', {parse_count}),
|
||||
'author_id': ('author', 'channelId', {self.ucid_or_none}),
|
||||
'author': ('author', 'displayName', {str}),
|
||||
'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}),
|
||||
'author_is_uploader': ('author', 'isCreator', {bool}),
|
||||
'author_is_verified': ('author', 'isVerified', {bool}),
|
||||
'author_url': ('author', 'channelCommand', 'innertubeCommand', (
|
||||
('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url')
|
||||
), {lambda x: urljoin('https://www.youtube.com', x)}),
|
||||
}, get_all=False),
|
||||
'is_favorited': (None if toolbar_entity_payload is None else
|
||||
toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'),
|
||||
'_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate.
|
||||
'timestamp': self._parse_time_text(time_text),
|
||||
}
|
||||
|
||||
def _extract_comment_old(self, comment_renderer, parent=None):
|
||||
comment_id = comment_renderer.get('commentId')
|
||||
if not comment_id:
|
||||
return
|
||||
|
@ -3398,21 +3427,39 @@ def extract_header(contents):
|
|||
break
|
||||
return _continuation
|
||||
|
||||
def extract_thread(contents):
|
||||
def extract_thread(contents, entity_payloads):
|
||||
if not parent:
|
||||
tracker['current_page_thread'] = 0
|
||||
for content in contents:
|
||||
if not parent and tracker['total_parent_comments'] >= max_parents:
|
||||
yield
|
||||
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
|
||||
comment_renderer = get_first(
|
||||
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
|
||||
expected_type=dict, default={})
|
||||
|
||||
comment = self._extract_comment(comment_renderer, parent)
|
||||
# old comment format
|
||||
if not entity_payloads:
|
||||
comment_renderer = get_first(
|
||||
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
|
||||
expected_type=dict, default={})
|
||||
|
||||
comment = self._extract_comment_old(comment_renderer, parent)
|
||||
|
||||
# new comment format
|
||||
else:
|
||||
view_model = (
|
||||
traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict}))
|
||||
or traverse_obj(content, ('commentViewModel', {dict})))
|
||||
comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str}))
|
||||
if not comment_keys:
|
||||
continue
|
||||
entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys)
|
||||
comment = self._extract_comment(entities, parent)
|
||||
if comment:
|
||||
comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None
|
||||
|
||||
if not comment:
|
||||
continue
|
||||
comment_id = comment['id']
|
||||
|
||||
if comment.get('is_pinned'):
|
||||
tracker['pinned_comment_ids'].add(comment_id)
|
||||
# Sometimes YouTube may break and give us infinite looping comments.
|
||||
|
@ -3505,7 +3552,7 @@ def extract_thread(contents):
|
|||
check_get_keys = None
|
||||
if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0):
|
||||
check_get_keys = [[*continuation_items_path, ..., (
|
||||
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]]
|
||||
'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]]
|
||||
try:
|
||||
response = self._extract_response(
|
||||
item_id=None, query=continuation,
|
||||
|
@ -3529,6 +3576,7 @@ def extract_thread(contents):
|
|||
raise
|
||||
is_forced_continuation = False
|
||||
continuation = None
|
||||
mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict}))
|
||||
for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]):
|
||||
if is_first_continuation:
|
||||
continuation = extract_header(continuation_items)
|
||||
|
@ -3537,7 +3585,7 @@ def extract_thread(contents):
|
|||
break
|
||||
continue
|
||||
|
||||
for entry in extract_thread(continuation_items):
|
||||
for entry in extract_thread(continuation_items, mutations):
|
||||
if not entry:
|
||||
return
|
||||
yield entry
|
||||
|
|
Loading…
Reference in New Issue
Block a user