yt-dlp/yt_dlp/extractor/weibo.py
c-basalt 15b252dfd2
[ie/weibo] Fix extraction (#8463)
Closes #8445
Authored by: c-basalt
2023-11-11 20:02:59 +00:00

252 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import random
import itertools
import urllib.parse
from .common import InfoExtractor
from ..utils import (
int_or_none,
make_archive_id,
mimetype2ext,
parse_resolution,
str_or_none,
strip_jsonp,
traverse_obj,
url_or_none,
urlencode_postdata,
urljoin,
)
class WeiboBaseIE(InfoExtractor):
def _update_visitor_cookies(self, visitor_url, video_id):
headers = {'Referer': visitor_url}
chrome_ver = self._search_regex(
r'Chrome/(\d+)', self.get_param('http_headers')['User-Agent'], 'user agent version', default='90')
visitor_data = self._download_json(
'https://passport.weibo.com/visitor/genvisitor', video_id,
note='Generating first-visit guest request',
headers=headers, transform_source=strip_jsonp,
data=urlencode_postdata({
'cb': 'gen_callback',
'fp': json.dumps({
'os': '1',
'browser': f'Chrome{chrome_ver},0,0,0',
'fonts': 'undefined',
'screenInfo': '1920*1080*24',
'plugins': ''
}, separators=(',', ':'))}))['data']
self._download_webpage(
'https://passport.weibo.com/visitor/visitor', video_id,
note='Running first-visit callback to get guest cookies',
headers=headers, query={
'a': 'incarnate',
't': visitor_data['tid'],
'w': 3 if visitor_data.get('new_tid') else 2,
'c': f'{visitor_data.get("confidence", 100):03d}',
'gc': '',
'cb': 'cross_domain',
'from': 'weibo',
'_rand': random.random(),
})
def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
self._update_visitor_cookies(urlh.url, video_id)
webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
return self._parse_json(webpage, video_id, fatal=fatal)
def _extract_formats(self, video_info):
media_info = traverse_obj(video_info, ('page_info', 'media_info'))
formats = traverse_obj(media_info, (
'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
'url': 'url',
'format': ('quality_desc', {str}),
'format_id': ('label', {str}),
'ext': ('mime', {mimetype2ext}),
'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
'vcodec': ('video_codecs', {str}),
'fps': ('fps', {int_or_none}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
'acodec': ('audio_codecs', {str}),
'asr': ('audio_sample_rate', {int_or_none}),
'audio_channels': ('audio_channels', {int_or_none}),
}))
if not formats: # fallback, should be barely used
for url in set(traverse_obj(media_info, (..., {url_or_none}))):
if 'label=' in url: # filter out non-video urls
format_id, resolution = self._search_regex(
r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
group=(1, 2), default=(None, None))
formats.append({
'url': url,
'format_id': format_id,
**parse_resolution(resolution),
**traverse_obj(media_info, (
'video_details', lambda _, v: v['label'].startswith(format_id), {
'size': ('size', {int_or_none}),
'tbr': ('bitrate', {int_or_none}),
}
), get_all=False),
})
return formats
def _parse_video_info(self, video_info, video_id=None):
return {
'id': video_id,
'extractor_key': WeiboIE.ie_key(),
'extractor': WeiboIE.IE_NAME,
'formats': self._extract_formats(video_info),
'http_headers': {'Referer': 'https://weibo.com/'},
'_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
**traverse_obj(video_info, {
'id': (('id', 'id_str', 'mid'), {str_or_none}),
'display_id': ('mblogid', {str_or_none}),
'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
'description': ('text_raw', {str}),
'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
'thumbnail': ('page_info', 'page_pic', {url_or_none}),
'uploader': ('user', 'screen_name', {str}),
'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
'like_count': ('attitudes_count', {int_or_none}),
'repost_count': ('reposts_count', {int_or_none}),
}, get_all=False),
'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
}
class WeiboIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://weibo.com/7827771738/N4xlMvjhI',
'info_dict': {
'id': '4910815147462302',
'ext': 'mp4',
'display_id': 'N4xlMvjhI',
'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
'duration': 918,
'timestamp': 1686312819,
'upload_date': '20230609',
'thumbnail': r're:https://.*\.jpg',
'uploader': '睡前视频基地',
'uploader_id': '7827771738',
'uploader_url': 'https://weibo.com/u/7827771738',
'view_count': int,
'like_count': int,
'repost_count': int,
'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
},
}, {
'url': 'https://m.weibo.cn/status/4189191225395228',
'info_dict': {
'id': '4189191225395228',
'ext': 'mp4',
'display_id': 'FBqgOmDxO',
'title': '柴犬柴犬的秒拍视频',
'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
'duration': 53,
'timestamp': 1514264429,
'upload_date': '20171226',
'thumbnail': r're:https://.*\.jpg',
'uploader': '柴犬柴犬',
'uploader_id': '5926682210',
'uploader_url': 'https://weibo.com/u/5926682210',
'view_count': int,
'like_count': int,
'repost_count': int,
}
}, {
'url': 'https://weibo.com/0/4224132150961381',
'note': 'no playback_list example',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._parse_video_info(self._weibo_download_json(
f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
class WeiboVideoIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
_TESTS = [{
'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
'info_dict': {
'id': '4797700463137878',
'ext': 'mp4',
'display_id': 'LEZDodaiW',
'title': '稍微了解了一下靡烟miya感觉这东西也太二了',
'description': '稍微了解了一下靡烟miya感觉这东西也太二了 http://t.cn/A6aerGsM ',
'duration': 76,
'timestamp': 1659344278,
'upload_date': '20220801',
'thumbnail': r're:https://.*\.jpg',
'uploader': '君子爱财陈平安',
'uploader_id': '3905382233',
'uploader_url': 'https://weibo.com/u/3905382233',
'view_count': int,
'like_count': int,
'repost_count': int,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
video_info = self._weibo_download_json(
f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
class WeiboUserIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://weibo.com/u/2066652961?tabtype=video',
'info_dict': {
'id': '2066652961',
'title': '萧影殿下的视频',
'description': '萧影殿下的全部视频',
'uploader': '萧影殿下',
},
'playlist_mincount': 195,
}]
def _fetch_page(self, uid, cursor=0, page=1):
return self._weibo_download_json(
'https://weibo.com/ajax/profile/getWaterFallContent',
uid, note=f'Downloading videos page {page}',
query={'uid': uid, 'cursor': cursor})['data']
def _entries(self, uid, first_page):
cursor = 0
for page in itertools.count(1):
response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
for video_info in traverse_obj(response, ('list', ..., {dict})):
yield self._parse_video_info(video_info)
cursor = response.get('next_cursor')
if (int_or_none(cursor) or -1) < 0:
break
def _real_extract(self, url):
uid = self._match_id(url)
first_page = self._fetch_page(uid)
uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
metainfo = {
'title': f'{uploader}的视频',
'description': f'{uploader}的全部视频',
'uploader': uploader,
} if uploader else {}
return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)