[extractor/bilibili] Add space.bilibili extractors (#4468)

Authored by: lockmatrix
This commit is contained in:
Locke 2022-09-16 23:59:02 +08:00 committed by GitHub
parent 2314b4d89f
commit 2b9d02167f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 119 additions and 29 deletions

View File

@ -190,7 +190,9 @@
BilibiliAudioIE, BilibiliAudioIE,
BilibiliAudioAlbumIE, BilibiliAudioAlbumIE,
BiliBiliPlayerIE, BiliBiliPlayerIE,
BilibiliChannelIE, BilibiliSpaceVideoIE,
BilibiliSpaceAudioIE,
BilibiliSpacePlaylistIE,
BiliIntlIE, BiliIntlIE,
BiliIntlSeriesIE, BiliIntlSeriesIE,
BiliLiveIE, BiliLiveIE,

View File

@ -2,8 +2,8 @@
import hashlib import hashlib
import itertools import itertools
import functools import functools
import re
import math import math
import re
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..compat import ( from ..compat import (
@ -13,23 +13,24 @@
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
InAdvancePagedList,
OnDemandPagedList,
filter_dict, filter_dict,
int_or_none,
float_or_none, float_or_none,
int_or_none,
mimetype2ext, mimetype2ext,
parse_count,
parse_iso8601, parse_iso8601,
qualities, qualities,
traverse_obj,
parse_count,
smuggle_url, smuggle_url,
srt_subtitles_timecode, srt_subtitles_timecode,
str_or_none, str_or_none,
strip_jsonp, strip_jsonp,
traverse_obj,
unified_timestamp, unified_timestamp,
unsmuggle_url, unsmuggle_url,
urlencode_postdata, urlencode_postdata,
url_or_none, url_or_none,
OnDemandPagedList
) )
@ -505,39 +506,126 @@ def _real_extract(self, url):
season_info.get('bangumi_title'), season_info.get('evaluate')) season_info.get('bangumi_title'), season_info.get('evaluate'))
class BilibiliChannelIE(InfoExtractor): class BilibiliSpaceBaseIE(InfoExtractor):
_VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)' def _extract_playlist(self, fetch_page, get_metadata, get_entries):
_API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp" first_page = fetch_page(1)
metadata = get_metadata(first_page)
paged_list = InAdvancePagedList(
lambda idx: get_entries(fetch_page(idx) if idx > 1 else first_page),
metadata['page_count'], metadata['page_size'])
return metadata, paged_list
class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)(?P<video>/video)?/?(?:[?#]|$)'
_TESTS = [{ _TESTS = [{
'url': 'https://space.bilibili.com/3985676/video', 'url': 'https://space.bilibili.com/3985676/video',
'info_dict': {}, 'info_dict': {
'playlist_mincount': 112, 'id': '3985676',
},
'playlist_mincount': 178,
}] }]
def _entries(self, list_id): def _real_extract(self, url):
count, max_count = 0, None playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video')
if not is_video_url:
self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. '
'To download audios, add a "/audio" to the URL')
for page_num in itertools.count(1): def fetch_page(page_idx):
data = self._download_json( return self._download_json(
self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] 'https://api.bilibili.com/x/space/arc/search', playlist_id,
note=f'Downloading page {page_idx}',
query={'mid': playlist_id, 'pn': page_idx, 'jsonp': 'jsonp'})['data']
max_count = max_count or traverse_obj(data, ('page', 'count')) def get_metadata(page_data):
page_size = page_data['page']['ps']
entry_count = page_data['page']['count']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
}
entries = traverse_obj(data, ('list', 'vlist')) def get_entries(page_data):
if not entries: for entry in traverse_obj(page_data, ('list', 'vlist')) or []:
return yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', BiliBiliIE, entry['bvid'])
for entry in entries:
yield self.url_result(
'https://www.bilibili.com/video/%s' % entry['bvid'],
BiliBiliIE.ie_key(), entry['bvid'])
count += len(entries) metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
if max_count and count >= max_count: return self.playlist_result(paged_list, playlist_id)
return
class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio'
_TESTS = [{
'url': 'https://space.bilibili.com/3985676/audio',
'info_dict': {
'id': '3985676',
},
'playlist_mincount': 1,
}]
def _real_extract(self, url): def _real_extract(self, url):
list_id = self._match_id(url) playlist_id = self._match_id(url)
return self.playlist_result(self._entries(list_id), list_id)
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/audio/music-service/web/song/upper', playlist_id,
note=f'Downloading page {page_idx}',
query={'uid': playlist_id, 'pn': page_idx, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data']
def get_metadata(page_data):
return {
'page_count': page_data['pageCount'],
'page_size': page_data['pageSize'],
}
def get_entries(page_data):
for entry in page_data.get('data', []):
yield self.url_result(f'https://www.bilibili.com/audio/au{entry["id"]}', BilibiliAudioIE, entry['id'])
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id)
class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
'info_dict': {
'id': '2142762_57445',
'title': '《底特律 变人》'
},
'playlist_mincount': 31,
}]
def _real_extract(self, url):
mid, sid = self._match_valid_url(url).group('mid', 'sid')
playlist_id = f'{mid}_{sid}'
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/x/polymer/space/seasons_archives_list',
playlist_id, note=f'Downloading page {page_idx}',
query={'mid': mid, 'season_id': sid, 'page_num': page_idx, 'page_size': 30})['data']
def get_metadata(page_data):
page_size = page_data['page']['page_size']
entry_count = page_data['page']['total']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'title': traverse_obj(page_data, ('meta', 'name'))
}
def get_entries(page_data):
for entry in page_data.get('archives', []):
yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
BiliBiliIE, entry['bvid'])
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, metadata['title'])
class BilibiliCategoryIE(InfoExtractor): class BilibiliCategoryIE(InfoExtractor):