From c61473c1d617a4d5432248815f22dcb46906acaf Mon Sep 17 00:00:00 2001 From: MMM Date: Wed, 9 Nov 2022 04:30:15 +0100 Subject: [PATCH] [extractor/bitchute] Improve `BitChuteChannelIE` (#5066) Authored by: flashdagger, pukkandan --- yt_dlp/extractor/bitchute.py | 138 ++++++++++++++++++++++++----------- yt_dlp/utils.py | 2 + 2 files changed, 99 insertions(+), 41 deletions(-) diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index 87d04468a..f4b6a9a0e 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -1,14 +1,18 @@ -import itertools +import functools import re from .common import InfoExtractor from ..utils import ( ExtractorError, HEADRequest, + OnDemandPagedList, clean_html, get_element_by_class, + get_elements_html_by_class, int_or_none, orderedSet, + parse_count, + parse_duration, traverse_obj, unified_strdate, urlencode_postdata, @@ -109,51 +113,103 @@ def _real_extract(self, url): class BitChuteChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P[^/?#&]+)' - _TEST = { - 'url': 'https://www.bitchute.com/channel/victoriaxrave/', - 'playlist_mincount': 185, + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?Pchannel|playlist)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/channel/bitchute/', 'info_dict': { - 'id': 'victoriaxrave', + 'id': 'bitchute', + 'title': 'BitChute', + 'description': 'md5:5329fb3866125afa9446835594a9b138', }, - } + 'playlist': [ + { + 'md5': '7e427d7ed7af5a75b5855705ec750e2b', + 'info_dict': { + 'id': 'UGlrF9o9b-Q', + 'ext': 'mp4', + 'filesize': None, + 'title': 'This is the first video on #BitChute !', + 'description': 'md5:a0337e7b1fe39e32336974af8173a034', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'BitChute', + 'upload_date': '20170103', + 'duration': 16, + 'view_count': int, + }, + } + ], + 'params': { + 'skip_download': True, + 'playlist_items': '-1', + }, + }, { + 'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/', + 'playlist_mincount': 20, + 'info_dict': { + 'id': 'wV9Imujxasw9', + 'title': 'Bruce MacDonald and "The Light of Darkness"', + 'description': 'md5:04913227d2714af1d36d804aa2ab6b1e', + } + }] _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + PAGE_SIZE = 25 + HTML_CLASS_NAMES = { + 'channel': { + 'container': 'channel-videos-container', + 'title': 'channel-videos-title', + 'description': 'channel-videos-text', + }, + 'playlist': { + 'container': 'playlist-video', + 'title': 'title', + 'description': 'description', + } - def _entries(self, channel_id): - channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id - offset = 0 - for page_num in itertools.count(1): - data = self._download_json( - '%sextend/' % channel_url, channel_id, - 'Downloading channel page %d' % page_num, - data=urlencode_postdata({ - 'csrfmiddlewaretoken': self._TOKEN, - 'name': '', - 'offset': offset, - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': channel_url, - 'X-Requested-With': 'XMLHttpRequest', - 'Cookie': 'csrftoken=%s' % self._TOKEN, - }) - if data.get('success') is False: - break - html = data.get('html') - if not html: - break - video_ids = re.findall( - r'class=["\']channel-videos-image-container[^>]+>\s*]+\bhref=["\']/video/([^"\'/]+)', - html) - if not video_ids: - break - offset += len(video_ids) - for video_id in video_ids: - yield self.url_result( - 'https://www.bitchute.com/video/%s' % video_id, - ie=BitChuteIE.ie_key(), video_id=video_id) + } + + @staticmethod + def _make_url(playlist_id, playlist_type): + return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/' + + def _fetch_page(self, playlist_id, playlist_type, page_num): + playlist_url = self._make_url(playlist_id, playlist_type) + data = self._download_json( + f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}', + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': page_num * self.PAGE_SIZE, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': playlist_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': f'csrftoken={self._TOKEN}', + }) + if not data.get('success'): + return + classes = self.HTML_CLASS_NAMES[playlist_type] + for video_html in get_elements_html_by_class(classes['container'], data.get('html')): + video_id = self._search_regex( + r']*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None) + if not video_id: + continue + yield self.url_result( + f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True, + title=clean_html(get_element_by_class(classes['title'], video_html)), + description=clean_html(get_element_by_class(classes['description'], video_html)), + duration=parse_duration(get_element_by_class('video-duration', video_html)), + view_count=parse_count(clean_html(get_element_by_class('video-views', video_html)))) def _real_extract(self, url): - channel_id = self._match_id(url) + playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id') + webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id) + + page_func = functools.partial(self._fetch_page, playlist_id, playlist_type) return self.playlist_result( - self._entries(channel_id), playlist_id=channel_id) + OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id, + title=self._html_extract_title(webpage, default=None), + description=self._html_search_meta( + ('description', 'og:description', 'twitter:description'), webpage, default=None), + playlist_count=int_or_none(self._html_search_regex( + r'(\d+)\s+videos?', webpage, 'playlist count', default=None))) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index d0513496e..b7e7cb7d7 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -418,6 +418,8 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w Return the text (content) and the html (whole) of the tag with the specified attribute in the passed HTML document """ + if not value: + return quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'