From f72218c1992d1eed446b3236a91e7613cec6039a Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 4 Nov 2022 19:38:38 +0530 Subject: [PATCH] [extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel --- yt_dlp/extractor/bitchute.py | 113 +++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index c9cbb6d1d..87d04468a 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -4,8 +4,12 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, - GeoRestrictedError, + HEADRequest, + clean_html, + get_element_by_class, + int_or_none, orderedSet, + traverse_obj, unified_strdate, urlencode_postdata, ) @@ -18,7 +22,7 @@ class BitChuteIE(InfoExtractor): 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', 'md5': '7e427d7ed7af5a75b5855705ec750e2b', 'info_dict': { - 'id': 'szoMrox2JEI', + 'id': 'UGlrF9o9b-Q', 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', @@ -26,6 +30,21 @@ class BitChuteIE(InfoExtractor): 'uploader': 'BitChute', 'upload_date': '20170103', }, + }, { + # video not downloadable in browser, but we can recover it + 'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/', + 'md5': '05c12397d5354bf24494885b08d24ed1', + 'info_dict': { + 'id': '2s6B3nZjAk7R', + 'ext': 'mp4', + 'filesize': 71537926, + 'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control', + 'description': 'md5:228ee93bd840a24938f536aeac9cf749', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'BitChute', + 'upload_date': '20181113', + }, + 'params': {'check_formats': None}, }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', 'only_matching': True, @@ -34,67 +53,57 @@ class BitChuteIE(InfoExtractor): 'only_matching': True, }] + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + 'Referer': 'https://www.bitchute.com/', + } + + def _check_format(self, video_url, video_id): + urls = orderedSet( + re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url) + for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153')) + for url in urls: + try: + response = self._request_webpage( + HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS) + except ExtractorError as e: + self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}') + continue + return { + 'url': url, + 'filesize': int_or_none(response.headers.get('Content-Length')) + } + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', - }) + f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS) - title = self._html_search_regex( - (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'([^<]+)'), - webpage, 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', - default=None) or self._og_search_description(webpage) + publish_date = clean_html(get_element_by_class('video-publish-date', webpage)) + entries = self._parse_html5_media_entries(url, webpage, video_id) - format_urls = [] - for mobj in re.finditer( - r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - format_urls.append(mobj.group('url')) - format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) - - formats = [ - {'url': format_url} - for format_url in orderedSet(format_urls)] + formats = [] + for format_ in traverse_obj(entries, (0, 'formats', ...)): + if self.get_param('check_formats') is not False: + format_.update(self._check_format(format_.pop('url'), video_id) or {}) + if 'url' not in format_: + continue + formats.append(format_) if not formats: - entries = self._parse_html5_media_entries( - url, webpage, video_id) - if not entries: - error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video') - if error == 'Video Unavailable': - raise GeoRestrictedError(error) - raise ExtractorError(error, expected=True) - formats = entries[0]['formats'] - - self._check_formats(formats, video_id) - if not formats: - raise self.raise_no_formats('Video is unavailable', expected=True, video_id=video_id) + self.raise_no_formats( + 'Video is unavailable. Please make sure this video is playable in the browser ' + 'before reporting this issue.', expected=True, video_id=video_id) self._sort_formats(formats) - description = self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'twitter:image:src', webpage, 'thumbnail') - uploader = self._html_search_regex( - (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', - r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), - webpage, 'uploader', fatal=False) - - upload_date = unified_strdate(self._search_regex( - r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', - webpage, 'upload date', fatal=False)) - return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, + 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': clean_html(get_element_by_class('owner', webpage)), + 'upload_date': unified_strdate(self._search_regex( + r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), 'formats': formats, }