From d3d81cc98f554d0adb87d24bfd6fabaaa803944d Mon Sep 17 00:00:00 2001 From: ringus1 Date: Tue, 5 Sep 2023 22:35:23 +0200 Subject: [PATCH] [ie/facebook] Fix webpage extraction (#7890) Closes #7901 Authored by: ringus1 --- yt_dlp/extractor/facebook.py | 41 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 4fd17b574..c30a6b06a 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -74,6 +74,22 @@ class FacebookIE(InfoExtractor): _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ + 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', + 'info_dict': { + 'id': '3676516585958356', + 'ext': 'mp4', + 'title': 'dr Adam Przygoda', + 'description': 'md5:34675bda53336b1d16400265c2bb9b3b', + 'uploader': 'RADIO KICKS FM', + 'upload_date': '20230818', + 'timestamp': 1692346159, + 'thumbnail': r're:^https?://.*', + 'uploader_id': '100063551323670', + 'duration': 3132.184, + 'view_count': int, + 'concurrent_view_count': 0, + }, + }, { 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', 'info_dict': { @@ -97,7 +113,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', + 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', 'duration': 131.03, 'concurrent_view_count': int, }, @@ -179,7 +195,7 @@ class FacebookIE(InfoExtractor): 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', + 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl', 'concurrent_view_count': int, 'thumbnail': r're:^https?://.*', 'view_count': int, @@ -274,7 +290,7 @@ class FacebookIE(InfoExtractor): 'title': 'Josef', 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, - 'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', + 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', 'timestamp': 1549275572, 'duration': 3.413, 'uploader': 'Josef Novak', @@ -401,9 +417,9 @@ def _extract_from_url(self, url, video_id): def extract_metadata(webpage): post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] + r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) @@ -493,14 +509,14 @@ def process_formats(info): def extract_relay_data(_filter): return self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + r'data-sjs>({.*?%s.*?})' % _filter, webpage, 'replay data', default='{}'), video_id, fatal=False) or {} def extract_relay_prefetched_data(_filter): - replay_data = extract_relay_data(_filter) - for require in (replay_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + return traverse_obj(extract_relay_data(_filter), ( + 'require', (None, (..., ..., ..., '__bbox', 'require')), + lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ..., + '__bbox', 'result', 'data', {dict}), get_all=False) or {} if not video_data: server_js_data = self._parse_json(self._search_regex([ @@ -511,7 +527,7 @@ def extract_relay_prefetched_data(_filter): if not video_data: data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)') if data: entries = [] @@ -526,7 +542,8 @@ def parse_graphql_video(video): formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', '')): + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): playable_url = video.get(key) if not playable_url: continue