From 7d3b98be4c4567b985ba7d7b17057e930457edc9 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 18 Aug 2022 20:57:46 +0000 Subject: [PATCH] [extractor/instagram] Fix extraction (#4696) Closes #4657, #4532, #4475 Authored by: bashonly, pritam20ps05 --- yt_dlp/extractor/instagram.py | 168 ++++++++++++++++++++-------------- 1 file changed, 97 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 94db75640..1d8e79495 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -39,37 +39,42 @@ class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' _IS_LOGGED_IN = False + _API_BASE_URL = 'https://i.instagram.com/api/v1' + _LOGIN_URL = 'https://www.instagram.com/accounts/login' + _API_HEADERS = { + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'Origin': 'https://www.instagram.com', + 'Accept': '*/*', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36', + } + def _perform_login(self, username, password): if self._IS_LOGGED_IN: return login_webpage = self._download_webpage( - 'https://www.instagram.com/accounts/login/', None, - note='Downloading login webpage', errnote='Failed to download login webpage') + self._LOGIN_URL, None, note='Downloading login webpage', errnote='Failed to download login webpage') - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - login_webpage, 'shared data', default='{}'), - None) + shared_data = self._parse_json(self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', login_webpage, 'shared data', default='{}'), None) - login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ - 'Accept': '*/*', - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) + login = self._download_json( + f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={ + **self._API_HEADERS, + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) if not login.get('authenticated'): if login.get('message'): @@ -134,7 +139,7 @@ def _extract_nodes(self, nodes, is_direct=False): } def _extract_product_media(self, product_media): - media_id = product_media.get('code') or product_media.get('id') + media_id = product_media.get('code') or _pk_to_id(product_media.get('pk')) vcodec = product_media.get('video_codec') dash_manifest_raw = product_media.get('video_dash_manifest') videos_list = product_media.get('video_versions') @@ -179,7 +184,7 @@ def _extract_product(self, product_info): user_info = product_info.get('user') or {} info_dict = { - 'id': product_info.get('code') or product_info.get('id'), + 'id': product_info.get('code') or _pk_to_id(product_info.get('pk')), 'title': product_info.get('title') or f'Video by {user_info.get("username")}', 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), 'timestamp': int_or_none(product_info.get('taken_at')), @@ -360,49 +365,74 @@ def _extract_embed_urls(cls, url, webpage): def _real_extract(self, url): video_id, url = self._match_valid_url(url).group('id', 'url') - general_info = self._download_json( - f'https://www.instagram.com/graphql/query/?query_hash=9f8827793ef34641b2fb195d4d41151c' - f'&variables=%7B"shortcode":"{video_id}",' - '"parent_comment_count":10,"has_threaded_comments":true}', video_id, fatal=False, errnote=False, - headers={ - 'Accept': '*', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', - 'Authority': 'www.instagram.com', - 'Referer': 'https://www.instagram.com', - 'x-ig-app-id': '936619743392459', - }) - media = traverse_obj(general_info, ('data', 'shortcode_media')) or {} + media, webpage = {}, '' + + api_check = self._download_json( + f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', + video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} + csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') + + if not csrf_token: + self.report_warning('No csrf token set by Instagram API', video_id) + elif api_check.get('status') != 'ok': + self.report_warning('Instagram API is not granting access', video_id) + else: + if self._get_cookies(url).get('sessionid'): + media = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, + fatal=False, note='Downloading video info', headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token.value, + }), ('items', 0)) + if media: + return self._extract_product(media) + + variables = { + 'shortcode': video_id, + 'child_comment_count': 3, + 'fetch_comment_count': 40, + 'parent_comment_count': 24, + 'has_threaded_comments': True, + } + general_info = self._download_json( + 'https://www.instagram.com/graphql/query/', video_id, fatal=False, + headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token.value, + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }, query={ + 'query_hash': '9f8827793ef34641b2fb195d4d41151c', + 'variables': json.dumps(variables, separators=(',', ':')), + }) + media = traverse_obj(general_info, ('data', 'shortcode_media')) + if not media: - self.report_warning('General metadata extraction failed', video_id) + self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + shared_data = self._search_json( + r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) - info = self._download_json( - f'https://i.instagram.com/api/v1/media/{_id_to_pk(video_id)}/info/', video_id, - fatal=False, note='Downloading video info', errnote=False, headers={ - 'Accept': '*', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', - 'Authority': 'www.instagram.com', - 'Referer': 'https://www.instagram.com', - 'x-ig-app-id': '936619743392459', - }) - if info: - media.update(info['items'][0]) - return self._extract_product(media) + if self._LOGIN_URL not in urlh.geturl(): + media.update(traverse_obj( + shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) + else: + self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage') + webpage = self._download_webpage( + f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) + additional_data = self._search_json( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*', webpage, 'additional data', video_id, fatal=False) + if not additional_data: + self.raise_login_required('Requested content was not found, the content might be private') - webpage = self._download_webpage( - f'https://www.instagram.com/p/{video_id}/embed/', video_id, - note='Downloading embed webpage', fatal=False) - if not webpage: - self.raise_login_required('Requested content was not found, the content might be private') + product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) + if product_item: + media.update(product_item) + return self._extract_product(media) - additional_data = self._search_json( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*', webpage, 'additional data', video_id, fatal=False) - product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) - if product_item: - media.update(product_item) - return self._extract_product(media) - - media.update(traverse_obj( - additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) + media.update(traverse_obj( + additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) username = traverse_obj(media, ('owner', 'username')) or self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) @@ -649,12 +679,8 @@ def _real_extract(self, url): story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' videos = traverse_obj(self._download_json( - f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', - story_id, errnote=False, fatal=False, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - }), 'reels') + f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', + story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') if not videos: self.raise_login_required('You need to log in to access this content')