2021-10-31 06:08:04 +01:00
# coding: utf-8
2014-02-10 20:24:12 +01:00
2018-03-21 17:43:03 +01:00
import itertools
2018-04-10 20:51:57 +02:00
import hashlib
2018-03-15 14:33:36 +01:00
import json
2013-07-01 21:08:54 +02:00
import re
2021-10-22 02:53:45 +02:00
import time
2013-07-01 21:08:54 +02:00
from . common import InfoExtractor
2018-04-17 17:37:50 +02:00
from . . compat import (
compat_HTTPError ,
)
2015-06-07 19:46:33 +02:00
from . . utils import (
2018-04-17 17:37:50 +02:00
ExtractorError ,
2021-04-01 10:28:33 +02:00
float_or_none ,
2016-03-24 09:29:33 +01:00
get_element_by_attribute ,
2015-06-07 19:46:33 +02:00
int_or_none ,
2016-03-24 09:30:01 +01:00
lowercase_escape ,
2018-04-17 17:37:50 +02:00
std_headers ,
2021-12-23 23:13:10 +01:00
str_to_int ,
2021-11-24 13:52:42 +01:00
traverse_obj ,
2018-07-21 14:08:28 +02:00
url_or_none ,
2021-10-22 02:53:45 +02:00
urlencode_postdata ,
2015-06-07 19:46:33 +02:00
)
2013-07-01 21:08:54 +02:00
2014-02-10 20:24:12 +01:00
2021-10-31 06:08:04 +01:00
class InstagramBaseIE ( InfoExtractor ) :
2021-10-22 02:53:45 +02:00
_NETRC_MACHINE = ' instagram '
2021-10-31 06:08:04 +01:00
_IS_LOGGED_IN = False
def _login ( self ) :
username , password = self . _get_login_info ( )
if username is None or self . _IS_LOGGED_IN :
return
login_webpage = self . _download_webpage (
' https://www.instagram.com/accounts/login/ ' , None ,
note = ' Downloading login webpage ' , errnote = ' Failed to download login webpage ' )
shared_data = self . _parse_json (
self . _search_regex (
r ' window \ ._sharedData \ s*= \ s*( { .+?}); ' ,
login_webpage , ' shared data ' , default = ' {} ' ) ,
None )
login = self . _download_json ( ' https://www.instagram.com/accounts/login/ajax/ ' , None , note = ' Logging in ' , headers = {
' Accept ' : ' */* ' ,
' X-IG-App-ID ' : ' 936619743392459 ' ,
' X-ASBD-ID ' : ' 198387 ' ,
' X-IG-WWW-Claim ' : ' 0 ' ,
' X-Requested-With ' : ' XMLHttpRequest ' ,
' X-CSRFToken ' : shared_data [ ' config ' ] [ ' csrf_token ' ] ,
' X-Instagram-AJAX ' : shared_data [ ' rollout_hash ' ] ,
' Referer ' : ' https://www.instagram.com/ ' ,
} , data = urlencode_postdata ( {
' enc_password ' : f ' #PWD_INSTAGRAM_BROWSER:0: { int ( time . time ( ) ) } : { password } ' ,
' username ' : username ,
' queryParams ' : ' {} ' ,
' optIntoOneTap ' : ' false ' ,
' stopDeletionNonce ' : ' ' ,
' trustedDeviceRecords ' : ' {} ' ,
} ) )
if not login . get ( ' authenticated ' ) :
if login . get ( ' message ' ) :
raise ExtractorError ( f ' Unable to login: { login [ " message " ] } ' )
2021-11-28 13:29:55 +01:00
elif login . get ( ' user ' ) :
raise ExtractorError ( ' Unable to login: Sorry, your password was incorrect. Please double-check your password. ' , expected = True )
elif login . get ( ' user ' ) is False :
raise ExtractorError ( ' Unable to login: The username you entered doesn \' t belong to an account. Please check your username and try again. ' , expected = True )
2021-10-31 06:08:04 +01:00
raise ExtractorError ( ' Unable to login ' )
InstagramBaseIE . _IS_LOGGED_IN = True
def _real_initialize ( self ) :
self . _login ( )
2021-11-24 13:52:42 +01:00
def _get_count ( self , media , kind , * keys ) :
return traverse_obj (
media , ( kind , ' count ' ) , * ( ( f ' edge_media_ { key } ' , ' count ' ) for key in keys ) ,
expected_type = int_or_none )
def _get_dimension ( self , name , media , webpage = None ) :
return (
traverse_obj ( media , ( ' dimensions ' , name ) , expected_type = int_or_none )
or int_or_none ( self . _html_search_meta (
( f ' og:video: { name } ' , f ' video: { name } ' ) , webpage or ' ' , default = None ) ) )
def _extract_nodes ( self , nodes , is_direct = False ) :
for idx , node in enumerate ( nodes , start = 1 ) :
if node . get ( ' __typename ' ) != ' GraphVideo ' and node . get ( ' is_video ' ) is not True :
continue
video_id = node . get ( ' shortcode ' )
if is_direct :
info = {
' id ' : video_id or node [ ' id ' ] ,
' url ' : node . get ( ' video_url ' ) ,
' width ' : self . _get_dimension ( ' width ' , node ) ,
' height ' : self . _get_dimension ( ' height ' , node ) ,
' http_headers ' : {
' Referer ' : ' https://www.instagram.com/ ' ,
}
}
elif not video_id :
continue
else :
info = {
' _type ' : ' url ' ,
' ie_key ' : ' Instagram ' ,
' id ' : video_id ,
' url ' : f ' https://instagram.com/p/ { video_id } ' ,
}
yield {
* * info ,
' title ' : node . get ( ' title ' ) or ( f ' Video { idx } ' if is_direct else None ) ,
' description ' : traverse_obj (
node , ( ' edge_media_to_caption ' , ' edges ' , 0 , ' node ' , ' text ' ) , expected_type = str ) ,
' thumbnail ' : traverse_obj (
node , ' display_url ' , ' thumbnail_src ' , ' display_src ' , expected_type = url_or_none ) ,
' duration ' : float_or_none ( node . get ( ' video_duration ' ) ) ,
' timestamp ' : int_or_none ( node . get ( ' taken_at_timestamp ' ) ) ,
' view_count ' : int_or_none ( node . get ( ' video_view_count ' ) ) ,
' comment_count ' : self . _get_count ( node , ' comments ' , ' preview_comment ' , ' to_comment ' , ' to_parent_comment ' ) ,
' like_count ' : self . _get_count ( node , ' likes ' , ' preview_like ' ) ,
}
2021-10-31 06:08:04 +01:00
2021-11-05 22:31:34 +01:00
class InstagramIOSIE ( InfoExtractor ) :
2021-11-09 23:44:42 +01:00
IE_DESC = ' IOS instagram:// URL '
2021-11-05 22:31:34 +01:00
_VALID_URL = r ' instagram://media \ ?id=(?P<id>[ \ d_]+) '
_TESTS = [ {
' url ' : ' instagram://media?id=482584233761418119 ' ,
' md5 ' : ' 0d2da106a9d2631273e192b372806516 ' ,
' info_dict ' : {
' id ' : ' aye83DjauH ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video by naomipq ' ,
' description ' : ' md5:1f17f0ab29bd6fe2bfad705f58de3cb8 ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
' duration ' : 0 ,
' timestamp ' : 1371748545 ,
' upload_date ' : ' 20130620 ' ,
' uploader_id ' : ' naomipq ' ,
' uploader ' : ' B E A U T Y F O R A S H E S ' ,
' like_count ' : int ,
' comment_count ' : int ,
' comments ' : list ,
} ,
' add_ie ' : [ ' Instagram ' ]
} ]
def _get_id ( self , id ) :
""" Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id """
chrs = ' ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_ '
media_id = int ( id . split ( ' _ ' ) [ 0 ] )
shortened_id = ' '
while media_id > 0 :
r = media_id % 64
media_id = ( media_id - r ) / / 64
shortened_id = chrs [ r ] + shortened_id
return shortened_id
def _real_extract ( self , url ) :
return {
' _type ' : ' url_transparent ' ,
' url ' : f ' http://instagram.com/tv/ { self . _get_id ( self . _match_id ( url ) ) } / ' ,
' ie_key ' : ' Instagram ' ,
}
2021-10-31 06:08:04 +01:00
class InstagramIE ( InstagramBaseIE ) :
2021-12-12 19:01:00 +01:00
_VALID_URL = r ' (?P<url>https?://(?:www \ .)?instagram \ .com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+)) '
2015-11-14 02:21:20 +01:00
_TESTS = [ {
2015-05-18 11:21:09 +02:00
' url ' : ' https://instagram.com/p/aye83DjauH/?foo=bar#abc ' ,
2014-02-10 20:24:12 +01:00
' md5 ' : ' 0d2da106a9d2631273e192b372806516 ' ,
' info_dict ' : {
' id ' : ' aye83DjauH ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video by naomipq ' ,
' description ' : ' md5:1f17f0ab29bd6fe2bfad705f58de3cb8 ' ,
2017-01-02 13:08:07 +01:00
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
2021-04-01 10:28:33 +02:00
' duration ' : 0 ,
2016-06-12 01:06:04 +02:00
' timestamp ' : 1371748545 ,
' upload_date ' : ' 20130620 ' ,
' uploader_id ' : ' naomipq ' ,
2021-01-01 13:26:37 +01:00
' uploader ' : ' B E A U T Y F O R A S H E S ' ,
2016-06-12 01:06:04 +02:00
' like_count ' : int ,
' comment_count ' : int ,
2016-09-28 16:54:06 +02:00
' comments ' : list ,
2016-06-12 01:06:04 +02:00
} ,
2016-01-26 16:46:51 +01:00
} , {
# missing description
' url ' : ' https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears ' ,
' info_dict ' : {
' id ' : ' BA-pQFBG8HZ ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video by britneyspears ' ,
2017-01-02 13:08:07 +01:00
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
2021-04-01 10:28:33 +02:00
' duration ' : 0 ,
2016-06-12 01:06:04 +02:00
' timestamp ' : 1453760977 ,
' upload_date ' : ' 20160125 ' ,
' uploader_id ' : ' britneyspears ' ,
' uploader ' : ' Britney Spears ' ,
' like_count ' : int ,
' comment_count ' : int ,
2016-09-28 16:54:06 +02:00
' comments ' : list ,
2016-01-26 16:46:51 +01:00
} ,
' params ' : {
' skip_download ' : True ,
} ,
2017-02-23 12:02:04 +01:00
} , {
# multi video post
' url ' : ' https://www.instagram.com/p/BQ0eAlwhDrw/ ' ,
' playlist ' : [ {
' info_dict ' : {
' id ' : ' BQ0dSaohpPW ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video 1 ' ,
} ,
} , {
' info_dict ' : {
' id ' : ' BQ0dTpOhuHT ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video 2 ' ,
} ,
} , {
' info_dict ' : {
' id ' : ' BQ0dT7RBFeF ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Video 3 ' ,
} ,
} ] ,
' info_dict ' : {
' id ' : ' BQ0eAlwhDrw ' ,
' title ' : ' Post by instagram ' ,
' description ' : ' md5:0f9203fc6a2ce4d228da5754bcf54957 ' ,
} ,
2021-04-01 10:28:33 +02:00
} , {
# IGTV
' url ' : ' https://www.instagram.com/tv/BkfuX9UB-eK/ ' ,
' info_dict ' : {
' id ' : ' BkfuX9UB-eK ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Fingerboarding Tricks with @cass.fb ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
' duration ' : 53.83 ,
' timestamp ' : 1530032919 ,
' upload_date ' : ' 20180626 ' ,
' uploader_id ' : ' instagram ' ,
' uploader ' : ' Instagram ' ,
' like_count ' : int ,
' comment_count ' : int ,
' comments ' : list ,
' description ' : ' Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded. ' ,
}
2015-11-14 02:21:20 +01:00
} , {
' url ' : ' https://instagram.com/p/-Cmh1cukG2/ ' ,
' only_matching ' : True ,
2016-04-16 18:23:08 +02:00
} , {
' url ' : ' http://instagram.com/p/9o6LshA7zy/embed/ ' ,
' only_matching ' : True ,
2019-09-21 22:57:45 +02:00
} , {
' url ' : ' https://www.instagram.com/tv/aye83DjauH/ ' ,
' only_matching ' : True ,
2021-01-01 13:26:37 +01:00
} , {
' url ' : ' https://www.instagram.com/reel/CDUMkliABpa/ ' ,
' only_matching ' : True ,
2021-12-12 19:01:00 +01:00
} , {
' url ' : ' https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/ ' ,
' only_matching ' : True ,
2015-11-14 02:21:20 +01:00
} ]
2013-07-01 21:08:54 +02:00
2016-03-24 09:29:33 +01:00
@staticmethod
def _extract_embed_url ( webpage ) :
2016-04-16 18:31:05 +02:00
mobj = re . search (
r ' <iframe[^>]+src=([ " \' ])(?P<url>(?:https?:)?//(?:www \ .)?instagram \ .com/p/[^/]+/embed.*?) \ 1 ' ,
webpage )
if mobj :
return mobj . group ( ' url ' )
2016-03-24 09:29:33 +01:00
blockquote_el = get_element_by_attribute (
' class ' , ' instagram-media ' , webpage )
if blockquote_el is None :
return
mobj = re . search (
r ' <a[^>]+href=([ \' " ])(?P<link>[^ \' " ]+) \ 1 ' , blockquote_el )
if mobj :
return mobj . group ( ' link ' )
2013-07-01 21:08:54 +02:00
def _real_extract ( self , url ) :
2021-11-24 13:52:42 +01:00
video_id , url = self . _match_valid_url ( url ) . group ( ' id ' , ' url ' )
2021-10-14 11:05:10 +02:00
webpage , urlh = self . _download_webpage_handle ( url , video_id )
2021-11-24 13:52:42 +01:00
if ' www.instagram.com/accounts/login ' in urlh . geturl ( ) :
2021-12-23 23:13:10 +01:00
self . report_warning ( ' Main webpage is locked behind the login page. '
' Retrying with embed webpage (Note that some metadata might be missing) ' )
webpage = self . _download_webpage (
' https://www.instagram.com/p/ %s /embed/ ' % video_id , video_id , note = ' Downloading embed webpage ' )
2016-06-12 01:06:04 +02:00
2021-01-01 13:26:37 +01:00
shared_data = self . _parse_json (
self . _search_regex (
r ' window \ ._sharedData \ s*= \ s*( { .+?}); ' ,
webpage , ' shared data ' , default = ' {} ' ) ,
video_id , fatal = False )
2021-11-24 13:52:42 +01:00
media = traverse_obj (
shared_data ,
( ' entry_data ' , ' PostPage ' , 0 , ' graphql ' , ' shortcode_media ' ) ,
( ' entry_data ' , ' PostPage ' , 0 , ' media ' ) ,
expected_type = dict )
2021-01-01 13:26:37 +01:00
# _sharedData.entry_data.PostPage is empty when authenticated (see
# https://github.com/ytdl-org/youtube-dl/pull/22880)
if not media :
additional_data = self . _parse_json (
self . _search_regex (
r ' window \ .__additionalDataLoaded \ s* \ ( \ s*[^,]+, \ s*( { .+?}) \ s* \ ) \ s*; ' ,
webpage , ' additional data ' , default = ' {} ' ) ,
video_id , fatal = False )
2021-12-23 23:13:10 +01:00
media = traverse_obj ( additional_data , ( ' graphql ' , ' shortcode_media ' ) , ' shortcode_media ' , expected_type = dict ) or { }
if not media and ' www.instagram.com/accounts/login ' in urlh . geturl ( ) :
self . raise_login_required ( ' You need to log in to access this content ' )
2021-11-24 13:52:42 +01:00
uploader_id = traverse_obj ( media , ( ' owner ' , ' username ' ) ) or self . _search_regex (
r ' " owner " \ s*: \ s* { \ s* " username " \ s*: \ s* " (.+?) " ' , webpage , ' uploader id ' , fatal = False )
description = (
traverse_obj ( media , ( ' edge_media_to_caption ' , ' edges ' , 0 , ' node ' , ' text ' ) , expected_type = str )
or media . get ( ' caption ' ) )
if not description :
description = self . _search_regex (
r ' " caption " \ s*: \ s* " (.+?) " ' , webpage , ' description ' , default = None )
if description is not None :
description = lowercase_escape ( description )
2016-06-12 01:06:04 +02:00
2021-11-24 13:52:42 +01:00
video_url = media . get ( ' video_url ' )
2016-06-12 01:06:04 +02:00
if not video_url :
2021-11-24 13:52:42 +01:00
nodes = traverse_obj ( media , ( ' edge_sidecar_to_children ' , ' edges ' , . . . , ' node ' ) , expected_type = dict ) or [ ]
if nodes :
return self . playlist_result (
self . _extract_nodes ( nodes , True ) , video_id ,
' Post by %s ' % uploader_id if uploader_id else None , description )
2016-06-12 01:06:04 +02:00
video_url = self . _og_search_video_url ( webpage , secure = False )
2016-09-28 17:28:16 +02:00
formats = [ {
' url ' : video_url ,
2021-11-24 13:52:42 +01:00
' width ' : self . _get_dimension ( ' width ' , media , webpage ) ,
' height ' : self . _get_dimension ( ' height ' , media , webpage ) ,
2016-09-28 17:28:16 +02:00
} ]
2021-11-24 13:52:42 +01:00
dash = traverse_obj ( media , ( ' dash_info ' , ' video_dash_manifest ' ) )
2021-10-31 03:54:39 +01:00
if dash :
formats . extend ( self . _parse_mpd_formats ( self . _parse_xml ( dash , video_id ) , mpd_id = ' dash ' ) )
self . _sort_formats ( formats )
2016-09-28 17:28:16 +02:00
2021-12-23 23:13:10 +01:00
comment_data = traverse_obj ( media , ( ' edge_media_to_parent_comment ' , ' edges ' ) )
2021-11-24 13:52:42 +01:00
comments = [ {
' author ' : traverse_obj ( comment_dict , ( ' node ' , ' owner ' , ' username ' ) ) ,
' author_id ' : traverse_obj ( comment_dict , ( ' node ' , ' owner ' , ' id ' ) ) ,
' id ' : traverse_obj ( comment_dict , ( ' node ' , ' id ' ) ) ,
' text ' : traverse_obj ( comment_dict , ( ' node ' , ' text ' ) ) ,
' timestamp ' : traverse_obj ( comment_dict , ( ' node ' , ' created_at ' ) , expected_type = int_or_none ) ,
2021-12-23 23:13:10 +01:00
} for comment_dict in comment_data ] if comment_data else None
2021-11-24 13:52:42 +01:00
display_resources = (
media . get ( ' display_resources ' )
or [ { ' src ' : media . get ( key ) } for key in ( ' display_src ' , ' display_url ' ) ]
or [ { ' src ' : self . _og_search_thumbnail ( webpage ) } ] )
thumbnails = [ {
' url ' : thumbnail [ ' src ' ] ,
' width ' : thumbnail . get ( ' config_width ' ) ,
' height ' : thumbnail . get ( ' config_height ' ) ,
} for thumbnail in display_resources if thumbnail . get ( ' src ' ) ]
2013-07-01 21:08:54 +02:00
2014-02-10 20:24:12 +01:00
return {
' id ' : video_id ,
2016-09-28 17:28:16 +02:00
' formats ' : formats ,
2021-11-24 13:52:42 +01:00
' title ' : media . get ( ' title ' ) or ' Video by %s ' % uploader_id ,
2016-06-12 01:06:04 +02:00
' description ' : description ,
2021-11-24 13:52:42 +01:00
' duration ' : float_or_none ( media . get ( ' video_duration ' ) ) ,
' timestamp ' : traverse_obj ( media , ' taken_at_timestamp ' , ' date ' , expected_type = int_or_none ) ,
2014-02-10 20:24:12 +01:00
' uploader_id ' : uploader_id ,
2021-11-24 13:52:42 +01:00
' uploader ' : traverse_obj ( media , ( ' owner ' , ' full_name ' ) ) ,
2021-12-23 23:13:10 +01:00
' like_count ' : self . _get_count ( media , ' likes ' , ' preview_like ' ) or str_to_int ( self . _search_regex (
r ' data-log-event= " likeCountClick " [^>]*>[^ \ d]*([ \ d, \ .]+) ' , webpage , ' like count ' , fatal = False ) ) ,
2021-11-24 13:52:42 +01:00
' comment_count ' : self . _get_count ( media , ' comments ' , ' preview_comment ' , ' to_comment ' , ' to_parent_comment ' ) ,
2016-09-28 16:54:06 +02:00
' comments ' : comments ,
2021-11-24 13:52:42 +01:00
' thumbnails ' : thumbnails ,
2021-08-14 21:15:01 +02:00
' http_headers ' : {
' Referer ' : ' https://www.instagram.com/ ' ,
}
2014-02-10 20:24:12 +01:00
}
2014-03-23 16:06:03 +01:00
2021-10-31 06:08:04 +01:00
class InstagramPlaylistBaseIE ( InstagramBaseIE ) :
2019-01-20 10:10:46 +01:00
_gis_tmpl = None # used to cache GIS request type
2014-03-23 16:06:03 +01:00
2019-01-20 10:10:46 +01:00
def _parse_graphql ( self , webpage , item_id ) :
# Reads a webpage and returns its GraphQL data.
return self . _parse_json (
self . _search_regex (
r ' sharedData \ s*= \ s*( { .+?}) \ s*; \ s*[< \ n] ' , webpage , ' data ' ) ,
item_id )
2018-04-17 17:37:50 +02:00
2019-01-20 10:10:46 +01:00
def _extract_graphql ( self , data , url ) :
# Parses GraphQL queries containing videos and generates a playlist.
uploader_id = self . _match_id ( url )
2018-04-10 20:25:41 +02:00
csrf_token = data [ ' config ' ] [ ' csrf_token ' ]
rhx_gis = data . get ( ' rhx_gis ' ) or ' 3c7ca9dcefcf966d11dacf1f151335e8 '
2018-03-21 17:43:03 +01:00
cursor = ' '
for page_num in itertools . count ( 1 ) :
2019-01-20 10:10:46 +01:00
variables = {
2018-04-18 05:12:24 +02:00
' first ' : 12 ,
2018-04-10 20:25:41 +02:00
' after ' : cursor ,
2019-01-20 10:10:46 +01:00
}
variables . update ( self . _query_vars_for ( data ) )
variables = json . dumps ( variables )
2018-04-17 17:37:50 +02:00
if self . _gis_tmpl :
gis_tmpls = [ self . _gis_tmpl ]
else :
gis_tmpls = [
' %s ' % rhx_gis ,
' ' ,
' %s : %s ' % ( rhx_gis , csrf_token ) ,
' %s : %s : %s ' % ( rhx_gis , csrf_token , std_headers [ ' User-Agent ' ] ) ,
]
2019-01-20 10:10:46 +01:00
# try all of the ways to generate a GIS query, and not only use the
# first one that works, but cache it for future requests
2018-04-17 17:37:50 +02:00
for gis_tmpl in gis_tmpls :
try :
2019-01-20 10:10:46 +01:00
json_data = self . _download_json (
2018-04-17 17:37:50 +02:00
' https://www.instagram.com/graphql/query/ ' , uploader_id ,
' Downloading JSON page %d ' % page_num , headers = {
' X-Requested-With ' : ' XMLHttpRequest ' ,
' X-Instagram-GIS ' : hashlib . md5 (
( ' %s : %s ' % ( gis_tmpl , variables ) ) . encode ( ' utf-8 ' ) ) . hexdigest ( ) ,
} , query = {
2019-01-20 10:10:46 +01:00
' query_hash ' : self . _QUERY_HASH ,
2018-04-17 17:37:50 +02:00
' variables ' : variables ,
2019-01-20 10:10:46 +01:00
} )
media = self . _parse_timeline_from ( json_data )
2018-04-17 17:37:50 +02:00
self . _gis_tmpl = gis_tmpl
break
except ExtractorError as e :
2019-01-20 10:10:46 +01:00
# if it's an error caused by a bad query, and there are
# more GIS templates to try, ignore it and keep trying
2018-04-17 17:37:50 +02:00
if isinstance ( e . cause , compat_HTTPError ) and e . cause . code == 403 :
if gis_tmpl != gis_tmpls [ - 1 ] :
continue
raise
2018-03-21 17:43:03 +01:00
2021-11-24 13:52:42 +01:00
nodes = traverse_obj ( media , ( ' edges ' , . . . , ' node ' ) , expected_type = dict ) or [ ]
if not nodes :
2018-03-21 17:43:03 +01:00
break
2021-11-24 13:52:42 +01:00
yield from self . _extract_nodes ( nodes )
2018-03-21 17:43:03 +01:00
2021-11-24 13:52:42 +01:00
has_next_page = traverse_obj ( media , ( ' page_info ' , ' has_next_page ' ) )
cursor = traverse_obj ( media , ( ' page_info ' , ' end_cursor ' ) , expected_type = str )
if not has_next_page or not cursor :
2018-03-21 17:43:03 +01:00
break
2017-11-12 12:35:17 +01:00
def _real_extract ( self , url ) :
2019-01-20 10:10:46 +01:00
user_or_tag = self . _match_id ( url )
webpage = self . _download_webpage ( url , user_or_tag )
data = self . _parse_graphql ( webpage , user_or_tag )
2018-04-10 20:25:41 +02:00
2019-01-20 10:10:46 +01:00
self . _set_cookie ( ' instagram.com ' , ' ig_pr ' , ' 1 ' )
2018-04-10 20:25:41 +02:00
2017-11-12 12:35:17 +01:00
return self . playlist_result (
2019-01-20 10:10:46 +01:00
self . _extract_graphql ( data , url ) , user_or_tag , user_or_tag )
2021-10-31 06:08:04 +01:00
class InstagramUserIE ( InstagramPlaylistBaseIE ) :
2019-01-20 10:10:46 +01:00
_VALID_URL = r ' https?://(?:www \ .)?instagram \ .com/(?P<id>[^/] { 2,})/?(?:$|[?#]) '
IE_DESC = ' Instagram user profile '
IE_NAME = ' instagram:user '
2021-10-31 06:08:04 +01:00
_TESTS = [ {
2019-01-20 10:10:46 +01:00
' url ' : ' https://instagram.com/porsche ' ,
' info_dict ' : {
' id ' : ' porsche ' ,
' title ' : ' porsche ' ,
} ,
' playlist_count ' : 5 ,
' params ' : {
' extract_flat ' : True ,
' skip_download ' : True ,
' playlistend ' : 5 ,
}
2021-10-31 06:08:04 +01:00
} ]
2019-01-20 10:10:46 +01:00
_QUERY_HASH = ' 42323d64886122307be10013ad2dcc44 ' ,
@staticmethod
def _parse_timeline_from ( data ) :
# extracts the media timeline data from a GraphQL result
return data [ ' data ' ] [ ' user ' ] [ ' edge_owner_to_timeline_media ' ]
@staticmethod
def _query_vars_for ( data ) :
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
' id ' : data [ ' entry_data ' ] [ ' ProfilePage ' ] [ 0 ] [ ' graphql ' ] [ ' user ' ] [ ' id ' ]
}
2021-10-31 06:08:04 +01:00
class InstagramTagIE ( InstagramPlaylistBaseIE ) :
2019-01-20 10:10:46 +01:00
_VALID_URL = r ' https?://(?:www \ .)?instagram \ .com/explore/tags/(?P<id>[^/]+) '
2021-11-28 22:22:52 +01:00
IE_DESC = ' Instagram hashtag search URLs '
2019-01-20 10:10:46 +01:00
IE_NAME = ' instagram:tag '
2021-10-31 06:08:04 +01:00
_TESTS = [ {
2019-01-20 10:10:46 +01:00
' url ' : ' https://instagram.com/explore/tags/lolcats ' ,
' info_dict ' : {
' id ' : ' lolcats ' ,
' title ' : ' lolcats ' ,
} ,
' playlist_count ' : 50 ,
' params ' : {
' extract_flat ' : True ,
' skip_download ' : True ,
' playlistend ' : 50 ,
}
2021-10-31 06:08:04 +01:00
} ]
2019-01-20 10:10:46 +01:00
_QUERY_HASH = ' f92f56d47dc7a55b606908374b43a314 ' ,
@staticmethod
def _parse_timeline_from ( data ) :
# extracts the media timeline data from a GraphQL result
return data [ ' data ' ] [ ' hashtag ' ] [ ' edge_hashtag_to_media ' ]
@staticmethod
def _query_vars_for ( data ) :
# returns a dictionary of variables to add to the timeline query based
# on the GraphQL of the original page
return {
' tag_name ' :
data [ ' entry_data ' ] [ ' TagPage ' ] [ 0 ] [ ' graphql ' ] [ ' hashtag ' ] [ ' name ' ]
}