2015-07-25 20:21:42 +06:00
# coding: utf-8
2014-02-08 21:55:28 +07:00
from __future__ import unicode_literals
2021-03-24 15:10:19 +01:00
import functools
2016-08-16 04:36:23 +07:00
import itertools
2021-03-24 15:10:19 +01:00
import json
2019-02-24 21:01:25 +07:00
import re
2014-02-08 21:55:28 +07:00
2015-02-19 16:46:41 +01:00
from . common import InfoExtractor
2021-02-24 11:52:30 +00:00
from . . compat import (
compat_etree_Element ,
compat_HTTPError ,
2021-03-24 15:10:19 +01:00
compat_parse_qs ,
2021-04-20 20:51:55 +01:00
compat_str ,
2021-03-24 15:10:19 +01:00
compat_urllib_parse_urlparse ,
2021-02-24 11:52:30 +00:00
compat_urlparse ,
)
2015-05-01 03:59:13 +06:00
from . . utils import (
2021-02-24 11:52:30 +00:00
ExtractorError ,
2021-03-24 15:10:19 +01:00
OnDemandPagedList ,
2017-05-26 22:12:24 +07:00
clean_html ,
2016-08-07 18:01:50 +07:00
dict_get ,
2015-07-25 20:21:42 +06:00
float_or_none ,
2017-05-26 22:12:24 +07:00
get_element_by_class ,
2015-05-01 03:59:13 +06:00
int_or_none ,
2018-06-03 04:07:59 +07:00
js_to_json ,
2015-07-25 20:21:42 +06:00
parse_duration ,
parse_iso8601 ,
2021-04-20 20:51:55 +01:00
strip_or_none ,
2016-08-07 18:01:50 +07:00
try_get ,
2015-10-10 20:34:06 +06:00
unescapeHTML ,
2021-04-20 20:51:55 +01:00
unified_timestamp ,
2019-02-24 21:01:25 +07:00
url_or_none ,
2017-05-26 22:12:24 +07:00
urlencode_postdata ,
urljoin ,
2015-05-01 03:59:13 +06:00
)
2014-02-08 21:55:28 +07:00
2015-07-30 00:55:06 +06:00
2015-02-19 16:46:41 +01:00
class BBCCoUkIE ( InfoExtractor ) :
2014-02-08 21:55:28 +07:00
IE_NAME = ' bbc.co.uk '
2014-02-09 04:00:24 +07:00
IE_DESC = ' BBC iPlayer '
2018-08-20 02:05:07 +07:00
_ID_REGEX = r ' (?:[pbm][ \ da-z] {7} |w[ \ da-z] { 7,14}) '
2016-01-02 19:22:39 +06:00
_VALID_URL = r ''' (?x)
https ? : / /
( ? : www \. ) ? bbc \. co \. uk /
( ? :
programmes / ( ? ! articles / ) |
iplayer ( ? : / [ ^ / ] + ) ? / ( ? : episode / | playlist / ) |
2017-06-29 22:29:28 +07:00
music / ( ? : clips | audiovideo / popular ) [ / #]|
2017-08-19 23:54:15 +07:00
radio / player / |
2019-08-26 23:16:18 +08:00
sounds / play / |
2017-08-19 23:54:15 +07:00
events / [ ^ / ] + / play / [ ^ / ] + /
2016-01-02 19:22:39 +06:00
)
2016-06-17 23:42:52 +07:00
( ? P < id > % s ) ( ? ! / ( ? : episodes | broadcasts | clips ) )
2016-01-02 19:22:39 +06:00
''' % _ID_REGEX
2014-02-08 21:55:28 +07:00
2017-05-26 22:12:24 +07:00
_LOGIN_URL = ' https://account.bbc.com/signin '
_NETRC_MACHINE = ' bbc '
2020-12-26 16:54:24 +01:00
_MEDIA_SELECTOR_URL_TEMPL = ' https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/ %s /vpid/ %s '
_MEDIA_SETS = [
2015-09-26 20:06:21 +06:00
# Provides HQ HLS streams with even better quality that pc mediaset but fails
# with geolocation in some cases when it's even not geo restricted at all (e.g.
2015-11-14 23:08:13 +06:00
# http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
2020-12-26 16:54:24 +01:00
' iptv-all ' ,
' pc ' ,
2015-07-30 00:55:06 +06:00
]
2015-06-19 01:52:25 -05:00
2015-10-10 23:01:20 +06:00
_EMP_PLAYLIST_NS = ' http://bbc.co.uk/2008/emp/playlist '
2014-02-09 04:00:24 +07:00
_TESTS = [
{
2014-02-19 06:46:14 +07:00
' url ' : ' http://www.bbc.co.uk/programmes/b039g8p7 ' ,
2014-02-09 04:00:24 +07:00
' info_dict ' : {
2014-02-19 06:46:14 +07:00
' id ' : ' b039d07m ' ,
2016-07-17 17:29:36 +07:00
' ext ' : ' flv ' ,
2019-08-26 23:04:38 +08:00
' title ' : ' Kaleidoscope, Leonard Cohen ' ,
2015-01-02 22:13:26 +06:00
' description ' : ' The Canadian poet and songwriter reflects on his musical career. ' ,
2014-02-09 04:00:24 +07:00
} ,
' params ' : {
2016-07-17 17:29:36 +07:00
# rtmp download
2014-02-09 04:00:24 +07:00
' skip_download ' : True ,
}
2014-02-08 21:55:28 +07:00
} ,
2014-02-09 04:00:24 +07:00
{
' url ' : ' http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/ ' ,
' info_dict ' : {
' id ' : ' b00yng1d ' ,
' ext ' : ' flv ' ,
' title ' : ' The Man in Black: Series 3: The Printed Name ' ,
' description ' : " Mark Gatiss introduces Nicholas Pierpan ' s chilling tale of a writer ' s devilish pact with a mysterious man. Stars Ewan Bailey. " ,
' duration ' : 1800 ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
2014-02-18 00:26:12 +07:00
} ,
' skip ' : ' Episode is no longer available on BBC iPlayer Radio ' ,
2014-02-09 04:00:24 +07:00
} ,
{
' url ' : ' http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/ ' ,
' info_dict ' : {
' id ' : ' b00yng1d ' ,
' ext ' : ' flv ' ,
2014-02-09 04:04:21 +07:00
' title ' : ' The Voice UK: Series 3: Blind Auditions 5 ' ,
2016-02-14 15:37:17 +06:00
' description ' : ' Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone. ' ,
2014-02-09 04:04:21 +07:00
' duration ' : 5100 ,
2014-02-09 04:00:24 +07:00
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2016-07-17 17:29:36 +07:00
' skip ' : ' Currently BBC iPlayer TV programmes are available to play in the UK only ' ,
2014-11-30 22:37:56 +06:00
} ,
{
' url ' : ' http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion ' ,
' info_dict ' : {
' id ' : ' b03k3pb7 ' ,
' ext ' : ' flv ' ,
' title ' : " Tomorrow ' s Worlds: The Unearthly History of Science Fiction " ,
' description ' : ' 2. Invasion ' ,
' duration ' : 3600 ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2016-07-17 17:29:36 +07:00
' skip ' : ' Currently BBC iPlayer TV programmes are available to play in the UK only ' ,
2014-12-29 03:00:24 +06:00
} , {
' url ' : ' http://www.bbc.co.uk/programmes/b04v20dw ' ,
' info_dict ' : {
' id ' : ' b04v209v ' ,
' ext ' : ' flv ' ,
' title ' : ' Pete Tong, The Essential New Tune Special ' ,
' description ' : " Pete has a very special mix - all of 2014 ' s Essential New Tunes! " ,
' duration ' : 10800 ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
2015-12-05 16:51:13 +08:00
} ,
' skip ' : ' Episode is no longer available on BBC iPlayer Radio ' ,
2015-01-03 20:43:40 +06:00
} , {
2016-01-04 02:55:25 +06:00
' url ' : ' http://www.bbc.co.uk/music/clips/p022h44b ' ,
2015-01-03 20:43:40 +06:00
' note ' : ' Audio ' ,
' info_dict ' : {
2016-01-04 02:55:25 +06:00
' id ' : ' p022h44j ' ,
2016-07-17 17:29:36 +07:00
' ext ' : ' flv ' ,
2016-01-04 02:55:25 +06:00
' title ' : ' BBC Proms Music Guides, Rachmaninov: Symphonic Dances ' ,
' description ' : " In this Proms Music Guide, Andrew McGregor looks at Rachmaninov ' s Symphonic Dances. " ,
' duration ' : 227 ,
2015-01-03 20:43:40 +06:00
} ,
' params ' : {
2016-07-17 17:29:36 +07:00
# rtmp download
2015-01-03 20:43:40 +06:00
' skip_download ' : True ,
}
} , {
' url ' : ' http://www.bbc.co.uk/music/clips/p025c0zz ' ,
' note ' : ' Video ' ,
' info_dict ' : {
' id ' : ' p025c103 ' ,
2016-07-17 17:29:36 +07:00
' ext ' : ' flv ' ,
2015-01-03 20:43:40 +06:00
' title ' : ' Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three) ' ,
' description ' : ' Rae Morris performs Closer for BBC Three at Reading 2014 ' ,
' duration ' : 226 ,
} ,
' params ' : {
2016-07-17 17:29:36 +07:00
# rtmp download
2015-01-03 20:43:40 +06:00
' skip_download ' : True ,
}
2015-05-01 04:02:56 +06:00
} , {
' url ' : ' http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls ' ,
' info_dict ' : {
' id ' : ' p02n76xf ' ,
' ext ' : ' flv ' ,
' title ' : ' Natural World, 2015-2016: 2. Super Powered Owls ' ,
' description ' : ' md5:e4db5c937d0e95a7c6b5e654d429183d ' ,
' duration ' : 3540 ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2016-07-17 17:29:36 +07:00
' skip ' : ' geolocation ' ,
2015-06-18 22:00:13 +08:00
} , {
' url ' : ' http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition ' ,
' info_dict ' : {
' id ' : ' b05zmgw1 ' ,
' ext ' : ' flv ' ,
' description ' : ' Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique. ' ,
' title ' : ' Royal Academy Summer Exhibition ' ,
' duration ' : 3540 ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2016-07-17 17:29:36 +07:00
' skip ' : ' geolocation ' ,
2015-09-26 20:07:12 +06:00
} , {
# iptv-all mediaset fails with geolocation however there is no geo restriction
# for this programme at all
2016-01-04 02:55:25 +06:00
' url ' : ' http://www.bbc.co.uk/programmes/b06rkn85 ' ,
2015-09-26 20:07:12 +06:00
' info_dict ' : {
2016-01-04 02:55:25 +06:00
' id ' : ' b06rkms3 ' ,
2015-09-26 20:07:12 +06:00
' ext ' : ' flv ' ,
2016-01-04 02:55:25 +06:00
' title ' : " Best of the Mini-Mixes 2015: Part 3, Annie Mac ' s Friday Night - BBC Radio 1 " ,
' description ' : " Annie has part three in the Best of the Mini-Mixes 2015, plus the year ' s Most Played! " ,
2015-09-26 20:07:12 +06:00
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2016-07-17 17:29:36 +07:00
' skip ' : ' Now it \' s really geo-restricted ' ,
2016-01-28 23:27:48 +06:00
} , {
2019-03-09 19:14:41 +07:00
# compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
2016-01-28 23:27:48 +06:00
' url ' : ' http://www.bbc.co.uk/programmes/p028bfkf/player ' ,
' info_dict ' : {
' id ' : ' p028bfkj ' ,
2016-07-17 17:29:36 +07:00
' ext ' : ' flv ' ,
2016-01-28 23:27:48 +06:00
' title ' : ' Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews ' ,
' description ' : ' Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews ' ,
} ,
' params ' : {
2016-07-17 17:29:36 +07:00
# rtmp download
2016-01-28 23:27:48 +06:00
' skip_download ' : True ,
} ,
2019-08-26 23:16:18 +08:00
} , {
' url ' : ' https://www.bbc.co.uk/sounds/play/m0007jzb ' ,
' note ' : ' Audio ' ,
' info_dict ' : {
' id ' : ' m0007jz9 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' BBC Proms, 2019, Prom 34: West– Eastern Divan Orchestra ' ,
' description ' : " Live BBC Proms. West– Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich. " ,
' duration ' : 9840 ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
}
2015-01-02 20:37:54 +06:00
} , {
' url ' : ' http://www.bbc.co.uk/iplayer/playlist/p01dvks4 ' ,
' only_matching ' : True ,
2015-01-03 20:43:40 +06:00
} , {
' url ' : ' http://www.bbc.co.uk/music/clips#p02frcc3 ' ,
' only_matching ' : True ,
2015-01-30 23:47:09 +06:00
} , {
' url ' : ' http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo ' ,
' only_matching ' : True ,
2016-01-02 19:22:39 +06:00
} , {
' url ' : ' http://www.bbc.co.uk/radio/player/p03cchwf ' ,
' only_matching ' : True ,
2017-06-29 22:29:28 +07:00
} , {
' url ' : ' https://www.bbc.co.uk/music/audiovideo/popular#p055bc55 ' ,
' only_matching ' : True ,
2017-08-30 05:27:56 +07:00
} , {
' url ' : ' http://www.bbc.co.uk/programmes/w3csv1y9 ' ,
' only_matching ' : True ,
2018-08-20 02:05:07 +07:00
} , {
' url ' : ' https://www.bbc.co.uk/programmes/m00005xn ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.bbc.co.uk/programmes/w172w4dww1jqt5s ' ,
' only_matching ' : True ,
2017-06-29 22:29:28 +07:00
} ]
2014-02-09 04:00:24 +07:00
2017-05-26 22:12:24 +07:00
def _login ( self ) :
username , password = self . _get_login_info ( )
if username is None :
return
login_page = self . _download_webpage (
self . _LOGIN_URL , None , ' Downloading signin page ' )
login_form = self . _hidden_inputs ( login_page )
login_form . update ( {
' username ' : username ,
' password ' : password ,
} )
post_url = urljoin ( self . _LOGIN_URL , self . _search_regex (
r ' <form[^>]+action=([ " \' ])(?P<url>.+?) \ 1 ' , login_page ,
' post url ' , default = self . _LOGIN_URL , group = ' url ' ) )
response , urlh = self . _download_webpage_handle (
post_url , None , ' Logging in ' , data = urlencode_postdata ( login_form ) ,
headers = { ' Referer ' : self . _LOGIN_URL } )
if self . _LOGIN_URL in urlh . geturl ( ) :
error = clean_html ( get_element_by_class ( ' form-message ' , response ) )
if error :
raise ExtractorError (
' Unable to login: %s ' % error , expected = True )
raise ExtractorError ( ' Unable to log in ' )
def _real_initialize ( self ) :
self . _login ( )
2015-07-30 00:55:06 +06:00
class MediaSelectionError ( Exception ) :
def __init__ ( self , id ) :
self . id = id
2014-02-09 04:00:24 +07:00
def _extract_asx_playlist ( self , connection , programme_id ) :
asx = self . _download_xml ( connection . get ( ' href ' ) , programme_id , ' Downloading ASX playlist ' )
return [ ref . get ( ' href ' ) for ref in asx . findall ( ' ./Entry/ref ' ) ]
def _extract_items ( self , playlist ) :
2015-10-10 23:01:20 +06:00
return playlist . findall ( ' ./ { %s }item ' % self . _EMP_PLAYLIST_NS )
2014-02-09 04:00:24 +07:00
def _extract_medias ( self , media_selection ) :
2020-12-26 16:54:24 +01:00
error = media_selection . get ( ' result ' )
if error :
raise BBCCoUkIE . MediaSelectionError ( error )
return media_selection . get ( ' media ' ) or [ ]
2014-02-09 04:00:24 +07:00
def _extract_connections ( self , media ) :
2020-12-26 16:54:24 +01:00
return media . get ( ' connection ' ) or [ ]
2014-02-09 04:00:24 +07:00
2015-02-19 16:46:41 +01:00
def _get_subtitles ( self , media , programme_id ) :
2014-02-09 04:00:24 +07:00
subtitles = { }
for connection in self . _extract_connections ( media ) :
2019-02-24 21:01:25 +07:00
cc_url = url_or_none ( connection . get ( ' href ' ) )
if not cc_url :
continue
captions = self . _download_xml (
cc_url , programme_id , ' Downloading captions ' , fatal = False )
2019-03-06 01:21:57 +07:00
if not isinstance ( captions , compat_etree_Element ) :
2019-02-24 21:01:25 +07:00
continue
2020-12-26 16:54:24 +01:00
subtitles [ ' en ' ] = [
2015-02-19 16:46:41 +01:00
{
' url ' : connection . get ( ' href ' ) ,
' ext ' : ' ttml ' ,
} ,
]
2020-12-26 16:54:24 +01:00
break
2014-02-09 04:00:24 +07:00
return subtitles
2014-02-08 21:55:28 +07:00
2015-07-30 00:55:06 +06:00
def _raise_extractor_error ( self , media_selection_error ) :
raise ExtractorError (
' %s returned error: %s ' % ( self . IE_NAME , media_selection_error . id ) ,
expected = True )
2014-11-30 22:37:56 +06:00
def _download_media_selector ( self , programme_id ) :
2015-07-30 00:55:06 +06:00
last_exception = None
2020-12-26 16:54:24 +01:00
for media_set in self . _MEDIA_SETS :
2015-07-30 00:55:06 +06:00
try :
return self . _download_media_selector_url (
2020-12-26 16:54:24 +01:00
self . _MEDIA_SELECTOR_URL_TEMPL % ( media_set , programme_id ) , programme_id )
2015-07-30 00:55:06 +06:00
except BBCCoUkIE . MediaSelectionError as e :
2015-11-14 23:08:13 +06:00
if e . id in ( ' notukerror ' , ' geolocation ' , ' selectionunavailable ' ) :
2015-07-30 00:55:06 +06:00
last_exception = e
continue
self . _raise_extractor_error ( e )
self . _raise_extractor_error ( last_exception )
2015-07-25 20:21:42 +06:00
def _download_media_selector_url ( self , url , programme_id = None ) :
2020-12-26 16:54:24 +01:00
media_selection = self . _download_json (
url , programme_id , ' Downloading media selection JSON ' ,
2018-06-18 04:04:47 +07:00
expected_status = ( 403 , 404 ) )
2015-07-25 20:21:42 +06:00
return self . _process_media_selector ( media_selection , programme_id )
2014-02-08 21:55:28 +07:00
2015-07-25 20:21:42 +06:00
def _process_media_selector ( self , media_selection , programme_id ) :
2014-02-08 21:55:28 +07:00
formats = [ ]
2014-02-09 04:00:24 +07:00
subtitles = None
2016-08-06 19:24:59 +01:00
urls = [ ]
2014-02-09 04:00:24 +07:00
2014-11-30 22:37:56 +06:00
for media in self . _extract_medias ( media_selection ) :
kind = media . get ( ' kind ' )
2016-08-06 18:48:09 +01:00
if kind in ( ' video ' , ' audio ' ) :
bitrate = int_or_none ( media . get ( ' bitrate ' ) )
encoding = media . get ( ' encoding ' )
width = int_or_none ( media . get ( ' width ' ) )
height = int_or_none ( media . get ( ' height ' ) )
file_size = int_or_none ( media . get ( ' media_file_size ' ) )
for connection in self . _extract_connections ( media ) :
2016-08-06 19:24:59 +01:00
href = connection . get ( ' href ' )
if href in urls :
continue
if href :
urls . append ( href )
2016-08-06 18:48:09 +01:00
conn_kind = connection . get ( ' kind ' )
protocol = connection . get ( ' protocol ' )
supplier = connection . get ( ' supplier ' )
transfer_format = connection . get ( ' transferFormat ' )
format_id = supplier or conn_kind or protocol
# ASX playlist
if supplier == ' asx ' :
for i , ref in enumerate ( self . _extract_asx_playlist ( connection , programme_id ) ) :
formats . append ( {
' url ' : ref ,
' format_id ' : ' ref %s _ %s ' % ( i , format_id ) ,
} )
elif transfer_format == ' dash ' :
formats . extend ( self . _extract_mpd_formats (
href , programme_id , mpd_id = format_id , fatal = False ) )
elif transfer_format == ' hls ' :
formats . extend ( self . _extract_m3u8_formats (
href , programme_id , ext = ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = format_id , fatal = False ) )
elif transfer_format == ' hds ' :
formats . extend ( self . _extract_f4m_formats (
href , programme_id , f4m_id = format_id , fatal = False ) )
else :
2020-12-26 16:54:24 +01:00
if not supplier and bitrate :
2016-08-07 18:05:13 +07:00
format_id + = ' - %d ' % bitrate
2016-08-06 18:48:09 +01:00
fmt = {
' format_id ' : format_id ,
' filesize ' : file_size ,
}
if kind == ' video ' :
fmt . update ( {
' width ' : width ,
' height ' : height ,
2017-04-10 22:56:22 +07:00
' tbr ' : bitrate ,
2016-08-06 18:48:09 +01:00
' vcodec ' : encoding ,
} )
else :
fmt . update ( {
' abr ' : bitrate ,
' acodec ' : encoding ,
' vcodec ' : ' none ' ,
} )
2017-04-10 22:53:06 +07:00
if protocol in ( ' http ' , ' https ' ) :
2016-08-06 18:48:09 +01:00
# Direct link
fmt . update ( {
' url ' : href ,
} )
elif protocol == ' rtmp ' :
application = connection . get ( ' application ' , ' ondemand ' )
auth_string = connection . get ( ' authString ' )
identifier = connection . get ( ' identifier ' )
server = connection . get ( ' server ' )
fmt . update ( {
' url ' : ' %s :// %s / %s ? %s ' % ( protocol , server , application , auth_string ) ,
' play_path ' : identifier ,
' app ' : ' %s ? %s ' % ( application , auth_string ) ,
' page_url ' : ' http://www.bbc.co.uk ' ,
' player_url ' : ' http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf ' ,
' rtmp_live ' : False ,
' ext ' : ' flv ' ,
} )
2017-04-10 22:53:51 +07:00
else :
continue
2016-08-06 18:48:09 +01:00
formats . append ( fmt )
2014-11-30 22:37:56 +06:00
elif kind == ' captions ' :
2015-02-19 16:46:41 +01:00
subtitles = self . extract_subtitles ( media , programme_id )
2014-11-30 22:37:56 +06:00
return formats , subtitles
2014-02-09 04:00:24 +07:00
2014-12-29 03:00:24 +06:00
def _download_playlist ( self , playlist_id ) :
try :
playlist = self . _download_json (
' http://www.bbc.co.uk/programmes/ %s /playlist.json ' % playlist_id ,
playlist_id , ' Downloading playlist JSON ' )
version = playlist . get ( ' defaultAvailableVersion ' )
if version :
smp_config = version [ ' smpConfig ' ]
title = smp_config [ ' title ' ]
description = smp_config [ ' summary ' ]
for item in smp_config [ ' items ' ] :
kind = item [ ' kind ' ]
2017-04-12 20:38:43 +01:00
if kind not in ( ' programme ' , ' radioProgramme ' ) :
2014-12-29 03:00:24 +06:00
continue
programme_id = item . get ( ' vpid ' )
2015-07-24 02:56:54 +06:00
duration = int_or_none ( item . get ( ' duration ' ) )
2014-12-29 03:00:24 +06:00
formats , subtitles = self . _download_media_selector ( programme_id )
return programme_id , title , description , duration , formats , subtitles
except ExtractorError as ee :
2015-02-15 16:32:38 +06:00
if not ( isinstance ( ee . cause , compat_HTTPError ) and ee . cause . code == 404 ) :
2014-12-29 03:00:24 +06:00
raise
# fallback to legacy playlist
2015-07-25 20:21:42 +06:00
return self . _process_legacy_playlist ( playlist_id )
def _process_legacy_playlist_url ( self , url , display_id ) :
playlist = self . _download_legacy_playlist_url ( url , display_id )
return self . _extract_from_legacy_playlist ( playlist , display_id )
def _process_legacy_playlist ( self , playlist_id ) :
return self . _process_legacy_playlist_url (
' http://www.bbc.co.uk/iplayer/playlist/ %s ' % playlist_id , playlist_id )
def _download_legacy_playlist_url ( self , url , playlist_id = None ) :
return self . _download_xml (
url , playlist_id , ' Downloading legacy playlist XML ' )
2014-12-29 03:00:24 +06:00
2015-07-25 20:21:42 +06:00
def _extract_from_legacy_playlist ( self , playlist , playlist_id ) :
2015-10-10 23:01:20 +06:00
no_items = playlist . find ( ' ./ { %s }noItems ' % self . _EMP_PLAYLIST_NS )
2014-12-29 03:00:24 +06:00
if no_items is not None :
reason = no_items . get ( ' reason ' )
if reason == ' preAvailability ' :
msg = ' Episode %s is not yet available ' % playlist_id
elif reason == ' postAvailability ' :
msg = ' Episode %s is no longer available ' % playlist_id
elif reason == ' noMedia ' :
msg = ' Episode %s is not currently available ' % playlist_id
else :
msg = ' Episode %s is not available: %s ' % ( playlist_id , reason )
raise ExtractorError ( msg , expected = True )
for item in self . _extract_items ( playlist ) :
kind = item . get ( ' kind ' )
2017-04-12 20:38:43 +01:00
if kind not in ( ' programme ' , ' radioProgramme ' ) :
2014-12-29 03:00:24 +06:00
continue
2015-10-10 23:01:20 +06:00
title = playlist . find ( ' ./ { %s }title ' % self . _EMP_PLAYLIST_NS ) . text
description_el = playlist . find ( ' ./ { %s }summary ' % self . _EMP_PLAYLIST_NS )
2015-10-17 00:26:45 +06:00
description = description_el . text if description_el is not None else None
2015-07-25 20:21:42 +06:00
def get_programme_id ( item ) :
def get_from_attributes ( item ) :
2020-05-14 05:11:42 +07:00
for p in ( ' identifier ' , ' group ' ) :
2015-07-25 20:21:42 +06:00
value = item . get ( p )
if value and re . match ( r ' ^[pb][ \ da-z] {7} $ ' , value ) :
return value
get_from_attributes ( item )
2015-10-10 23:01:20 +06:00
mediator = item . find ( ' ./ { %s }mediator ' % self . _EMP_PLAYLIST_NS )
2015-07-25 20:21:42 +06:00
if mediator is not None :
return get_from_attributes ( mediator )
programme_id = get_programme_id ( item )
2015-07-24 02:56:54 +06:00
duration = int_or_none ( item . get ( ' duration ' ) )
2015-10-10 23:01:20 +06:00
if programme_id :
formats , subtitles = self . _download_media_selector ( programme_id )
else :
formats , subtitles = self . _process_media_selector ( item , playlist_id )
programme_id = playlist_id
2014-12-29 03:00:24 +06:00
return programme_id , title , description , duration , formats , subtitles
2014-11-30 22:37:56 +06:00
def _real_extract ( self , url ) :
group_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , group_id , ' Downloading video page ' )
2017-06-29 22:27:53 +07:00
error = self . _search_regex (
2020-12-26 16:54:24 +01:00
r ' <div \ b[^>]+ \ bclass=[ " \' ](?:smp|playout)__message delta[ " \' ][^>]*> \ s*([^<]+?) \ s*< ' ,
2017-06-29 22:27:53 +07:00
webpage , ' error ' , default = None )
if error :
raise ExtractorError ( error , expected = True )
2015-05-01 03:59:13 +06:00
programme_id = None
2015-12-05 16:45:24 +08:00
duration = None
2015-05-01 03:59:13 +06:00
tviplayer = self . _search_regex (
r ' mediator \ .bind \ (( { .+?}) \ s*, \ s*document \ .getElementById ' ,
webpage , ' player ' , default = None )
if tviplayer :
player = self . _parse_json ( tviplayer , group_id ) . get ( ' player ' , { } )
duration = int_or_none ( player . get ( ' duration ' ) )
programme_id = player . get ( ' vpid ' )
if not programme_id :
programme_id = self . _search_regex (
2015-12-02 02:34:31 +06:00
r ' " vpid " \ s*: \ s* " ( %s ) " ' % self . _ID_REGEX , webpage , ' vpid ' , fatal = False , default = None )
2015-05-01 03:59:13 +06:00
2014-11-30 22:37:56 +06:00
if programme_id :
formats , subtitles = self . _download_media_selector ( programme_id )
2016-01-02 19:42:11 +06:00
title = self . _og_search_title ( webpage , default = None ) or self . _html_search_regex (
2016-01-28 23:19:53 +06:00
( r ' <h2[^>]+id= " parent-title " [^>]*>(.+?)</h2> ' ,
r ' <div[^>]+class= " info " [^>]*> \ s*<h1>(.+?)</h1> ' ) , webpage , ' title ' )
2015-05-01 03:59:13 +06:00
description = self . _search_regex (
2016-01-28 23:23:13 +06:00
( r ' <p class= " [^ " ]*medium-description[^ " ]* " >([^<]+)</p> ' ,
r ' <div[^>]+class= " info_+synopsis " [^>]*>([^<]+)</div> ' ) ,
2015-12-05 16:45:24 +08:00
webpage , ' description ' , default = None )
if not description :
description = self . _html_search_meta ( ' description ' , webpage )
2014-11-30 22:37:56 +06:00
else :
2014-12-29 03:00:24 +06:00
programme_id , title , description , duration , formats , subtitles = self . _download_playlist ( group_id )
2014-02-09 04:00:24 +07:00
2014-02-08 21:55:28 +07:00
self . _sort_formats ( formats )
return {
2014-02-09 04:00:24 +07:00
' id ' : programme_id ,
2014-02-08 21:55:28 +07:00
' title ' : title ,
' description ' : description ,
2015-05-01 04:07:30 +06:00
' thumbnail ' : self . _og_search_thumbnail ( webpage , default = None ) ,
2014-02-08 21:55:28 +07:00
' duration ' : duration ,
' formats ' : formats ,
2014-02-09 04:00:24 +07:00
' subtitles ' : subtitles ,
2014-11-23 20:41:03 +01:00
}
2015-06-20 08:22:13 -05:00
2015-07-25 20:21:42 +06:00
class BBCIE ( BBCCoUkIE ) :
IE_NAME = ' bbc '
IE_DESC = ' BBC '
_VALID_URL = r ' https?://(?:www \ .)?bbc \ .(?:com|co \ .uk)/(?:[^/]+/)+(?P<id>[^/#?]+) '
2015-06-20 08:22:13 -05:00
2020-12-26 16:54:24 +01:00
_MEDIA_SETS = [
' mobile-tablet-main ' ,
' pc ' ,
2015-07-30 00:55:06 +06:00
]
2015-06-20 08:22:13 -05:00
_TESTS = [ {
2015-10-10 23:56:55 +06:00
# article with multiple videos embedded with data-playable containing vpids
2015-06-20 08:22:13 -05:00
' url ' : ' http://www.bbc.com/news/world-europe-32668511 ' ,
' info_dict ' : {
' id ' : ' world-europe-32668511 ' ,
2019-08-26 23:04:38 +08:00
' title ' : ' Russia stages massive WW2 parade ' ,
2015-07-25 20:21:42 +06:00
' description ' : ' md5:00ff61976f6081841f759a08bf78cc9c ' ,
2015-06-20 08:22:13 -05:00
} ,
' playlist_count ' : 2 ,
2015-07-17 02:47:02 -05:00
} , {
2015-10-10 23:56:55 +06:00
# article with multiple videos embedded with data-playable (more videos)
2015-06-20 08:22:13 -05:00
' url ' : ' http://www.bbc.com/news/business-28299555 ' ,
' info_dict ' : {
' id ' : ' business-28299555 ' ,
' title ' : ' Farnborough Airshow: Video highlights ' ,
2015-07-25 20:21:42 +06:00
' description ' : ' BBC reports and video highlights at the Farnborough Airshow. ' ,
2015-06-20 08:22:13 -05:00
} ,
' playlist_count ' : 9 ,
2015-07-25 20:21:42 +06:00
' skip ' : ' Save time ' ,
2015-07-27 22:05:51 +06:00
} , {
# article with multiple videos embedded with `new SMP()`
2015-10-10 23:56:55 +06:00
# broken
2015-07-27 22:05:51 +06:00
' url ' : ' http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460 ' ,
' info_dict ' : {
' id ' : ' 3662a707-0af9-3149-963f-47bea720b460 ' ,
2016-03-13 15:56:34 +06:00
' title ' : ' BUGGER ' ,
2015-07-27 22:05:51 +06:00
} ,
' playlist_count ' : 18 ,
2015-07-17 02:47:02 -05:00
} , {
2015-10-10 23:56:55 +06:00
# single video embedded with data-playable containing vpid
2015-06-20 08:22:13 -05:00
' url ' : ' http://www.bbc.com/news/world-europe-32041533 ' ,
' info_dict ' : {
' id ' : ' p02mprgb ' ,
2015-10-11 00:37:39 +06:00
' ext ' : ' mp4 ' ,
2015-06-20 08:22:13 -05:00
' title ' : ' Aerial footage showed the site of the crash in the Alps - courtesy BFM TV ' ,
2015-10-11 00:37:39 +06:00
' description ' : ' md5:2868290467291b37feda7863f7a83f54 ' ,
2015-06-20 08:22:13 -05:00
' duration ' : 47 ,
2015-07-25 20:21:42 +06:00
' timestamp ' : 1427219242 ,
2015-06-25 00:31:32 -05:00
' upload_date ' : ' 20150324 ' ,
2015-06-20 08:22:13 -05:00
} ,
' params ' : {
2015-07-25 20:21:42 +06:00
# rtmp download
2015-06-20 08:22:13 -05:00
' skip_download ' : True ,
}
2015-07-17 02:47:02 -05:00
} , {
2015-10-10 23:56:55 +06:00
# article with single video embedded with data-playable containing XML playlist
# with direct video links as progressiveDownloadUrl (for now these are extracted)
# and playlist with f4m and m3u8 as streamingUrl
2015-06-20 11:04:46 -05:00
' url ' : ' http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu ' ,
' info_dict ' : {
2015-07-25 20:21:42 +06:00
' id ' : ' 150615_telabyad_kentin_cogu ' ,
2015-06-20 11:04:46 -05:00
' ext ' : ' mp4 ' ,
2016-08-06 19:36:12 +01:00
' title ' : " YPG: Tel Abyad ' ı n tamamı kontrolümüzde" ,
2016-07-19 22:49:38 +07:00
' description ' : ' md5:33a4805a855c9baf7115fcbde57e7025 ' ,
2015-07-25 20:21:42 +06:00
' timestamp ' : 1434397334 ,
2015-06-25 00:31:32 -05:00
' upload_date ' : ' 20150615 ' ,
2015-06-20 11:04:46 -05:00
} ,
' params ' : {
' skip_download ' : True ,
}
2015-10-10 23:14:25 +06:00
} , {
2015-10-10 23:56:55 +06:00
# single video embedded with data-playable containing XML playlists (regional section)
2015-06-20 11:04:46 -05:00
' url ' : ' http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw ' ,
' info_dict ' : {
2015-07-25 20:21:42 +06:00
' id ' : ' 150619_video_honduras_militares_hospitales_corrupcion_aw ' ,
2015-06-20 11:04:46 -05:00
' ext ' : ' mp4 ' ,
2015-07-25 20:21:42 +06:00
' title ' : ' Honduras militariza sus hospitales por nuevo escándalo de corrupción ' ,
2016-07-19 22:49:38 +07:00
' description ' : ' md5:1525f17448c4ee262b64b8f0c9ce66c8 ' ,
2015-07-25 20:21:42 +06:00
' timestamp ' : 1434713142 ,
2015-06-25 00:31:32 -05:00
' upload_date ' : ' 20150619 ' ,
2015-06-20 11:04:46 -05:00
} ,
' params ' : {
' skip_download ' : True ,
}
2015-08-04 20:44:22 +06:00
} , {
# single video from video playlist embedded with vxp-playlist-data JSON
' url ' : ' http://www.bbc.com/news/video_and_audio/must_see/33376376 ' ,
' info_dict ' : {
' id ' : ' p02w6qjc ' ,
2015-10-11 00:37:39 +06:00
' ext ' : ' mp4 ' ,
2015-08-04 20:44:22 +06:00
' title ' : ''' Judge Mindy Glazer: " I ' m sorry to see you here... I always wondered what happened to you " ''' ,
' duration ' : 56 ,
2015-12-05 16:54:25 +08:00
' description ' : ''' Judge Mindy Glazer: " I ' m sorry to see you here... I always wondered what happened to you " ''' ,
2015-08-04 20:44:22 +06:00
} ,
' params ' : {
' skip_download ' : True ,
}
2015-07-25 20:21:42 +06:00
} , {
# single video story with digitalData
' url ' : ' http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret ' ,
' info_dict ' : {
' id ' : ' p02q6gc4 ' ,
' ext ' : ' flv ' ,
' title ' : ' Sri Lanka’ s spicy secret ' ,
' description ' : ' As a new train line to Jaffna opens up the country’ s north, travellers can experience a truly distinct slice of Tamil culture. ' ,
' timestamp ' : 1437674293 ,
' upload_date ' : ' 20150723 ' ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
}
} , {
# single video story without digitalData
' url ' : ' http://www.bbc.com/autos/story/20130513-hyundais-rock-star ' ,
' info_dict ' : {
' id ' : ' p018zqqg ' ,
2015-10-11 00:37:39 +06:00
' ext ' : ' mp4 ' ,
2015-07-25 20:21:42 +06:00
' title ' : ' Hyundai Santa Fe Sport: Rock star ' ,
' description ' : ' md5:b042a26142c4154a6e472933cf20793d ' ,
2015-10-11 00:25:09 +06:00
' timestamp ' : 1415867444 ,
' upload_date ' : ' 20141113 ' ,
2015-07-25 20:21:42 +06:00
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
}
2016-08-07 18:01:50 +07:00
} , {
# single video embedded with Morph
' url ' : ' http://www.bbc.co.uk/sport/live/olympics/36895975 ' ,
' info_dict ' : {
' id ' : ' p041vhd0 ' ,
' ext ' : ' mp4 ' ,
' title ' : " Nigeria v Japan - Men ' s First Round " ,
' description ' : ' Live coverage of the first round from Group B at the Amazonia Arena. ' ,
' duration ' : 7980 ,
' uploader ' : ' BBC Sport ' ,
' uploader_id ' : ' bbc_sport ' ,
} ,
' params ' : {
# m3u8 download
' skip_download ' : True ,
} ,
' skip ' : ' Georestricted to UK ' ,
2015-07-25 20:21:42 +06:00
} , {
2015-10-10 23:56:55 +06:00
# single video with playlist.sxml URL in playlist param
2015-07-25 20:21:42 +06:00
' url ' : ' http://www.bbc.com/sport/0/football/33653409 ' ,
' info_dict ' : {
' id ' : ' p02xycnp ' ,
2015-10-11 00:37:39 +06:00
' ext ' : ' mp4 ' ,
2015-07-25 20:21:42 +06:00
' title ' : ' Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend? ' ,
2015-10-22 21:12:29 +08:00
' description ' : ' BBC Sport \' s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo. ' ,
2015-07-25 20:21:42 +06:00
' duration ' : 140 ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
}
2015-10-10 20:55:46 +06:00
} , {
2015-10-10 23:56:55 +06:00
# article with multiple videos embedded with playlist.sxml in playlist param
2015-10-10 20:55:46 +06:00
' url ' : ' http://www.bbc.com/sport/0/football/34475836 ' ,
' info_dict ' : {
' id ' : ' 34475836 ' ,
2016-03-13 15:59:54 +06:00
' title ' : ' Jurgen Klopp: Furious football from a witty and winning coach ' ,
2016-04-16 18:00:19 +08:00
' description ' : ' Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp. ' ,
2015-10-10 20:55:46 +06:00
} ,
' playlist_count ' : 3 ,
2016-03-13 15:59:54 +06:00
} , {
# school report article with single video
' url ' : ' http://www.bbc.co.uk/schoolreport/35744779 ' ,
' info_dict ' : {
' id ' : ' 35744779 ' ,
' title ' : ' School which breaks down barriers in Jerusalem ' ,
} ,
' playlist_count ' : 1 ,
2015-07-25 20:21:42 +06:00
} , {
# single video with playlist URL from weather section
' url ' : ' http://www.bbc.com/weather/features/33601775 ' ,
' only_matching ' : True ,
} , {
# custom redirection to www.bbc.com
2021-04-20 20:51:55 +01:00
# also, video with window.__INITIAL_DATA__
2015-07-25 20:21:42 +06:00
' url ' : ' http://www.bbc.co.uk/news/science-environment-33661876 ' ,
2021-04-20 20:51:55 +01:00
' info_dict ' : {
' id ' : ' p02xzws1 ' ,
' ext ' : ' mp4 ' ,
' title ' : " Pluto may have ' nitrogen glaciers ' " ,
2021-04-21 03:00:56 +07:00
' description ' : ' md5:6a95b593f528d7a5f2605221bc56912f ' ,
2021-04-20 20:51:55 +01:00
' thumbnail ' : r ' re:https?://.+/.+ \ .jpg ' ,
' timestamp ' : 1437785037 ,
' upload_date ' : ' 20150725 ' ,
} ,
2016-03-27 23:22:51 +06:00
} , {
# single video article embedded with data-media-vpid
' url ' : ' http://www.bbc.co.uk/sport/rowing/35908187 ' ,
' only_matching ' : True ,
2018-06-03 04:07:59 +07:00
} , {
' url ' : ' https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1 ' ,
' info_dict ' : {
' id ' : ' p06556y7 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend? ' ,
' description ' : ' md5:4b7dfd063d5a789a1512e99662be3ddd ' ,
} ,
' params ' : {
' skip_download ' : True ,
}
2018-07-21 11:49:55 +01:00
} , {
# window.__PRELOADED_STATE__
' url ' : ' https://www.bbc.co.uk/radio/play/b0b9z4yl ' ,
' info_dict ' : {
' id ' : ' b0b9z4vz ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Prom 6: An American in Paris and Turangalila ' ,
' description ' : ' md5:51cf7d6f5c8553f197e58203bc78dff8 ' ,
' uploader ' : ' Radio 3 ' ,
' uploader_id ' : ' bbc_radio_three ' ,
} ,
2018-12-31 23:20:40 +07:00
} , {
' url ' : ' http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 ' ,
' info_dict ' : {
' id ' : ' p06w9tws ' ,
' ext ' : ' mp4 ' ,
' title ' : ' md5:2fabf12a726603193a2879a055f72514 ' ,
' description ' : ' Learn English words and phrases from this story ' ,
} ,
' add_ie ' : [ BBCCoUkIE . ie_key ( ) ] ,
2021-02-24 11:52:30 +00:00
} , {
# BBC Reel
' url ' : ' https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness ' ,
' info_dict ' : {
' id ' : ' p07c6sb9 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' How positive thinking is harming your happiness ' ,
' alt_title ' : ' The downsides of positive thinking ' ,
' description ' : ' md5:fad74b31da60d83b8265954ee42d85b4 ' ,
' duration ' : 235 ,
' thumbnail ' : r ' re:https?://.+/p07c9dsr.jpg ' ,
' upload_date ' : ' 20190604 ' ,
' categories ' : [ ' Psychology ' ] ,
} ,
2015-06-20 08:22:13 -05:00
} ]
2015-07-25 20:21:42 +06:00
@classmethod
def suitable ( cls , url ) :
2021-03-24 15:10:19 +01:00
EXCLUDE_IE = ( BBCCoUkIE , BBCCoUkArticleIE , BBCCoUkIPlayerEpisodesIE , BBCCoUkIPlayerGroupIE , BBCCoUkPlaylistIE )
2016-06-17 23:42:52 +07:00
return ( False if any ( ie . suitable ( url ) for ie in EXCLUDE_IE )
else super ( BBCIE , cls ) . suitable ( url ) )
2015-07-25 20:21:42 +06:00
def _extract_from_media_meta ( self , media_meta , video_id ) :
# Direct links to media in media metadata (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
# TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
source_files = media_meta . get ( ' sourceFiles ' )
if source_files :
return [ {
' url ' : f [ ' url ' ] ,
' format_id ' : format_id ,
' ext ' : f . get ( ' encoding ' ) ,
' tbr ' : float_or_none ( f . get ( ' bitrate ' ) , 1000 ) ,
' filesize ' : int_or_none ( f . get ( ' filesize ' ) ) ,
} for format_id , f in source_files . items ( ) if f . get ( ' url ' ) ] , [ ]
programme_id = media_meta . get ( ' externalId ' )
if programme_id :
return self . _download_media_selector ( programme_id )
# Process playlist.sxml as legacy playlist
href = media_meta . get ( ' href ' )
if href :
playlist = self . _download_legacy_playlist_url ( href )
_ , _ , _ , _ , formats , subtitles = self . _extract_from_legacy_playlist ( playlist , video_id )
return formats , subtitles
return [ ] , [ ]
2015-10-10 21:32:27 +06:00
def _extract_from_playlist_sxml ( self , url , playlist_id , timestamp ) :
programme_id , title , description , duration , formats , subtitles = \
self . _process_legacy_playlist_url ( url , playlist_id )
self . _sort_formats ( formats )
return {
' id ' : programme_id ,
' title ' : title ,
' description ' : description ,
' duration ' : duration ,
' timestamp ' : timestamp ,
' formats ' : formats ,
' subtitles ' : subtitles ,
}
2015-06-20 08:22:13 -05:00
def _real_extract ( self , url ) :
2015-07-25 20:21:42 +06:00
playlist_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , playlist_id )
2016-08-08 22:44:36 +07:00
json_ld_info = self . _search_json_ld ( webpage , playlist_id , default = { } )
2016-01-16 20:46:28 +08:00
timestamp = json_ld_info . get ( ' timestamp ' )
2016-03-13 15:54:56 +06:00
2016-01-16 20:46:28 +08:00
playlist_title = json_ld_info . get ( ' title ' )
2016-03-13 15:54:56 +06:00
if not playlist_title :
playlist_title = self . _og_search_title (
webpage , default = None ) or self . _html_search_regex (
r ' <title>(.+?)</title> ' , webpage , ' playlist title ' , default = None )
if playlist_title :
playlist_title = re . sub ( r ' (.+) \ s*- \ s*BBC.*?$ ' , r ' \ 1 ' , playlist_title ) . strip ( )
playlist_description = json_ld_info . get (
' description ' ) or self . _og_search_description ( webpage , default = None )
2015-10-11 00:25:09 +06:00
if not timestamp :
timestamp = parse_iso8601 ( self . _search_regex (
[ r ' <meta[^>]+property= " article:published_time " [^>]+content= " ([^ " ]+) " ' ,
r ' itemprop= " datePublished " [^>]+datetime= " ([^ " ]+) " ' ,
2015-10-11 00:39:28 +06:00
r ' " datePublished " : \ s* " ([^ " ]+) ' ] ,
2015-10-11 00:25:09 +06:00
webpage , ' date ' , default = None ) )
2015-07-25 20:21:42 +06:00
2015-10-10 23:40:20 +06:00
entries = [ ]
2015-10-10 20:40:56 +06:00
# article with multiple videos embedded with playlist.sxml (e.g.
# http://www.bbc.com/sport/0/football/34475836)
playlists = re . findall ( r ' <param[^>]+name= " playlist " [^>]+value= " ([^ " ]+) " ' , webpage )
2015-12-06 16:41:12 +06:00
playlists . extend ( re . findall ( r ' data-media-id= " ([^ " ]+/playlist \ .sxml) " ' , webpage ) )
2015-10-10 20:40:56 +06:00
if playlists :
2015-10-10 21:32:27 +06:00
entries = [
self . _extract_from_playlist_sxml ( playlist_url , playlist_id , timestamp )
for playlist_url in playlists ]
2015-06-20 11:04:46 -05:00
2015-10-10 23:40:20 +06:00
# news article with multiple videos embedded with data-playable
data_playables = re . findall ( r ' data-playable=([ " \' ])( { .+?}) \ 1 ' , webpage )
if data_playables :
for _ , data_playable_json in data_playables :
data_playable = self . _parse_json (
unescapeHTML ( data_playable_json ) , playlist_id , fatal = False )
if not data_playable :
continue
2015-10-10 21:32:27 +06:00
settings = data_playable . get ( ' settings ' , { } )
if settings :
2015-10-10 23:40:20 +06:00
# data-playable with video vpid in settings.playlistObject.items (e.g.
# http://www.bbc.com/news/world-us-canada-34473351)
2015-10-10 21:32:27 +06:00
playlist_object = settings . get ( ' playlistObject ' , { } )
if playlist_object :
items = playlist_object . get ( ' items ' )
if items and isinstance ( items , list ) :
2015-10-10 23:40:20 +06:00
title = playlist_object [ ' title ' ]
description = playlist_object . get ( ' summary ' )
2015-10-10 21:32:27 +06:00
duration = int_or_none ( items [ 0 ] . get ( ' duration ' ) )
programme_id = items [ 0 ] . get ( ' vpid ' )
2015-10-10 23:40:20 +06:00
formats , subtitles = self . _download_media_selector ( programme_id )
self . _sort_formats ( formats )
entries . append ( {
' id ' : programme_id ,
' title ' : title ,
' description ' : description ,
' timestamp ' : timestamp ,
' duration ' : duration ,
' formats ' : formats ,
' subtitles ' : subtitles ,
} )
else :
# data-playable without vpid but with a playlist.sxml URLs
# in otherSettings.playlist (e.g.
# http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
playlist = data_playable . get ( ' otherSettings ' , { } ) . get ( ' playlist ' , { } )
if playlist :
2016-08-06 18:48:09 +01:00
entry = None
for key in ( ' streaming ' , ' progressiveDownload ' ) :
2016-07-19 22:49:38 +07:00
playlist_url = playlist . get ( ' %s Url ' % key )
if not playlist_url :
continue
try :
2016-08-06 18:48:09 +01:00
info = self . _extract_from_playlist_sxml (
playlist_url , playlist_id , timestamp )
if not entry :
entry = info
else :
entry [ ' title ' ] = info [ ' title ' ]
entry [ ' formats ' ] . extend ( info [ ' formats ' ] )
2021-03-02 11:21:49 +01:00
except ExtractorError as e :
2016-07-19 22:49:38 +07:00
# Some playlist URL may fail with 500, at the same time
# the other one may work fine (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
if isinstance ( e . cause , compat_HTTPError ) and e . cause . code == 500 :
continue
raise
2016-08-06 18:48:09 +01:00
if entry :
self . _sort_formats ( entry [ ' formats ' ] )
entries . append ( entry )
2015-10-10 23:40:20 +06:00
if entries :
return self . playlist_result ( entries , playlist_id , playlist_title , playlist_description )
2018-12-31 23:20:40 +07:00
# http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
group_id = self . _search_regex (
r ' <div[^>]+ \ bclass=[ " \' ]video[ " \' ][^>]+ \ bdata-pid=[ " \' ]( %s ) ' % self . _ID_REGEX ,
webpage , ' group id ' , default = None )
2020-11-24 22:49:04 +01:00
if group_id :
2018-12-31 23:20:40 +07:00
return self . url_result (
' https://www.bbc.co.uk/programmes/ %s ' % group_id ,
ie = BBCCoUkIE . ie_key ( ) )
2015-10-10 23:40:20 +06:00
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self . _search_regex (
2016-03-27 23:22:51 +06:00
[ r ' data-(?:video-player|media)-vpid= " ( %s ) " ' % self . _ID_REGEX ,
2015-12-02 02:34:31 +06:00
r ' <param[^>]+name= " externalIdentifier " [^>]+value= " ( %s ) " ' % self . _ID_REGEX ,
r ' videoId \ s*: \ s*[ " \' ]( %s )[ " \' ] ' % self . _ID_REGEX ] ,
2015-10-10 23:40:20 +06:00
webpage , ' vpid ' , default = None )
2015-10-10 20:34:06 +06:00
2015-07-25 20:21:42 +06:00
if programme_id :
formats , subtitles = self . _download_media_selector ( programme_id )
self . _sort_formats ( formats )
# digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
digital_data = self . _parse_json (
self . _search_regex (
r ' var \ s+digitalData \ s*= \ s*( { .+?});? \ n ' , webpage , ' digital data ' , default = ' {} ' ) ,
programme_id , fatal = False )
page_info = digital_data . get ( ' page ' , { } ) . get ( ' pageInfo ' , { } )
title = page_info . get ( ' pageName ' ) or self . _og_search_title ( webpage )
description = page_info . get ( ' description ' ) or self . _og_search_description ( webpage )
timestamp = parse_iso8601 ( page_info . get ( ' publicationDate ' ) ) or timestamp
return {
' id ' : programme_id ,
' title ' : title ,
' description ' : description ,
' timestamp ' : timestamp ,
' formats ' : formats ,
' subtitles ' : subtitles ,
}
2015-07-17 02:47:02 -05:00
2021-02-24 11:52:30 +00:00
# bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
initial_data = self . _parse_json ( self . _html_search_regex (
r ' <script[^>]+id=([ " \' ])initial-data \ 1[^>]+data-json=([ " \' ])(?P<json>(?:(?! \ 2).)+) ' ,
webpage , ' initial data ' , default = ' {} ' , group = ' json ' ) , playlist_id , fatal = False )
if initial_data :
init_data = try_get (
initial_data , lambda x : x [ ' initData ' ] [ ' items ' ] [ 0 ] , dict ) or { }
smp_data = init_data . get ( ' smpData ' ) or { }
clip_data = try_get ( smp_data , lambda x : x [ ' items ' ] [ 0 ] , dict ) or { }
version_id = clip_data . get ( ' versionID ' )
if version_id :
title = smp_data [ ' title ' ]
formats , subtitles = self . _download_media_selector ( version_id )
self . _sort_formats ( formats )
image_url = smp_data . get ( ' holdingImageURL ' )
display_date = init_data . get ( ' displayDate ' )
topic_title = init_data . get ( ' topicTitle ' )
return {
' id ' : version_id ,
' title ' : title ,
' formats ' : formats ,
' alt_title ' : init_data . get ( ' shortTitle ' ) ,
' thumbnail ' : image_url . replace ( ' $recipe ' , ' raw ' ) if image_url else None ,
' description ' : smp_data . get ( ' summary ' ) or init_data . get ( ' shortSummary ' ) ,
' upload_date ' : display_date . replace ( ' - ' , ' ' ) if display_date else None ,
' subtitles ' : subtitles ,
' duration ' : int_or_none ( clip_data . get ( ' duration ' ) ) ,
' categories ' : [ topic_title ] if topic_title else None ,
}
2016-08-07 18:01:50 +07:00
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video
# seems to be always related to the first one
morph_payload = self . _parse_json (
self . _search_regex (
r ' Morph \ .setPayload \ ([^,]+, \ s*( { .+?}) \ ); ' ,
webpage , ' morph payload ' , default = ' {} ' ) ,
playlist_id , fatal = False )
if morph_payload :
components = try_get ( morph_payload , lambda x : x [ ' body ' ] [ ' components ' ] , list ) or [ ]
for component in components :
if not isinstance ( component , dict ) :
continue
lead_media = try_get ( component , lambda x : x [ ' props ' ] [ ' leadMedia ' ] , dict )
if not lead_media :
continue
identifiers = lead_media . get ( ' identifiers ' )
if not identifiers or not isinstance ( identifiers , dict ) :
continue
programme_id = identifiers . get ( ' vpid ' ) or identifiers . get ( ' playablePid ' )
if not programme_id :
continue
title = lead_media . get ( ' title ' ) or self . _og_search_title ( webpage )
formats , subtitles = self . _download_media_selector ( programme_id )
self . _sort_formats ( formats )
description = lead_media . get ( ' summary ' )
uploader = lead_media . get ( ' masterBrand ' )
uploader_id = lead_media . get ( ' mid ' )
duration = None
duration_d = lead_media . get ( ' duration ' )
if isinstance ( duration_d , dict ) :
duration = parse_duration ( dict_get (
duration_d , ( ' rawDuration ' , ' formattedDuration ' , ' spokenDuration ' ) ) )
return {
' id ' : programme_id ,
' title ' : title ,
' description ' : description ,
' duration ' : duration ,
' uploader ' : uploader ,
' uploader_id ' : uploader_id ,
' formats ' : formats ,
' subtitles ' : subtitles ,
}
2018-07-21 11:49:55 +01:00
preload_state = self . _parse_json ( self . _search_regex (
r ' window \ .__PRELOADED_STATE__ \ s*= \ s*( { .+?}); ' , webpage ,
' preload state ' , default = ' {} ' ) , playlist_id , fatal = False )
if preload_state :
current_programme = preload_state . get ( ' programmes ' , { } ) . get ( ' current ' ) or { }
programme_id = current_programme . get ( ' id ' )
if current_programme and programme_id and current_programme . get ( ' type ' ) == ' playable_item ' :
title = current_programme . get ( ' titles ' , { } ) . get ( ' tertiary ' ) or playlist_title
formats , subtitles = self . _download_media_selector ( programme_id )
self . _sort_formats ( formats )
synopses = current_programme . get ( ' synopses ' ) or { }
network = current_programme . get ( ' network ' ) or { }
duration = int_or_none (
current_programme . get ( ' duration ' , { } ) . get ( ' value ' ) )
thumbnail = None
image_url = current_programme . get ( ' image_url ' )
if image_url :
2021-02-24 11:52:30 +00:00
thumbnail = image_url . replace ( ' {recipe} ' , ' raw ' )
2018-07-21 11:49:55 +01:00
return {
' id ' : programme_id ,
' title ' : title ,
' description ' : dict_get ( synopses , ( ' long ' , ' medium ' , ' short ' ) ) ,
' thumbnail ' : thumbnail ,
' duration ' : duration ,
' uploader ' : network . get ( ' short_title ' ) ,
' uploader_id ' : network . get ( ' id ' ) ,
' formats ' : formats ,
' subtitles ' : subtitles ,
}
2018-06-03 04:07:59 +07:00
bbc3_config = self . _parse_json (
self . _search_regex (
r ' (?s)bbcthreeConfig \ s*= \ s*( { .+?}) \ s*; \ s*< ' , webpage ,
' bbcthree config ' , default = ' {} ' ) ,
2020-11-24 22:54:08 +01:00
playlist_id , transform_source = js_to_json , fatal = False ) or { }
payload = bbc3_config . get ( ' payload ' ) or { }
if payload :
clip = payload . get ( ' currentClip ' ) or { }
clip_vpid = clip . get ( ' vpid ' )
clip_title = clip . get ( ' title ' )
if clip_vpid and clip_title :
formats , subtitles = self . _download_media_selector ( clip_vpid )
self . _sort_formats ( formats )
return {
' id ' : clip_vpid ,
' title ' : clip_title ,
' thumbnail ' : dict_get ( clip , ( ' poster ' , ' imageUrl ' ) ) ,
' description ' : clip . get ( ' description ' ) ,
' duration ' : parse_duration ( clip . get ( ' duration ' ) ) ,
' formats ' : formats ,
' subtitles ' : subtitles ,
}
2018-06-03 04:07:59 +07:00
bbc3_playlist = try_get (
2020-11-24 22:54:08 +01:00
payload , lambda x : x [ ' content ' ] [ ' bbcMedia ' ] [ ' playlist ' ] ,
2018-06-03 04:07:59 +07:00
dict )
if bbc3_playlist :
playlist_title = bbc3_playlist . get ( ' title ' ) or playlist_title
thumbnail = bbc3_playlist . get ( ' holdingImageURL ' )
entries = [ ]
for bbc3_item in bbc3_playlist [ ' items ' ] :
programme_id = bbc3_item . get ( ' versionID ' )
if not programme_id :
continue
formats , subtitles = self . _download_media_selector ( programme_id )
self . _sort_formats ( formats )
entries . append ( {
' id ' : programme_id ,
' title ' : playlist_title ,
' thumbnail ' : thumbnail ,
' timestamp ' : timestamp ,
' formats ' : formats ,
' subtitles ' : subtitles ,
} )
return self . playlist_result (
entries , playlist_id , playlist_title , playlist_description )
2020-11-24 22:49:04 +01:00
initial_data = self . _parse_json ( self . _search_regex (
r ' window \ .__INITIAL_DATA__ \ s*= \ s*( { .+?}); ' , webpage ,
' preload state ' , default = ' {} ' ) , playlist_id , fatal = False )
if initial_data :
def parse_media ( media ) :
if not media :
return
for item in ( try_get ( media , lambda x : x [ ' media ' ] [ ' items ' ] , list ) or [ ] ) :
item_id = item . get ( ' id ' )
item_title = item . get ( ' title ' )
if not ( item_id and item_title ) :
continue
formats , subtitles = self . _download_media_selector ( item_id )
self . _sort_formats ( formats )
2021-04-21 03:00:56 +07:00
item_desc = None
blocks = try_get ( media , lambda x : x [ ' summary ' ] [ ' blocks ' ] , list )
if blocks :
summary = [ ]
for block in blocks :
text = try_get ( block , lambda x : x [ ' model ' ] [ ' text ' ] , compat_str )
if text :
summary . append ( text )
if summary :
item_desc = ' \n \n ' . join ( summary )
2021-04-20 20:51:55 +01:00
item_time = None
for meta in try_get ( media , lambda x : x [ ' metadata ' ] [ ' items ' ] , list ) or [ ] :
if try_get ( meta , lambda x : x [ ' label ' ] ) == ' Published ' :
item_time = unified_timestamp ( meta . get ( ' timestamp ' ) )
break
2020-11-24 22:49:04 +01:00
entries . append ( {
' id ' : item_id ,
' title ' : item_title ,
' thumbnail ' : item . get ( ' holdingImageUrl ' ) ,
' formats ' : formats ,
' subtitles ' : subtitles ,
2021-04-20 20:51:55 +01:00
' timestamp ' : item_time ,
' description ' : strip_or_none ( item_desc ) ,
2020-11-24 22:49:04 +01:00
} )
for resp in ( initial_data . get ( ' data ' ) or { } ) . values ( ) :
name = resp . get ( ' name ' )
if name == ' media-experience ' :
parse_media ( try_get ( resp , lambda x : x [ ' data ' ] [ ' initialItem ' ] [ ' mediaItem ' ] , dict ) )
elif name == ' article ' :
for block in ( try_get ( resp , lambda x : x [ ' data ' ] [ ' blocks ' ] , list ) or [ ] ) :
if block . get ( ' type ' ) != ' media ' :
continue
parse_media ( block . get ( ' model ' ) )
return self . playlist_result (
entries , playlist_id , playlist_title , playlist_description )
2015-07-27 22:05:51 +06:00
def extract_all ( pattern ) :
return list ( filter ( None , map (
lambda s : self . _parse_json ( s , playlist_id , fatal = False ) ,
re . findall ( pattern , webpage ) ) ) )
# Multiple video article (e.g.
# http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
2015-12-02 02:34:31 +06:00
EMBED_URL = r ' https?://(?:www \ .)?bbc \ .co \ .uk/(?:[^/]+/)+ %s (?: \ b[^ " ]+)? ' % self . _ID_REGEX
2015-07-27 22:05:51 +06:00
entries = [ ]
for match in extract_all ( r ' new \ s+SMP \ (( { .+?}) \ ) ' ) :
embed_url = match . get ( ' playerSettings ' , { } ) . get ( ' externalEmbedUrl ' )
if embed_url and re . match ( EMBED_URL , embed_url ) :
entries . append ( embed_url )
entries . extend ( re . findall (
r ' setPlaylist \ ( " ( %s ) " \ ) ' % EMBED_URL , webpage ) )
if entries :
return self . playlist_result (
2016-08-07 18:05:13 +07:00
[ self . url_result ( entry_ , ' BBCCoUk ' ) for entry_ in entries ] ,
2015-07-27 22:05:51 +06:00
playlist_id , playlist_title , playlist_description )
2015-07-25 20:21:42 +06:00
# Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
2015-07-27 22:05:51 +06:00
medias = extract_all ( r " data-media-meta= ' ( { [^ ' ]+}) ' " )
2015-07-25 20:21:42 +06:00
if not medias :
# Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
2015-08-04 20:44:22 +06:00
media_asset = self . _search_regex (
r ' mediaAssetPage \ .init \ ( \ s*( { .+?}), " / ' ,
webpage , ' media asset ' , default = None )
if media_asset :
media_asset_page = self . _parse_json ( media_asset , playlist_id , fatal = False )
medias = [ ]
for video in media_asset_page . get ( ' videos ' , { } ) . values ( ) :
medias . extend ( video . values ( ) )
if not medias :
# Multiple video playlist with single `now playing` entry (e.g.
# http://www.bbc.com/news/video_and_audio/must_see/33767813)
vxp_playlist = self . _parse_json (
2015-07-25 20:21:42 +06:00
self . _search_regex (
2015-08-04 20:44:22 +06:00
r ' <script[^>]+class= " vxp-playlist-data " [^>]+type= " application/json " [^>]*>([^<]+)</script> ' ,
webpage , ' playlist data ' ) ,
2015-07-25 20:21:42 +06:00
playlist_id )
2015-08-04 20:44:22 +06:00
playlist_medias = [ ]
for item in vxp_playlist :
media = item . get ( ' media ' )
if not media :
continue
playlist_medias . append ( media )
# Download single video if found media with asset id matching the video id from URL
if item . get ( ' advert ' , { } ) . get ( ' assetId ' ) == playlist_id :
medias = [ media ]
break
# Fallback to the whole playlist
if not medias :
medias = playlist_medias
2015-07-25 20:21:42 +06:00
entries = [ ]
for num , media_meta in enumerate ( medias , start = 1 ) :
formats , subtitles = self . _extract_from_media_meta ( media_meta , playlist_id )
if not formats :
continue
2015-06-20 08:22:13 -05:00
self . _sort_formats ( formats )
2015-07-25 20:21:42 +06:00
video_id = media_meta . get ( ' externalId ' )
if not video_id :
video_id = playlist_id if len ( medias ) == 1 else ' %s - %s ' % ( playlist_id , num )
title = media_meta . get ( ' caption ' )
if not title :
title = playlist_title if len ( medias ) == 1 else ' %s - Video %s ' % ( playlist_title , num )
duration = int_or_none ( media_meta . get ( ' durationInSeconds ' ) ) or parse_duration ( media_meta . get ( ' duration ' ) )
2015-06-25 00:31:32 -05:00
2015-07-25 20:21:42 +06:00
images = [ ]
for image in media_meta . get ( ' images ' , { } ) . values ( ) :
images . extend ( image . values ( ) )
if ' image ' in media_meta :
images . append ( media_meta [ ' image ' ] )
thumbnails = [ {
' url ' : image . get ( ' href ' ) ,
' width ' : int_or_none ( image . get ( ' width ' ) ) ,
' height ' : int_or_none ( image . get ( ' height ' ) ) ,
} for image in images ]
entries . append ( {
' id ' : video_id ,
2015-06-20 08:22:13 -05:00
' title ' : title ,
2015-07-25 20:21:42 +06:00
' thumbnails ' : thumbnails ,
2015-06-20 08:22:13 -05:00
' duration ' : duration ,
2015-07-25 20:21:42 +06:00
' timestamp ' : timestamp ,
2015-06-20 08:22:13 -05:00
' formats ' : formats ,
' subtitles ' : subtitles ,
2015-07-17 02:47:02 -05:00
} )
2015-06-20 08:22:13 -05:00
2015-07-25 20:21:42 +06:00
return self . playlist_result ( entries , playlist_id , playlist_title , playlist_description )
2015-10-22 21:13:03 +08:00
class BBCCoUkArticleIE ( InfoExtractor ) :
2016-09-08 18:29:05 +07:00
_VALID_URL = r ' https?://(?:www \ .)?bbc \ .co \ .uk/programmes/articles/(?P<id>[a-zA-Z0-9]+) '
2015-10-22 21:13:03 +08:00
IE_NAME = ' bbc.co.uk:article '
IE_DESC = ' BBC articles '
_TEST = {
' url ' : ' http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer ' ,
' info_dict ' : {
' id ' : ' 3jNQLTMrPlYGTBn0WV6M2MS ' ,
' title ' : ' Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four ' ,
' description ' : ' Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming. ' ,
} ,
' playlist_count ' : 4 ,
' add_ie ' : [ ' BBCCoUk ' ] ,
}
def _real_extract ( self , url ) :
playlist_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , playlist_id )
title = self . _og_search_title ( webpage )
description = self . _og_search_description ( webpage ) . strip ( )
entries = [ self . url_result ( programme_url ) for programme_url in re . findall (
r ' <div[^>]+typeof= " Clip " [^>]+resource= " ([^ " ]+) " ' , webpage ) ]
return self . playlist_result ( entries , playlist_id , title , description )
2016-06-17 23:42:52 +07:00
class BBCCoUkPlaylistBaseIE ( InfoExtractor ) :
2016-08-16 04:36:23 +07:00
def _entries ( self , webpage , url , playlist_id ) :
single_page = ' page ' in compat_urlparse . parse_qs (
compat_urlparse . urlparse ( url ) . query )
for page_num in itertools . count ( 2 ) :
for video_id in re . findall (
self . _VIDEO_ID_TEMPLATE % BBCCoUkIE . _ID_REGEX , webpage ) :
yield self . url_result (
self . _URL_TEMPLATE % video_id , BBCCoUkIE . ie_key ( ) )
if single_page :
return
next_page = self . _search_regex (
r ' <li[^>]+class=([ " \' ])pagination_+next \ 1[^>]*><a[^>]+href=([ " \' ])(?P<url>(?:(?! \ 2).)+) \ 2 ' ,
webpage , ' next page url ' , default = None , group = ' url ' )
if not next_page :
break
webpage = self . _download_webpage (
compat_urlparse . urljoin ( url , next_page ) , playlist_id ,
' Downloading page %d ' % page_num , page_num )
2016-06-17 23:42:52 +07:00
def _real_extract ( self , url ) :
playlist_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , playlist_id )
title , description = self . _extract_title_and_description ( webpage )
2016-08-16 04:36:23 +07:00
return self . playlist_result (
self . _entries ( webpage , url , playlist_id ) ,
playlist_id , title , description )
2016-06-17 23:42:52 +07:00
2021-03-24 15:10:19 +01:00
class BBCCoUkIPlayerPlaylistBaseIE ( InfoExtractor ) :
_VALID_URL_TMPL = r ' https?://(?:www \ .)?bbc \ .co \ .uk/iplayer/ %% s/(?P<id> %s ) ' % BBCCoUkIE . _ID_REGEX
@staticmethod
def _get_default ( episode , key , default_key = ' default ' ) :
return try_get ( episode , lambda x : x [ key ] [ default_key ] )
def _get_description ( self , data ) :
synopsis = data . get ( self . _DESCRIPTION_KEY ) or { }
return dict_get ( synopsis , ( ' large ' , ' medium ' , ' small ' ) )
def _fetch_page ( self , programme_id , per_page , series_id , page ) :
elements = self . _get_elements ( self . _call_api (
programme_id , per_page , page + 1 , series_id ) )
for element in elements :
episode = self . _get_episode ( element )
episode_id = episode . get ( ' id ' )
if not episode_id :
continue
thumbnail = None
image = self . _get_episode_image ( episode )
if image :
thumbnail = image . replace ( ' {recipe} ' , ' raw ' )
category = self . _get_default ( episode , ' labels ' , ' category ' )
yield {
' _type ' : ' url ' ,
' id ' : episode_id ,
' title ' : self . _get_episode_field ( episode , ' subtitle ' ) ,
' url ' : ' https://www.bbc.co.uk/iplayer/episode/ ' + episode_id ,
' thumbnail ' : thumbnail ,
' description ' : self . _get_description ( episode ) ,
' categories ' : [ category ] if category else None ,
' series ' : self . _get_episode_field ( episode , ' title ' ) ,
' ie_key ' : BBCCoUkIE . ie_key ( ) ,
}
def _real_extract ( self , url ) :
pid = self . _match_id ( url )
qs = compat_parse_qs ( compat_urllib_parse_urlparse ( url ) . query )
series_id = qs . get ( ' seriesId ' , [ None ] ) [ 0 ]
page = qs . get ( ' page ' , [ None ] ) [ 0 ]
per_page = 36 if page else self . _PAGE_SIZE
fetch_page = functools . partial ( self . _fetch_page , pid , per_page , series_id )
entries = fetch_page ( int ( page ) - 1 ) if page else OnDemandPagedList ( fetch_page , self . _PAGE_SIZE )
playlist_data = self . _get_playlist_data ( self . _call_api ( pid , 1 ) )
return self . playlist_result (
entries , pid , self . _get_playlist_title ( playlist_data ) ,
self . _get_description ( playlist_data ) )
class BBCCoUkIPlayerEpisodesIE ( BBCCoUkIPlayerPlaylistBaseIE ) :
IE_NAME = ' bbc.co.uk:iplayer:episodes '
_VALID_URL = BBCCoUkIPlayerPlaylistBaseIE . _VALID_URL_TMPL % ' episodes '
2016-07-21 22:37:36 +07:00
_TESTS = [ {
2016-06-17 23:42:52 +07:00
' url ' : ' http://www.bbc.co.uk/iplayer/episodes/b05rcz9v ' ,
' info_dict ' : {
' id ' : ' b05rcz9v ' ,
' title ' : ' The Disappearance ' ,
2021-03-24 15:10:19 +01:00
' description ' : ' md5:58eb101aee3116bad4da05f91179c0cb ' ,
2016-06-17 23:42:52 +07:00
} ,
2021-03-24 15:10:19 +01:00
' playlist_mincount ' : 8 ,
2016-07-21 22:37:36 +07:00
} , {
2021-03-24 15:10:19 +01:00
# all seasons
' url ' : ' https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster ' ,
' info_dict ' : {
' id ' : ' b094m5t9 ' ,
' title ' : ' Doctor Foster ' ,
' description ' : ' md5:5aa9195fad900e8e14b52acd765a9fd6 ' ,
} ,
' playlist_mincount ' : 10 ,
} , {
# explicit season
' url ' : ' https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv ' ,
' info_dict ' : {
' id ' : ' b094m5t9 ' ,
' title ' : ' Doctor Foster ' ,
' description ' : ' md5:5aa9195fad900e8e14b52acd765a9fd6 ' ,
} ,
' playlist_mincount ' : 5 ,
} , {
# all pages
' url ' : ' https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove ' ,
' info_dict ' : {
' id ' : ' m0004c4v ' ,
' title ' : ' Beechgrove ' ,
' description ' : ' Gardening show that celebrates Scottish horticulture and growing conditions. ' ,
} ,
' playlist_mincount ' : 37 ,
} , {
# explicit page
' url ' : ' https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2 ' ,
' info_dict ' : {
' id ' : ' m0004c4v ' ,
' title ' : ' Beechgrove ' ,
' description ' : ' Gardening show that celebrates Scottish horticulture and growing conditions. ' ,
} ,
' playlist_mincount ' : 1 ,
} ]
_PAGE_SIZE = 100
_DESCRIPTION_KEY = ' synopsis '
def _get_episode_image ( self , episode ) :
return self . _get_default ( episode , ' image ' )
def _get_episode_field ( self , episode , field ) :
return self . _get_default ( episode , field )
@staticmethod
def _get_elements ( data ) :
return data [ ' entities ' ] [ ' results ' ]
@staticmethod
def _get_episode ( element ) :
return element . get ( ' episode ' ) or { }
def _call_api ( self , pid , per_page , page = 1 , series_id = None ) :
variables = {
' id ' : pid ,
' page ' : page ,
' perPage ' : per_page ,
}
if series_id :
variables [ ' sliceId ' ] = series_id
return self . _download_json (
' https://graph.ibl.api.bbc.co.uk/ ' , pid , headers = {
' Content-Type ' : ' application/json '
} , data = json . dumps ( {
' id ' : ' 5692d93d5aac8d796a0305e895e61551 ' ,
' variables ' : variables ,
} ) . encode ( ' utf-8 ' ) ) [ ' data ' ] [ ' programme ' ]
@staticmethod
def _get_playlist_data ( data ) :
return data
def _get_playlist_title ( self , data ) :
return self . _get_default ( data , ' title ' )
class BBCCoUkIPlayerGroupIE ( BBCCoUkIPlayerPlaylistBaseIE ) :
IE_NAME = ' bbc.co.uk:iplayer:group '
_VALID_URL = BBCCoUkIPlayerPlaylistBaseIE . _VALID_URL_TMPL % ' group '
_TESTS = [ {
2016-07-21 22:37:36 +07:00
# Available for over a year unlike 30 days for most other programmes
' url ' : ' http://www.bbc.co.uk/iplayer/group/p02tcc32 ' ,
' info_dict ' : {
' id ' : ' p02tcc32 ' ,
' title ' : ' Bohemian Icons ' ,
' description ' : ' md5:683e901041b2fe9ba596f2ab04c4dbe7 ' ,
} ,
' playlist_mincount ' : 10 ,
2021-03-24 15:10:19 +01:00
} , {
# all pages
' url ' : ' https://www.bbc.co.uk/iplayer/group/p081d7j7 ' ,
' info_dict ' : {
' id ' : ' p081d7j7 ' ,
' title ' : ' Music in Scotland ' ,
' description ' : ' Perfomances in Scotland and programmes featuring Scottish acts. ' ,
} ,
' playlist_mincount ' : 47 ,
} , {
# explicit page
' url ' : ' https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2 ' ,
' info_dict ' : {
' id ' : ' p081d7j7 ' ,
' title ' : ' Music in Scotland ' ,
' description ' : ' Perfomances in Scotland and programmes featuring Scottish acts. ' ,
} ,
' playlist_mincount ' : 11 ,
2016-07-21 22:37:36 +07:00
} ]
2021-03-24 15:10:19 +01:00
_PAGE_SIZE = 200
_DESCRIPTION_KEY = ' synopses '
def _get_episode_image ( self , episode ) :
return self . _get_default ( episode , ' images ' , ' standard ' )
def _get_episode_field ( self , episode , field ) :
return episode . get ( field )
@staticmethod
def _get_elements ( data ) :
return data [ ' elements ' ]
@staticmethod
def _get_episode ( element ) :
return element
def _call_api ( self , pid , per_page , page = 1 , series_id = None ) :
return self . _download_json (
' http://ibl.api.bbc.co.uk/ibl/v1/groups/ %s /episodes ' % pid ,
pid , query = {
' page ' : page ,
' per_page ' : per_page ,
} ) [ ' group_episodes ' ]
@staticmethod
def _get_playlist_data ( data ) :
return data [ ' group ' ]
2016-06-17 23:42:52 +07:00
2021-03-24 15:10:19 +01:00
def _get_playlist_title ( self , data ) :
return data . get ( ' title ' )
2016-06-17 23:42:52 +07:00
class BBCCoUkPlaylistIE ( BBCCoUkPlaylistBaseIE ) :
IE_NAME = ' bbc.co.uk:playlist '
_VALID_URL = r ' https?://(?:www \ .)?bbc \ .co \ .uk/programmes/(?P<id> %s )/(?:episodes|broadcasts|clips) ' % BBCCoUkIE . _ID_REGEX
_URL_TEMPLATE = ' http://www.bbc.co.uk/programmes/ %s '
_VIDEO_ID_TEMPLATE = r ' data-pid=[ " \' ]( %s ) '
_TESTS = [ {
' url ' : ' http://www.bbc.co.uk/programmes/b05rcz9v/clips ' ,
' info_dict ' : {
' id ' : ' b05rcz9v ' ,
' title ' : ' The Disappearance - Clips - BBC Four ' ,
' description ' : ' French thriller serial about a missing teenager. ' ,
} ,
' playlist_mincount ' : 7 ,
2016-08-16 04:43:10 +07:00
} , {
# multipage playlist, explicit page
' url ' : ' http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1 ' ,
' info_dict ' : {
' id ' : ' b00mfl7n ' ,
' title ' : ' Frozen Planet - Clips - BBC One ' ,
' description ' : ' md5:65dcbf591ae628dafe32aa6c4a4a0d8c ' ,
} ,
' playlist_mincount ' : 24 ,
} , {
# multipage playlist, all pages
' url ' : ' http://www.bbc.co.uk/programmes/b00mfl7n/clips ' ,
' info_dict ' : {
' id ' : ' b00mfl7n ' ,
' title ' : ' Frozen Planet - Clips - BBC One ' ,
' description ' : ' md5:65dcbf591ae628dafe32aa6c4a4a0d8c ' ,
} ,
' playlist_mincount ' : 142 ,
2016-06-17 23:42:52 +07:00
} , {
' url ' : ' http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06 ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://www.bbc.co.uk/programmes/b05rcz9v/clips ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://www.bbc.co.uk/programmes/b055jkys/episodes/player ' ,
' only_matching ' : True ,
} ]
def _extract_title_and_description ( self , webpage ) :
title = self . _og_search_title ( webpage , fatal = False )
description = self . _og_search_description ( webpage )
return title , description