[soundcloud] fix download url extraction(closes #24394)

This commit is contained in:
Remita Amine 2020-03-22 09:24:07 +01:00
parent c76cdf2382
commit a6c5859d6b

View File

@ -27,6 +27,7 @@
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
url_or_none, url_or_none,
urlhandle_detect_ext,
) )
@ -96,7 +97,7 @@ class SoundcloudIE(InfoExtractor):
'repost_count': int, 'repost_count': int,
} }
}, },
# not streamable song, preview # geo-restricted
{ {
'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
'info_dict': { 'info_dict': {
@ -108,17 +109,13 @@ class SoundcloudIE(InfoExtractor):
'uploader_id': '9615865', 'uploader_id': '9615865',
'timestamp': 1337635207, 'timestamp': 1337635207,
'upload_date': '20120521', 'upload_date': '20120521',
'duration': 30, 'duration': 227.155,
'license': 'all-rights-reserved', 'license': 'all-rights-reserved',
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
}, },
'params': {
# rtmp
'skip_download': True,
},
}, },
# private link # private link
{ {
@ -229,7 +226,6 @@ class SoundcloudIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, },
# not available via api.soundcloud.com/i1/tracks/id/streams
{ {
'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7',
@ -250,11 +246,9 @@ class SoundcloudIE(InfoExtractor):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
}, },
'expected_warnings': ['Unable to download JSON metadata'],
} }
] ]
_API_BASE = 'https://api.soundcloud.com/'
_API_V2_BASE = 'https://api-v2.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/'
_BASE_URL = 'https://soundcloud.com/' _BASE_URL = 'https://soundcloud.com/'
_IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
@ -316,10 +310,9 @@ def _real_initialize(self):
def _resolv_url(cls, url): def _resolv_url(cls, url):
return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url
def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): def _extract_info_dict(self, info, full_title=None, secret_token=None):
track_id = compat_str(info['id']) track_id = compat_str(info['id'])
title = info['title'] title = info['title']
track_base_url = self._API_BASE + 'tracks/%s' % track_id
format_urls = set() format_urls = set()
formats = [] formats = []
@ -328,21 +321,22 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2
query['secret_token'] = secret_token query['secret_token'] = secret_token
if info.get('downloadable') and info.get('has_downloads_left'): if info.get('downloadable') and info.get('has_downloads_left'):
format_url = update_url_query( download_url = update_url_query(
info.get('download_url') or track_base_url + '/download', query) self._API_V2_BASE + 'tracks/' + track_id + '/download', query)
format_urls.add(format_url) redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri')
if version == 2: if redirect_url:
v1_info = self._download_json( urlh = self._request_webpage(
track_base_url, track_id, query=query, fatal=False) or {} HEADRequest(redirect_url), track_id, fatal=False)
else: if urlh:
v1_info = info format_url = urlh.geturl()
formats.append({ format_urls.add(format_url)
'format_id': 'download', formats.append({
'ext': v1_info.get('original_format') or 'mp3', 'format_id': 'download',
'filesize': int_or_none(v1_info.get('original_content_size')), 'ext': urlhandle_detect_ext(urlh) or 'mp3',
'url': format_url, 'filesize': int_or_none(urlh.headers.get('Content-Length')),
'preference': 10, 'url': format_url,
}) 'preference': 10,
})
def invalid_url(url): def invalid_url(url):
return not url or url in format_urls return not url or url in format_urls
@ -406,42 +400,11 @@ def add_format(f, protocol, is_preview=False):
}, 'http' if protocol == 'progressive' else protocol, }, 'http' if protocol == 'progressive' else protocol,
t.get('snipped') or '/preview/' in format_url) t.get('snipped') or '/preview/' in format_url)
if not formats:
# Old API, does not work for some tracks (e.g.
# https://soundcloud.com/giovannisarani/mezzo-valzer)
# and might serve preview URLs (e.g.
# http://www.soundcloud.com/snbrn/ele)
format_dict = self._download_json(
track_base_url + '/streams', track_id,
'Downloading track url', query=query, fatal=False) or {}
for key, stream_url in format_dict.items():
if invalid_url(stream_url):
continue
format_urls.add(stream_url)
mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key)
if mobj:
protocol, ext, abr = mobj.groups()
add_format({
'abr': abr,
'ext': ext,
'url': stream_url,
}, protocol)
if not formats:
# We fallback to the stream_url in the original info, this
# cannot be always used, sometimes it can give an HTTP 404 error
urlh = self._request_webpage(
HEADRequest(info.get('stream_url') or track_base_url + '/stream'),
track_id, query=query, fatal=False)
if urlh:
stream_url = urlh.geturl()
if not invalid_url(stream_url):
add_format({'url': stream_url}, 'http')
for f in formats: for f in formats:
f['vcodec'] = 'none' f['vcodec'] = 'none'
if not formats and info.get('policy') == 'BLOCK':
self.raise_geo_restricted()
self._sort_formats(formats) self._sort_formats(formats)
user = info.get('user') or {} user = info.get('user') or {}
@ -511,16 +474,10 @@ def _real_extract(self, url):
resolve_title += '/%s' % token resolve_title += '/%s' % token
info_json_url = self._resolv_url(self._BASE_URL + resolve_title) info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
version = 2
info = self._download_json( info = self._download_json(
info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False) info_json_url, full_title, 'Downloading info JSON', query=query)
if not info:
info = self._download_json(
info_json_url.replace(self._API_V2_BASE, self._API_BASE),
full_title, 'Downloading info JSON', query=query)
version = 1
return self._extract_info_dict(info, full_title, token, version) return self._extract_info_dict(info, full_title, token)
class SoundcloudPlaylistBaseIE(SoundcloudIE): class SoundcloudPlaylistBaseIE(SoundcloudIE):