[extractor/common] Recognize src attribute from HTML5 media elements (#3899)

Authored by: Lesmiscore
This commit is contained in:
Lesmiscore 2022-05-29 22:48:04 +09:00 committed by GitHub
parent ee27297f82
commit 222a230871
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 23 additions and 2 deletions

View File

@ -502,6 +502,24 @@ def test_parse_html5_media_entries(self):
}], }],
}) })
# from https://0000.studio/
# with type attribute but without extension in URL
expect_dict(
self,
self.ie._parse_html5_media_entries(
'https://0000.studio',
r'''
<video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
</video>
''', None)[0],
{
'formats': [{
'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
'ext': 'mp4',
}],
})
def test_extract_jwplayer_data_realworld(self): def test_extract_jwplayer_data_realworld(self):
# from http://www.suffolk.edu/sjc/ # from http://www.suffolk.edu/sjc/
expect_dict( expect_dict(

View File

@ -3197,7 +3197,8 @@ def parse_content_type(content_type):
return f return f
return {} return {}
def _media_formats(src, cur_media_type, type_info={}): def _media_formats(src, cur_media_type, type_info=None):
type_info = type_info or {}
full_url = absolute_url(src) full_url = absolute_url(src)
ext = type_info.get('ext') or determine_ext(full_url) ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8': if ext == 'm3u8':
@ -3215,6 +3216,7 @@ def _media_formats(src, cur_media_type, type_info={}):
formats = [{ formats = [{
'url': full_url, 'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None, 'vcodec': 'none' if cur_media_type == 'audio' else None,
'ext': ext,
}] }]
return is_plain_url, formats return is_plain_url, formats
@ -3241,7 +3243,8 @@ def _media_formats(src, cur_media_type, type_info={}):
media_attributes = extract_attributes(media_tag) media_attributes = extract_attributes(media_tag)
src = strip_or_none(media_attributes.get('src')) src = strip_or_none(media_attributes.get('src'))
if src: if src:
_, formats = _media_formats(src, media_type) f = parse_content_type(media_attributes.get('type'))
_, formats = _media_formats(src, media_type, f)
media_info['formats'].extend(formats) media_info['formats'].extend(formats)
media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content: if media_content: