[extractor] Extract storyboards from SMIL manifests (#1128)

Authored by: fstirlitz
This commit is contained in:
Felix S 2021-10-02 18:43:42 +00:00 committed by GitHub
parent 0eaec13ba6
commit 9359f3d4f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 56 additions and 12 deletions

View File

@ -3029,9 +3029,7 @@ def record_download_archive(self, info_dict):
@staticmethod @staticmethod
def format_resolution(format, default='unknown'): def format_resolution(format, default='unknown'):
if format.get('vcodec') == 'none': if format.get('vcodec') == 'none' and format.get('acodec') != 'none':
if format.get('acodec') == 'none':
return 'images'
return 'audio only' return 'audio only'
if format.get('resolution') is not None: if format.get('resolution') is not None:
return format['resolution'] return format['resolution']
@ -3043,6 +3041,8 @@ def format_resolution(format, default='unknown'):
res = '%dx?' % format['width'] res = '%dx?' % format['width']
else: else:
res = default res = default
if format.get('vcodec') == 'none' and format.get('acodec') == 'none':
res += ' (images)'
return res return res
def _format_note(self, fdict): def _format_note(self, fdict):

View File

@ -2346,14 +2346,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
rtmp_count = 0 rtmp_count = 0
http_count = 0 http_count = 0
m3u8_count = 0 m3u8_count = 0
imgs_count = 0
srcs = [] srcs = set()
media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
for medium in media: for medium in media:
src = medium.get('src') src = medium.get('src')
if not src or src in srcs: if not src or src in srcs:
continue continue
srcs.append(src) srcs.add(src)
bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
filesize = int_or_none(medium.get('size') or medium.get('fileSize')) filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
@ -2427,6 +2428,24 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
'height': height, 'height': height,
}) })
for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
src = medium.get('src')
if not src or src in srcs:
continue
srcs.add(src)
imgs_count += 1
formats.append({
'format_id': 'imagestream-%d' % (imgs_count),
'url': src,
'ext': mimetype2ext(medium.get('type')),
'acodec': 'none',
'vcodec': 'none',
'width': int_or_none(medium.get('width')),
'height': int_or_none(medium.get('height')),
'format_note': 'SMIL storyboards',
})
return formats return formats
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):

View File

@ -4546,20 +4546,24 @@ def mimetype2ext(mt):
if mt is None: if mt is None:
return None return None
ext = { mt, _, params = mt.partition(';')
mt = mt.strip()
FULL_MAP = {
'audio/mp4': 'm4a', 'audio/mp4': 'm4a',
# Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
# it's the most popular one # it's the most popular one
'audio/mpeg': 'mp3', 'audio/mpeg': 'mp3',
'audio/x-wav': 'wav', 'audio/x-wav': 'wav',
}.get(mt) 'audio/wav': 'wav',
'audio/wave': 'wav',
}
ext = FULL_MAP.get(mt)
if ext is not None: if ext is not None:
return ext return ext
_, _, res = mt.rpartition('/') SUBTYPE_MAP = {
res = res.split(';')[0].strip().lower()
return {
'3gpp': '3gp', '3gpp': '3gp',
'smptett+xml': 'tt', 'smptett+xml': 'tt',
'ttaf+xml': 'dfxp', 'ttaf+xml': 'dfxp',
@ -4578,7 +4582,28 @@ def mimetype2ext(mt):
'quicktime': 'mov', 'quicktime': 'mov',
'mp2t': 'ts', 'mp2t': 'ts',
'x-wav': 'wav', 'x-wav': 'wav',
}.get(res, res) 'filmstrip+json': 'fs',
'svg+xml': 'svg',
}
_, _, subtype = mt.rpartition('/')
ext = SUBTYPE_MAP.get(subtype.lower())
if ext is not None:
return ext
SUFFIX_MAP = {
'json': 'json',
'xml': 'xml',
'zip': 'zip',
'gzip': 'gz',
}
_, _, suffix = subtype.partition('+')
ext = SUFFIX_MAP.get(suffix)
if ext is not None:
return ext
return subtype.replace('+', '.')
def parse_codecs(codecs_str): def parse_codecs(codecs_str):