From 2647c933b8ed22f95dd8e9866c4db031867a1bc8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 29 Dec 2022 16:32:54 +0000 Subject: [PATCH] [extractor/wistia] Improve extension detection (#5415) Closes #5053 Authored by: bashonly, Grub4k, pukkandan --- yt_dlp/extractor/wistia.py | 41 ++++++++----- yt_dlp/utils.py | 122 +++++++++++++++++++++++-------------- 2 files changed, 104 insertions(+), 59 deletions(-) diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index 38dcc2f5b..884fa4b5f 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -6,12 +6,15 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + HEADRequest, + determine_ext, float_or_none, int_or_none, parse_qs, traverse_obj, try_get, update_url_query, + urlhandle_detect_ext, ) @@ -34,6 +37,16 @@ def _download_embed_config(self, config_type, config_id, referer): return embed_config + def _get_real_ext(self, url): + ext = determine_ext(url, default_ext='bin') + if ext == 'bin': + urlh = self._request_webpage( + HEADRequest(url), None, note='Checking media extension', + errnote='HEAD request returned error', fatal=False) + if urlh: + ext = urlhandle_detect_ext(urlh, default='bin') + return 'mp4' if ext == 'mov' else ext + def _extract_media(self, embed_config): data = embed_config['media'] video_id = data['hashedId'] @@ -51,13 +64,13 @@ def _extract_media(self, embed_config): continue elif atype in ('still', 'still_image'): thumbnails.append({ - 'url': aurl, + 'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'), 'width': int_or_none(a.get('width')), 'height': int_or_none(a.get('height')), 'filesize': int_or_none(a.get('size')), }) else: - aext = a.get('ext') + aext = a.get('ext') or self._get_real_ext(aurl) display_name = a.get('display_name') format_id = atype if atype and atype.endswith('_video') and display_name: @@ -169,26 +182,26 @@ class WistiaIE(WistiaBaseIE): 'md5': '10c1ce9c4dde638202513ed17a3767bd', 'info_dict': { 'id': 'a6ndpko1wg', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'Episode 2: Boxed Water\'s retention is thirsty', 'upload_date': '20210324', 'description': 'md5:da5994c2c2d254833b412469d9666b7a', 'duration': 966.0, 'timestamp': 1616614369, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png', } }, { 'url': 'wistia:5vd7p4bct5', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679', 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.(?:jpg|bin)$', + 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$', }, }, { 'url': 'wistia:sh7fpupwlt', @@ -208,25 +221,25 @@ class WistiaIE(WistiaBaseIE): 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool', 'info_dict': { 'id': 'cqwukac3z1', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content', 'duration': 158.125, 'timestamp': 1618974400, 'description': 'md5:27abc99a758573560be72600ef95cece', 'upload_date': '20210421', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg', } }, { 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg', 'description': 'a Paywall Videos video', }, }] @@ -302,9 +315,9 @@ class WistiaChannelIE(WistiaBaseIE): 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n', 'info_dict': { 'id': 'sp5dqjzw3n', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'The Roof S2: The Modern CRO', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png', 'duration': 86.487, 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n', 'timestamp': 1619790290, @@ -334,12 +347,12 @@ class WistiaChannelIE(WistiaBaseIE): 'info_dict': { 'id': 'pz0m0l0if3', 'title': 'A Framework for Improving Product Team Performance', - 'ext': 'bin', + 'ext': 'mp4', 'timestamp': 1653935275, 'upload_date': '20220530', 'description': 'Learn how to help your company improve and achieve your product related goals.', 'duration': 1854.39, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png', }, 'params': {'noplaylist': True, 'skip_download': True}, }] diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 65408bf19..3947dcf2e 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3480,67 +3480,93 @@ def error_to_str(err): return f'{type(err).__name__}: {err}' -def mimetype2ext(mt): - if mt is None: +def mimetype2ext(mt, default=NO_DEFAULT): + if not isinstance(mt, str): + if default is not NO_DEFAULT: + return default return None - mt, _, params = mt.partition(';') - mt = mt.strip() - - FULL_MAP = { - 'audio/mp4': 'm4a', - # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as - # it's the most popular one - 'audio/mpeg': 'mp3', - 'audio/x-wav': 'wav', - 'audio/wav': 'wav', - 'audio/wave': 'wav', - } - - ext = FULL_MAP.get(mt) - if ext is not None: - return ext - - SUBTYPE_MAP = { + MAP = { + # video '3gpp': '3gp', - 'smptett+xml': 'tt', - 'ttaf+xml': 'dfxp', - 'ttml+xml': 'ttml', - 'x-flv': 'flv', - 'x-mp4-fragmented': 'mp4', - 'x-ms-sami': 'sami', - 'x-ms-wmv': 'wmv', + 'mp2t': 'ts', + 'mp4': 'mp4', + 'mpeg': 'mpeg', 'mpegurl': 'm3u8', - 'x-mpegurl': 'm3u8', - 'vnd.apple.mpegurl': 'm3u8', + 'quicktime': 'mov', + 'webm': 'webm', + 'vp9': 'vp9', + 'x-flv': 'flv', + 'x-m4v': 'm4v', + 'x-matroska': 'mkv', + 'x-mng': 'mng', + 'x-mp4-fragmented': 'mp4', + 'x-ms-asf': 'asf', + 'x-ms-wmv': 'wmv', + 'x-msvideo': 'avi', + + # application (streaming playlists) 'dash+xml': 'mpd', 'f4m+xml': 'f4m', 'hds+xml': 'f4m', + 'vnd.apple.mpegurl': 'm3u8', 'vnd.ms-sstr+xml': 'ism', - 'quicktime': 'mov', - 'mp2t': 'ts', + 'x-mpegurl': 'm3u8', + + # audio + 'audio/mp4': 'm4a', + # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. + # Using .mp3 as it's the most popular one + 'audio/mpeg': 'mp3', + 'audio/webm': 'weba', + 'audio/x-matroska': 'mka', + 'audio/x-mpegurl': 'm3u', + 'midi': 'mid', + 'ogg': 'ogg', + 'wav': 'wav', + 'wave': 'wav', + 'x-aac': 'aac', + 'x-flac': 'flac', + 'x-m4a': 'm4a', + 'x-realaudio': 'ra', 'x-wav': 'wav', - 'filmstrip+json': 'fs', + + # image + 'avif': 'avif', + 'bmp': 'bmp', + 'gif': 'gif', + 'jpeg': 'jpg', + 'png': 'png', 'svg+xml': 'svg', - } + 'tiff': 'tif', + 'vnd.wap.wbmp': 'wbmp', + 'webp': 'webp', + 'x-icon': 'ico', + 'x-jng': 'jng', + 'x-ms-bmp': 'bmp', - _, _, subtype = mt.rpartition('/') - ext = SUBTYPE_MAP.get(subtype.lower()) - if ext is not None: - return ext + # caption + 'filmstrip+json': 'fs', + 'smptett+xml': 'tt', + 'ttaf+xml': 'dfxp', + 'ttml+xml': 'ttml', + 'x-ms-sami': 'sami', - SUFFIX_MAP = { + # misc + 'gzip': 'gz', 'json': 'json', 'xml': 'xml', 'zip': 'zip', - 'gzip': 'gz', } - _, _, suffix = subtype.partition('+') - ext = SUFFIX_MAP.get(suffix) - if ext is not None: - return ext + mimetype = mt.partition(';')[0].strip().lower() + _, _, subtype = mimetype.rpartition('/') + ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1]) + if ext: + return ext + elif default is not NO_DEFAULT: + return default return subtype.replace('+', '.') @@ -3634,7 +3660,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): return 'mkv' if allow_mkv else preferences[-1] -def urlhandle_detect_ext(url_handle): +def urlhandle_detect_ext(url_handle, default=NO_DEFAULT): getheader = url_handle.headers.get cd = getheader('Content-Disposition') @@ -3645,7 +3671,13 @@ def urlhandle_detect_ext(url_handle): if e: return e - return mimetype2ext(getheader('Content-Type')) + meta_ext = getheader('x-amz-meta-name') + if meta_ext: + e = meta_ext.rpartition('.')[2] + if e: + return e + + return mimetype2ext(getheader('Content-Type'), default=default) def encode_data_uri(data, mime_type):