From e74a3c6dcc30ba16455749c3c5dbb9477961c175 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 9 Dec 2022 15:17:51 +0530 Subject: [PATCH] [extractor/hotstar] Improve format metadata --- yt_dlp/extractor/hotstar.py | 40 ++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 8725c9436f..cea1812f15 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -148,6 +148,12 @@ class HotStarIE(HotStarBaseIE): 'dr': 'dynamic_range', } + _TAG_FIELDS = { + 'language': 'language', + 'acodec': 'audio_codec', + 'vcodec': 'video_codec', + } + @classmethod def _video_url(cls, video_id, video_type=None, *, slug='ignore_me', root=None): assert None in (video_type, root) @@ -182,24 +188,22 @@ def _real_extract(self, url): for key, prefix in self._IGNORE_MAP.items() for ignore in self._configuration_arg(key)): continue + tag_dict = dict((t.split(':', 1) + [None])[:2] for t in tags.split(';')) format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue format_url = re.sub(r'(?<=//staragvod)(\d)', r'web\1', format_url) - dr = re.search(r'dynamic_range:(?P[a-z]+)', playback_set.get('tagsCombination')).group('dr') ext = determine_ext(format_url) current_formats, current_subs = [], {} try: if 'package:hls' in tags or ext == 'm3u8': current_formats, current_subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id=f'{dr}-hls', headers=headers) + format_url, video_id, ext='mp4', headers=headers) elif 'package:dash' in tags or ext == 'mpd': current_formats, current_subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) + format_url, video_id, headers=headers) elif ext == 'f4m': pass # XXX: produce broken files else: @@ -213,20 +217,32 @@ def _real_extract(self, url): geo_restricted = True continue - if tags and 'encryption:plain' not in tags: + if tag_dict.get('encryption') not in ('plain', None): for f in current_formats: f['has_drm'] = True - if tags and 'language' in tags: - lang = re.search(r'language:(?P[a-z]+)', tags).group('lang') - for f in current_formats: - if not f.get('langauge'): - f['language'] = lang + for f in current_formats: + for k, v in self._TAG_FIELDS.items(): + if not f.get(k): + f[k] = tag_dict.get(v) + if f.get('vcodec') != 'none' and not f.get('dynamic_range'): + f['dynamic_range'] = tag_dict.get('dynamic_range') + if f.get('acodec') != 'none' and not f.get('audio_channels'): + f['audio_channels'] = { + 'stereo': 2, + 'dolby51': 6, + }.get(tag_dict.get('audio_channel')) + f['format_note'] = join_nonempty( + tag_dict.get('ladder'), + tag_dict.get('audio_channel') if f.get('acodec') != 'none' else None, + f.get('format_note'), + delim=', ') formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) if not formats and geo_restricted: self.raise_geo_restricted(countries=['IN'], metadata_available=True) + self._remove_duplicate_formats(formats) for f in formats: f.setdefault('http_headers', {}).update(headers) @@ -235,7 +251,7 @@ def _real_extract(self, url): 'title': video_data.get('title'), 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), - 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), + 'timestamp': int_or_none(traverse_obj(video_data, 'broadcastDate', 'startDate')), 'formats': formats, 'subtitles': subs, 'channel': video_data.get('channelName'),