From a25a424323267e3f6f9f63c0b62df499bd7b8d46 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 1 Apr 2024 02:20:03 +0530 Subject: [PATCH] [ie/youtube] Calculate more accurate `filesize` YouTube provides slightly different duration for each format. Calculating file-size based on this duration instead of the video duration gives more accurate results. Ref: https://github.com/yt-dlp/yt-dlp/issues/1400#issuecomment-2007441207 --- yt_dlp/extractor/youtube.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b41191b7f..1f1db1ad3 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -33,6 +33,7 @@ clean_html, datetime_from_str, dict_get, + filesize_from_tbr, filter_dict, float_or_none, format_field, @@ -55,6 +56,7 @@ str_to_int, strftime_or_none, traverse_obj, + try_call, try_get, unescapeHTML, unified_strdate, @@ -3839,11 +3841,12 @@ def build_fragments(f): 10 if audio_track.get('audioIsDefault') and 10 else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 else -1) + format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)})) # Some formats may have much smaller duration than others (possibly damaged during encoding) # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 # Make sure to avoid false positives with small duration differences. # E.g. __2ABJjxzNo, ySuUZEjARPY - is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) + is_damaged = try_call(lambda: format_duration < duration // 2) if is_damaged: self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) @@ -3873,6 +3876,7 @@ def build_fragments(f): 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, + 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(audio_track.get('id', '').split('.')[0],