From 73af5cc817ff19d21cb432c5a4e9e37dd35a353d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 21:18:33 +0700 Subject: [PATCH] [YoutubeDL] Skip malformed formats for better extraction robustness --- youtube_dl/YoutubeDL.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c05103bb6..b3a6d4d3b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1448,17 +1448,25 @@ def sanitize_numeric_fields(info): if not formats: raise ExtractorError('No video formats found!') + def is_wellformed(f): + url = f.get('url') + valid_url = url and isinstance(url, compat_str) + if not valid_url: + self.report_warning( + '"url" field is missing or empty - skipping format, ' + 'there is an error in extractor') + return valid_url + + # Filter out malformed formats for better extraction robustness + formats = list(filter(is_wellformed, formats)) + formats_dict = {} # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): - if 'url' not in format: - raise ExtractorError('Missing "url" key in result (index %d)' % i) - sanitize_string_field(format, 'format_id') sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) - if format.get('format_id') is None: format['format_id'] = compat_str(i) else: