From d22dec74ffa2a53a1c04770af37d39f384f3d56c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 19:20:12 +0600 Subject: [PATCH 1/5] Add `--force-generic-extractor` For some extractors that are hard to workout a good _VALID_URL we use very vague and unrestrictive ones, e.g. just allowing anything after hostname and capturing part of URL as id. If some of these extractors happen to have an video embed of some different hoster or platform and this scenario was not handled in extractor itself we end up with inability to download this embed until extractor is fixed to support embed of this kind. Forcing downloader to use the generic extractor can be a neat temporary solution for this problem. Example: FiveTV extractor with Tvigle embed - http://www.5-tv.ru/rabota/broadcasts/48/ --- youtube_dl/YoutubeDL.py | 6 ++++++ youtube_dl/__init__.py | 1 + youtube_dl/extractor/generic.py | 4 +++- youtube_dl/options.py | 4 ++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b1f792d4e..4b801a917 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -139,6 +139,7 @@ class YoutubeDL(object): outtmpl: Template for output names. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. + force_generic_extractor: Force downloader to use the generic extractor nooverwrites: Prevent overwriting files. playliststart: Playlist item to start at. playlistend: Playlist item to end at. @@ -282,6 +283,7 @@ def __init__(self, params=None, auto_init=True): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr + self._force_generic_extractor_required = params.get('force_generic_extractor', False) self.params = params self.cache = Cache(self) @@ -633,6 +635,10 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, extra_info is a dict containing the extra values to add to each result ''' + if not ie_key and self._force_generic_extractor_required: + self._force_generic_extractor_required = False + ie_key = 'Generic' + if ie_key: ies = [self.get_info_extractor(ie_key)] else: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ace17857c..215b616de 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -293,6 +293,7 @@ def _real_main(argv=None): 'autonumber_size': opts.autonumber_size, 'restrictfilenames': opts.restrictfilenames, 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, 'retries': opts_retries, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40d869c53..3d672197c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -977,7 +977,9 @@ def _real_extract(self, url): 'upload_date': upload_date, } - if not self._downloader.params.get('test', False) and not is_intentional: + if (not self._downloader.params.get('test', False) and + not is_intentional and + not self._downloader.params.get('force_generic_extractor', False)): self._downloader.report_warning('Falling back on generic information extractor.') if not full_response: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 689fa7595..096ab6137 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -150,6 +150,10 @@ def _hide_login_info(opts): '--extractor-descriptions', action='store_true', dest='list_extractor_descriptions', default=False, help='Output descriptions of all supported extractors') + general.add_option( + '--force-generic-extractor', + action='store_true', dest='force_generic_extractor', default=False, + help='Force extraction to use the generic extractor') general.add_option( '--default-search', dest='default_search', metavar='PREFIX', From 9f4323252abade4f10b0884682f92cedc78b4d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 21:56:50 +0600 Subject: [PATCH 2/5] [YoutubeDL] Fix for multiple URLs --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4b801a917..8dbad7cf8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -283,7 +283,6 @@ def __init__(self, params=None, auto_init=True): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr - self._force_generic_extractor_required = params.get('force_generic_extractor', False) self.params = params self.cache = Cache(self) @@ -1504,6 +1503,7 @@ def download(self, url_list): for url in url_list: try: + self._force_generic_extractor_required = self.params.get('force_generic_extractor', False) # It also downloads the videos res = self.extract_info(url) except UnavailableVideoError: From 61aa5ba36eea3b7cf8c3570ab33604dd2c13b855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 02:05:21 +0600 Subject: [PATCH 3/5] [YoutubeDL] Remove global state for force_generic_extractor flag in favor of passing argument --- youtube_dl/YoutubeDL.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8dbad7cf8..dd2d8cb3c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -627,15 +627,14 @@ def add_extra_info(info_dict, extra_info): info_dict.setdefault(key, value) def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True): + process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. extra_info is a dict containing the extra values to add to each result ''' - if not ie_key and self._force_generic_extractor_required: - self._force_generic_extractor_required = False + if not ie_key and force_generic_extractor: ie_key = 'Generic' if ie_key: @@ -663,7 +662,7 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, } self.add_default_extra_info(ie_result, ie, url) if process: - return self.process_ie_result(ie_result, download, extra_info) + return self.process_ie_result(ie_result, download, extra_info, force_generic_extractor=False) else: return ie_result except ExtractorError as de: # An error we somewhat expected @@ -688,7 +687,7 @@ def add_default_extra_info(self, ie_result, ie, url): 'extractor_key': ie.ie_key(), }) - def process_ie_result(self, ie_result, download=True, extra_info={}): + def process_ie_result(self, ie_result, download=True, extra_info={}, force_generic_extractor=False): """ Take the result of the ie(may be modified) and resolve all unresolved references (URLs, playlist items). @@ -716,7 +715,8 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): return self.extract_info(ie_result['url'], download, ie_key=ie_result.get('ie_key'), - extra_info=extra_info) + extra_info=extra_info, + force_generic_extractor=force_generic_extractor) elif result_type == 'url_transparent': # Use the information from the embedding page info = self.extract_info( @@ -1503,9 +1503,9 @@ def download(self, url_list): for url in url_list: try: - self._force_generic_extractor_required = self.params.get('force_generic_extractor', False) # It also downloads the videos - res = self.extract_info(url) + res = self.extract_info( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: From 0072afca8e02052c77dc3b7009e51114887e31b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 02:21:29 +0600 Subject: [PATCH 4/5] [YoutubeDL] Remove force_generic_extractor arg from process_ie_result --- youtube_dl/YoutubeDL.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dd2d8cb3c..a7d3a1c01 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -662,7 +662,7 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, } self.add_default_extra_info(ie_result, ie, url) if process: - return self.process_ie_result(ie_result, download, extra_info, force_generic_extractor=False) + return self.process_ie_result(ie_result, download, extra_info) else: return ie_result except ExtractorError as de: # An error we somewhat expected @@ -687,7 +687,7 @@ def add_default_extra_info(self, ie_result, ie, url): 'extractor_key': ie.ie_key(), }) - def process_ie_result(self, ie_result, download=True, extra_info={}, force_generic_extractor=False): + def process_ie_result(self, ie_result, download=True, extra_info={}): """ Take the result of the ie(may be modified) and resolve all unresolved references (URLs, playlist items). @@ -715,8 +715,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}, force_gener return self.extract_info(ie_result['url'], download, ie_key=ie_result.get('ie_key'), - extra_info=extra_info, - force_generic_extractor=force_generic_extractor) + extra_info=extra_info) elif result_type == 'url_transparent': # Use the information from the embedding page info = self.extract_info( From 2fece970b80022574a6b54c936820897cfd10d70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 24 Jun 2015 03:08:24 +0600 Subject: [PATCH 5/5] [extractor/generic] Clarify generic extraction warning --- youtube_dl/extractor/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3d672197c..c8582bda9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -977,10 +977,10 @@ def _real_extract(self, url): 'upload_date': upload_date, } - if (not self._downloader.params.get('test', False) and - not is_intentional and - not self._downloader.params.get('force_generic_extractor', False)): - self._downloader.report_warning('Falling back on generic information extractor.') + if not self._downloader.params.get('test', False) and not is_intentional: + force = self._downloader.params.get('force_generic_extractor', False) + self._downloader.report_warning( + '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) if not full_response: request = compat_urllib_request.Request(url)