[extractor, cleanup] Refactor _download_... methods

2025-02-18 23:46:48 +01:00 · 2022-05-31 23:13:26 +05:30 · 2022-05-31 23:13:26 +05:30 · 617f658b7e
commit 617f658b7e
parent 8a7f6d7a15
1 changed files with 101 additions and 168 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -791,8 +791,35 @@ class InfoExtractor:
        """
        Return a tuple (page content as string, URL handle).
-        See _download_webpage docstring for arguments specification.
+        Arguments:
        url_or_request -- plain text URL as a string or
            a compat_urllib_request.Requestobject
        video_id -- Video/playlist/item identifier (string)
        Keyword arguments:
        note -- note printed before downloading (string)
        errnote -- note printed in case of an error (string)
        fatal -- flag denoting whether error should be considered fatal,
            i.e. whether it should cause ExtractionError to be raised,
            otherwise a warning will be reported and extraction continued
        encoding -- encoding for a page content decoding, guessed automatically
            when not explicitly specified
        data -- POST data (bytes)
        headers -- HTTP headers (dict)
        query -- URL query (dict)
        expected_status -- allows to accept failed HTTP requests (non 2xx
            status code) by explicitly specifying a set of accepted status
            codes. Can be any of the following entities:
                - an integer type specifying an exact failed status code to
                  accept
                - a list or a tuple of integer types specifying a list of
                  failed status codes to accept
                - a callable accepting an actual failed status code and
                  returning True if it should be accepted
            Note that this argument does not affect success status codes (2xx)
            which are always accepted.
        """
        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]
@ -887,102 +914,6 @@ class InfoExtractor:
        return content
    def _download_webpage(
            self, url_or_request, video_id, note=None, errnote=None,
            fatal=True, tries=1, timeout=5, encoding=None, data=None,
            headers={}, query={}, expected_status=None):
        """
        Return the data of the page as a string.
        Arguments:
        url_or_request -- plain text URL as a string or
            a compat_urllib_request.Requestobject
        video_id -- Video/playlist/item identifier (string)
        Keyword arguments:
        note -- note printed before downloading (string)
        errnote -- note printed in case of an error (string)
        fatal -- flag denoting whether error should be considered fatal,
            i.e. whether it should cause ExtractionError to be raised,
            otherwise a warning will be reported and extraction continued
        tries -- number of tries
        timeout -- sleep interval between tries
        encoding -- encoding for a page content decoding, guessed automatically
            when not explicitly specified
        data -- POST data (bytes)
        headers -- HTTP headers (dict)
        query -- URL query (dict)
        expected_status -- allows to accept failed HTTP requests (non 2xx
            status code) by explicitly specifying a set of accepted status
            codes. Can be any of the following entities:
                - an integer type specifying an exact failed status code to
                  accept
                - a list or a tuple of integer types specifying a list of
                  failed status codes to accept
                - a callable accepting an actual failed status code and
                  returning True if it should be accepted
            Note that this argument does not affect success status codes (2xx)
            which are always accepted.
        """
        success = False
        try_count = 0
        while success is False:
            try:
                res = self._download_webpage_handle(
                    url_or_request, video_id, note, errnote, fatal,
                    encoding=encoding, data=data, headers=headers, query=query,
                    expected_status=expected_status)
                success = True
            except compat_http_client.IncompleteRead as e:
                try_count += 1
                if try_count >= tries:
                    raise e
                self._sleep(timeout, video_id)
        if res is False:
            return res
        else:
            content, _ = res
            return content
    def _download_xml_handle(
            self, url_or_request, video_id, note='Downloading XML',
            errnote='Unable to download XML', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
        See _download_webpage docstring for arguments specification.
        """
        res = self._download_webpage_handle(
            url_or_request, video_id, note, errnote, fatal=fatal,
            encoding=encoding, data=data, headers=headers, query=query,
            expected_status=expected_status)
        if res is False:
            return res
        xml_string, urlh = res
        return self._parse_xml(
            xml_string, video_id, transform_source=transform_source,
            fatal=fatal), urlh
    def _download_xml(
            self, url_or_request, video_id,
            note='Downloading XML', errnote='Unable to download XML',
            transform_source=None, fatal=True, encoding=None,
            data=None, headers={}, query={}, expected_status=None):
        """
        Return the xml as an xml.etree.ElementTree.Element.
        See _download_webpage docstring for arguments specification.
        """
        res = self._download_xml_handle(
            url_or_request, video_id, note=note, errnote=errnote,
            transform_source=transform_source, fatal=fatal, encoding=encoding,
            data=data, headers=headers, query=query,
            expected_status=expected_status)
        return res if res is False else res[0]
    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
        if transform_source:
            xml_string = transform_source(xml_string)
@ -995,44 +926,6 @@ class InfoExtractor:
            else:
                self.report_warning(errmsg + str(ve))
    def _download_json_handle(
            self, url_or_request, video_id, note='Downloading JSON metadata',
            errnote='Unable to download JSON metadata', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return a tuple (JSON object, URL handle).
        See _download_webpage docstring for arguments specification.
        """
        res = self._download_webpage_handle(
            url_or_request, video_id, note, errnote, fatal=fatal,
            encoding=encoding, data=data, headers=headers, query=query,
            expected_status=expected_status)
        if res is False:
            return res
        json_string, urlh = res
        return self._parse_json(
            json_string, video_id, transform_source=transform_source,
            fatal=fatal), urlh
    def _download_json(
            self, url_or_request, video_id, note='Downloading JSON metadata',
            errnote='Unable to download JSON metadata', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return the JSON object as a dict.
        See _download_webpage docstring for arguments specification.
        """
        res = self._download_json_handle(
            url_or_request, video_id, note=note, errnote=errnote,
            transform_source=transform_source, fatal=fatal, encoding=encoding,
            data=data, headers=headers, query=query,
            expected_status=expected_status)
        return res if res is False else res[0]
    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
        if transform_source:
            json_string = transform_source(json_string)
@ -1058,43 +951,83 @@ class InfoExtractor:
            data[data.find('{'):data.rfind('}') + 1],
            video_id, transform_source, fatal)
-    def _download_socket_json_handle(
+    def __create_download_methods(name, parser, note, errnote, return_value):
            self, url_or_request, video_id, note='Polling socket',
            errnote='Unable to poll socket', transform_source=None,
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return a tuple (JSON object, URL handle).
-        See _download_webpage docstring for arguments specification.
+        def parse(ie, content, *args, **kwargs):
-        """
+            if parser is None:
-        res = self._download_webpage_handle(
+                return content
-            url_or_request, video_id, note, errnote, fatal=fatal,
+            # parser is fetched by name so subclasses can override it
-            encoding=encoding, data=data, headers=headers, query=query,
+            return getattr(ie, parser)(content, *args, **kwargs)
            expected_status=expected_status)
        if res is False:
            return res
        webpage, urlh = res
        return self._parse_socket_response_as_json(
            webpage, video_id, transform_source=transform_source,
            fatal=fatal), urlh
-    def _download_socket_json(
+        def download_handle(self, url_or_request, video_id, note=note, errnote=errnote,
-            self, url_or_request, video_id, note='Polling socket',
+                            transform_source=None, fatal=True, *args, **kwargs):
-            errnote='Unable to poll socket', transform_source=None,
+            res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, *args, **kwargs)
-            fatal=True, encoding=None, data=None, headers={}, query={},
+            if res is False:
-            expected_status=None):
+                return res
-        """
+            content, urlh = res
-        Return the JSON object as a dict.
+            return parse(self, content, video_id, transform_source, fatal), urlh
-        See _download_webpage docstring for arguments specification.
+        def download_content(
                self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, *args, **kwargs):
            args = [url_or_request, video_id, note, errnote, transform_source, *args]
            if parser is None:
                args.pop(4)  # transform_source
            # The method is fetched by name so subclasses can override _download_..._handle
            res = getattr(self, download_handle.__name__)(*args, **kwargs)
            return res if res is False else res[0]
        def impersonate(func, name, return_value):
            func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
            func.__doc__ = f'''
                @param transform_source     Apply this transformation before parsing
                @returns                    {return_value}
                See _download_webpage_handle docstring for other arguments specification
            '''
        impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
        impersonate(download_content, f'_download_{name}', f'{return_value}')
        return download_handle, download_content
    _download_xml_handle, _download_xml = __create_download_methods(
        'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
    _download_json_handle, _download_json = __create_download_methods(
        'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
    _download_socket_json_handle, _download_socket_json = __create_download_methods(
        'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
    __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
    def _download_webpage(
            self, url_or_request, video_id, note=None, errnote=None,
            fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
        """
-        res = self._download_socket_json_handle(
+        Return the data of the page as a string.
-            url_or_request, video_id, note=note, errnote=errnote,
+
-            transform_source=transform_source, fatal=fatal, encoding=encoding,
+        Keyword arguments:
-            data=data, headers=headers, query=query,
+        tries -- number of tries
-            expected_status=expected_status)
+        timeout -- sleep interval between tries
-        return res if res is False else res[0]
+
        See _download_webpage_handle docstring for other arguments specification.
        """
        R''' # NB: These are unused; should they be deprecated?
        if tries != 1:
            self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
        if timeout is NO_DEFAULT:
            timeout = 5
        else:
            self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
        '''
        try_count = 0
        while True:
            try:
                return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
            except compat_http_client.IncompleteRead as e:
                try_count += 1
                if try_count >= tries:
                    raise e
                self._sleep(timeout, video_id)
    def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
        idstr = format_field(video_id, template='%s: ')