This commit is contained in:
coletdjnz 2023-07-29 11:47:50 +12:00
parent 4bf912282a
commit ba17d88d5e
No known key found for this signature in database
GPG Key ID: 91984263BB39894A
8 changed files with 382 additions and 2 deletions

View File

@ -399,6 +399,7 @@ class YoutubeDL:
- "detect_or_warn": check whether we can do anything
about it, warn otherwise (default)
source_address: Client-side IP address to bind to.
impersonate: curl-impersonate target name to impersonate for requests.
sleep_interval_requests: Number of seconds to sleep between requests
during extraction
sleep_interval: Number of seconds to sleep before each download when
@ -3977,7 +3978,7 @@ def get_encoding(stream):
})) or 'none'))
write_debug(f'Proxy map: {self.proxies}')
# write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers)}')
write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}')
for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items():
display_list = ['%s%s' % (
klass.__name__, '' if klass.__name__ == name else f' as {name}')
@ -4105,6 +4106,7 @@ def build_request_director(self, handlers):
'client_certificate_key': 'client_certificate_key',
'client_certificate_password': 'client_certificate_password',
},
'impersonate': 'impersonate',
}),
))
return director

View File

@ -910,6 +910,7 @@ def parse_options(argv=None):
'postprocessors': postprocessors,
'fixup': opts.fixup,
'source_address': opts.source_address,
'impersonate': opts.impersonate,
'call_home': opts.call_home,
'sleep_interval_requests': opts.sleep_interval_requests,
'sleep_interval': opts.sleep_interval,

View File

@ -56,6 +56,10 @@
# See https://github.com/yt-dlp/yt-dlp/issues/2633
websockets = None
try:
import curl_cffi
except ImportError:
curl_cffi = None
try:
import xattr # xattr or pyxattr

View File

@ -11,3 +11,7 @@
# isort: split
# TODO: all request handlers should be safely imported
from . import _urllib
try:
from . import _curlcffi # noqa: F401
except ImportError:
pass

View File

@ -0,0 +1,308 @@
import io
import os
import re
from enum import IntEnum
from .common import Features, Request, RequestHandler, Response, register_rh
from .exceptions import (
CertificateVerifyError,
HTTPError,
IncompleteRead,
SSLError,
TransportError,
)
from .impersonate import ImpersonateRequestHandler
from ._helper import InstanceStoreMixin, select_proxy
from ..cookies import LenientSimpleCookie
from ..dependencies import curl_cffi
from ..utils import int_or_none, traverse_obj
if curl_cffi is None:
raise ImportError('curl_cffi is not installed')
import curl_cffi.requests
from curl_cffi import ffi
from curl_cffi.const import CurlInfo, CurlOpt
class CurlCFFISession(curl_cffi.requests.Session):
def __init__(
self,
verbose=False,
**kwargs
):
super().__init__(**kwargs)
self.verbose = verbose
@property
def curl(self):
# due to how curl_cffi handles threading
curl = super().curl
if self.verbose:
curl.setopt(CurlOpt.VERBOSE, 1)
return curl
def _set_curl_options(self, curl, method: str, url: str, *args, **kwargs):
res = super()._set_curl_options(curl, method, url, *args, **kwargs)
data = traverse_obj(kwargs, 'data') or traverse_obj(args, 1)
# Attempt to align curl redirect handling with ours
curl.setopt(CurlOpt.CUSTOMREQUEST, ffi.NULL)
if data and method != 'POST':
# Don't strip data on 301,302,303 redirects for PUT etc.
curl.setopt(CurlOpt.POSTREDIR, 1 | 2 | 4) # CURL_REDIR_POST_ALL
if method not in ('GET', 'POST'):
curl.setopt(CurlOpt.CUSTOMREQUEST, method.encode())
return res
def get_error_code(error: curl_cffi.curl.CurlError):
return int_or_none(re.search(r'ErrCode:\s+(\d+)', str(error)).group(1))
@register_rh
class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
RH_NAME = 'curl_cffi'
_SUPPORTED_URL_SCHEMES = ('http', 'https')
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_IMPERSONATE_TARGETS = curl_cffi.requests.BrowserType._member_names_
def _create_instance(self):
session_opts = {}
if self.verbose:
session_opts['verbose'] = True
session = CurlCFFISession(**session_opts)
return session
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('impersonate', None)
def _generate_set_cookie(self, cookiejar):
for cookie in cookiejar:
encoder = LenientSimpleCookie()
values = []
_, value = encoder.value_encode(cookie.value)
values.append(f'{cookie.name}={value}')
if cookie.domain:
values.append(f'Domain={cookie.domain}')
if cookie.path:
values.append(f'Path={cookie.path}')
if cookie.secure:
values.append('Secure')
if cookie.expires:
values.append(f'Expires={cookie.expires}')
if cookie.version:
values.append(f'Version={cookie.version}')
yield '; '.join(values)
def _send(self, request: Request):
# XXX: curl_cffi reads the whole response at once into memory
# Streaming is not yet supported.
# See: https://github.com/yifeikong/curl_cffi/issues/26
max_redirects_exceeded = False
session: CurlCFFISession = self._get_instance()
cookiejar = request.extensions.get('cookiejar') or self.cookiejar
# Reset the internal curl cookie store to ensure consistency with our cookiejar
# See: https://curl.se/libcurl/c/CURLOPT_COOKIELIST.html
# XXX: does this actually work?
session.curl.setopt(CurlOpt.COOKIELIST, b'ALL')
session.cookies.clear()
for cookie_str in self._generate_set_cookie(cookiejar):
session.curl.setopt(CurlOpt.COOKIELIST, ('set-cookie: ' + cookie_str).encode())
# XXX: if we need to change http version
# session.curl.setopt(CurlOpt.HTTP_VERSION, 2)
if self.source_address is not None:
session.curl.setopt(CurlOpt.INTERFACE, self.source_address.encode())
proxies = (request.proxies or self.proxies).copy()
if 'no' in proxies:
session.curl.setopt(CurlOpt.NOPROXY, proxies['no'].encode())
proxies.pop('no', None)
if 'all' in proxies:
session.curl.setopt(CurlOpt.PROXY, proxies['all'].encode())
else:
# curl doesn't support per protocol proxies, so we select the one that matches the request protocol
proxy = select_proxy(request.url, proxies=proxies)
if proxy:
session.curl.setopt(CurlOpt.PROXY, proxy.encode())
headers = self._get_impersonate_headers(request)
try:
curl_response = session.request(
method=request.method,
url=request.url,
headers=headers,
data=request.data,
verify=self.verify,
max_redirects=5,
timeout=request.extensions.get('timeout') or self.timeout,
impersonate=self._get_impersonate_target(request),
)
except curl_cffi.requests.errors.RequestsError as e:
error_code = get_error_code(e)
if error_code in (CurlECode.PEER_FAILED_VERIFICATION, CurlECode.OBSOLETE51):
# Error code 51 used to be this in curl <7.62.0
# See: https://curl.se/libcurl/c/libcurl-errors.html
raise CertificateVerifyError(cause=e) from e
elif error_code == CurlECode.SSL_CONNECT_ERROR:
raise SSLError(cause=e) from e
elif error_code == CurlECode.TOO_MANY_REDIRECTS:
# The response isn't exposed on too many redirects.
# We are creating a dummy response here, but it's
# not ideal since it only contains initial request data
max_redirects_exceeded = True
curl_response = curl_cffi.requests.cookies.Response(
curl=session.curl,
request=curl_cffi.requests.cookies.Request(
url=request.url,
headers=curl_cffi.requests.headers.Headers(request.headers),
method=request.method,
))
# We can try extract *some* data from curl
curl_response.url = session.curl.getinfo(CurlInfo.EFFECTIVE_URL).decode()
curl_response.status_code = session.curl.getinfo(CurlInfo.RESPONSE_CODE)
elif error_code == CurlECode.PARTIAL_FILE:
raise IncompleteRead(
# XXX: do we need partial to have the content?
partial=[''] * int(session.curl.getinfo(CurlInfo.SIZE_DOWNLOAD)),
expected=session.curl.getinfo(CurlInfo.CONTENT_LENGTH_DOWNLOAD),
cause=e) from e
else:
raise TransportError(cause=e) from e
response = Response(
io.BytesIO(curl_response.content),
headers=curl_response.headers,
url=curl_response.url,
status=curl_response.status_code)
# XXX: this won't apply cookies from intermediate responses in a redirect chain
# curl_cffi doesn't support CurlInfo.COOKIELIST yet which we need to reliably read cookies
# See: https://github.com/yifeikong/curl_cffi/issues/4
for cookie in session.cookies.jar:
cookiejar.set_cookie(cookie)
if not 200 <= response.status < 300:
raise HTTPError(response, redirect_loop=max_redirects_exceeded)
return response
# https://curl.se/libcurl/c/libcurl-errors.html
class CurlECode(IntEnum):
OK = 0
UNSUPPORTED_PROTOCOL = 1
FAILED_INIT = 2
URL_MALFORMAT = 3
NOT_BUILT_IN = 4
COULDNT_RESOLVE_PROXY = 5
COULDNT_RESOLVE_HOST = 6
COULDNT_CONNECT = 7
WEIRD_SERVER_REPLY = 8
REMOTE_ACCESS_DENIED = 9
FTP_ACCEPT_FAILED = 10
FTP_WEIRD_PASS_REPLY = 11
FTP_ACCEPT_TIMEOUT = 12
FTP_WEIRD_PASV_REPLY = 13
FTP_WEIRD_227_FORMAT = 14
FTP_CANT_GET_HOST = 15
HTTP2 = 16
FTP_COULDNT_SET_TYPE = 17
PARTIAL_FILE = 18
FTP_COULDNT_RETR_FILE = 19
OBSOLETE20 = 20
QUOTE_ERROR = 21
HTTP_RETURNED_ERROR = 22
WRITE_ERROR = 23
OBSOLETE24 = 24
UPLOAD_FAILED = 25
READ_ERROR = 26
OUT_OF_MEMORY = 27
OPERATION_TIMEDOUT = 28
OBSOLETE29 = 29
FTP_PORT_FAILED = 30
FTP_COULDNT_USE_REST = 31
OBSOLETE32 = 32
RANGE_ERROR = 33
HTTP_POST_ERROR = 34
SSL_CONNECT_ERROR = 35
BAD_DOWNLOAD_RESUME = 36
FILE_COULDNT_READ_FILE = 37
LDAP_CANNOT_BIND = 38
LDAP_SEARCH_FAILED = 39
OBSOLETE40 = 40
FUNCTION_NOT_FOUND = 41
ABORTED_BY_CALLBACK = 42
BAD_FUNCTION_ARGUMENT = 43
OBSOLETE44 = 44
INTERFACE_FAILED = 45
OBSOLETE46 = 46
TOO_MANY_REDIRECTS = 47
UNKNOWN_OPTION = 48
SETOPT_OPTION_SYNTAX = 49
OBSOLETE50 = 50
OBSOLETE51 = 51
GOT_NOTHING = 52
SSL_ENGINE_NOTFOUND = 53
SSL_ENGINE_SETFAILED = 54
SEND_ERROR = 55
RECV_ERROR = 56
OBSOLETE57 = 57
SSL_CERTPROBLEM = 58
SSL_CIPHER = 59
PEER_FAILED_VERIFICATION = 60
BAD_CONTENT_ENCODING = 61
OBSOLETE62 = 62
FILESIZE_EXCEEDED = 63
USE_SSL_FAILED = 64
SEND_FAIL_REWIND = 65
SSL_ENGINE_INITFAILED = 66
LOGIN_DENIED = 67
TFTP_NOTFOUND = 68
TFTP_PERM = 69
REMOTE_DISK_FULL = 70
TFTP_ILLEGAL = 71
TFTP_UNKNOWNID = 72
REMOTE_FILE_EXISTS = 73
TFTP_NOSUCHUSER = 74
OBSOLETE75 = 75
OBSOLETE76 = 76
SSL_CACERT_BADFILE = 77
REMOTE_FILE_NOT_FOUND = 78
SSH = 79
SSL_SHUTDOWN_FAILED = 80
AGAIN = 81
SSL_CRL_BADFILE = 82
SSL_ISSUER_ERROR = 83
FTP_PRET_FAILED = 84
RTSP_CSEQ_ERROR = 85
RTSP_SESSION_ERROR = 86
FTP_BAD_FILE_LIST = 87
CHUNK_FAILED = 88
NO_CONNECTION_AVAILABLE = 89
SSL_PINNEDPUBKEYNOTMATCH = 90
SSL_INVALIDCERTSTATUS = 91
HTTP2_STREAM = 92
RECURSIVE_API_CALL = 93
AUTH_ERROR = 94
HTTP3 = 95
QUIC_CONNECT_ERROR = 96
PROXY = 97
SSL_CLIENTCERT = 98
UNRECOVERABLE_POLL = 99

View File

@ -373,7 +373,7 @@ def handle_response_read_exceptions(e):
raise TransportError(cause=e) from e
@register_rh
#@register_rh
class UrllibRH(RequestHandler, InstanceStoreMixin):
_SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
_SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')

View File

@ -0,0 +1,56 @@
from abc import ABC
from .exceptions import UnsupportedRequest
from ..utils.networking import std_headers
from .common import RequestHandler
from ..compat.types import NoneType
class ImpersonateRequestHandler(RequestHandler, ABC):
"""
Base class for request handlers that support browser impersonation.
This provides a method for checking the validity of the impersonate extension,
which can be used in _check_extensions.
The following may be defined:
`SUPPORTED_IMPERSONATE_TARGETS`: a tuple of supported targets to impersonate,
in curl-impersonate target name format. Any Request with an impersonate
target not in this list will raise an UnsupportedRequest.
Set to None to disable this check.
"""
_SUPPORTED_IMPERSONATE_TARGETS: tuple = ()
def __init__(self, *, impersonate=None, **kwargs):
super().__init__(**kwargs)
self.impersonate = impersonate
def _get_impersonate_target(self, request):
return request.extensions.get('impersonate') or self.impersonate
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
self._check_impersonate_target(extensions.get('impersonate'))
def _check_impersonate_target(self, target):
assert isinstance(target, (str, NoneType))
if self._SUPPORTED_IMPERSONATE_TARGETS is None or target is None:
return
# XXX: this will raise even if the handler doesn't support the impersonate extension
if target not in self._SUPPORTED_IMPERSONATE_TARGETS:
raise UnsupportedRequest(f'Unsupported impersonate target: {target}')
def _validate(self, request):
super()._validate(request)
self._check_impersonate_target(self.impersonate)
def _get_impersonate_headers(self, request):
headers = self._merge_headers(request.headers)
impersonate_target = self._get_impersonate_target(request)
if impersonate_target:
# remove all headers present in std_headers
headers.pop('User-Agent', None)
for header in std_headers:
if header in headers and std_headers[header] == headers[header]:
headers.pop(header, None)
return headers

View File

@ -510,6 +510,11 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
metavar='IP', dest='source_address', default=None,
help='Client-side IP address to bind to',
)
network.add_option(
'--impersonate',
metavar='TARGET', dest='impersonate', default=None,
help='curl-impersonate target name to impersonate for requests.',
)
network.add_option(
'-4', '--force-ipv4',
action='store_const', const='0.0.0.0', dest='source_address',