_requests.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. from __future__ import annotations
  2. import contextlib
  3. import functools
  4. import http.client
  5. import logging
  6. import re
  7. import socket
  8. import warnings
  9. from ..dependencies import brotli, requests, urllib3
  10. from ..utils import bug_reports_message, int_or_none, variadic
  11. from ..utils.networking import normalize_url
  12. if requests is None:
  13. raise ImportError('requests module is not installed')
  14. if urllib3 is None:
  15. raise ImportError('urllib3 module is not installed')
  16. urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
  17. if urllib3_version < (1, 26, 17):
  18. urllib3._yt_dlp__version = f'{urllib3.__version__} (unsupported)'
  19. raise ImportError('Only urllib3 >= 1.26.17 is supported')
  20. if requests.__build__ < 0x023202:
  21. requests._yt_dlp__version = f'{requests.__version__} (unsupported)'
  22. raise ImportError('Only requests >= 2.32.2 is supported')
  23. import requests.adapters
  24. import requests.utils
  25. import urllib3.connection
  26. import urllib3.exceptions
  27. import urllib3.util
  28. from ._helper import (
  29. InstanceStoreMixin,
  30. add_accept_encoding_header,
  31. create_connection,
  32. create_socks_proxy_socket,
  33. get_redirect_method,
  34. make_socks_proxy_opts,
  35. select_proxy,
  36. )
  37. from .common import (
  38. Features,
  39. RequestHandler,
  40. Response,
  41. register_preference,
  42. register_rh,
  43. )
  44. from .exceptions import (
  45. CertificateVerifyError,
  46. HTTPError,
  47. IncompleteRead,
  48. ProxyError,
  49. RequestError,
  50. SSLError,
  51. TransportError,
  52. )
  53. from ..socks import ProxyError as SocksProxyError
  54. SUPPORTED_ENCODINGS = [
  55. 'gzip', 'deflate',
  56. ]
  57. if brotli is not None:
  58. SUPPORTED_ENCODINGS.append('br')
  59. '''
  60. Override urllib3's behavior to not convert lower-case percent-encoded characters
  61. to upper-case during url normalization process.
  62. RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
  63. and normalizers should convert them to uppercase for consistency [1].
  64. However, some sites may have an incorrect implementation where they provide
  65. a percent-encoded url that is then compared case-sensitively.[2]
  66. While this is a very rare case, since urllib does not do this normalization step, it
  67. is best to avoid it in requests too for compatability reasons.
  68. 1: https://tools.ietf.org/html/rfc3986#section-2.1
  69. 2: https://github.com/streamlink/streamlink/pull/4003
  70. '''
  71. class Urllib3PercentREOverride:
  72. def __init__(self, r: re.Pattern):
  73. self.re = r
  74. # pass through all other attribute calls to the original re
  75. def __getattr__(self, item):
  76. return self.re.__getattribute__(item)
  77. def subn(self, repl, string, *args, **kwargs):
  78. return string, self.re.subn(repl, string, *args, **kwargs)[1]
  79. # urllib3 >= 1.25.8 uses subn:
  80. # https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
  81. import urllib3.util.url
  82. if hasattr(urllib3.util.url, 'PERCENT_RE'):
  83. urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
  84. elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0
  85. urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
  86. else:
  87. warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
  88. '''
  89. Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
  90. server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
  91. however this is an issue because we set check_hostname to True in our SSLContext.
  92. Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
  93. This has been fixed in urllib3 2.0+.
  94. See: https://github.com/urllib3/urllib3/issues/517
  95. '''
  96. if urllib3_version < (2, 0, 0):
  97. with contextlib.suppress(Exception):
  98. urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
  99. # Requests will not automatically handle no_proxy by default
  100. # due to buggy no_proxy handling with proxy dict [1].
  101. # 1. https://github.com/psf/requests/issues/5000
  102. requests.adapters.select_proxy = select_proxy
  103. class RequestsResponseAdapter(Response):
  104. def __init__(self, res: requests.models.Response):
  105. super().__init__(
  106. fp=res.raw, headers=res.headers, url=res.url,
  107. status=res.status_code, reason=res.reason)
  108. self._requests_response = res
  109. def read(self, amt: int | None = None):
  110. try:
  111. # Interact with urllib3 response directly.
  112. return self.fp.read(amt, decode_content=True)
  113. # See urllib3.response.HTTPResponse.read() for exceptions raised on read
  114. except urllib3.exceptions.SSLError as e:
  115. raise SSLError(cause=e) from e
  116. except urllib3.exceptions.ProtocolError as e:
  117. # IncompleteRead is always contained within ProtocolError
  118. # See urllib3.response.HTTPResponse._error_catcher()
  119. ir_err = next(
  120. (err for err in (e.__context__, e.__cause__, *variadic(e.args))
  121. if isinstance(err, http.client.IncompleteRead)), None)
  122. if ir_err is not None:
  123. # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
  124. # but uses an `int` for its `partial` property.
  125. partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
  126. raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
  127. raise TransportError(cause=e) from e
  128. except urllib3.exceptions.HTTPError as e:
  129. # catch-all for any other urllib3 response exceptions
  130. raise TransportError(cause=e) from e
  131. class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
  132. def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
  133. self._pm_args = {}
  134. if ssl_context:
  135. self._pm_args['ssl_context'] = ssl_context
  136. if source_address:
  137. self._pm_args['source_address'] = (source_address, 0)
  138. self._proxy_ssl_context = proxy_ssl_context or ssl_context
  139. super().__init__(**kwargs)
  140. def init_poolmanager(self, *args, **kwargs):
  141. return super().init_poolmanager(*args, **kwargs, **self._pm_args)
  142. def proxy_manager_for(self, proxy, **proxy_kwargs):
  143. extra_kwargs = {}
  144. if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
  145. extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
  146. return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
  147. # Skip `requests` internal verification; we use our own SSLContext
  148. def cert_verify(*args, **kwargs):
  149. pass
  150. # requests 2.32.2+: Reimplementation without `_urllib3_request_context`
  151. def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None):
  152. url = urllib3.util.parse_url(request.url).url
  153. manager = self.poolmanager
  154. if proxy := select_proxy(url, proxies):
  155. manager = self.proxy_manager_for(proxy)
  156. return manager.connection_from_url(url)
  157. class RequestsSession(requests.sessions.Session):
  158. """
  159. Ensure unified redirect method handling with our urllib redirect handler.
  160. """
  161. def rebuild_method(self, prepared_request, response):
  162. new_method = get_redirect_method(prepared_request.method, response.status_code)
  163. # HACK: requests removes headers/body on redirect unless code was a 307/308.
  164. if new_method == prepared_request.method:
  165. response._real_status_code = response.status_code
  166. response.status_code = 308
  167. prepared_request.method = new_method
  168. # Requests fails to resolve dot segments on absolute redirect locations
  169. # See: https://github.com/yt-dlp/yt-dlp/issues/9020
  170. prepared_request.url = normalize_url(prepared_request.url)
  171. def rebuild_auth(self, prepared_request, response):
  172. # HACK: undo status code change from rebuild_method, if applicable.
  173. # rebuild_auth runs after requests would remove headers/body based on status code
  174. if hasattr(response, '_real_status_code'):
  175. response.status_code = response._real_status_code
  176. del response._real_status_code
  177. return super().rebuild_auth(prepared_request, response)
  178. class Urllib3LoggingFilter(logging.Filter):
  179. def filter(self, record):
  180. # Ignore HTTP request messages since HTTPConnection prints those
  181. return record.msg != '%s://%s:%s "%s %s %s" %s %s'
  182. class Urllib3LoggingHandler(logging.Handler):
  183. """Redirect urllib3 logs to our logger"""
  184. def __init__(self, logger, *args, **kwargs):
  185. super().__init__(*args, **kwargs)
  186. self._logger = logger
  187. def emit(self, record):
  188. try:
  189. msg = self.format(record)
  190. if record.levelno >= logging.ERROR:
  191. self._logger.error(msg)
  192. else:
  193. self._logger.stdout(msg)
  194. except Exception:
  195. self.handleError(record)
  196. @register_rh
  197. class RequestsRH(RequestHandler, InstanceStoreMixin):
  198. """Requests RequestHandler
  199. https://github.com/psf/requests
  200. """
  201. _SUPPORTED_URL_SCHEMES = ('http', 'https')
  202. _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
  203. _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
  204. _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
  205. RH_NAME = 'requests'
  206. def __init__(self, *args, **kwargs):
  207. super().__init__(*args, **kwargs)
  208. # Forward urllib3 debug messages to our logger
  209. logger = logging.getLogger('urllib3')
  210. self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
  211. self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
  212. self.__logging_handler.addFilter(Urllib3LoggingFilter())
  213. logger.addHandler(self.__logging_handler)
  214. # TODO: Use a logger filter to suppress pool reuse warning instead
  215. logger.setLevel(logging.ERROR)
  216. if self.verbose:
  217. # Setting this globally is not ideal, but is easier than hacking with urllib3.
  218. # It could technically be problematic for scripts embedding yt-dlp.
  219. # However, it is unlikely debug traffic is used in that context in a way this will cause problems.
  220. urllib3.connection.HTTPConnection.debuglevel = 1
  221. logger.setLevel(logging.DEBUG)
  222. # this is expected if we are using --no-check-certificate
  223. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  224. def close(self):
  225. self._clear_instances()
  226. # Remove the logging handler that contains a reference to our logger
  227. # See: https://github.com/yt-dlp/yt-dlp/issues/8922
  228. logging.getLogger('urllib3').removeHandler(self.__logging_handler)
  229. def _check_extensions(self, extensions):
  230. super()._check_extensions(extensions)
  231. extensions.pop('cookiejar', None)
  232. extensions.pop('timeout', None)
  233. extensions.pop('legacy_ssl', None)
  234. extensions.pop('keep_header_casing', None)
  235. def _create_instance(self, cookiejar, legacy_ssl_support=None):
  236. session = RequestsSession()
  237. http_adapter = RequestsHTTPAdapter(
  238. ssl_context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support),
  239. source_address=self.source_address,
  240. max_retries=urllib3.util.retry.Retry(False),
  241. )
  242. session.adapters.clear()
  243. session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
  244. session.mount('https://', http_adapter)
  245. session.mount('http://', http_adapter)
  246. session.cookies = cookiejar
  247. session.trust_env = False # no need, we already load proxies from env
  248. return session
  249. def _prepare_headers(self, _, headers):
  250. add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
  251. def _send(self, request):
  252. headers = self._get_headers(request)
  253. max_redirects_exceeded = False
  254. session = self._get_instance(
  255. cookiejar=self._get_cookiejar(request),
  256. legacy_ssl_support=request.extensions.get('legacy_ssl'),
  257. )
  258. try:
  259. requests_res = session.request(
  260. method=request.method,
  261. url=request.url,
  262. data=request.data,
  263. headers=headers,
  264. timeout=self._calculate_timeout(request),
  265. proxies=self._get_proxies(request),
  266. allow_redirects=True,
  267. stream=True,
  268. )
  269. except requests.exceptions.TooManyRedirects as e:
  270. max_redirects_exceeded = True
  271. requests_res = e.response
  272. except requests.exceptions.SSLError as e:
  273. if 'CERTIFICATE_VERIFY_FAILED' in str(e):
  274. raise CertificateVerifyError(cause=e) from e
  275. raise SSLError(cause=e) from e
  276. except requests.exceptions.ProxyError as e:
  277. raise ProxyError(cause=e) from e
  278. except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
  279. raise TransportError(cause=e) from e
  280. except urllib3.exceptions.HTTPError as e:
  281. # Catch any urllib3 exceptions that may leak through
  282. raise TransportError(cause=e) from e
  283. except requests.exceptions.RequestException as e:
  284. # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
  285. raise RequestError(cause=e) from e
  286. res = RequestsResponseAdapter(requests_res)
  287. if not 200 <= res.status < 300:
  288. raise HTTPError(res, redirect_loop=max_redirects_exceeded)
  289. return res
  290. @register_preference(RequestsRH)
  291. def requests_preference(rh, request):
  292. return 100
  293. # Use our socks proxy implementation with requests to avoid an extra dependency.
  294. class SocksHTTPConnection(urllib3.connection.HTTPConnection):
  295. def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks
  296. self._proxy_args = _socks_options
  297. super().__init__(*args, **kwargs)
  298. def _new_conn(self):
  299. try:
  300. return create_connection(
  301. address=(self._proxy_args['addr'], self._proxy_args['port']),
  302. timeout=self.timeout,
  303. source_address=self.source_address,
  304. _create_socket_func=functools.partial(
  305. create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
  306. except (socket.timeout, TimeoutError) as e:
  307. raise urllib3.exceptions.ConnectTimeoutError(
  308. self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
  309. except SocksProxyError as e:
  310. raise urllib3.exceptions.ProxyError(str(e), e) from e
  311. except OSError as e:
  312. raise urllib3.exceptions.NewConnectionError(
  313. self, f'Failed to establish a new connection: {e}') from e
  314. class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
  315. pass
  316. class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
  317. ConnectionCls = SocksHTTPConnection
  318. class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
  319. ConnectionCls = SocksHTTPSConnection
  320. class SocksProxyManager(urllib3.PoolManager):
  321. def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
  322. connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
  323. super().__init__(num_pools, headers, **connection_pool_kw)
  324. self.pool_classes_by_scheme = {
  325. 'http': SocksHTTPConnectionPool,
  326. 'https': SocksHTTPSConnectionPool,
  327. }
  328. requests.adapters.SOCKSProxyManager = SocksProxyManager