_urllib.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. from __future__ import annotations
  2. import functools
  3. import http.client
  4. import io
  5. import socket
  6. import ssl
  7. import urllib.error
  8. import urllib.parse
  9. import urllib.request
  10. import urllib.response
  11. import zlib
  12. from urllib.request import (
  13. DataHandler,
  14. FileHandler,
  15. FTPHandler,
  16. HTTPCookieProcessor,
  17. HTTPDefaultErrorHandler,
  18. HTTPErrorProcessor,
  19. UnknownHandler,
  20. )
  21. from ._helper import (
  22. InstanceStoreMixin,
  23. add_accept_encoding_header,
  24. get_redirect_method,
  25. make_socks_proxy_opts,
  26. select_proxy,
  27. )
  28. from .common import Features, RequestHandler, Response, register_rh
  29. from .exceptions import (
  30. CertificateVerifyError,
  31. HTTPError,
  32. IncompleteRead,
  33. ProxyError,
  34. RequestError,
  35. SSLError,
  36. TransportError,
  37. )
  38. from ..dependencies import brotli
  39. from ..socks import ProxyError as SocksProxyError
  40. from ..socks import sockssocket
  41. from ..utils import update_url_query
  42. from ..utils.networking import normalize_url
  43. SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  44. CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  45. if brotli:
  46. SUPPORTED_ENCODINGS.append('br')
  47. CONTENT_DECODE_ERRORS.append(brotli.error)
  48. def _create_http_connection(http_class, source_address, *args, **kwargs):
  49. hc = http_class(*args, **kwargs)
  50. if source_address is not None:
  51. # This is to workaround _create_connection() from socket where it will try all
  52. # address data from getaddrinfo() including IPv6. This filters the result from
  53. # getaddrinfo() based on the source_address value.
  54. # This is based on the cpython socket.create_connection() function.
  55. # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  56. def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  57. host, port = address
  58. err = None
  59. addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  60. af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  61. ip_addrs = [addr for addr in addrs if addr[0] == af]
  62. if addrs and not ip_addrs:
  63. ip_version = 'v4' if af == socket.AF_INET else 'v6'
  64. raise OSError(
  65. "No remote IP%s addresses available for connect, can't use '%s' as source address"
  66. % (ip_version, source_address[0]))
  67. for res in ip_addrs:
  68. af, socktype, proto, canonname, sa = res
  69. sock = None
  70. try:
  71. sock = socket.socket(af, socktype, proto)
  72. if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  73. sock.settimeout(timeout)
  74. sock.bind(source_address)
  75. sock.connect(sa)
  76. err = None # Explicitly break reference cycle
  77. return sock
  78. except OSError as _:
  79. err = _
  80. if sock is not None:
  81. sock.close()
  82. if err is not None:
  83. raise err
  84. else:
  85. raise OSError('getaddrinfo returns an empty list')
  86. if hasattr(hc, '_create_connection'):
  87. hc._create_connection = _create_connection
  88. hc.source_address = (source_address, 0)
  89. return hc
  90. class HTTPHandler(urllib.request.AbstractHTTPHandler):
  91. """Handler for HTTP requests and responses.
  92. This class, when installed with an OpenerDirector, automatically adds
  93. the standard headers to every HTTP request and handles gzipped, deflated and
  94. brotli responses from web servers.
  95. Part of this code was copied from:
  96. http://techknack.net/python-urllib2-handlers/
  97. Andrew Rowls, the author of that code, agreed to release it to the
  98. public domain.
  99. """
  100. def __init__(self, context=None, source_address=None, *args, **kwargs):
  101. super().__init__(*args, **kwargs)
  102. self._source_address = source_address
  103. self._context = context
  104. @staticmethod
  105. def _make_conn_class(base, req):
  106. conn_class = base
  107. socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
  108. if socks_proxy:
  109. conn_class = make_socks_conn_class(conn_class, socks_proxy)
  110. return conn_class
  111. def http_open(self, req):
  112. conn_class = self._make_conn_class(http.client.HTTPConnection, req)
  113. return self.do_open(functools.partial(
  114. _create_http_connection, conn_class, self._source_address), req)
  115. def https_open(self, req):
  116. conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
  117. return self.do_open(
  118. functools.partial(
  119. _create_http_connection, conn_class, self._source_address),
  120. req, context=self._context)
  121. @staticmethod
  122. def deflate(data):
  123. if not data:
  124. return data
  125. try:
  126. return zlib.decompress(data, -zlib.MAX_WBITS)
  127. except zlib.error:
  128. return zlib.decompress(data)
  129. @staticmethod
  130. def brotli(data):
  131. if not data:
  132. return data
  133. return brotli.decompress(data)
  134. @staticmethod
  135. def gz(data):
  136. # There may be junk added the end of the file
  137. # We ignore it by only ever decoding a single gzip payload
  138. if not data:
  139. return data
  140. return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
  141. def http_request(self, req):
  142. # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
  143. # always respected by websites, some tend to give out URLs with non percent-encoded
  144. # non-ASCII characters (see telemb.py, ard.py [#3412])
  145. # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
  146. # To work around aforementioned issue we will replace request's original URL with
  147. # percent-encoded one
  148. # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
  149. # the code of this workaround has been moved here from YoutubeDL.urlopen()
  150. url = req.get_full_url()
  151. url_escaped = normalize_url(url)
  152. # Substitute URL if any change after escaping
  153. if url != url_escaped:
  154. req = update_Request(req, url=url_escaped)
  155. return super().do_request_(req)
  156. def http_response(self, req, resp):
  157. old_resp = resp
  158. # Content-Encoding header lists the encodings in order that they were applied [1].
  159. # To decompress, we simply do the reverse.
  160. # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
  161. decoded_response = None
  162. for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
  163. if encoding == 'gzip':
  164. decoded_response = self.gz(decoded_response or resp.read())
  165. elif encoding == 'deflate':
  166. decoded_response = self.deflate(decoded_response or resp.read())
  167. elif encoding == 'br' and brotli:
  168. decoded_response = self.brotli(decoded_response or resp.read())
  169. if decoded_response is not None:
  170. resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
  171. resp.msg = old_resp.msg
  172. # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
  173. # https://github.com/ytdl-org/youtube-dl/issues/6457).
  174. if 300 <= resp.code < 400:
  175. location = resp.headers.get('Location')
  176. if location:
  177. # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
  178. location = location.encode('iso-8859-1').decode()
  179. location_escaped = normalize_url(location)
  180. if location != location_escaped:
  181. del resp.headers['Location']
  182. resp.headers['Location'] = location_escaped
  183. return resp
  184. https_request = http_request
  185. https_response = http_response
  186. def make_socks_conn_class(base_class, socks_proxy):
  187. assert issubclass(base_class, (
  188. http.client.HTTPConnection, http.client.HTTPSConnection))
  189. proxy_args = make_socks_proxy_opts(socks_proxy)
  190. class SocksConnection(base_class):
  191. def connect(self):
  192. self.sock = sockssocket()
  193. self.sock.setproxy(**proxy_args)
  194. if type(self.timeout) in (int, float): # noqa: E721
  195. self.sock.settimeout(self.timeout)
  196. self.sock.connect((self.host, self.port))
  197. if isinstance(self, http.client.HTTPSConnection):
  198. self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
  199. return SocksConnection
  200. class RedirectHandler(urllib.request.HTTPRedirectHandler):
  201. """YoutubeDL redirect handler
  202. The code is based on HTTPRedirectHandler implementation from CPython [1].
  203. This redirect handler fixes and improves the logic to better align with RFC7261
  204. and what browsers tend to do [2][3]
  205. 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
  206. 2. https://datatracker.ietf.org/doc/html/rfc7231
  207. 3. https://github.com/python/cpython/issues/91306
  208. """
  209. http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
  210. def redirect_request(self, req, fp, code, msg, headers, newurl):
  211. if code not in (301, 302, 303, 307, 308):
  212. raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
  213. new_data = req.data
  214. # Technically the Cookie header should be in unredirected_hdrs,
  215. # however in practice some may set it in normal headers anyway.
  216. # We will remove it here to prevent any leaks.
  217. remove_headers = ['Cookie']
  218. new_method = get_redirect_method(req.get_method(), code)
  219. # only remove payload if method changed (e.g. POST to GET)
  220. if new_method != req.get_method():
  221. new_data = None
  222. remove_headers.extend(['Content-Length', 'Content-Type'])
  223. new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
  224. return urllib.request.Request(
  225. newurl, headers=new_headers, origin_req_host=req.origin_req_host,
  226. unverifiable=True, method=new_method, data=new_data)
  227. class ProxyHandler(urllib.request.BaseHandler):
  228. handler_order = 100
  229. def __init__(self, proxies=None):
  230. self.proxies = proxies
  231. # Set default handlers
  232. for type in ('http', 'https', 'ftp'):
  233. setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
  234. def proxy_open(self, req):
  235. proxy = select_proxy(req.get_full_url(), self.proxies)
  236. if proxy is None:
  237. return
  238. if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
  239. req.add_header('Ytdl-socks-proxy', proxy)
  240. # hypervideo's http/https handlers do wrapping the socket with socks
  241. return None
  242. return urllib.request.ProxyHandler.proxy_open(
  243. self, req, proxy, None)
  244. class PUTRequest(urllib.request.Request):
  245. def get_method(self):
  246. return 'PUT'
  247. class HEADRequest(urllib.request.Request):
  248. def get_method(self):
  249. return 'HEAD'
  250. def update_Request(req, url=None, data=None, headers=None, query=None):
  251. req_headers = req.headers.copy()
  252. req_headers.update(headers or {})
  253. req_data = data if data is not None else req.data
  254. req_url = update_url_query(url or req.get_full_url(), query)
  255. req_get_method = req.get_method()
  256. if req_get_method == 'HEAD':
  257. req_type = HEADRequest
  258. elif req_get_method == 'PUT':
  259. req_type = PUTRequest
  260. else:
  261. req_type = urllib.request.Request
  262. new_req = req_type(
  263. req_url, data=req_data, headers=req_headers,
  264. origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
  265. if hasattr(req, 'timeout'):
  266. new_req.timeout = req.timeout
  267. return new_req
  268. class UrllibResponseAdapter(Response):
  269. """
  270. HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
  271. """
  272. def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
  273. # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
  274. # HTTPResponse: .getcode() was deprecated, .status always existed [2]
  275. # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
  276. # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
  277. super().__init__(
  278. fp=res, headers=res.headers, url=res.url,
  279. status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
  280. def read(self, amt=None):
  281. try:
  282. return self.fp.read(amt)
  283. except Exception as e:
  284. handle_response_read_exceptions(e)
  285. raise e
  286. def handle_sslerror(e: ssl.SSLError):
  287. if not isinstance(e, ssl.SSLError):
  288. return
  289. if isinstance(e, ssl.SSLCertVerificationError):
  290. raise CertificateVerifyError(cause=e) from e
  291. raise SSLError(cause=e) from e
  292. def handle_response_read_exceptions(e):
  293. if isinstance(e, http.client.IncompleteRead):
  294. raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
  295. elif isinstance(e, ssl.SSLError):
  296. handle_sslerror(e)
  297. elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
  298. # OSErrors raised here should mostly be network related
  299. raise TransportError(cause=e) from e
  300. @register_rh
  301. class UrllibRH(RequestHandler, InstanceStoreMixin):
  302. _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
  303. _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
  304. _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
  305. RH_NAME = 'urllib'
  306. def __init__(self, *, enable_file_urls: bool = False, **kwargs):
  307. super().__init__(**kwargs)
  308. self.enable_file_urls = enable_file_urls
  309. if self.enable_file_urls:
  310. self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
  311. def _check_extensions(self, extensions):
  312. super()._check_extensions(extensions)
  313. extensions.pop('cookiejar', None)
  314. extensions.pop('timeout', None)
  315. def _create_instance(self, proxies, cookiejar):
  316. opener = urllib.request.OpenerDirector()
  317. handlers = [
  318. ProxyHandler(proxies),
  319. HTTPHandler(
  320. debuglevel=int(bool(self.verbose)),
  321. context=self._make_sslcontext(),
  322. source_address=self.source_address),
  323. HTTPCookieProcessor(cookiejar),
  324. DataHandler(),
  325. UnknownHandler(),
  326. HTTPDefaultErrorHandler(),
  327. FTPHandler(),
  328. HTTPErrorProcessor(),
  329. RedirectHandler(),
  330. ]
  331. if self.enable_file_urls:
  332. handlers.append(FileHandler())
  333. for handler in handlers:
  334. opener.add_handler(handler)
  335. # Delete the default user-agent header, which would otherwise apply in
  336. # cases where our custom HTTP handler doesn't come into play
  337. # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
  338. opener.addheaders = []
  339. return opener
  340. def _send(self, request):
  341. headers = self._merge_headers(request.headers)
  342. add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
  343. urllib_req = urllib.request.Request(
  344. url=request.url,
  345. data=request.data,
  346. headers=dict(headers),
  347. method=request.method
  348. )
  349. opener = self._get_instance(
  350. proxies=request.proxies or self.proxies,
  351. cookiejar=request.extensions.get('cookiejar') or self.cookiejar
  352. )
  353. try:
  354. res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
  355. except urllib.error.HTTPError as e:
  356. if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
  357. # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
  358. e._closer.file = None
  359. raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
  360. raise # unexpected
  361. except urllib.error.URLError as e:
  362. cause = e.reason # NOTE: cause may be a string
  363. # proxy errors
  364. if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
  365. raise ProxyError(cause=e) from e
  366. handle_response_read_exceptions(cause)
  367. raise TransportError(cause=e) from e
  368. except (http.client.InvalidURL, ValueError) as e:
  369. # Validation errors
  370. # http.client.HTTPConnection raises ValueError in some validation cases
  371. # such as if request method contains illegal control characters [1]
  372. # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
  373. raise RequestError(cause=e) from e
  374. except Exception as e:
  375. handle_response_read_exceptions(e)
  376. raise # unexpected
  377. return UrllibResponseAdapter(res)