http.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. import http.client
  2. import os
  3. import random
  4. import socket
  5. import ssl
  6. import time
  7. import urllib.error
  8. from .common import FileDownloader
  9. from ..utils import (
  10. ContentTooShortError,
  11. RetryManager,
  12. ThrottledDownload,
  13. XAttrMetadataError,
  14. XAttrUnavailableError,
  15. encodeFilename,
  16. int_or_none,
  17. parse_http_range,
  18. sanitized_Request,
  19. try_call,
  20. write_xattr,
  21. )
  22. RESPONSE_READ_EXCEPTIONS = (
  23. TimeoutError,
  24. socket.timeout, # compat: py < 3.10
  25. ConnectionError,
  26. ssl.SSLError,
  27. http.client.HTTPException
  28. )
  29. class HttpFD(FileDownloader):
  30. def real_download(self, filename, info_dict):
  31. url = info_dict['url']
  32. request_data = info_dict.get('request_data', None)
  33. class DownloadContext(dict):
  34. __getattr__ = dict.get
  35. __setattr__ = dict.__setitem__
  36. __delattr__ = dict.__delitem__
  37. ctx = DownloadContext()
  38. ctx.filename = filename
  39. ctx.tmpfilename = self.temp_name(filename)
  40. ctx.stream = None
  41. # Do not include the Accept-Encoding header
  42. headers = {'Youtubedl-no-compression': 'True'}
  43. add_headers = info_dict.get('http_headers')
  44. if add_headers:
  45. headers.update(add_headers)
  46. is_test = self.params.get('test', False)
  47. chunk_size = self._TEST_FILE_SIZE if is_test else (
  48. self.params.get('http_chunk_size')
  49. or info_dict.get('downloader_options', {}).get('http_chunk_size')
  50. or 0)
  51. ctx.open_mode = 'wb'
  52. ctx.resume_len = 0
  53. ctx.block_size = self.params.get('buffersize', 1024)
  54. ctx.start_time = time.time()
  55. # parse given Range
  56. req_start, req_end, _ = parse_http_range(headers.get('Range'))
  57. if self.params.get('continuedl', True):
  58. # Establish possible resume length
  59. if os.path.isfile(encodeFilename(ctx.tmpfilename)):
  60. ctx.resume_len = os.path.getsize(
  61. encodeFilename(ctx.tmpfilename))
  62. ctx.is_resume = ctx.resume_len > 0
  63. class SucceedDownload(Exception):
  64. pass
  65. class RetryDownload(Exception):
  66. def __init__(self, source_error):
  67. self.source_error = source_error
  68. class NextFragment(Exception):
  69. pass
  70. def establish_connection():
  71. ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size)
  72. if not is_test and chunk_size else chunk_size)
  73. if ctx.resume_len > 0:
  74. range_start = ctx.resume_len
  75. if req_start is not None:
  76. # offset the beginning of Range to be within request
  77. range_start += req_start
  78. if ctx.is_resume:
  79. self.report_resuming_byte(ctx.resume_len)
  80. ctx.open_mode = 'ab'
  81. elif req_start is not None:
  82. range_start = req_start
  83. elif ctx.chunk_size > 0:
  84. range_start = 0
  85. else:
  86. range_start = None
  87. ctx.is_resume = False
  88. if ctx.chunk_size:
  89. chunk_aware_end = range_start + ctx.chunk_size - 1
  90. # we're not allowed to download outside Range
  91. range_end = chunk_aware_end if req_end is None else min(chunk_aware_end, req_end)
  92. elif req_end is not None:
  93. # there's no need for chunked downloads, so download until the end of Range
  94. range_end = req_end
  95. else:
  96. range_end = None
  97. if try_call(lambda: range_start > range_end):
  98. ctx.resume_len = 0
  99. ctx.open_mode = 'wb'
  100. raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})'))
  101. if try_call(lambda: range_end >= ctx.content_len):
  102. range_end = ctx.content_len - 1
  103. request = sanitized_Request(url, request_data, headers)
  104. has_range = range_start is not None
  105. if has_range:
  106. request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}')
  107. # Establish connection
  108. try:
  109. ctx.data = self.ydl.urlopen(request)
  110. # When trying to resume, Content-Range HTTP header of response has to be checked
  111. # to match the value of requested Range HTTP header. This is due to a webservers
  112. # that don't support resuming and serve a whole file with no Content-Range
  113. # set in response despite of requested Range (see
  114. # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799)
  115. if has_range:
  116. content_range = ctx.data.headers.get('Content-Range')
  117. content_range_start, content_range_end, content_len = parse_http_range(content_range)
  118. # Content-Range is present and matches requested Range, resume is possible
  119. if range_start == content_range_start and (
  120. # Non-chunked download
  121. not ctx.chunk_size
  122. # Chunked download and requested piece or
  123. # its part is promised to be served
  124. or content_range_end == range_end
  125. or content_len < range_end):
  126. ctx.content_len = content_len
  127. if content_len or req_end:
  128. ctx.data_len = min(content_len or req_end, req_end or content_len) - (req_start or 0)
  129. return
  130. # Content-Range is either not present or invalid. Assuming remote webserver is
  131. # trying to send the whole file, resume is not possible, so wiping the local file
  132. # and performing entire redownload
  133. self.report_unable_to_resume()
  134. ctx.resume_len = 0
  135. ctx.open_mode = 'wb'
  136. ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None))
  137. except urllib.error.HTTPError as err:
  138. if err.code == 416:
  139. # Unable to resume (requested range not satisfiable)
  140. try:
  141. # Open the connection again without the range header
  142. ctx.data = self.ydl.urlopen(
  143. sanitized_Request(url, request_data, headers))
  144. content_length = ctx.data.info()['Content-Length']
  145. except urllib.error.HTTPError as err:
  146. if err.code < 500 or err.code >= 600:
  147. raise
  148. else:
  149. # Examine the reported length
  150. if (content_length is not None
  151. and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
  152. # The file had already been fully downloaded.
  153. # Explanation to the above condition: in issue #175 it was revealed that
  154. # YouTube sometimes adds or removes a few bytes from the end of the file,
  155. # changing the file size slightly and causing problems for some users. So
  156. # I decided to implement a suggested change and consider the file
  157. # completely downloaded if the file size differs less than 100 bytes from
  158. # the one in the hard drive.
  159. self.report_file_already_downloaded(ctx.filename)
  160. self.try_rename(ctx.tmpfilename, ctx.filename)
  161. self._hook_progress({
  162. 'filename': ctx.filename,
  163. 'status': 'finished',
  164. 'downloaded_bytes': ctx.resume_len,
  165. 'total_bytes': ctx.resume_len,
  166. }, info_dict)
  167. raise SucceedDownload()
  168. else:
  169. # The length does not match, we start the download over
  170. self.report_unable_to_resume()
  171. ctx.resume_len = 0
  172. ctx.open_mode = 'wb'
  173. return
  174. elif err.code < 500 or err.code >= 600:
  175. # Unexpected HTTP error
  176. raise
  177. raise RetryDownload(err)
  178. except urllib.error.URLError as err:
  179. if isinstance(err.reason, ssl.CertificateError):
  180. raise
  181. raise RetryDownload(err)
  182. # In urllib.request.AbstractHTTPHandler, the response is partially read on request.
  183. # Any errors that occur during this will not be wrapped by URLError
  184. except RESPONSE_READ_EXCEPTIONS as err:
  185. raise RetryDownload(err)
  186. def close_stream():
  187. if ctx.stream is not None:
  188. if not ctx.tmpfilename == '-':
  189. ctx.stream.close()
  190. ctx.stream = None
  191. def download():
  192. data_len = ctx.data.info().get('Content-length', None)
  193. # Range HTTP header may be ignored/unsupported by a webserver
  194. # (e.g. extractor/scivee.py, extractor/bambuser.py).
  195. # However, for a test we still would like to download just a piece of a file.
  196. # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
  197. # block size when downloading a file.
  198. if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
  199. data_len = self._TEST_FILE_SIZE
  200. if data_len is not None:
  201. data_len = int(data_len) + ctx.resume_len
  202. min_data_len = self.params.get('min_filesize')
  203. max_data_len = self.params.get('max_filesize')
  204. if min_data_len is not None and data_len < min_data_len:
  205. self.to_screen(
  206. f'\r[download] File is smaller than min-filesize ({data_len} bytes < {min_data_len} bytes). Aborting.')
  207. return False
  208. if max_data_len is not None and data_len > max_data_len:
  209. self.to_screen(
  210. f'\r[download] File is larger than max-filesize ({data_len} bytes > {max_data_len} bytes). Aborting.')
  211. return False
  212. byte_counter = 0 + ctx.resume_len
  213. block_size = ctx.block_size
  214. start = time.time()
  215. # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
  216. now = None # needed for slow_down() in the first loop run
  217. before = start # start measuring
  218. def retry(e):
  219. close_stream()
  220. ctx.resume_len = (byte_counter if ctx.tmpfilename == '-'
  221. else os.path.getsize(encodeFilename(ctx.tmpfilename)))
  222. raise RetryDownload(e)
  223. while True:
  224. try:
  225. # Download and write
  226. data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
  227. except RESPONSE_READ_EXCEPTIONS as err:
  228. retry(err)
  229. byte_counter += len(data_block)
  230. # exit loop when download is finished
  231. if len(data_block) == 0:
  232. break
  233. # Open destination file just in time
  234. if ctx.stream is None:
  235. try:
  236. ctx.stream, ctx.tmpfilename = self.sanitize_open(
  237. ctx.tmpfilename, ctx.open_mode)
  238. assert ctx.stream is not None
  239. ctx.filename = self.undo_temp_name(ctx.tmpfilename)
  240. self.report_destination(ctx.filename)
  241. except OSError as err:
  242. self.report_error('unable to open for writing: %s' % str(err))
  243. return False
  244. if self.params.get('xattr_set_filesize', False) and data_len is not None:
  245. try:
  246. write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode())
  247. except (XAttrUnavailableError, XAttrMetadataError) as err:
  248. self.report_error('unable to set filesize xattr: %s' % str(err))
  249. try:
  250. ctx.stream.write(data_block)
  251. except OSError as err:
  252. self.to_stderr('\n')
  253. self.report_error('unable to write data: %s' % str(err))
  254. return False
  255. # Apply rate limit
  256. self.slow_down(start, now, byte_counter - ctx.resume_len)
  257. # end measuring of one loop run
  258. now = time.time()
  259. after = now
  260. # Adjust block size
  261. if not self.params.get('noresizebuffer', False):
  262. block_size = self.best_block_size(after - before, len(data_block))
  263. before = after
  264. # Progress message
  265. speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
  266. if ctx.data_len is None:
  267. eta = None
  268. else:
  269. eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len)
  270. self._hook_progress({
  271. 'status': 'downloading',
  272. 'downloaded_bytes': byte_counter,
  273. 'total_bytes': ctx.data_len,
  274. 'tmpfilename': ctx.tmpfilename,
  275. 'filename': ctx.filename,
  276. 'eta': eta,
  277. 'speed': speed,
  278. 'elapsed': now - ctx.start_time,
  279. 'ctx_id': info_dict.get('ctx_id'),
  280. }, info_dict)
  281. if data_len is not None and byte_counter == data_len:
  282. break
  283. if speed and speed < (self.params.get('throttledratelimit') or 0):
  284. # The speed must stay below the limit for 3 seconds
  285. # This prevents raising error when the speed temporarily goes down
  286. if ctx.throttle_start is None:
  287. ctx.throttle_start = now
  288. elif now - ctx.throttle_start > 3:
  289. if ctx.stream is not None and ctx.tmpfilename != '-':
  290. ctx.stream.close()
  291. raise ThrottledDownload()
  292. elif speed:
  293. ctx.throttle_start = None
  294. if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len:
  295. ctx.resume_len = byte_counter
  296. # ctx.block_size = block_size
  297. raise NextFragment()
  298. if ctx.stream is None:
  299. self.to_stderr('\n')
  300. self.report_error('Did not get any data blocks')
  301. return False
  302. if ctx.tmpfilename != '-':
  303. ctx.stream.close()
  304. if data_len is not None and byte_counter != data_len:
  305. err = ContentTooShortError(byte_counter, int(data_len))
  306. retry(err)
  307. self.try_rename(ctx.tmpfilename, ctx.filename)
  308. # Update file modification time
  309. if self.params.get('updatetime', True):
  310. info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
  311. self._hook_progress({
  312. 'downloaded_bytes': byte_counter,
  313. 'total_bytes': byte_counter,
  314. 'filename': ctx.filename,
  315. 'status': 'finished',
  316. 'elapsed': time.time() - ctx.start_time,
  317. 'ctx_id': info_dict.get('ctx_id'),
  318. }, info_dict)
  319. return True
  320. for retry in RetryManager(self.params.get('retries'), self.report_retry):
  321. try:
  322. establish_connection()
  323. return download()
  324. except RetryDownload as err:
  325. retry.error = err.source_error
  326. continue
  327. except NextFragment:
  328. retry.error = None
  329. retry.attempt -= 1
  330. continue
  331. except SucceedDownload:
  332. return True
  333. except: # noqa: E722
  334. close_stream()
  335. raise
  336. return False