123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386 |
- import http.client
- import os
- import random
- import socket
- import ssl
- import time
- import urllib.error
- from .common import FileDownloader
- from ..utils import (
- ContentTooShortError,
- RetryManager,
- ThrottledDownload,
- XAttrMetadataError,
- XAttrUnavailableError,
- encodeFilename,
- int_or_none,
- parse_http_range,
- sanitized_Request,
- try_call,
- write_xattr,
- )
- RESPONSE_READ_EXCEPTIONS = (
- TimeoutError,
- socket.timeout, # compat: py < 3.10
- ConnectionError,
- ssl.SSLError,
- http.client.HTTPException
- )
- class HttpFD(FileDownloader):
- def real_download(self, filename, info_dict):
- url = info_dict['url']
- request_data = info_dict.get('request_data', None)
- class DownloadContext(dict):
- __getattr__ = dict.get
- __setattr__ = dict.__setitem__
- __delattr__ = dict.__delitem__
- ctx = DownloadContext()
- ctx.filename = filename
- ctx.tmpfilename = self.temp_name(filename)
- ctx.stream = None
- # Do not include the Accept-Encoding header
- headers = {'Youtubedl-no-compression': 'True'}
- add_headers = info_dict.get('http_headers')
- if add_headers:
- headers.update(add_headers)
- is_test = self.params.get('test', False)
- chunk_size = self._TEST_FILE_SIZE if is_test else (
- self.params.get('http_chunk_size')
- or info_dict.get('downloader_options', {}).get('http_chunk_size')
- or 0)
- ctx.open_mode = 'wb'
- ctx.resume_len = 0
- ctx.block_size = self.params.get('buffersize', 1024)
- ctx.start_time = time.time()
- # parse given Range
- req_start, req_end, _ = parse_http_range(headers.get('Range'))
- if self.params.get('continuedl', True):
- # Establish possible resume length
- if os.path.isfile(encodeFilename(ctx.tmpfilename)):
- ctx.resume_len = os.path.getsize(
- encodeFilename(ctx.tmpfilename))
- ctx.is_resume = ctx.resume_len > 0
- class SucceedDownload(Exception):
- pass
- class RetryDownload(Exception):
- def __init__(self, source_error):
- self.source_error = source_error
- class NextFragment(Exception):
- pass
- def establish_connection():
- ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size)
- if not is_test and chunk_size else chunk_size)
- if ctx.resume_len > 0:
- range_start = ctx.resume_len
- if req_start is not None:
- # offset the beginning of Range to be within request
- range_start += req_start
- if ctx.is_resume:
- self.report_resuming_byte(ctx.resume_len)
- ctx.open_mode = 'ab'
- elif req_start is not None:
- range_start = req_start
- elif ctx.chunk_size > 0:
- range_start = 0
- else:
- range_start = None
- ctx.is_resume = False
- if ctx.chunk_size:
- chunk_aware_end = range_start + ctx.chunk_size - 1
- # we're not allowed to download outside Range
- range_end = chunk_aware_end if req_end is None else min(chunk_aware_end, req_end)
- elif req_end is not None:
- # there's no need for chunked downloads, so download until the end of Range
- range_end = req_end
- else:
- range_end = None
- if try_call(lambda: range_start > range_end):
- ctx.resume_len = 0
- ctx.open_mode = 'wb'
- raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})'))
- if try_call(lambda: range_end >= ctx.content_len):
- range_end = ctx.content_len - 1
- request = sanitized_Request(url, request_data, headers)
- has_range = range_start is not None
- if has_range:
- request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}')
- # Establish connection
- try:
- ctx.data = self.ydl.urlopen(request)
- # When trying to resume, Content-Range HTTP header of response has to be checked
- # to match the value of requested Range HTTP header. This is due to a webservers
- # that don't support resuming and serve a whole file with no Content-Range
- # set in response despite of requested Range (see
- # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799)
- if has_range:
- content_range = ctx.data.headers.get('Content-Range')
- content_range_start, content_range_end, content_len = parse_http_range(content_range)
- # Content-Range is present and matches requested Range, resume is possible
- if range_start == content_range_start and (
- # Non-chunked download
- not ctx.chunk_size
- # Chunked download and requested piece or
- # its part is promised to be served
- or content_range_end == range_end
- or content_len < range_end):
- ctx.content_len = content_len
- if content_len or req_end:
- ctx.data_len = min(content_len or req_end, req_end or content_len) - (req_start or 0)
- return
- # Content-Range is either not present or invalid. Assuming remote webserver is
- # trying to send the whole file, resume is not possible, so wiping the local file
- # and performing entire redownload
- self.report_unable_to_resume()
- ctx.resume_len = 0
- ctx.open_mode = 'wb'
- ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None))
- except urllib.error.HTTPError as err:
- if err.code == 416:
- # Unable to resume (requested range not satisfiable)
- try:
- # Open the connection again without the range header
- ctx.data = self.ydl.urlopen(
- sanitized_Request(url, request_data, headers))
- content_length = ctx.data.info()['Content-Length']
- except urllib.error.HTTPError as err:
- if err.code < 500 or err.code >= 600:
- raise
- else:
- # Examine the reported length
- if (content_length is not None
- and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
- # The file had already been fully downloaded.
- # Explanation to the above condition: in issue #175 it was revealed that
- # YouTube sometimes adds or removes a few bytes from the end of the file,
- # changing the file size slightly and causing problems for some users. So
- # I decided to implement a suggested change and consider the file
- # completely downloaded if the file size differs less than 100 bytes from
- # the one in the hard drive.
- self.report_file_already_downloaded(ctx.filename)
- self.try_rename(ctx.tmpfilename, ctx.filename)
- self._hook_progress({
- 'filename': ctx.filename,
- 'status': 'finished',
- 'downloaded_bytes': ctx.resume_len,
- 'total_bytes': ctx.resume_len,
- }, info_dict)
- raise SucceedDownload()
- else:
- # The length does not match, we start the download over
- self.report_unable_to_resume()
- ctx.resume_len = 0
- ctx.open_mode = 'wb'
- return
- elif err.code < 500 or err.code >= 600:
- # Unexpected HTTP error
- raise
- raise RetryDownload(err)
- except urllib.error.URLError as err:
- if isinstance(err.reason, ssl.CertificateError):
- raise
- raise RetryDownload(err)
- # In urllib.request.AbstractHTTPHandler, the response is partially read on request.
- # Any errors that occur during this will not be wrapped by URLError
- except RESPONSE_READ_EXCEPTIONS as err:
- raise RetryDownload(err)
- def close_stream():
- if ctx.stream is not None:
- if not ctx.tmpfilename == '-':
- ctx.stream.close()
- ctx.stream = None
- def download():
- data_len = ctx.data.info().get('Content-length', None)
- # Range HTTP header may be ignored/unsupported by a webserver
- # (e.g. extractor/scivee.py, extractor/bambuser.py).
- # However, for a test we still would like to download just a piece of a file.
- # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
- # block size when downloading a file.
- if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
- data_len = self._TEST_FILE_SIZE
- if data_len is not None:
- data_len = int(data_len) + ctx.resume_len
- min_data_len = self.params.get('min_filesize')
- max_data_len = self.params.get('max_filesize')
- if min_data_len is not None and data_len < min_data_len:
- self.to_screen(
- f'\r[download] File is smaller than min-filesize ({data_len} bytes < {min_data_len} bytes). Aborting.')
- return False
- if max_data_len is not None and data_len > max_data_len:
- self.to_screen(
- f'\r[download] File is larger than max-filesize ({data_len} bytes > {max_data_len} bytes). Aborting.')
- return False
- byte_counter = 0 + ctx.resume_len
- block_size = ctx.block_size
- start = time.time()
- # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
- now = None # needed for slow_down() in the first loop run
- before = start # start measuring
- def retry(e):
- close_stream()
- ctx.resume_len = (byte_counter if ctx.tmpfilename == '-'
- else os.path.getsize(encodeFilename(ctx.tmpfilename)))
- raise RetryDownload(e)
- while True:
- try:
- # Download and write
- data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
- except RESPONSE_READ_EXCEPTIONS as err:
- retry(err)
- byte_counter += len(data_block)
- # exit loop when download is finished
- if len(data_block) == 0:
- break
- # Open destination file just in time
- if ctx.stream is None:
- try:
- ctx.stream, ctx.tmpfilename = self.sanitize_open(
- ctx.tmpfilename, ctx.open_mode)
- assert ctx.stream is not None
- ctx.filename = self.undo_temp_name(ctx.tmpfilename)
- self.report_destination(ctx.filename)
- except OSError as err:
- self.report_error('unable to open for writing: %s' % str(err))
- return False
- if self.params.get('xattr_set_filesize', False) and data_len is not None:
- try:
- write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode())
- except (XAttrUnavailableError, XAttrMetadataError) as err:
- self.report_error('unable to set filesize xattr: %s' % str(err))
- try:
- ctx.stream.write(data_block)
- except OSError as err:
- self.to_stderr('\n')
- self.report_error('unable to write data: %s' % str(err))
- return False
- # Apply rate limit
- self.slow_down(start, now, byte_counter - ctx.resume_len)
- # end measuring of one loop run
- now = time.time()
- after = now
- # Adjust block size
- if not self.params.get('noresizebuffer', False):
- block_size = self.best_block_size(after - before, len(data_block))
- before = after
- # Progress message
- speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
- if ctx.data_len is None:
- eta = None
- else:
- eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len)
- self._hook_progress({
- 'status': 'downloading',
- 'downloaded_bytes': byte_counter,
- 'total_bytes': ctx.data_len,
- 'tmpfilename': ctx.tmpfilename,
- 'filename': ctx.filename,
- 'eta': eta,
- 'speed': speed,
- 'elapsed': now - ctx.start_time,
- 'ctx_id': info_dict.get('ctx_id'),
- }, info_dict)
- if data_len is not None and byte_counter == data_len:
- break
- if speed and speed < (self.params.get('throttledratelimit') or 0):
- # The speed must stay below the limit for 3 seconds
- # This prevents raising error when the speed temporarily goes down
- if ctx.throttle_start is None:
- ctx.throttle_start = now
- elif now - ctx.throttle_start > 3:
- if ctx.stream is not None and ctx.tmpfilename != '-':
- ctx.stream.close()
- raise ThrottledDownload()
- elif speed:
- ctx.throttle_start = None
- if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len:
- ctx.resume_len = byte_counter
- # ctx.block_size = block_size
- raise NextFragment()
- if ctx.stream is None:
- self.to_stderr('\n')
- self.report_error('Did not get any data blocks')
- return False
- if ctx.tmpfilename != '-':
- ctx.stream.close()
- if data_len is not None and byte_counter != data_len:
- err = ContentTooShortError(byte_counter, int(data_len))
- retry(err)
- self.try_rename(ctx.tmpfilename, ctx.filename)
- # Update file modification time
- if self.params.get('updatetime', True):
- info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
- self._hook_progress({
- 'downloaded_bytes': byte_counter,
- 'total_bytes': byte_counter,
- 'filename': ctx.filename,
- 'status': 'finished',
- 'elapsed': time.time() - ctx.start_time,
- 'ctx_id': info_dict.get('ctx_id'),
- }, info_dict)
- return True
- for retry in RetryManager(self.params.get('retries'), self.report_retry):
- try:
- establish_connection()
- return download()
- except RetryDownload as err:
- retry.error = err.source_error
- continue
- except NextFragment:
- retry.error = None
- retry.attempt -= 1
- continue
- except SucceedDownload:
- return True
- except: # noqa: E722
- close_stream()
- raise
- return False
|