123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286 |
- #!/usr/bin/env python3
- from gevent import monkey
- monkey.patch_all()
- import gevent.socket
- from youtube import yt_app
- from youtube import util
- # these are just so the files get run - they import yt_app and add routes to it
- from youtube import watch, search, playlist, channel, local_playlist, comments, subscriptions
- import settings
- from gevent.pywsgi import WSGIServer
- import urllib
- import urllib3
- import socket
- import socks, sockshandler
- import subprocess
- import re
- import sys
- import time
- def youtu_be(env, start_response):
- id = env['PATH_INFO'][1:]
- env['PATH_INFO'] = '/watch'
- if not env['QUERY_STRING']:
- env['QUERY_STRING'] = 'v=' + id
- else:
- env['QUERY_STRING'] += '&v=' + id
- yield from yt_app(env, start_response)
- RANGE_RE = re.compile(r'bytes=(\d+-(?:\d+)?)')
- def parse_range(range_header, content_length):
- # Range header can be like bytes=200-1000 or bytes=200-
- # amount_received is the length of bytes from the range that have already
- # been received
- match = RANGE_RE.fullmatch(range_header.strip())
- if not match:
- print('Unsupported range header format:', range_header)
- return None
- start, end = match.group(1).split('-')
- start_byte = int(start)
- if not end:
- end_byte = start_byte + content_length - 1
- else:
- end_byte = int(end)
- return start_byte, end_byte
- def proxy_site(env, start_response, video=False):
- send_headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
- 'Accept': '*/*',
- }
- current_range_start = 0
- range_end = None
- if 'HTTP_RANGE' in env:
- send_headers['Range'] = env['HTTP_RANGE']
- url = "https://" + env['SERVER_NAME'] + env['PATH_INFO']
- # remove /name portion
- if video and '/videoplayback/name/' in url:
- url = url[0:url.rfind('/name/')]
- if env['QUERY_STRING']:
- url += '?' + env['QUERY_STRING']
- try_num = 1
- first_attempt = True
- current_attempt_position = 0
- while try_num <= 3: # Try a given byte position three times
- if not first_attempt:
- print('(Try %d)' % try_num, 'Trying with', send_headers['Range'])
- if video:
- params = urllib.parse.parse_qs(env['QUERY_STRING'])
- params_use_tor = int(params.get('use_tor', '0')[0])
- use_tor = (settings.route_tor == 2) or params_use_tor
- response, cleanup_func = util.fetch_url_response(url, send_headers,
- use_tor=use_tor,
- max_redirects=10)
- else:
- response, cleanup_func = util.fetch_url_response(url, send_headers)
- response_headers = response.headers
- if isinstance(response_headers, urllib3._collections.HTTPHeaderDict):
- response_headers = response_headers.items()
- if video:
- response_headers = (list(response_headers)
- +[('Access-Control-Allow-Origin', '*')])
- if first_attempt:
- start_response(str(response.status) + ' ' + response.reason,
- response_headers)
- content_length = int(dict(response_headers).get('Content-Length', 0))
- if response.status >= 400:
- print('Error: YouTube returned "%d %s" while routing %s' % (
- response.status, response.reason, url.split('?')[0]))
- total_received = 0
- retry = False
- while True:
- # a bit over 3 seconds of 360p video
- # we want each TCP packet to transmit in large multiples,
- # such as 65,536, so we shouldn't read in small chunks
- # such as 8192 lest that causes the socket library to limit the
- # TCP window size
- # Might need fine-tuning, since this gives us 4*65536
- # The tradeoff is that larger values (such as 6 seconds) only
- # allows video to buffer in those increments, meaning user must
- # wait until the entire chunk is downloaded before video starts
- # playing
- content_part = response.read(32*8192)
- total_received += len(content_part)
- if not content_part:
- # Sometimes YouTube closes the connection before sending all of
- # the content. Retry with a range request for the missing
- # content. See
- # https://github.com/user234683/youtube-local/issues/40
- if total_received < content_length:
- if 'Range' in send_headers:
- int_range = parse_range(send_headers['Range'],
- content_length)
- if not int_range: # give up b/c unrecognized range
- break
- start, end = int_range
- else:
- start, end = 0, (content_length - 1)
- fail_byte = start + total_received
- send_headers['Range'] = 'bytes=%d-%d' % (fail_byte, end)
- print(
- 'Warning: YouTube closed the connection before byte',
- str(fail_byte) + '.', 'Expected', start+content_length,
- 'bytes.'
- )
- retry = True
- first_attempt = False
- if fail_byte == current_attempt_position:
- try_num += 1
- else:
- try_num = 1
- current_attempt_position = fail_byte
- break
- yield content_part
- cleanup_func(response)
- if retry:
- # YouTube will return 503 Service Unavailable if you do a bunch
- # of range requests too quickly.
- time.sleep(1)
- continue
- else:
- break
- else: # no break
- print('Error: YouTube closed the connection before',
- 'providing all content. Retried three times:', url.split('?')[0])
- def proxy_video(env, start_response):
- yield from proxy_site(env, start_response, video=True)
- site_handlers = {
- 'youtube.com': yt_app,
- 'youtube-nocookie.com': yt_app,
- 'youtu.be': youtu_be,
- 'ytimg.com': proxy_site,
- 'ggpht.com': proxy_site,
- 'googleusercontent.com': proxy_site,
- 'sponsor.ajay.app': proxy_site,
- 'googlevideo.com': proxy_video,
- }
- def split_url(url):
- ''' Split https://sub.example.com/foo/bar.html into ('sub.example.com', '/foo/bar.html')'''
- # XXX: Is this regex safe from REDOS?
- # python STILL doesn't have a proper regular expression engine like grep uses built in...
- match = re.match(r'(?:https?://)?([\w-]+(?:\.[\w-]+)+?)(/.*|$)', url)
- if match is None:
- raise ValueError('Invalid or unsupported url: ' + url)
- return match.group(1), match.group(2)
- def error_code(code, start_response):
- start_response(code, ())
- return code.encode()
- def site_dispatch(env, start_response):
- client_address = env['REMOTE_ADDR']
- try:
- # correct malformed query string with ? separators instead of &
- env['QUERY_STRING'] = env['QUERY_STRING'].replace('?', '&')
- # Fix PATH_INFO for UWSGI
- if 'REQUEST_URI' in env:
- env['PATH_INFO'] = urllib.parse.unquote(
- env['REQUEST_URI'].split('?')[0]
- )
- method = env['REQUEST_METHOD']
- path = env['PATH_INFO']
- if (method == "POST"
- and client_address not in ('127.0.0.1', '::1')
- and not settings.allow_foreign_post_requests):
- yield error_code('403 Forbidden', start_response)
- return
- # redirect localhost:8080 to localhost:8080/https://youtube.com
- if path == '' or path == '/':
- start_response('302 Found', [('Location', '/https://youtube.com')])
- return
- try:
- env['SERVER_NAME'], env['PATH_INFO'] = split_url(path[1:])
- except ValueError:
- yield error_code('404 Not Found', start_response)
- return
- base_name = ''
- for domain in reversed(env['SERVER_NAME'].split('.')):
- if base_name == '':
- base_name = domain
- else:
- base_name = domain + '.' + base_name
- try:
- handler = site_handlers[base_name]
- except KeyError:
- continue
- else:
- yield from handler(env, start_response)
- break
- else: # did not break
- yield error_code('404 Not Found', start_response)
- return
- except Exception:
- start_response('500 Internal Server Error', ())
- yield b'500 Internal Server Error'
- raise
- return
- class FilteredRequestLog:
- '''Don't log noisy thumbnail and avatar requests'''
- filter_re = re.compile(r'''(?x)
- "GET\ /https://(
- i[.]ytimg[.]com/|
- www[.]youtube[.]com/data/subscription_thumbnails/|
- yt3[.]ggpht[.]com/|
- www[.]youtube[.]com/api/timedtext|
- [-\w]+[.]googlevideo[.]com/).*"\ (200|206)
- ''')
- def __init__(self):
- pass
- def write(self, s):
- if not self.filter_re.search(s):
- sys.stderr.write(s)
- if __name__ == '__main__':
- if settings.allow_foreign_addresses:
- server = WSGIServer(('0.0.0.0', settings.port_number), site_dispatch,
- log=FilteredRequestLog())
- ip_server = '0.0.0.0'
- else:
- server = WSGIServer(('127.0.0.1', settings.port_number), site_dispatch,
- log=FilteredRequestLog())
- ip_server = '127.0.0.1'
- print('Starting httpserver at http://%s:%s/' %
- (ip_server, settings.port_number))
- server.serve_forever()
- # for uwsgi, gunicorn, etc.
- application = site_dispatch
|