server.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. #!/usr/bin/env python3
  2. from gevent import monkey
  3. monkey.patch_all()
  4. import gevent.socket
  5. from youtube import yt_app
  6. from youtube import util
  7. # these are just so the files get run - they import yt_app and add routes to it
  8. from youtube import watch, search, playlist, channel, local_playlist, comments, subscriptions
  9. import settings
  10. from gevent.pywsgi import WSGIServer
  11. import urllib
  12. import urllib3
  13. import socket
  14. import socks, sockshandler
  15. import subprocess
  16. import re
  17. import sys
  18. import time
  19. def youtu_be(env, start_response):
  20. id = env['PATH_INFO'][1:]
  21. env['PATH_INFO'] = '/watch'
  22. if not env['QUERY_STRING']:
  23. env['QUERY_STRING'] = 'v=' + id
  24. else:
  25. env['QUERY_STRING'] += '&v=' + id
  26. yield from yt_app(env, start_response)
  27. RANGE_RE = re.compile(r'bytes=(\d+-(?:\d+)?)')
  28. def parse_range(range_header, content_length):
  29. # Range header can be like bytes=200-1000 or bytes=200-
  30. # amount_received is the length of bytes from the range that have already
  31. # been received
  32. match = RANGE_RE.fullmatch(range_header.strip())
  33. if not match:
  34. print('Unsupported range header format:', range_header)
  35. return None
  36. start, end = match.group(1).split('-')
  37. start_byte = int(start)
  38. if not end:
  39. end_byte = start_byte + content_length - 1
  40. else:
  41. end_byte = int(end)
  42. return start_byte, end_byte
  43. def proxy_site(env, start_response, video=False):
  44. send_headers = {
  45. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
  46. 'Accept': '*/*',
  47. }
  48. current_range_start = 0
  49. range_end = None
  50. if 'HTTP_RANGE' in env:
  51. send_headers['Range'] = env['HTTP_RANGE']
  52. url = "https://" + env['SERVER_NAME'] + env['PATH_INFO']
  53. # remove /name portion
  54. if video and '/videoplayback/name/' in url:
  55. url = url[0:url.rfind('/name/')]
  56. if env['QUERY_STRING']:
  57. url += '?' + env['QUERY_STRING']
  58. try_num = 1
  59. first_attempt = True
  60. current_attempt_position = 0
  61. while try_num <= 3: # Try a given byte position three times
  62. if not first_attempt:
  63. print('(Try %d)' % try_num, 'Trying with', send_headers['Range'])
  64. if video:
  65. params = urllib.parse.parse_qs(env['QUERY_STRING'])
  66. params_use_tor = int(params.get('use_tor', '0')[0])
  67. use_tor = (settings.route_tor == 2) or params_use_tor
  68. response, cleanup_func = util.fetch_url_response(url, send_headers,
  69. use_tor=use_tor,
  70. max_redirects=10)
  71. else:
  72. response, cleanup_func = util.fetch_url_response(url, send_headers)
  73. response_headers = response.headers
  74. if isinstance(response_headers, urllib3._collections.HTTPHeaderDict):
  75. response_headers = response_headers.items()
  76. if video:
  77. response_headers = (list(response_headers)
  78. +[('Access-Control-Allow-Origin', '*')])
  79. if first_attempt:
  80. start_response(str(response.status) + ' ' + response.reason,
  81. response_headers)
  82. content_length = int(dict(response_headers).get('Content-Length', 0))
  83. if response.status >= 400:
  84. print('Error: YouTube returned "%d %s" while routing %s' % (
  85. response.status, response.reason, url.split('?')[0]))
  86. total_received = 0
  87. retry = False
  88. while True:
  89. # a bit over 3 seconds of 360p video
  90. # we want each TCP packet to transmit in large multiples,
  91. # such as 65,536, so we shouldn't read in small chunks
  92. # such as 8192 lest that causes the socket library to limit the
  93. # TCP window size
  94. # Might need fine-tuning, since this gives us 4*65536
  95. # The tradeoff is that larger values (such as 6 seconds) only
  96. # allows video to buffer in those increments, meaning user must
  97. # wait until the entire chunk is downloaded before video starts
  98. # playing
  99. content_part = response.read(32*8192)
  100. total_received += len(content_part)
  101. if not content_part:
  102. # Sometimes YouTube closes the connection before sending all of
  103. # the content. Retry with a range request for the missing
  104. # content. See
  105. # https://github.com/user234683/youtube-local/issues/40
  106. if total_received < content_length:
  107. if 'Range' in send_headers:
  108. int_range = parse_range(send_headers['Range'],
  109. content_length)
  110. if not int_range: # give up b/c unrecognized range
  111. break
  112. start, end = int_range
  113. else:
  114. start, end = 0, (content_length - 1)
  115. fail_byte = start + total_received
  116. send_headers['Range'] = 'bytes=%d-%d' % (fail_byte, end)
  117. print(
  118. 'Warning: YouTube closed the connection before byte',
  119. str(fail_byte) + '.', 'Expected', start+content_length,
  120. 'bytes.'
  121. )
  122. retry = True
  123. first_attempt = False
  124. if fail_byte == current_attempt_position:
  125. try_num += 1
  126. else:
  127. try_num = 1
  128. current_attempt_position = fail_byte
  129. break
  130. yield content_part
  131. cleanup_func(response)
  132. if retry:
  133. # YouTube will return 503 Service Unavailable if you do a bunch
  134. # of range requests too quickly.
  135. time.sleep(1)
  136. continue
  137. else:
  138. break
  139. else: # no break
  140. print('Error: YouTube closed the connection before',
  141. 'providing all content. Retried three times:', url.split('?')[0])
  142. def proxy_video(env, start_response):
  143. yield from proxy_site(env, start_response, video=True)
  144. site_handlers = {
  145. 'youtube.com': yt_app,
  146. 'youtube-nocookie.com': yt_app,
  147. 'youtu.be': youtu_be,
  148. 'ytimg.com': proxy_site,
  149. 'ggpht.com': proxy_site,
  150. 'googleusercontent.com': proxy_site,
  151. 'sponsor.ajay.app': proxy_site,
  152. 'googlevideo.com': proxy_video,
  153. }
  154. def split_url(url):
  155. ''' Split https://sub.example.com/foo/bar.html into ('sub.example.com', '/foo/bar.html')'''
  156. # XXX: Is this regex safe from REDOS?
  157. # python STILL doesn't have a proper regular expression engine like grep uses built in...
  158. match = re.match(r'(?:https?://)?([\w-]+(?:\.[\w-]+)+?)(/.*|$)', url)
  159. if match is None:
  160. raise ValueError('Invalid or unsupported url: ' + url)
  161. return match.group(1), match.group(2)
  162. def error_code(code, start_response):
  163. start_response(code, ())
  164. return code.encode()
  165. def site_dispatch(env, start_response):
  166. client_address = env['REMOTE_ADDR']
  167. try:
  168. # correct malformed query string with ? separators instead of &
  169. env['QUERY_STRING'] = env['QUERY_STRING'].replace('?', '&')
  170. # Fix PATH_INFO for UWSGI
  171. if 'REQUEST_URI' in env:
  172. env['PATH_INFO'] = urllib.parse.unquote(
  173. env['REQUEST_URI'].split('?')[0]
  174. )
  175. method = env['REQUEST_METHOD']
  176. path = env['PATH_INFO']
  177. if (method == "POST"
  178. and client_address not in ('127.0.0.1', '::1')
  179. and not settings.allow_foreign_post_requests):
  180. yield error_code('403 Forbidden', start_response)
  181. return
  182. # redirect localhost:8080 to localhost:8080/https://youtube.com
  183. if path == '' or path == '/':
  184. start_response('302 Found', [('Location', '/https://youtube.com')])
  185. return
  186. try:
  187. env['SERVER_NAME'], env['PATH_INFO'] = split_url(path[1:])
  188. except ValueError:
  189. yield error_code('404 Not Found', start_response)
  190. return
  191. base_name = ''
  192. for domain in reversed(env['SERVER_NAME'].split('.')):
  193. if base_name == '':
  194. base_name = domain
  195. else:
  196. base_name = domain + '.' + base_name
  197. try:
  198. handler = site_handlers[base_name]
  199. except KeyError:
  200. continue
  201. else:
  202. yield from handler(env, start_response)
  203. break
  204. else: # did not break
  205. yield error_code('404 Not Found', start_response)
  206. return
  207. except Exception:
  208. start_response('500 Internal Server Error', ())
  209. yield b'500 Internal Server Error'
  210. raise
  211. return
  212. class FilteredRequestLog:
  213. '''Don't log noisy thumbnail and avatar requests'''
  214. filter_re = re.compile(r'''(?x)
  215. "GET\ /https://(
  216. i[.]ytimg[.]com/|
  217. www[.]youtube[.]com/data/subscription_thumbnails/|
  218. yt3[.]ggpht[.]com/|
  219. www[.]youtube[.]com/api/timedtext|
  220. [-\w]+[.]googlevideo[.]com/).*"\ (200|206)
  221. ''')
  222. def __init__(self):
  223. pass
  224. def write(self, s):
  225. if not self.filter_re.search(s):
  226. sys.stderr.write(s)
  227. if __name__ == '__main__':
  228. if settings.allow_foreign_addresses:
  229. server = WSGIServer(('0.0.0.0', settings.port_number), site_dispatch,
  230. log=FilteredRequestLog())
  231. ip_server = '0.0.0.0'
  232. else:
  233. server = WSGIServer(('127.0.0.1', settings.port_number), site_dispatch,
  234. log=FilteredRequestLog())
  235. ip_server = '127.0.0.1'
  236. print('Starting httpserver at http://%s:%s/' %
  237. (ip_server, settings.port_number))
  238. server.serve_forever()
  239. # for uwsgi, gunicorn, etc.
  240. application = site_dispatch