server.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. #!/usr/bin/env python3
  2. from gevent import monkey
  3. monkey.patch_all()
  4. import gevent.socket
  5. from youtube import yt_app
  6. from youtube import util
  7. # these are just so the files get run - they import yt_app and add routes to it
  8. from youtube import watch, search, playlist, channel, local_playlist, comments, subscriptions
  9. import settings
  10. from gevent.pywsgi import WSGIServer
  11. import urllib
  12. import urllib3
  13. import socket
  14. import socks, sockshandler
  15. import subprocess
  16. import re
  17. import sys
  18. import time
  19. def youtu_be(env, start_response):
  20. id = env['PATH_INFO'][1:]
  21. env['PATH_INFO'] = '/watch'
  22. if not env['QUERY_STRING']:
  23. env['QUERY_STRING'] = 'v=' + id
  24. else:
  25. env['QUERY_STRING'] += '&v=' + id
  26. yield from yt_app(env, start_response)
  27. RANGE_RE = re.compile(r'bytes=(\d+-(?:\d+)?)')
  28. def parse_range(range_header, content_length):
  29. # Range header can be like bytes=200-1000 or bytes=200-
  30. # amount_received is the length of bytes from the range that have already
  31. # been received
  32. match = RANGE_RE.fullmatch(range_header.strip())
  33. if not match:
  34. print('Unsupported range header format:', range_header)
  35. return None
  36. start, end = match.group(1).split('-')
  37. start_byte = int(start)
  38. if not end:
  39. end_byte = start_byte + content_length - 1
  40. else:
  41. end_byte = int(end)
  42. return start_byte, end_byte
  43. def proxy_site(env, start_response, video=False):
  44. send_headers = {
  45. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
  46. 'Accept': '*/*',
  47. }
  48. current_range_start = 0
  49. range_end = None
  50. if 'HTTP_RANGE' in env:
  51. send_headers['Range'] = env['HTTP_RANGE']
  52. url = "https://" + env['SERVER_NAME'] + env['PATH_INFO']
  53. # remove /name portion
  54. if video and '/videoplayback/name/' in url:
  55. url = url[0:url.rfind('/name/')]
  56. if env['QUERY_STRING']:
  57. url += '?' + env['QUERY_STRING']
  58. try_num = 1
  59. first_attempt = True
  60. current_attempt_position = 0
  61. while try_num <= 3: # Try a given byte position three times
  62. if not first_attempt:
  63. print('(Try %d)' % try_num, 'Trying with', send_headers['Range'])
  64. if video:
  65. params = urllib.parse.parse_qs(env['QUERY_STRING'])
  66. params_use_tor = int(params.get('use_tor', '0')[0])
  67. use_tor = (settings.route_tor == 2) or params_use_tor
  68. response, cleanup_func = util.fetch_url_response(url, send_headers,
  69. use_tor=use_tor,
  70. max_redirects=10)
  71. else:
  72. response, cleanup_func = util.fetch_url_response(url, send_headers)
  73. response_headers = response.getheaders()
  74. if isinstance(response_headers, urllib3._collections.HTTPHeaderDict):
  75. response_headers = response_headers.items()
  76. if first_attempt:
  77. start_response(str(response.status) + ' ' + response.reason,
  78. response_headers)
  79. content_length = int(dict(response_headers).get('Content-Length', 0))
  80. if response.status >= 400:
  81. print('Error: YouTube returned "%d %s" while routing %s' % (
  82. response.status, response.reason, url.split('?')[0]))
  83. total_received = 0
  84. retry = False
  85. while True:
  86. # a bit over 3 seconds of 360p video
  87. # we want each TCP packet to transmit in large multiples,
  88. # such as 65,536, so we shouldn't read in small chunks
  89. # such as 8192 lest that causes the socket library to limit the
  90. # TCP window size
  91. # Might need fine-tuning, since this gives us 4*65536
  92. # The tradeoff is that larger values (such as 6 seconds) only
  93. # allows video to buffer in those increments, meaning user must
  94. # wait until the entire chunk is downloaded before video starts
  95. # playing
  96. content_part = response.read(32*8192)
  97. total_received += len(content_part)
  98. if not content_part:
  99. # Sometimes YouTube closes the connection before sending all of
  100. # the content. Retry with a range request for the missing
  101. # content. See
  102. # https://github.com/user234683/youtube-local/issues/40
  103. if total_received < content_length:
  104. if 'Range' in send_headers:
  105. int_range = parse_range(send_headers['Range'],
  106. content_length)
  107. if not int_range: # give up b/c unrecognized range
  108. break
  109. start, end = int_range
  110. else:
  111. start, end = 0, (content_length - 1)
  112. fail_byte = start + total_received
  113. send_headers['Range'] = 'bytes=%d-%d' % (fail_byte, end)
  114. print(
  115. 'Warning: YouTube closed the connection before byte',
  116. str(fail_byte) + '.', 'Expected', start+content_length,
  117. 'bytes.'
  118. )
  119. retry = True
  120. first_attempt = False
  121. if fail_byte == current_attempt_position:
  122. try_num += 1
  123. else:
  124. try_num = 1
  125. current_attempt_position = fail_byte
  126. break
  127. yield content_part
  128. cleanup_func(response)
  129. if retry:
  130. # YouTube will return 503 Service Unavailable if you do a bunch
  131. # of range requests too quickly.
  132. time.sleep(1)
  133. continue
  134. else:
  135. break
  136. else: # no break
  137. print('Error: YouTube closed the connection before',
  138. 'providing all content. Retried three times:', url.split('?')[0])
  139. def proxy_video(env, start_response):
  140. yield from proxy_site(env, start_response, video=True)
  141. site_handlers = {
  142. 'youtube.com': yt_app,
  143. 'youtube-nocookie.com': yt_app,
  144. 'youtu.be': youtu_be,
  145. 'ytimg.com': proxy_site,
  146. 'yt3.ggpht.com': proxy_site,
  147. 'lh3.googleusercontent.com': proxy_site,
  148. 'sponsor.ajay.app': proxy_site,
  149. 'googlevideo.com': proxy_video,
  150. }
  151. def split_url(url):
  152. ''' Split https://sub.example.com/foo/bar.html into ('sub.example.com', '/foo/bar.html')'''
  153. # XXX: Is this regex safe from REDOS?
  154. # python STILL doesn't have a proper regular expression engine like grep uses built in...
  155. match = re.match(r'(?:https?://)?([\w-]+(?:\.[\w-]+)+?)(/.*|$)', url)
  156. if match is None:
  157. raise ValueError('Invalid or unsupported url: ' + url)
  158. return match.group(1), match.group(2)
  159. def error_code(code, start_response):
  160. start_response(code, ())
  161. return code.encode()
  162. def site_dispatch(env, start_response):
  163. client_address = env['REMOTE_ADDR']
  164. try:
  165. # correct malformed query string with ? separators instead of &
  166. env['QUERY_STRING'] = env['QUERY_STRING'].replace('?', '&')
  167. # Fix PATH_INFO for UWSGI
  168. if 'REQUEST_URI' in env:
  169. env['PATH_INFO'] = urllib.parse.unquote(
  170. env['REQUEST_URI'].split('?')[0]
  171. )
  172. method = env['REQUEST_METHOD']
  173. path = env['PATH_INFO']
  174. if (method == "POST"
  175. and client_address not in ('127.0.0.1', '::1')
  176. and not settings.allow_foreign_post_requests):
  177. yield error_code('403 Forbidden', start_response)
  178. return
  179. # redirect localhost:8080 to localhost:8080/https://youtube.com
  180. if path == '' or path == '/':
  181. start_response('302 Found', [('Location', '/https://youtube.com')])
  182. return
  183. try:
  184. env['SERVER_NAME'], env['PATH_INFO'] = split_url(path[1:])
  185. except ValueError:
  186. yield error_code('404 Not Found', start_response)
  187. return
  188. base_name = ''
  189. for domain in reversed(env['SERVER_NAME'].split('.')):
  190. if base_name == '':
  191. base_name = domain
  192. else:
  193. base_name = domain + '.' + base_name
  194. try:
  195. handler = site_handlers[base_name]
  196. except KeyError:
  197. continue
  198. else:
  199. yield from handler(env, start_response)
  200. break
  201. else: # did not break
  202. yield error_code('404 Not Found', start_response)
  203. return
  204. except Exception:
  205. start_response('500 Internal Server Error', ())
  206. yield b'500 Internal Server Error'
  207. raise
  208. return
  209. class FilteredRequestLog:
  210. '''Don't log noisy thumbnail and avatar requests'''
  211. filter_re = re.compile(r"""(?x)^
  212. "GET /https://(i[.]ytimg[.]com/|
  213. www[.]youtube[.]com/data/subscription_thumbnails/|
  214. yt3[.]ggpht[.]com/|
  215. www[.]youtube[.]com/api/timedtext).*" 200
  216. """)
  217. def __init__(self):
  218. pass
  219. def write(self, s):
  220. if not self.filter_re.search(s):
  221. sys.stderr.write(s)
  222. if __name__ == '__main__':
  223. if settings.allow_foreign_addresses:
  224. server = WSGIServer(('0.0.0.0', settings.port_number), site_dispatch,
  225. log=FilteredRequestLog())
  226. ip_server = '0.0.0.0'
  227. else:
  228. server = WSGIServer(('127.0.0.1', settings.port_number), site_dispatch,
  229. log=FilteredRequestLog())
  230. ip_server = '127.0.0.1'
  231. print('Starting httpserver at http://%s:%s/' %
  232. (ip_server, settings.port_number))
  233. server.serve_forever()
  234. # for uwsgi, gunicorn, etc.
  235. application = site_dispatch