netcache.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912
  1. #!/usr/bin/env python3
  2. import os
  3. import sys
  4. import urllib.parse
  5. import argparse
  6. import codecs
  7. import getpass
  8. import socket
  9. import ssl
  10. import glob
  11. import datetime
  12. import hashlib
  13. import sqlite3
  14. from ssl import CertificateError
  15. import ansicat
  16. import offutils
  17. from offutils import xdg
  18. import time
  19. try:
  20. import chardet
  21. _HAS_CHARDET = True
  22. except ModuleNotFoundError:
  23. _HAS_CHARDET = False
  24. try:
  25. from cryptography import x509
  26. from cryptography.hazmat.backends import default_backend
  27. _HAS_CRYPTOGRAPHY = True
  28. _BACKEND = default_backend()
  29. except(ModuleNotFoundError,ImportError):
  30. _HAS_CRYPTOGRAPHY = False
  31. try:
  32. import requests
  33. _DO_HTTP = True
  34. except (ModuleNotFoundError,ImportError):
  35. _DO_HTTP = False
  36. # This list is also used as a list of supported protocols
  37. standard_ports = {
  38. "gemini" : 1965,
  39. "gopher" : 70,
  40. "finger" : 79,
  41. "http" : 80,
  42. "https" : 443,
  43. "spartan": 300,
  44. }
  45. default_protocol = "gemini"
  46. CRLF = '\r\n'
  47. DEFAULT_TIMEOUT = 10
  48. _MAX_REDIRECTS = 5
  49. # monkey-patch Gemini support in urllib.parse
  50. # see https://github.com/python/cpython/blob/master/Lib/urllib/parse.py
  51. urllib.parse.uses_relative.append("gemini")
  52. urllib.parse.uses_netloc.append("gemini")
  53. urllib.parse.uses_relative.append("spartan")
  54. urllib.parse.uses_netloc.append("spartan")
  55. class UserAbortException(Exception):
  56. pass
  57. def parse_mime(mime):
  58. options = {}
  59. if mime:
  60. if ";" in mime:
  61. splited = mime.split(";",maxsplit=1)
  62. mime = splited[0]
  63. if len(splited) >= 1:
  64. options_list = splited[1].split()
  65. for o in options_list:
  66. spl = o.split("=",maxsplit=1)
  67. if len(spl) > 0:
  68. options[spl[0]] = spl[1]
  69. return mime, options
  70. def normalize_url(url):
  71. if "://" not in url and ("./" not in url and url[0] != "/"):
  72. if not url.startswith("mailto:"):
  73. url = "gemini://" + url
  74. return url
  75. def cache_last_modified(url):
  76. if not url:
  77. return None
  78. path = get_cache_path(url)
  79. if path and os.path.isfile(path):
  80. return os.path.getmtime(path)
  81. else:
  82. return None
  83. def is_cache_valid(url,validity=0):
  84. # Validity is the acceptable time for
  85. # a cache to be valid (in seconds)
  86. # If 0, then any cache is considered as valid
  87. # (use validity = 1 if you want to refresh everything)
  88. if offutils.is_local(url):
  89. return True
  90. cache = get_cache_path(url)
  91. if cache :
  92. # If path is too long, we always return True to avoid
  93. # fetching it.
  94. if len(cache) > 259:
  95. print("We return False because path is too long")
  96. return False
  97. if os.path.exists(cache) and not os.path.isdir(cache):
  98. if validity > 0 :
  99. last_modification = cache_last_modified(url)
  100. now = time.time()
  101. age = now - last_modification
  102. return age < validity
  103. else:
  104. return True
  105. else:
  106. #Cache has not been build
  107. return False
  108. else:
  109. #There’s not even a cache!
  110. return False
  111. def get_cache_path(url,add_index=True):
  112. # Sometimes, cache_path became a folder! (which happens for index.html/index.gmi)
  113. # In that case, we need to reconstruct it
  114. # if add_index=False, we don’t add that "index.gmi" at the ends of the cache_path
  115. #First, we parse the URL
  116. if not url:
  117. return None
  118. parsed = urllib.parse.urlparse(url)
  119. if url[0] == "/" or url.startswith("./") or os.path.exists(url):
  120. scheme = "file"
  121. elif parsed.scheme:
  122. scheme = parsed.scheme
  123. else:
  124. scheme = default_protocol
  125. if scheme in ["file","mailto","list"]:
  126. local = True
  127. host = ""
  128. port = None
  129. # file:// is 7 char
  130. if url.startswith("file://"):
  131. path = url[7:]
  132. elif scheme == "mailto":
  133. path = parsed.path
  134. elif url.startswith("list://"):
  135. listdir = os.path.join(xdg("data"),"lists")
  136. listname = url[7:].lstrip("/")
  137. if listname in [""]:
  138. name = "My Lists"
  139. path = listdir
  140. else:
  141. name = listname
  142. path = os.path.join(listdir, "%s.gmi"%listname)
  143. else:
  144. path = url
  145. else:
  146. local = False
  147. # Convert unicode hostname to punycode using idna RFC3490
  148. host = parsed.hostname #.encode("idna").decode()
  149. port = parsed.port or standard_ports.get(scheme, 0)
  150. # special gopher selector case
  151. if scheme == "gopher":
  152. if len(parsed.path) >= 2:
  153. itemtype = parsed.path[1]
  154. path = parsed.path[2:]
  155. else:
  156. itemtype = "1"
  157. path = ""
  158. if itemtype == "0":
  159. mime = "text/gemini"
  160. elif itemtype == "1":
  161. mime = "text/gopher"
  162. elif itemtype == "h":
  163. mime = "text/html"
  164. elif itemtype in ("9","g","I","s",";"):
  165. mime = "binary"
  166. else:
  167. mime = "text/gopher"
  168. else:
  169. path = parsed.path
  170. if parsed.query:
  171. # we don’t add the query if path is too long because path above 260 char
  172. # are not supported and crash python.
  173. # Also, very long query are usually useless stuff
  174. if len(path+parsed.query) < 258:
  175. path += "/" + parsed.query
  176. # Now, we have a partial path. Let’s make it full path.
  177. if local:
  178. cache_path = path
  179. elif scheme and host:
  180. cache_path = os.path.expanduser(xdg("cache") + scheme + "/" + host + path)
  181. #There’s an OS limitation of 260 characters per path.
  182. #We will thus cut the path enough to add the index afterward
  183. cache_path = cache_path[:249]
  184. # this is a gross hack to give a name to
  185. # index files. This will break if the index is not
  186. # index.gmi. I don’t know how to know the real name
  187. # of the file. But first, we need to ensure that the domain name
  188. # finish by "/". Else, the cache will create a file, not a folder.
  189. if scheme.startswith("http"):
  190. index = "index.html"
  191. elif scheme == "finger":
  192. index = "index.txt"
  193. elif scheme == "gopher":
  194. index = "gophermap"
  195. else:
  196. index = "index.gmi"
  197. if path == "" or os.path.isdir(cache_path):
  198. if not cache_path.endswith("/"):
  199. cache_path += "/"
  200. if not url.endswith("/"):
  201. url += "/"
  202. if add_index and cache_path.endswith("/"):
  203. cache_path += index
  204. #sometimes, the index itself is a dir
  205. #like when folder/index.gmi?param has been created
  206. #and we try to access folder
  207. if add_index and os.path.isdir(cache_path):
  208. cache_path += "/" + index
  209. else:
  210. #URL is missing either a supported scheme or a valid host
  211. #print("Error: %s is not a supported url"%url)
  212. return None
  213. if len(cache_path) > 259:
  214. print("Path is too long. This is an OS limitation.\n\n")
  215. print(url)
  216. return None
  217. return cache_path
  218. def write_body(url,body,mime=None):
  219. ## body is a copy of the raw gemtext
  220. ## Write_body() also create the cache !
  221. # DEFAULT GEMINI MIME
  222. mime, options = parse_mime(mime)
  223. cache_path = get_cache_path(url)
  224. if cache_path:
  225. if mime and mime.startswith("text/"):
  226. mode = "w"
  227. else:
  228. mode = "wb"
  229. cache_dir = os.path.dirname(cache_path)
  230. # If the subdirectory already exists as a file (not a folder)
  231. # We remove it (happens when accessing URL/subfolder before
  232. # URL/subfolder/file.gmi.
  233. # This causes loss of data in the cache
  234. # proper solution would be to save "sufolder" as "sufolder/index.gmi"
  235. # If the subdirectory doesn’t exist, we recursively try to find one
  236. # until it exists to avoid a file blocking the creation of folders
  237. root_dir = cache_dir
  238. while not os.path.exists(root_dir):
  239. root_dir = os.path.dirname(root_dir)
  240. if os.path.isfile(root_dir):
  241. os.remove(root_dir)
  242. os.makedirs(cache_dir,exist_ok=True)
  243. with open(cache_path, mode=mode) as f:
  244. f.write(body)
  245. f.close()
  246. return cache_path
  247. def set_error(url,err):
  248. # If we get an error, we want to keep an existing cache
  249. # but we need to touch it or to create an empty one
  250. # to avoid hitting the error at each refresh
  251. cache = get_cache_path(url)
  252. if is_cache_valid(url):
  253. os.utime(cache)
  254. elif cache:
  255. cache_dir = os.path.dirname(cache)
  256. root_dir = cache_dir
  257. while not os.path.exists(root_dir):
  258. root_dir = os.path.dirname(root_dir)
  259. if os.path.isfile(root_dir):
  260. os.remove(root_dir)
  261. os.makedirs(cache_dir,exist_ok=True)
  262. if os.path.isdir(cache_dir):
  263. with open(cache, "w") as c:
  264. c.write(str(datetime.datetime.now())+"\n")
  265. c.write("ERROR while caching %s\n\n" %url)
  266. c.write("*****\n\n")
  267. c.write(str(type(err)) + " = " + str(err))
  268. #cache.write("\n" + str(err.with_traceback(None)))
  269. c.write("\n*****\n\n")
  270. c.write("If you believe this error was temporary, type ""reload"".\n")
  271. c.write("The ressource will be tentatively fetched during next sync.\n")
  272. c.close()
  273. return cache
  274. def _fetch_http(url,max_size=None,timeout=DEFAULT_TIMEOUT,accept_bad_ssl_certificates=False,**kwargs):
  275. if not _DO_HTTP: return None
  276. def too_large_error(url,length,max_size):
  277. err = "Size of %s is %s Mo\n"%(url,length)
  278. err += "Offpunk only download automatically content under %s Mo\n" %(max_size/1000000)
  279. err += "To retrieve this content anyway, type 'reload'."
  280. return set_error(url,err)
  281. if accept_bad_ssl_certificates:
  282. requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL:@SECLEVEL=1'
  283. requests.packages.urllib3.disable_warnings()
  284. verify=False
  285. else:
  286. requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL:@SECLEVEL=2'
  287. verify=True
  288. header = {}
  289. header["User-Agent"] = "Netcache"
  290. with requests.get(url,verify=verify,headers=header, stream=True,timeout=DEFAULT_TIMEOUT) as response:
  291. if "content-type" in response.headers:
  292. mime = response.headers['content-type']
  293. else:
  294. mime = None
  295. if "content-length" in response.headers:
  296. length = int(response.headers['content-length'])
  297. else:
  298. length = 0
  299. if max_size and length > max_size:
  300. response.close()
  301. return too_large_error(url,str(length/100),max_size)
  302. elif max_size and length == 0:
  303. body = b''
  304. downloaded = 0
  305. for r in response.iter_content():
  306. body += r
  307. #We divide max_size for streamed content
  308. #in order to catch them faster
  309. size = sys.getsizeof(body)
  310. max = max_size/2
  311. current = round(size*100/max,1)
  312. if current > downloaded:
  313. downloaded = current
  314. print(" -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
  315. #print("size: %s (%s\% of maxlenght)"%(size,size/max_size))
  316. if size > max_size/2:
  317. response.close()
  318. return too_large_error(url,"streaming",max_size)
  319. response.close()
  320. else:
  321. body = response.content
  322. response.close()
  323. if mime and "text/" in mime:
  324. body = body.decode("UTF-8","replace")
  325. cache = write_body(url,body,mime)
  326. return cache
  327. def _fetch_gopher(url,timeout=DEFAULT_TIMEOUT,**kwargs):
  328. parsed =urllib.parse.urlparse(url)
  329. host = parsed.hostname
  330. port = parsed.port or 70
  331. if len(parsed.path) >= 2:
  332. itemtype = parsed.path[1]
  333. selector = parsed.path[2:]
  334. else:
  335. itemtype = "1"
  336. selector = ""
  337. addresses = socket.getaddrinfo(host, port, family=0,type=socket.SOCK_STREAM)
  338. s = socket.create_connection((host,port))
  339. for address in addresses:
  340. s = socket.socket(address[0], address[1])
  341. s.settimeout(timeout)
  342. try:
  343. s.connect(address[4])
  344. break
  345. except OSError as e:
  346. err = e
  347. if parsed.query:
  348. request = selector + "\t" + parsed.query
  349. else:
  350. request = selector
  351. request += "\r\n"
  352. s.sendall(request.encode("UTF-8"))
  353. response1 = s.makefile("rb")
  354. response = response1.read()
  355. # Transcode response into UTF-8
  356. #if itemtype in ("0","1","h"):
  357. if not itemtype in ("9","g","I","s",";"):
  358. # Try most common encodings
  359. for encoding in ("UTF-8", "ISO-8859-1"):
  360. try:
  361. response = response.decode("UTF-8")
  362. break
  363. except UnicodeDecodeError:
  364. pass
  365. else:
  366. # try to find encoding
  367. if _HAS_CHARDET:
  368. detected = chardet.detect(response)
  369. response = response.decode(detected["encoding"])
  370. else:
  371. raise UnicodeDecodeError
  372. if itemtype == "0":
  373. mime = "text/gemini"
  374. elif itemtype == "1":
  375. mime = "text/gopher"
  376. elif itemtype == "h":
  377. mime = "text/html"
  378. elif itemtype in ("9","g","I","s",";"):
  379. mime = None
  380. else:
  381. # by default, we should consider Gopher
  382. mime = "text/gopher"
  383. cache = write_body(url,response,mime)
  384. return cache
  385. def _fetch_finger(url,timeout=DEFAULT_TIMEOUT,**kwargs):
  386. parsed = urllib.parse.urlparse(url)
  387. host = parsed.hostname
  388. port = parsed.port or standard_ports["finger"]
  389. query = parsed.path.lstrip("/") + "\r\n"
  390. with socket.create_connection((host,port)) as sock:
  391. sock.settimeout(timeout)
  392. sock.send(query.encode())
  393. response = sock.makefile("rb").read().decode("UTF-8")
  394. cache = write_body(response,"text/plain")
  395. return cache
  396. # Originally copied from reference spartan client by Michael Lazar
  397. def _fetch_spartan(url,**kwargs):
  398. cache = None
  399. url_parts = urllib.parse.urlparse(url)
  400. host = url_parts.hostname
  401. port = url_parts.port or standard_ports["spartan"]
  402. path = url_parts.path or "/"
  403. query = url_parts.query
  404. redirect_url = None
  405. with socket.create_connection((host,port)) as sock:
  406. if query:
  407. data = urllib.parse.unquote_to_bytes(query)
  408. else:
  409. data = b""
  410. encoded_host = host.encode("idna")
  411. ascii_path = urllib.parse.unquote_to_bytes(path)
  412. encoded_path = urllib.parse.quote_from_bytes(ascii_path).encode("ascii")
  413. sock.send(b"%s %s %d\r\n" % (encoded_host,encoded_path,len(data)))
  414. fp = sock.makefile("rb")
  415. response = fp.readline(4096).decode("ascii").strip("\r\n")
  416. parts = response.split(" ",maxsplit=1)
  417. code,meta = int(parts[0]),parts[1]
  418. if code == 2:
  419. body = fp.read()
  420. if meta.startswith("text"):
  421. body = body.decode("UTF-8")
  422. cache = write_body(url,body,meta)
  423. elif code == 3:
  424. redirect_url = url_parts._replace(path=meta).geturl()
  425. else:
  426. return set_error(url,"Spartan code %s: Error %s"%(code,meta))
  427. if redirect_url:
  428. cache = _fetch_spartan(redirect_url)
  429. return cache
  430. def _validate_cert(address, host, cert,accept_bad_ssl=False,automatic_choice=None):
  431. """
  432. Validate a TLS certificate in TOFU mode.
  433. If the cryptography module is installed:
  434. - Check the certificate Common Name or SAN matches `host`
  435. - Check the certificate's not valid before date is in the past
  436. - Check the certificate's not valid after date is in the future
  437. Whether the cryptography module is installed or not, check the
  438. certificate's fingerprint against the TOFU database to see if we've
  439. previously encountered a different certificate for this IP address and
  440. hostname.
  441. """
  442. now = datetime.datetime.utcnow()
  443. if _HAS_CRYPTOGRAPHY:
  444. # Using the cryptography module we can get detailed access
  445. # to the properties of even self-signed certs, unlike in
  446. # the standard ssl library...
  447. c = x509.load_der_x509_certificate(cert, _BACKEND)
  448. # Check certificate validity dates
  449. if accept_bad_ssl:
  450. if c.not_valid_before >= now:
  451. raise CertificateError("Certificate not valid until: {}!".format(c.not_valid_before))
  452. elif c.not_valid_after <= now:
  453. raise CertificateError("Certificate expired as of: {})!".format(c.not_valid_after))
  454. # Check certificate hostnames
  455. names = []
  456. common_name = c.subject.get_attributes_for_oid(x509.oid.NameOID.COMMON_NAME)
  457. if common_name:
  458. names.append(common_name[0].value)
  459. try:
  460. names.extend([alt.value for alt in c.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME).value])
  461. except x509.ExtensionNotFound:
  462. pass
  463. names = set(names)
  464. for name in names:
  465. try:
  466. ssl._dnsname_match(str(name), host)
  467. break
  468. except CertificateError:
  469. continue
  470. else:
  471. # If we didn't break out, none of the names were valid
  472. raise CertificateError("Hostname does not match certificate common name or any alternative names.")
  473. sha = hashlib.sha256()
  474. sha.update(cert)
  475. fingerprint = sha.hexdigest()
  476. db_path = os.path.join(xdg("config"), "tofu.db")
  477. db_conn = sqlite3.connect(db_path)
  478. db_cur = db_conn.cursor()
  479. db_cur.execute("""CREATE TABLE IF NOT EXISTS cert_cache
  480. (hostname text, address text, fingerprint text,
  481. first_seen date, last_seen date, count integer)""")
  482. # Have we been here before?
  483. db_cur.execute("""SELECT fingerprint, first_seen, last_seen, count
  484. FROM cert_cache
  485. WHERE hostname=? AND address=?""", (host, address))
  486. cached_certs = db_cur.fetchall()
  487. # If so, check for a match
  488. if cached_certs:
  489. max_count = 0
  490. most_frequent_cert = None
  491. for cached_fingerprint, first, last, count in cached_certs:
  492. if count > max_count:
  493. max_count = count
  494. most_frequent_cert = cached_fingerprint
  495. if fingerprint == cached_fingerprint:
  496. # Matched!
  497. db_cur.execute("""UPDATE cert_cache
  498. SET last_seen=?, count=?
  499. WHERE hostname=? AND address=? AND fingerprint=?""",
  500. (now, count+1, host, address, fingerprint))
  501. db_conn.commit()
  502. break
  503. else:
  504. certdir = os.path.join(xdg("config"), "cert_cache")
  505. with open(os.path.join(certdir, most_frequent_cert+".crt"), "rb") as fp:
  506. previous_cert = fp.read()
  507. if _HAS_CRYPTOGRAPHY:
  508. # Load the most frequently seen certificate to see if it has
  509. # expired
  510. previous_cert = x509.load_der_x509_certificate(previous_cert, _BACKEND)
  511. previous_ttl = previous_cert.not_valid_after - now
  512. print(previous_ttl)
  513. print("****************************************")
  514. print("[SECURITY WARNING] Unrecognised certificate!")
  515. print("The certificate presented for {} ({}) has never been seen before.".format(host, address))
  516. print("This MIGHT be a Man-in-the-Middle attack.")
  517. print("A different certificate has previously been seen {} times.".format(max_count))
  518. if _HAS_CRYPTOGRAPHY:
  519. if previous_ttl < datetime.timedelta():
  520. print("That certificate has expired, which reduces suspicion somewhat.")
  521. else:
  522. print("That certificate is still valid for: {}".format(previous_ttl))
  523. print("****************************************")
  524. print("Attempt to verify the new certificate fingerprint out-of-band:")
  525. print(fingerprint)
  526. if automatic_choice:
  527. choice = automatic_choice
  528. else:
  529. choice = input("Accept this new certificate? Y/N ").strip().lower()
  530. if choice in ("y", "yes"):
  531. db_cur.execute("""INSERT INTO cert_cache
  532. VALUES (?, ?, ?, ?, ?, ?)""",
  533. (host, address, fingerprint, now, now, 1))
  534. db_conn.commit()
  535. with open(os.path.join(certdir, fingerprint+".crt"), "wb") as fp:
  536. fp.write(cert)
  537. else:
  538. raise Exception("TOFU Failure!")
  539. # If not, cache this cert
  540. else:
  541. db_cur.execute("""INSERT INTO cert_cache
  542. VALUES (?, ?, ?, ?, ?, ?)""",
  543. (host, address, fingerprint, now, now, 1))
  544. db_conn.commit()
  545. certdir = os.path.join(xdg("config"), "cert_cache")
  546. if not os.path.exists(certdir):
  547. os.makedirs(certdir)
  548. with open(os.path.join(certdir, fingerprint+".crt"), "wb") as fp:
  549. fp.write(cert)
  550. def _fetch_gemini(url,timeout=DEFAULT_TIMEOUT,interactive=True,accept_bad_ssl_certificates=False,\
  551. **kwargs):
  552. cache = None
  553. newurl = url
  554. url_parts = urllib.parse.urlparse(url)
  555. host = url_parts.hostname
  556. port = url_parts.port or standard_ports["gemini"]
  557. path = url_parts.path or "/"
  558. query = url_parts.query
  559. # In AV-98, this was the _send_request method
  560. #Send a selector to a given host and port.
  561. #Returns the resolved address and binary file with the reply."""
  562. host = host.encode("idna").decode()
  563. # Do DNS resolution
  564. # DNS lookup - will get IPv4 and IPv6 records if IPv6 is enabled
  565. if ":" in host:
  566. # This is likely a literal IPv6 address, so we can *only* ask for
  567. # IPv6 addresses or getaddrinfo will complain
  568. family_mask = socket.AF_INET6
  569. elif socket.has_ipv6:
  570. # Accept either IPv4 or IPv6 addresses
  571. family_mask = 0
  572. else:
  573. # IPv4 only
  574. family_mask = socket.AF_INET
  575. addresses = socket.getaddrinfo(host, port, family=family_mask,
  576. type=socket.SOCK_STREAM)
  577. # Sort addresses so IPv6 ones come first
  578. addresses.sort(key=lambda add: add[0] == socket.AF_INET6, reverse=True)
  579. ## Continuation of send_request
  580. # Prepare TLS context
  581. protocol = ssl.PROTOCOL_TLS_CLIENT if sys.version_info.minor >=6 else ssl.PROTOCOL_TLSv1_2
  582. context = ssl.SSLContext(protocol)
  583. context.check_hostname=False
  584. context.verify_mode = ssl.CERT_NONE
  585. # Impose minimum TLS version
  586. ## In 3.7 and above, this is easy...
  587. if sys.version_info.minor >= 7:
  588. context.minimum_version = ssl.TLSVersion.TLSv1_2
  589. ## Otherwise, it seems very hard...
  590. ## The below is less strict than it ought to be, but trying to disable
  591. ## TLS v1.1 here using ssl.OP_NO_TLSv1_1 produces unexpected failures
  592. ## with recent versions of OpenSSL. What a mess...
  593. else:
  594. context.options |= ssl.OP_NO_SSLv3
  595. context.options |= ssl.OP_NO_SSLv2
  596. # Try to enforce sensible ciphers
  597. try:
  598. context.set_ciphers("AESGCM+ECDHE:AESGCM+DHE:CHACHA20+ECDHE:CHACHA20+DHE:!DSS:!SHA1:!MD5:@STRENGTH")
  599. except ssl.SSLError:
  600. # Rely on the server to only support sensible things, I guess...
  601. pass
  602. # Connect to remote host by any address possible
  603. err = None
  604. for address in addresses:
  605. try:
  606. s = socket.socket(address[0], address[1])
  607. s.settimeout(timeout)
  608. s = context.wrap_socket(s, server_hostname = host)
  609. s.connect(address[4])
  610. break
  611. except OSError as e:
  612. err = e
  613. else:
  614. # If we couldn't connect to *any* of the addresses, just
  615. # bubble up the exception from the last attempt and deny
  616. # knowledge of earlier failures.
  617. raise err
  618. # Do TOFU
  619. cert = s.getpeercert(binary_form=True)
  620. # Remember that we showed the current cert to this domain...
  621. #TODO : accept badssl and automatic choice
  622. _validate_cert(address[4][0], host, cert,automatic_choice="y")
  623. # Send request and wrap response in a file descriptor
  624. url = urllib.parse.urlparse(url)
  625. new_netloc = host
  626. #Handle IPV6 hostname
  627. if ":" in new_netloc:
  628. new_netloc = "[" + new_netloc + "]"
  629. if port != standard_ports["gemini"]:
  630. new_netloc += ":" + str(port)
  631. url = urllib.parse.urlunparse(url._replace(netloc=new_netloc))
  632. s.sendall((url + CRLF).encode("UTF-8"))
  633. f= s.makefile(mode = "rb")
  634. ## end of send_request in AV98
  635. # Spec dictates <META> should not exceed 1024 bytes,
  636. # so maximum valid header length is 1027 bytes.
  637. header = f.readline(1027)
  638. header = urllib.parse.unquote(header.decode("UTF-8"))
  639. if not header or header[-1] != '\n':
  640. raise RuntimeError("Received invalid header from server!")
  641. header = header.strip()
  642. # Validate header
  643. status, meta = header.split(maxsplit=1)
  644. if len(meta) > 1024 or len(status) != 2 or not status.isnumeric():
  645. f.close()
  646. raise RuntimeError("Received invalid header from server!")
  647. # Update redirect loop/maze escaping state
  648. if not status.startswith("3"):
  649. previous_redirectors = set()
  650. #TODO FIXME
  651. else:
  652. #we set a previous_redirectors anyway because refactoring in progress
  653. previous_redirectors = set()
  654. # Handle non-SUCCESS headers, which don't have a response body
  655. # Inputs
  656. if status.startswith("1"):
  657. if interactive:
  658. print(meta)
  659. if status == "11":
  660. user_input = getpass.getpass("> ")
  661. else:
  662. #TODO:FIXME we should not ask for user input while non-interactive
  663. user_input = input("> ")
  664. newurl = url.split("?")[0]
  665. return _fetch_gemini(newurl+"?"+user_input)
  666. else:
  667. return None,None
  668. # Redirects
  669. elif status.startswith("3"):
  670. newurl = urllib.parse.urljoin(url,meta)
  671. if newurl == url:
  672. raise RuntimeError("URL redirects to itself!")
  673. elif newurl in previous_redirectors:
  674. raise RuntimeError("Caught in redirect loop!")
  675. elif len(previous_redirectors) == _MAX_REDIRECTS:
  676. raise RuntimeError("Refusing to follow more than %d consecutive redirects!" % _MAX_REDIRECTS)
  677. # TODO: redirections handling should be refactored
  678. # elif "interactive" in options and not options["interactive"]:
  679. # follow = self.automatic_choice
  680. # # Never follow cross-domain redirects without asking
  681. # elif new_gi.host.encode("idna") != gi.host.encode("idna"):
  682. # follow = input("Follow cross-domain redirect to %s? (y/n) " % new_gi.url)
  683. # # Never follow cross-protocol redirects without asking
  684. # elif new_gi.scheme != gi.scheme:
  685. # follow = input("Follow cross-protocol redirect to %s? (y/n) " % new_gi.url)
  686. # # Don't follow *any* redirect without asking if auto-follow is off
  687. # elif not self.options["auto_follow_redirects"]:
  688. # follow = input("Follow redirect to %s? (y/n) " % new_gi.url)
  689. # # Otherwise, follow away
  690. else:
  691. follow = "yes"
  692. if follow.strip().lower() not in ("y", "yes"):
  693. raise UserAbortException()
  694. previous_redirectors.add(url)
  695. # if status == "31":
  696. # # Permanent redirect
  697. # self.permanent_redirects[gi.url] = new_gi.url
  698. return _fetch_gemini(newurl)
  699. # Errors
  700. elif status.startswith("4") or status.startswith("5"):
  701. raise RuntimeError(meta)
  702. # Client cert
  703. elif status.startswith("6"):
  704. error = "Handling certificates for status 6X are not supported by offpunk\n"
  705. error += "See bug #31 for discussion about the problem"
  706. raise RuntimeError(error)
  707. # Invalid status
  708. elif not status.startswith("2"):
  709. raise RuntimeError("Server returned undefined status code %s!" % status)
  710. # If we're here, this must be a success and there's a response body
  711. assert status.startswith("2")
  712. mime = meta
  713. # Read the response body over the network
  714. fbody = f.read()
  715. # DEFAULT GEMINI MIME
  716. if mime == "":
  717. mime = "text/gemini; charset=utf-8"
  718. shortmime, mime_options = parse_mime(mime)
  719. if "charset" in mime_options:
  720. try:
  721. codecs.lookup(mime_options["charset"])
  722. except LookupError:
  723. #raise RuntimeError("Header declared unknown encoding %s" % mime_options)
  724. #If the encoding is wrong, there’s a high probably it’s UTF-8 with a bad header
  725. mime_options["charset"] = "UTF-8"
  726. if shortmime.startswith("text/"):
  727. #Get the charset and default to UTF-8 in none
  728. encoding = mime_options.get("charset", "UTF-8")
  729. try:
  730. body = fbody.decode(encoding)
  731. except UnicodeError:
  732. raise RuntimeError("Could not decode response body using %s\
  733. encoding declared in header!" % encoding)
  734. else:
  735. body = fbody
  736. cache = write_body(url,body,mime)
  737. return cache,newurl
  738. def fetch(url,offline=False,download_image_first=True,images_mode="readable",validity=0,**kwargs):
  739. url = normalize_url(url)
  740. newurl = url
  741. path=None
  742. print_error = "print_error" in kwargs.keys() and kwargs["print_error"]
  743. #Firt, we look if we have a valid cache, even if offline
  744. #If we are offline, any cache is better than nothing
  745. if is_cache_valid(url,validity=validity) or (offline and is_cache_valid(url,validity=0)):
  746. path = get_cache_path(url)
  747. #if the cache is a folder, we should add a "/" at the end of the URL
  748. if not url.endswith("/") and os.path.isdir(get_cache_path(url,add_index=False)) :
  749. newurl = url+"/"
  750. elif offline and is_cache_valid(url,validity=0):
  751. path = get_cache_path(url)
  752. elif "://" in url and not offline:
  753. try:
  754. scheme = url.split("://")[0]
  755. if scheme not in standard_ports:
  756. if print_error:
  757. print("%s is not a supported protocol"%scheme)
  758. path = None
  759. elif scheme in ("http","https"):
  760. if _DO_HTTP:
  761. path=_fetch_http(url,**kwargs)
  762. else:
  763. print("HTTP requires python-requests")
  764. elif scheme == "gopher":
  765. path=_fetch_gopher(url,**kwargs)
  766. elif scheme == "finger":
  767. path=_fetch_finger(url,**kwargs)
  768. elif scheme == "gemini":
  769. path,newurl=_fetch_gemini(url,**kwargs)
  770. else:
  771. print("scheme %s not implemented yet")
  772. except UserAbortException:
  773. return None, newurl
  774. except Exception as err:
  775. cache = set_error(url, err)
  776. # Print an error message
  777. # we fail silently when sync_only
  778. if isinstance(err, socket.gaierror):
  779. if print_error:
  780. print("ERROR: DNS error!")
  781. elif isinstance(err, ConnectionRefusedError):
  782. if print_error:
  783. print("ERROR1: Connection refused!")
  784. elif isinstance(err, ConnectionResetError):
  785. if print_error:
  786. print("ERROR2: Connection reset!")
  787. elif isinstance(err, (TimeoutError, socket.timeout)):
  788. if print_error:
  789. print("""ERROR3: Connection timed out!
  790. Slow internet connection? Use 'set timeout' to be more patient.""")
  791. elif isinstance(err, FileExistsError):
  792. if print_error:
  793. print("""ERROR5: Trying to create a directory which already exists
  794. in the cache : """)
  795. print(err)
  796. elif _DO_HTTP and isinstance(err,requests.exceptions.SSLError):
  797. if print_error:
  798. print("""ERROR6: Bad SSL certificate:\n""")
  799. print(err)
  800. print("""\n If you know what you are doing, you can try to accept bad certificates with the following command:\n""")
  801. print("""set accept_bad_ssl_certificates True""")
  802. elif _DO_HTTP and isinstance(err,requests.exceptions.ConnectionError):
  803. if print_error:
  804. print("""ERROR7: Cannot connect to URL:\n""")
  805. print(str(err))
  806. else:
  807. if print_error:
  808. import traceback
  809. print("ERROR4: " + str(type(err)) + " : " + str(err))
  810. #print("\n" + str(err.with_traceback(None)))
  811. print(traceback.format_exc())
  812. return cache, newurl
  813. # We download images contained in the document (from full mode)
  814. if not offline and download_image_first and images_mode:
  815. renderer = ansicat.renderer_from_file(path,newurl)
  816. if renderer:
  817. for image in renderer.get_images(mode=images_mode):
  818. #Image should exist, should be an url (not a data image)
  819. #and should not be already cached
  820. if image and not image.startswith("data:image/") and not is_cache_valid(image):
  821. width = offutils.term_width() - 1
  822. toprint = "Downloading %s" %image
  823. toprint = toprint[:width]
  824. toprint += " "*(width-len(toprint))
  825. print(toprint,end="\r")
  826. #d_i_f and images_mode are False/None to avoid recursive downloading
  827. #if that ever happen
  828. fetch(image,offline=offline,download_image_first=False,\
  829. images_mode=None,validity=0,**kwargs)
  830. return path, newurl
  831. def main():
  832. descri="Netcache is a command-line tool to retrieve, cache and access networked content.\n\
  833. By default, netcache will returns a cached version of a given URL, downloading it \
  834. only if not existing. A validity duration, in seconds, can also be given so that \
  835. netcache downloads the content only if the existing cache is older than the validity."
  836. # Parse arguments
  837. parser = argparse.ArgumentParser(prog="netcache",description=descri)
  838. parser.add_argument("--path", action="store_true",
  839. help="return path to the cache instead of the content of the cache")
  840. parser.add_argument("--offline", action="store_true",
  841. help="Do not attempt to download, return cached version or error")
  842. parser.add_argument("--max-size", type=int,
  843. help="Cancel download of items above that size (value in Mb).")
  844. parser.add_argument("--timeout", type=int,
  845. help="Time to wait before cancelling connection (in second).")
  846. parser.add_argument("--cache-validity",type=int, default=0,
  847. help="maximum age, in second, of the cached version before \
  848. redownloading a new version")
  849. # No argument: write help
  850. parser.add_argument('url', metavar='URL', nargs='*',
  851. help='download URL and returns the content or the path to a cached version')
  852. # --validity : returns the date of the cached version, Null if no version
  853. # --force-download : download and replace cache, even if valid
  854. args = parser.parse_args()
  855. param = {}
  856. for u in args.url:
  857. if args.offline:
  858. path = get_cache_path(u)
  859. else:
  860. path,url = fetch(u,max_size=args.max_size,timeout=args.timeout,\
  861. validity=args.cache_validity)
  862. if args.path:
  863. print(path)
  864. else:
  865. with open(path,"r") as f:
  866. print(f.read())
  867. f.close()
  868. if __name__== '__main__':
  869. main()