URLHelper.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. #!/usr/bin/env python3
  2. # vim: tabstop=4 shiftwidth=4 expandtab
  3. import re
  4. import urllib.parse
  5. urllib.parse.uses_relative.append("gemini")
  6. urllib.parse.uses_netloc.append("gemini")
  7. class URLHelper():
  8. def __init__(self, blocklist: str = "blocklist.txt"):
  9. self.blockrules: set = set()
  10. if not Path(blocklist).exists():
  11. return
  12. with open(blocklist) as blockfile:
  13. self.blockrules = set(blockfile.read().split("\n")) - {""}
  14. def isBlocked(self, url) -> bool:
  15. """
  16. Check whether a URL is blocked by the rules.
  17. This method calls :meth:`~URLHelper.resolve`.
  18. """
  19. url = self.resolve(url)
  20. for rule in self.blockrules:
  21. if url.startswith(rule):
  22. return True
  23. return False
  24. @classmethod
  25. def mightBeAURL(cls, url: str) -> bool:
  26. """
  27. Naive URL validation.
  28. >>> URLHelper.mightBeAURL("gemini://example.com/feed")
  29. True
  30. >>> URLHelper.mightBeAURL("my feed")
  31. False
  32. """
  33. pattern = r'^[\w]+://[^/]+\.[^/]+.*'
  34. return bool(re.match(pattern, url))
  35. @classmethod
  36. def getNetLoc(cls, url: str) -> str:
  37. return urllib.parse.urlparse(url).netloc
  38. @classmethod
  39. def resolve(cls, url: str, url2: str = "") -> str:
  40. """
  41. Resolve relative paths in URLs.
  42. This method calls :meth:`~URLHelper.correct` beforehand.
  43. >>> URLHelper.resolve("gemini://example.com/1/../2")
  44. 'gemini://example.com/2'
  45. """
  46. url = urllib.parse.unquote(url)
  47. url2 = urllib.parse.unquote(url2)
  48. fullUrl = urllib.parse.urljoin(url, url2)
  49. if not fullUrl.startswith("gemini://"):
  50. fullUrl = f"gemini://{fullUrl}"
  51. parseResult = urllib.parse.urlparse(fullUrl)
  52. return f"{parseResult.scheme}://{parseResult.netloc}{parseResult.path}"