URLHelper.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/env python3
  2. # vim: tabstop=4 shiftwidth=4 expandtab
  3. import re
  4. import urllib.parse
  5. from pathlib import Path, PosixPath
  6. class URLHelper():
  7. def __init__(self, blocklist: str = "blocklist.txt"):
  8. self.blockrules: set = set()
  9. if not Path(blocklist).exists():
  10. return
  11. with open(blocklist) as blockfile:
  12. self.blockrules = set(blockfile.read().split("\n")) - {""}
  13. def isBlocked(self, url) -> bool:
  14. """
  15. Check whether a URL is blocked by the rules.
  16. This method calls :meth:`~URLHelper.resolve`.
  17. """
  18. url = self.resolve(url)
  19. for rule in self.blockrules:
  20. if url.startswith(rule):
  21. return True
  22. return False
  23. @classmethod
  24. def mightBeAURL(cls, url: str) -> bool:
  25. """
  26. Naive URL validation.
  27. >>> URLHelper.mightBeAURL("gemini://example.com/feed")
  28. True
  29. >>> URLHelper.mightBeAURL("my feed")
  30. False
  31. """
  32. pattern = r'^[\w]+://[^/]+\.[^/]+.*'
  33. return bool(re.match(pattern, url))
  34. @classmethod
  35. def correct(cls, url: str) -> str:
  36. """
  37. Unquote a URL and add gemini:// scheme if needed.
  38. >>> URLHelper.correct("example.com/my%20feed")
  39. 'gemini://example.com/my feed'
  40. """
  41. url = urllib.parse.unquote(url)
  42. if not re.findall(r'^[\w:]*//', url):
  43. url = "gemini://" + url
  44. elif not urllib.parse.urlparse(url).netloc:
  45. url = "gemini:" + url
  46. return url
  47. @classmethod
  48. def resolve(cls, url: str) -> str:
  49. """
  50. Resolve relative paths in URLs.
  51. This method calls :meth:`~URLHelper.correct` beforehand.
  52. >>> URLHelper.resolve("gemini://example.com/1/../2")
  53. 'gemini://example.com/2'
  54. """
  55. url = urllib.parse.urlparse(cls.correct(url))
  56. if not url.path:
  57. path = ""
  58. elif not url.path.startswith("/"):
  59. raise ValueError("Not an absoulute URL")
  60. else:
  61. path = str(PosixPath(url.path).resolve())
  62. # restore lost trailing slash
  63. if url.path.endswith("/"):
  64. path += "/"
  65. return urllib.parse.urlunparse(url._replace(path=path))
  66. if __name__ == "__main__":
  67. import doctest
  68. doctest.testmod()