123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- #!/usr/bin/env python3
- # vim: tabstop=4 shiftwidth=4 expandtab
- import re
- import urllib.parse
- from pathlib import Path, PosixPath
- class URLHelper():
- def __init__(self, blocklist: str = "blocklist.txt"):
- self.blockrules: set = set()
- if not Path(blocklist).exists():
- return
- with open(blocklist) as blockfile:
- self.blockrules = set(blockfile.read().split("\n")) - {""}
- def isBlocked(self, url) -> bool:
- """
- Check whether a URL is blocked by the rules.
- This method calls :meth:`~URLHelper.resolve`.
- """
- url = self.resolve(url)
- for rule in self.blockrules:
- if url.startswith(rule):
- return True
- return False
- @classmethod
- def mightBeAURL(cls, url: str) -> bool:
- """
- Naive URL validation.
- >>> URLHelper.mightBeAURL("gemini://example.com/feed")
- True
- >>> URLHelper.mightBeAURL("my feed")
- False
- """
- pattern = r'^[\w]+://[^/]+\.[^/]+.*'
- return bool(re.match(pattern, url))
- @classmethod
- def correct(cls, url: str) -> str:
- """
- Unquote a URL and add gemini:// scheme if needed.
- >>> URLHelper.correct("example.com/my%20feed")
- 'gemini://example.com/my feed'
- """
- url = urllib.parse.unquote(url)
- if not re.findall(r'^[\w:]*//', url):
- url = "gemini://" + url
- elif not urllib.parse.urlparse(url).netloc:
- url = "gemini:" + url
- return url
- @classmethod
- def resolve(cls, url: str) -> str:
- """
- Resolve relative paths in URLs.
- This method calls :meth:`~URLHelper.correct` beforehand.
- >>> URLHelper.resolve("gemini://example.com/1/../2")
- 'gemini://example.com/2'
- """
- url = urllib.parse.urlparse(cls.correct(url))
- if not url.path:
- path = ""
- elif not url.path.startswith("/"):
- raise ValueError("Not an absoulute URL")
- else:
- path = str(PosixPath(url.path).resolve())
- # restore lost trailing slash
- if url.path.endswith("/"):
- path += "/"
- return urllib.parse.urlunparse(url._replace(path=path))
- if __name__ == "__main__":
- import doctest
- doctest.testmod()
|