CyberTailor
/
gemini-antenna
원본 프로젝트 : tinyrabbit/gemini-antenna


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
							#!/usr/bin/env python3
# vim: tabstop=4 shiftwidth=4 expandtab

import re
import urllib.parse
from pathlib import Path, PosixPath

class URLHelper():

    def __init__(self, blocklist: str = "blocklist.txt"):
        self.blockrules: set = set()
        if not Path(blocklist).exists():
            return

        with open(blocklist) as blockfile:
            self.blockrules = set(blockfile.read().split("\n")) - {""}

    def isBlocked(self, url) -> bool:
        """
        Check whether a URL is blocked by the rules.
        This method calls :meth:`~URLHelper.resolve`.
        """
        url = self.resolve(url)
        for rule in self.blockrules:
            if url.startswith(rule):
                return True
        return False

    @classmethod
    def mightBeAURL(cls, url: str) -> bool:
        """
        Naive URL validation.

        >>> URLHelper.mightBeAURL("gemini://example.com/feed")
        True
        >>> URLHelper.mightBeAURL("my feed")
        False
        """
        pattern = r'^[\w]+://[^/]+\.[^/]+.*'
        return bool(re.match(pattern, url))

    @classmethod
    def correct(cls, url: str) -> str:
        """
        Unquote a URL and add gemini:// scheme if needed.

        >>> URLHelper.correct("example.com/my%20feed")
        'gemini://example.com/my feed'
        """
        url = urllib.parse.unquote(url)

        if not re.findall(r'^[\w:]*//', url):
            url = "gemini://" + url
        elif not urllib.parse.urlparse(url).netloc:
            url = "gemini:" + url

        return url

    @classmethod
    def resolve(cls, url: str) -> str:
        """
        Resolve relative paths in URLs.
        This method calls :meth:`~URLHelper.correct` beforehand.

        >>> URLHelper.resolve("gemini://example.com/1/../2")
        'gemini://example.com/2'
        """
        url = urllib.parse.urlparse(cls.correct(url))

        if not url.path:
            path = ""
        elif not url.path.startswith("/"):
            raise ValueError("Not an absoulute URL")
        else:
            path = str(PosixPath(url.path).resolve())
            # restore lost trailing slash
            if url.path.endswith("/"):
                path += "/"
        return urllib.parse.urlunparse(url._replace(path=path))

if __name__ == "__main__":
    import doctest
    doctest.testmod()