tinyrabbit
/
gemini-antenna


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
							#!/usr/bin/env python3
# vim: tabstop=4 shiftwidth=4 expandtab

import re
import urllib.parse
from pathlib import Path, PosixPath

class URLHelper():

    def __init__(self, blocklist: str = "blocklist.txt"):
        self.blockrules: set = set()
        if not Path(blocklist).exists():
            return

        with open(blocklist) as blockfile:
            self.blockrules = set(blockfile.read().split("\n")) - {""}

    def isBlocked(self, url) -> bool:
        """
        Check whether a URL is blocked by the rules.
        This method calls :meth:`~URLHelper.resolve`.
        """
        url = self.resolve(url)
        for rule in self.blockrules:
            if url.startswith(rule):
                return True
        return False

    @classmethod
    def mightBeAURL(cls, url: str) -> bool:
        """
        Naive URL validation.

        >>> URLHelper.mightBeAURL("gemini://example.com/feed")
        True
        >>> URLHelper.mightBeAURL("my feed")
        False
        """
        pattern = r'^[\w]+://[^/]+\.[^/]+.*'
        return bool(re.match(pattern, url))

    @classmethod
    def correct(cls, url: str) -> str:
        """
        Unquote a URL and add gemini:// scheme if needed.

        >>> URLHelper.correct("example.com/my%20feed")
        'gemini://example.com/my feed'
        """
        url = urllib.parse.unquote(url)

        if not re.findall(r'^[\w:]*//', url):
            url = "gemini://" + url
        elif not urllib.parse.urlparse(url).netloc:
            url = "gemini:" + url

        return url

    @classmethod
    def resolve(cls, url: str) -> str:
        """
        Resolve relative paths in URLs.
        This method calls :meth:`~URLHelper.correct` beforehand.

        >>> URLHelper.resolve("gemini://example.com/1/../2")
        'gemini://example.com/2'
        """
        url = urllib.parse.urlparse(cls.correct(url))

        if not url.path:
            path = ""
        elif not url.path.startswith("/"):
            raise ValueError("Not an absoulute URL")
        else:
            path = str(PosixPath(url.path).resolve())
            # restore lost trailing slash
            if url.path.endswith("/") and path != "/":
                path += "/"
        return urllib.parse.urlunparse(url._replace(path=path))