lark
/
gemini-antenna
派生自 tinyrabbit/gemini-antenna


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
							#!/usr/bin/env python3
# vim: tabstop=4 shiftwidth=4 expandtab

import urllib.parse
import re
import feedparser
import time
from datetime import datetime
urllib.parse.uses_relative.append("gemini")
urllib.parse.uses_netloc.append("gemini")

# collapse whitespace
def _cw(text):
    return re.sub(r'\s', ' ', text)

def parsegemsub(feed, baseurl):
    entries = []
    authorpattern = r'^#\s*([^#\r\n]+)'
    entriespattern = r'^=>\s*(\S+)\s+(\d{4}-\d{2}-\d{2})[^\r\n\S]*([^\r\n]*)'
    entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
    authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
    if authorpatternmatch:
        author = authorpatternmatch[0]
    else:
        return None
    for entrypatternmatch in entriespatternmatches:
        # Get our YYYY-MM-DD string, add time of day, parse to datetime.datetime, convert to unix timestamp and cast to int
        try:
            updated = int(datetime.timestamp(datetime.strptime(entrypatternmatch[1] + " 12:00:00", "%Y-%m-%d %H:%M:%S")))
        except:
            continue
        # A gemsub feed can often have relative links, we'll have to absolutize them
        link = urllib.parse.urljoin(baseurl, entrypatternmatch[0]).replace('/..','').replace('/.','')
        title = entrypatternmatch[2] if entrypatternmatch[2] else entrypatternmatch[1]
        entries.append(FeedEntry(baseurl, author, updated, title, link))
    return entries

def parsetwtxt(feed, baseurl):
    entries = []
    authorpattern = r'^#\s*nick\s*=\s*(\S+)'
    # This is a naive match, but we'll only keep those that validate eventually
    entriespattern = r'^(\S+)\t([^\r\n]+)'
    entriespatternmatches = re.findall(entriespattern, feed, re.MULTILINE)
    authorpatternmatch = re.findall(authorpattern, feed, re.MULTILINE)
    if authorpatternmatch:
        author = authorpatternmatch[0]
    else:
        author = baseurl
    for entrypatternmatch in entriespatternmatches:
        # Get our datetime string, parse to datetime.datetime, convert to unix timestamp and cast to int
        try:
            posted = int(datetime.timestamp(datetime.strptime(entrypatternmatch[0], "%Y-%m-%dT%H:%M:%S%z")))
        except:
            continue
        entries.append(TwtxtEntry(feedurl = baseurl, author = author, posted = posted, twt = entrypatternmatch[1]))
    return entries

def parsexml(feed, baseurl):
    scheme = baseurl.split("://")[0]
    entries = []
    parsedfeed = feedparser.parse(feed)

    # Let's set author name, or lacking that use the feed title.
    feedauthor = _cw(parsedfeed['feed']['author_detail']['name']) if parsedfeed['feed'].has_key('author_detail') and parsedfeed['feed']['author_detail'].has_key('name') else None
    feedtitle = _cw(parsedfeed['feed']['title']) if parsedfeed['feed'].has_key('title') else None
    if not feedauthor and feedtitle:
        feedauthor = feedtitle
    if not parsedfeed.has_key('entries'):
        return None
    for entry in parsedfeed['entries']:
        try: # The feed could miss all sorts of fields...
            if entry.has_key('author_detail') and entry['author_detail'].has_key('name'):
                author = _cw(entry['author_detail']['name'])
            elif feedauthor:
                author = feedauthor
            else:
                continue
            updated = int(time.mktime(entry['updated_parsed'])) # Seconds since epoch
            title = _cw(entry['title'])
            if len(entry['links']) >= 1:
                link = [l for l in entry['links'] if l['href'].startswith(scheme)][0]['href']
            else:
                link = _cw(entry['link'])
            if not link:
                continue
            link = urllib.parse.urljoin(baseurl, link).replace('/..','').replace('/.','')
        except:
            continue
        entries.append(FeedEntry(baseurl, author, updated, title, link))
    return entries


class FeedEntry():

    def __init__(self, feedurl, author, updated, title, link):
        self.feedurl = feedurl
        self.author = author
        self.updated = updated
        self.title = title
        self.link = link

class TwtxtEntry():

    def __init__(self, feedurl, author, posted, twt):
        self.feedurl = feedurl
        self.author = author
        self.posted = posted
        self.twt = twt