123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- #!/usr/bin/env python3
- # vim: tabstop=4 shiftwidth=4 expandtab
- import antennaDB
- from os import getenv
- import datetime
- import time
- import configparser
- import argparse
- import gemcall
- import URLHelper
- from multiFeedParsing import parsetwtxt,parsegemsub,parsexml,FeedEntry,TwtxtEntry
- from pageGeneration import updateStatFile,generateFeedPage,generateAtomFeed,generateTwtxtPage,generateTwtxtFile
- # Uniform log messages and output
- def log(msg: str = "", logfile: str = "", response:int = None) -> None:
- log = open(logfile, "a")
- log.write(datetime.datetime.utcnow().isoformat() + " " + msg + "\n")
- log.close()
- if response:
- print(f"{str(response)} {msg}\r\n")
- def getFeed(uh: URLHelper = None,feedurl: str = None) -> { "response": gemcall.Response, "url": str } :
- # This is a bit messy, but we want to allow a few redirects and
- # still keep track of which URL we're actually fetching.
- # We'll also remove the feed entirely if it's blocked by the rules.
- response = None
- for i in [0,1,2]:
- correctedfeedurl = uh.resolve(feedurl)
- if not uh.mightBeAURL(correctedfeedurl):
- log(msg = f"ERROR: pretty sure '{feedurl}' is not a real URL...",logfile = logfilei, response = 40)
- exit(2)
- if uh.isBlocked(correctedfeedurl):
- log(msg = f"ERROR: feed URL '{feedurl}' is blocked by rules.",logfile = logfile, response = 40)
- db.deleteFeeds([correctedfeedurl])
- break
- try:
- response = gemcall.request(correctedfeedurl)
- except:
- log(msg = f"ERROR: failed to fetch feed from '{correctedfeedurl}'",logfile = logfile, response = 40)
- break
- if response.responsecode == 30 or response.responsecode == 31:
- log(msg = f"INFO: following redirect from '{correctedfeedurl}' to '{response.meta}'.",logfile = logfile)
- correctedfeedurl = response.meta
- elif response.responsecode != 20:
- log(msg = f"ERROR: bad response for feed '{correctedfeedurl}': '{str(response.responsecode)} {response.meta}'",logfile = logfile, response = 40)
- response = None
- break
- else:
- break
- return { "response":response, "url":correctedfeedurl}
- feedurl = getenv('QUERY_STRING')
- if not feedurl:
- print("10 Feed URL:\r\n")
- exit()
- argparser = argparse.ArgumentParser(description="Ingest feed $QUERY_STRING to Antenna, according to settings in --config file.")
- argparser.add_argument("--config", help="Path to config file.")
- args = argparser.parse_args()
- if not args.config:
- argparser.print_help()
- exit()
- config = configparser.ConfigParser()
- config.read(args.config)
- nonpublic = config["nonpublic"]
- public = config["public"]
- rules = config["rules"]
- rootdir = nonpublic.get("rootdir","")
- outputdir = public.get("outputdir","")
- if not rootdir:
- print("ERROR: The value 'rootdir' is missing in the [nonpublic] section of config file.")
- exit(1)
- if not outputdir:
- print("ERROR: The value 'outputdir' is missing in the [public] section of config file.")
- exit(1)
- db = antennaDB.AntennaDB(f"{rootdir}/{nonpublic.get('db','antenna.sqlite')}")
- blocklist = f"{rootdir}/{nonpublic.get('blocklist','blocklist.txt')}"
- logfile = f"{rootdir}/{nonpublic.get('logfile','antenna.log')}"
- uh = URLHelper.URLHelper(blocklist = blocklist)
- agelimit = int(time.mktime(datetime.datetime.utcnow().utctimetuple())) - (3600*24*int(rules.get('agelimit','7')))
- feedResponse = getFeed(uh, feedurl)
- response = feedResponse["response"]
- correctedfeedurl = feedResponse["url"]
- if not response:
- exit()
- try: # 300kb should be enough for most feeds
- feed = response.read(300*1024).decode('UTF-8')
- except:
- log(msg = f"ERROR: failed to properly read content from '{correctedfeedurl}'",logfile = logfile, response = 40)
- exit()
-
- # Since we received an updated feed we'll start by removing existing entries
- log(msg = f"INFO: fetched feed from '{correctedfeedurl}', removing from DB",logfile = logfile)
- db.deleteFeeds([correctedfeedurl])
- log(msg = f"INFO: attempting to parse feed '{correctedfeedurl}' as gemlog feed",logfile = logfile)
- preliminaryEntries = parsegemsub(feed, correctedfeedurl) or parsexml(feed, correctedfeedurl)
- entries = []
- while len(preliminaryEntries) > 0:
- entry = preliminaryEntries.pop()
- if not uh.isBlocked(entry.link):
- entry.link = uh.resolve(entry.link)
- entries.append(entry)
- else:
- log(msg = f"ERROR: entry URL '{entry.link}' is blocked by rules.",logfile = logfile)
- if entries:
- db.insertFeedEntries(entries, agelimit)
- else:
- log(msg = f"INFO: attempting to parse feed '{correctedfeedurl}' as twtxt",logfile = logfile)
- db.insertTwtxtEntries(parsetwtxt(feed, correctedfeedurl), agelimit)
- db.pruneDB(agelimit)
- # Short of getting all entries and comparing them fully to each other there's really no way to tell whether anything needs regeneration.
- twts = db.getTwts()
- feedEntries = db.getEntries()
- feedURLs = set()
- for entry in feedEntries:
- feedURLs.add(entry.feedurl)
- # twtgmi = twtxt.gmi
- # twtxt = twtxt.txt
- # atom = atom.xml
- # stats = stats.tsv
- # index = index.gmi
- updateStatFile(len(feedURLs), len(feedEntries), f"{outputdir}/{public.get('stats','stats.tsv')}")
- generateFeedPage(feedEntries, f"{outputdir}/{public.get('index', 'index.gmi')}")
- generateAtomFeed(feedEntries, f"{outputdir}/{public.get('atom', 'atom.xml')}")
- generateTwtxtPage(twts, f"{outputdir}/{public.get('twtgmi', 'twtxt.gmi')}")
- generateTwtxtPage(twts, f"{outputdir}/{public.get('twtxt', 'twtxt.txt')}")
- print("20 text/gemini\r\nThank you for your submission! Antenna has now been updated.\n")
|