direct-ingestion.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #!/usr/bin/env python3
  2. # vim: tabstop=4 shiftwidth=4 expandtab
  3. import antennaDB
  4. from os import getenv
  5. import datetime
  6. import time
  7. import configparser
  8. import argparse
  9. import gemcall
  10. import URLHelper
  11. from multiFeedParsing import parsetwtxt,parsegemsub,parsexml,FeedEntry,TwtxtEntry
  12. from pageGeneration import updateStatFile,generateFeedPage,generateAtomFeed,generateTwtxtPage,generateTwtxtFile
  13. # Uniform log messages and output
  14. def log(msg: str = "", logfile: str = "", response:int = None) -> None:
  15. log = open(logfile, "a")
  16. log.write(datetime.datetime.utcnow().isoformat() + " " + msg + "\n")
  17. log.close()
  18. if response:
  19. print(f"{str(response)} {msg}\r\n")
  20. def getFeed(uh: URLHelper = None,feedurl: str = None) -> { "response": gemcall.Response, "url": str } :
  21. # This is a bit messy, but we want to allow a few redirects and
  22. # still keep track of which URL we're actually fetching.
  23. # We'll also remove the feed entirely if it's blocked by the rules.
  24. response = None
  25. for i in [0,1,2]:
  26. correctedfeedurl = uh.resolve(feedurl)
  27. if not uh.mightBeAURL(correctedfeedurl):
  28. log(msg = f"ERROR: pretty sure '{feedurl}' is not a real URL...",logfile = logfilei, response = 40)
  29. exit(2)
  30. if uh.isBlocked(correctedfeedurl):
  31. log(msg = f"ERROR: feed URL '{feedurl}' is blocked by rules.",logfile = logfile, response = 40)
  32. db.deleteFeeds([correctedfeedurl])
  33. break
  34. try:
  35. response = gemcall.request(correctedfeedurl)
  36. except:
  37. log(msg = f"ERROR: failed to fetch feed from '{correctedfeedurl}'",logfile = logfile, response = 40)
  38. break
  39. if response.responsecode == 30 or response.responsecode == 31:
  40. log(msg = f"INFO: following redirect from '{correctedfeedurl}' to '{response.meta}'.",logfile = logfile)
  41. correctedfeedurl = response.meta
  42. elif response.responsecode != 20:
  43. log(msg = f"ERROR: bad response for feed '{correctedfeedurl}': '{str(response.responsecode)} {response.meta}'",logfile = logfile, response = 40)
  44. response = None
  45. break
  46. else:
  47. break
  48. return { "response":response, "url":correctedfeedurl}
  49. feedurl = getenv('QUERY_STRING')
  50. if not feedurl:
  51. print("10 Feed URL:\r\n")
  52. exit()
  53. argparser = argparse.ArgumentParser(description="Ingest feed $QUERY_STRING to Antenna, according to settings in --config file.")
  54. argparser.add_argument("--config", help="Path to config file.")
  55. args = argparser.parse_args()
  56. if not args.config:
  57. argparser.print_help()
  58. exit()
  59. config = configparser.ConfigParser()
  60. config.read(args.config)
  61. nonpublic = config["nonpublic"]
  62. public = config["public"]
  63. rules = config["rules"]
  64. rootdir = nonpublic.get("rootdir","")
  65. outputdir = public.get("outputdir","")
  66. if not rootdir:
  67. print("ERROR: The value 'rootdir' is missing in the [nonpublic] section of config file.")
  68. exit(1)
  69. if not outputdir:
  70. print("ERROR: The value 'outputdir' is missing in the [public] section of config file.")
  71. exit(1)
  72. db = antennaDB.AntennaDB(f"{rootdir}/{nonpublic.get('db','antenna.sqlite')}")
  73. blocklist = f"{rootdir}/{nonpublic.get('blocklist','blocklist.txt')}"
  74. logfile = f"{rootdir}/{nonpublic.get('logfile','antenna.log')}"
  75. uh = URLHelper.URLHelper(blocklist = blocklist)
  76. agelimit = int(time.mktime(datetime.datetime.utcnow().utctimetuple())) - (3600*24*int(rules.get('agelimit','7')))
  77. feedResponse = getFeed(uh, feedurl)
  78. response = feedResponse["response"]
  79. correctedfeedurl = feedResponse["url"]
  80. if not response:
  81. exit()
  82. try: # 300kb should be enough for most feeds
  83. feed = response.read(300*1024).decode('UTF-8')
  84. except:
  85. log(msg = f"ERROR: failed to properly read content from '{correctedfeedurl}'",logfile = logfile, response = 40)
  86. exit()
  87. # Since we received an updated feed we'll start by removing existing entries
  88. log(msg = f"INFO: fetched feed from '{correctedfeedurl}', removing from DB",logfile = logfile)
  89. db.deleteFeeds([correctedfeedurl])
  90. log(msg = f"INFO: attempting to parse feed '{correctedfeedurl}' as gemlog feed",logfile = logfile)
  91. preliminaryEntries = parsegemsub(feed, correctedfeedurl) or parsexml(feed, correctedfeedurl)
  92. entries = []
  93. while len(preliminaryEntries) > 0:
  94. entry = preliminaryEntries.pop()
  95. if not uh.isBlocked(entry.link):
  96. entry.link = uh.resolve(entry.link)
  97. entries.append(entry)
  98. else:
  99. log(msg = f"ERROR: entry URL '{entry.link}' is blocked by rules.",logfile = logfile)
  100. if entries:
  101. db.insertFeedEntries(entries, agelimit)
  102. else:
  103. log(msg = f"INFO: attempting to parse feed '{correctedfeedurl}' as twtxt",logfile = logfile)
  104. db.insertTwtxtEntries(parsetwtxt(feed, correctedfeedurl), agelimit)
  105. db.pruneDB(agelimit)
  106. # Short of getting all entries and comparing them fully to each other there's really no way to tell whether anything needs regeneration.
  107. twts = db.getTwts()
  108. feedEntries = db.getEntries()
  109. feedURLs = set()
  110. for entry in feedEntries:
  111. feedURLs.add(entry.feedurl)
  112. # twtgmi = twtxt.gmi
  113. # twtxt = twtxt.txt
  114. # atom = atom.xml
  115. # stats = stats.tsv
  116. # index = index.gmi
  117. updateStatFile(len(feedURLs), len(feedEntries), f"{outputdir}/{public.get('stats','stats.tsv')}")
  118. generateFeedPage(feedEntries, f"{outputdir}/{public.get('index', 'index.gmi')}")
  119. generateAtomFeed(feedEntries, f"{outputdir}/{public.get('atom', 'atom.xml')}")
  120. generateTwtxtPage(twts, f"{outputdir}/{public.get('twtgmi', 'twtxt.gmi')}")
  121. generateTwtxtPage(twts, f"{outputdir}/{public.get('twtxt', 'twtxt.txt')}")
  122. print("20 text/gemini\r\nThank you for your submission! Antenna has now been updated.\n")