|
@@ -1,17 +1,15 @@
|
|
|
#!/usr/bin/env python3
|
|
|
# vim: tabstop=4 shiftwidth=4 expandtab
|
|
|
|
|
|
-import urllib.parse
|
|
|
+import URLHelper
|
|
|
import re
|
|
|
import feedparser
|
|
|
import time
|
|
|
from datetime import datetime
|
|
|
-urllib.parse.uses_relative.append("gemini")
|
|
|
-urllib.parse.uses_netloc.append("gemini")
|
|
|
|
|
|
# collapse whitespace
|
|
|
def _cw(text):
|
|
|
- return re.sub(r'\s', ' ', text)
|
|
|
+ return re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
def parsegemsub(feed, baseurl):
|
|
|
entries = []
|
|
@@ -23,6 +21,7 @@ def parsegemsub(feed, baseurl):
|
|
|
author = authorpatternmatch[0]
|
|
|
else:
|
|
|
return None
|
|
|
+ uh = URLHelper.URLHelper()
|
|
|
for entrypatternmatch in entriespatternmatches:
|
|
|
# Get our YYYY-MM-DD string, add time of day, parse to datetime.datetime, convert to unix timestamp and cast to int
|
|
|
try:
|
|
@@ -30,7 +29,7 @@ def parsegemsub(feed, baseurl):
|
|
|
except:
|
|
|
continue
|
|
|
# A gemsub feed can often have relative links, we'll have to absolutize them
|
|
|
- link = urllib.parse.urljoin(baseurl, entrypatternmatch[0]).replace('/..','').replace('/.','')
|
|
|
+ link = uh.resolve(baseurl, entrypatternmatch[0])
|
|
|
title = entrypatternmatch[2] if entrypatternmatch[2] else entrypatternmatch[1]
|
|
|
entries.append(FeedEntry(baseurl, author, updated, title, link))
|
|
|
return entries
|
|
@@ -56,9 +55,11 @@ def parsetwtxt(feed, baseurl):
|
|
|
return entries
|
|
|
|
|
|
def parsexml(feed, baseurl):
|
|
|
- scheme = baseurl.split("://")[0]
|
|
|
+ scheme = "gemini"
|
|
|
entries = []
|
|
|
parsedfeed = feedparser.parse(feed)
|
|
|
+ uh = URLHelper.URLHelper()
|
|
|
+ baseurl = uh.resolve(baseurl)
|
|
|
|
|
|
# Let's set author name, or lacking that use the feed title.
|
|
|
feedauthor = _cw(parsedfeed['feed']['author_detail']['name']) if parsedfeed['feed'].has_key('author_detail') and parsedfeed['feed']['author_detail'].has_key('name') else None
|
|
@@ -83,7 +84,9 @@ def parsexml(feed, baseurl):
|
|
|
link = _cw(entry['link'])
|
|
|
if not link:
|
|
|
continue
|
|
|
- link = urllib.parse.urljoin(baseurl, link).replace('/..','').replace('/.','')
|
|
|
+ link = uh.resolve(baseurl, link)
|
|
|
+ if not uh.getNetLoc(link) == uh.getNetLoc(baseurl):
|
|
|
+ continue
|
|
|
except:
|
|
|
continue
|
|
|
entries.append(FeedEntry(baseurl, author, updated, title, link))
|