123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- # -*- coding: utf-8 -*-
- # vim: ai ts=4 sts=4 et sw=4
- # Akreggator archive to newsbeuter sqlite3 converter
- import metakit as mk
- import urlparse as up
- import sqlite3 as sq3
- from ConfigParser import ConfigParser
- import sys
- idxarticle = 0
- # Get configuration from file
- cparse = ConfigParser()
- cparse.read("export.cfg")
- path = cparse.get("settings","akregator")
- sq3file = cparse.get("settings","output")
- indexfile = path + "archiveindex.mk4"
- def readfeed(file, feedurl):
- # Open a Metakit file, one file per feed, fetch contents
- global idxarticle
- blogdb = mk.storage(file,0)
- blogvw = blogdb.getas(blogdb.description())
- for i in blogvw:
- writedb_article(i,feedurl)
- idxarticle = idxarticle + 1
- return
- def getfeed(feed):
- # Read index of items from Akregator, translate into metakit database filenames
- global idxarticle
- sys.stdout.write("Reading "+feed.url+": ")
- writedb_feed(feed)
- file = feed.url
- site = up.urlparse(feed.url)
- file = file.replace(":","_")
- file = file.replace("/","_")
- file = path + file + ".mk4"
- readfeed(file, feed.url)
- print idxarticle
- return
- def initdb():
- # Initialize sqlite3 tables to receive data
- print "Deleting old tables"
- sqdb.execute("PRAGMA synchronous = OFF")
- sqdb.execute("drop table if exists google_replay")
- sqdb.execute("drop table if exists rss_feed")
- sqdb.execute("drop table if exists rss_item")
- print "Creating new tables"
- sqdb.execute("CREATE TABLE google_replay ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, guid VARCHAR(64) NOT NULL, state INTEGER NOT NULL, ts INTEGER NOT NULL )")
- sqdb.execute('CREATE TABLE rss_feed ( rssurl VARCHAR(1024) PRIMARY KEY NOT NULL, url VARCHAR(1024) NOT NULL, title VARCHAR(1024) NOT NULL , lastmodified INTEGER(11) NOT NULL DEFAULT 0, is_rtl INTEGER(1) NOT NULL DEFAULT 0, etag VARCHAR(128) NOT NULL DEFAULT "" )')
- sqdb.execute('CREATE TABLE rss_item ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, guid VARCHAR(64) NOT NULL, title VARCHAR(1024) NOT NULL, author VARCHAR(1024) NOT NULL, url VARCHAR(1024) NOT NULL, feedurl VARCHAR(1024) NOT NULL, pubDate INTEGER NOT NULL, content VARCHAR(65535) NOT NULL, unread INTEGER(1) NOT NULL , enclosure_url VARCHAR(1024), enclosure_type VARCHAR(1024), enqueued INTEGER(1) NOT NULL DEFAULT 0, flags VARCHAR(52), deleted INTEGER(1) NOT NULL DEFAULT 0, base VARCHAR(128) NOT NULL DEFAULT "")')
- return
- def writedb_feed(feed):
- parsed = up.urlparse(feed.url)
- # save site URL as title, no feed title seems to be saved in mk4
- insert = (feed.url, parsed.scheme+"://"+parsed.netloc, feed.lastfetch, parsed.netloc)
- sqdb.execute("insert into rss_feed (rssurl, url, lastmodified, title) values(?, ?, ?, ?)", insert)
- return
- def writedb_article(article,feedurl):
- # store an article in sqlite3
- global idxarticle
- if not len(article.title):
- print "Skipped "+article.guid+" with empty title"
- return
- # prefer full text article conversion
- if len(article.content):
- content = article.content
- else:
- content = article.description
- insert = (idxarticle, article.guid, article.title.decode("utf-8"), article.authorName.decode('utf-8'), article.link, feedurl, article.pubDate, content.decode('utf-8'), 0)
- sqdb.execute("insert into rss_item (id, guid, title, author, url, feedurl, pubDate, content, unread) values (?,?,?,?,?,?,?,?,?)", insert)
- return
- print "Starting conversion:"
- db=mk.storage(indexfile,0)
- vw=db.getas(db.description())
- print "Opening up database files"
- sqdb = sq3.connect(sq3file)
- initdb()
- for i in vw:
- getfeed(i)
- print "Imported "+str(idxarticle)+" articles"
- print "Creating indexes:"
- sqdb.execute("CREATE INDEX idx_deleted ON rss_item(deleted)")
- sqdb.execute("CREATE INDEX idx_feedurl ON rss_item(feedurl)")
- sqdb.execute("CREATE INDEX idx_guid ON rss_item(guid)")
- sqdb.execute("CREATE INDEX idx_lastmodified ON rss_feed(lastmodified)")
- sqdb.execute("CREATE INDEX idx_rssurl ON rss_feed(rssurl)")
- sqdb.execute("vacuum")
- print "Vacuuming complete, now closing database."
- sqdb.commit()
- sqdb.close()
- print "Finished!"
|