gnusrss.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import configparser
  4. import urllib.parse
  5. import requests
  6. import os.path
  7. import sqlite3
  8. import feedparser
  9. import argparse
  10. import hashlib
  11. import time
  12. from os import listdir
  13. from sys import argv
  14. from xml.dom import minidom
  15. from io import BytesIO
  16. from html.parser import HTMLParser
  17. from re import findall
  18. from sys import exit
  19. class Database:
  20. """Manage the database."""
  21. def __init__(self, database='gnusrss.db'):
  22. """
  23. Connect to the database.
  24. database -- string containig the filepath of the db
  25. (default: gnusrss.db)
  26. """
  27. self.connection = sqlite3.connect(database)
  28. def create_tables(self):
  29. """Create table and columns."""
  30. current = self.connection.cursor()
  31. current.execute('DROP TABLE IF EXISTS items')
  32. current.execute('CREATE TABLE items(id INTEGER PRIMARY KEY,'
  33. 'feed TEXT, post TEXT, posted INTEGER, url '
  34. 'TEXT, lastbuild TIMESTAMP, guid TEXT)')
  35. def insert_data(self, param):
  36. """
  37. Insert all the article's information to the table.
  38. Keyword arguments:
  39. param -- list containing all the values
  40. """
  41. self.connection.execute('INSERT INTO items(feed, post, posted'
  42. ', url, lastbuild, guid) VALUES(?, ?,'
  43. '?, ?, ?, ?)', (param))
  44. self.connection.commit()
  45. def select(self, param):
  46. """
  47. Return a select.
  48. Keyword arguments:
  49. param -- string containing a sql select
  50. """
  51. current = self.connection.cursor()
  52. current.execute(param)
  53. rows = current.fetchall()
  54. return rows
  55. def close(self):
  56. """Close the database."""
  57. self.connection.close()
  58. class StupidParser(HTMLParser):
  59. """Just a HTML parser."""
  60. def __init__(self):
  61. try:
  62. HTMLParser.__init__(self, convert_charrefs=True)
  63. except:
  64. # python 3.2 support
  65. HTMLParser.__init__(self)
  66. self.data = []
  67. def handle_data(self, data):
  68. self.data.append(data)
  69. def return_value(self):
  70. return ''.join(self.data)
  71. class GNUsrss:
  72. def parse_feed(self, feed, post_format):
  73. """
  74. Request the feed, parse it and return requested values on a list
  75. of lists.
  76. Keyword arguments:
  77. feed -- string containing the url or the filepath of the feed
  78. post_format -- string containing RSS keywords surrounded by {}
  79. Comment:
  80. Here it's saved way more tags that aren't necessary. They're added just
  81. to add more metadata just because it's clearer when viewing the sqlite.
  82. """
  83. article = []
  84. xml = feedparser.parse(feed)
  85. entries_keys = list(xml.entries[0].keys())
  86. feed_keys = list(xml.feed.keys())
  87. # Very ugly way to test existence, but seems to be the only way
  88. if 'published' in entries_keys:
  89. lastbuild = xml.entries[0].published
  90. elif 'published' in feed_keys:
  91. lastbuild = xml.feed.published
  92. elif 'updated' in entries_keys:
  93. lastbuild = xml.entries[0].updated
  94. elif 'updated' in feed_keys:
  95. lastbuild = xml.feed.updated
  96. else:
  97. # Since the feed doesn't have a date, I'll create it
  98. lastbuild = time.strftime("%a, %d %b %Y %H:%M:%S GMT")
  99. if 'link' in feed_keys:
  100. rss_link = xml.feed.link
  101. else:
  102. rss_link = 'http://' + xml.entries[0].link.split('/')[2]
  103. for item in xml['items']:
  104. values = {}
  105. for i in entries_keys:
  106. if i in post_format:
  107. values[i] = item[i]
  108. post = post_format.format(**values)
  109. # Stupid HTML code adding to complete the post to parse it
  110. post = '<html>' + post + '</html>'
  111. parser = StupidParser()
  112. parser.feed(post)
  113. post = parser.return_value()
  114. if 'guid' in entries_keys:
  115. guid = item['guid']
  116. else:
  117. # Since the feed doesn't have a guid, I'll create it
  118. guid = hashlib.sha1(post.encode()).hexdigest()
  119. article.append([rss_link, post, item['link'], lastbuild, guid])
  120. return article
  121. def post(self, article, gs_node, username, password, insecure):
  122. """
  123. Post the articles to GNU Social.
  124. Keyword arguments:
  125. article -- list containing a most of what is necessary on the insert
  126. gs_node -- string containing the url of the GNU Social node
  127. username -- string containing the user of GNU Social
  128. password -- string containing the password of GNU Social
  129. """
  130. msg = article[1].split()
  131. api = (gs_node + '/api/statuses/update.xml')
  132. # Check for twitter images and call post_image if required
  133. for word in msg:
  134. if 'pic.twitter.com/' in word:
  135. image = self.post_image(word, gs_node, username, password, insecure)
  136. if image is not None:
  137. index = msg.index(word)
  138. msg[index] = image
  139. else:
  140. pass
  141. msg = ' '.join(msg)
  142. post_data = {'status': msg, 'source': 'gnusrss'}
  143. if insecure == 'yes':
  144. req = requests.post(api, auth=(username, password), data=post_data,
  145. verify=False)
  146. else:
  147. req = requests.post(api, auth=(username, password), data=post_data)
  148. response = req.status_code
  149. return response
  150. def post_image(self, picture, gs_node, username, password, insecure):
  151. """
  152. Upload a picture to GNU Social hosting and return a string with the
  153. new url.
  154. Keyword arguments:
  155. picture -- string containing the twitter url of a picture
  156. gs_node -- string containing the url of the GNU Social node
  157. username -- string containing the user of GNU Social
  158. password -- string containing the password of GNU Social
  159. """
  160. pic = ""
  161. found = False
  162. api = gs_node + '/api/statusnet/media/upload'
  163. # If the picture doesn't exist or is not well written, show must go on
  164. try:
  165. html = urllib.request.urlopen('https://' + picture).read().decode(
  166. 'utf-8').splitlines()
  167. except:
  168. return picture
  169. # Search the hardcoded tag name of the picture
  170. for part in html:
  171. if picture in part:
  172. found = True
  173. if 'data-image-url' in part and found is True:
  174. pic = part.split('"')[1]
  175. break
  176. # If there's a video instead of a picture, just exit
  177. if not pic:
  178. return None
  179. req = requests.get(pic)
  180. pic = req.content
  181. img = {'media': ('useless.jpg', pic)}
  182. if insecure == 'yes':
  183. req = requests.post(api, auth=(username, password), verify=False,
  184. files=img)
  185. else:
  186. req = requests.post(api, auth=(username, password), files=img)
  187. buffer = req.content
  188. xmldoc = minidom.parseString(buffer)
  189. item = xmldoc.getElementsByTagName('rsp')
  190. url = item.item(0).getElementsByTagName('mediaurl')[0].firstChild.data
  191. return url
  192. def compare(self, feeds):
  193. """
  194. Compare the picked feed to the saved on the database and return
  195. list of lists if new.
  196. Keyword argument:
  197. feeds -- list of lists containing all actual feeds on the RSS file
  198. """
  199. db = Database()
  200. old = db.select('select guid from items;')
  201. new_feed = []
  202. posted = []
  203. # make the list accesible
  204. for x in old:
  205. posted.append(x[0])
  206. for feed in feeds:
  207. if feed[4] not in posted:
  208. new_feed.append(feed)
  209. db.close()
  210. return new_feed
  211. def shortener(self, post):
  212. """
  213. Return a shortened url.
  214. Keyword argument:
  215. post -- string containing a url to be shortened
  216. """
  217. api = ('http://qttr.at/yourls-api.php?format=xml&action=shorturl'
  218. '&signature=b6afeec983&url=' + post)
  219. req = requests.post(api)
  220. buffer = req.content
  221. xmldoc = minidom.parseString(buffer)
  222. item = xmldoc.getElementsByTagName('result')
  223. url = item.item(0).getElementsByTagName('shorturl')[0].firstChild.data
  224. return url
  225. def shorten_all(self, post):
  226. """
  227. Short all the urls from a notice.
  228. Keyword arguments:
  229. post - list containing all the data related to the post to GS
  230. """
  231. # Regex taken from stackoverflow, thanks guys
  232. # It doesn't identify pic.twitter.com url, which is good
  233. urls = findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&~#=+]|[!*\(\),]'
  234. '|(?:%[0-9a-fA-F][0-9a-fA-F]))+', post[1])
  235. separate = post[1].split(' ')
  236. # Clean shitty carriage return
  237. tmp = []
  238. for i in separate:
  239. i = i.replace('\n', ' ')
  240. tmp.append(i)
  241. separate = tmp
  242. for i in urls:
  243. shortened = self.shortener(i)
  244. position = separate.index(i)
  245. separate[position] = shortened
  246. post[1] = ' '.join(separate)
  247. return post
  248. class Config:
  249. def create(self, config_name):
  250. """
  251. Create config file.
  252. Keyword argument:
  253. config_name -- string containing the config's name to be created
  254. """
  255. print('Hi! Now we\'ll create de config file!')
  256. feed = input('Please introduce the feed\'s url: ')
  257. username = input('Please introduce your username '
  258. '(user@server.com): ')
  259. password = input('Please introduce your password: ')
  260. shorten = input('Do you need to shorten the urls that you '
  261. 'post? Please take in account \nthat you '
  262. 'should only use it if your node only has 140'
  263. ' characters. \nAnswer with "yes" or just press '
  264. 'enter if you don\'t want to use it: ')
  265. fallback_feed = input('Please introduce your feed\'s fallback'
  266. 'url. If you don\'t want or have one,\n'
  267. 'just press enter: ')
  268. print('Now we\'re going to fetch the feed. Please wait...')
  269. feed_file = feedparser.parse(feed)
  270. keys = list(feed_file.entries[0].keys())
  271. print('Done! The tags are: ')
  272. for tag in keys:
  273. print('\t' + tag)
  274. post_format = input('The XML has been parsed. Choose wich '
  275. 'format you want:\nPlease put the tags '
  276. 'inside the square brackets\nEx: {title}'
  277. ' - {link} by @{author}: ')
  278. insecure = input('Do you want to allow insecure connection to your GNU '
  279. 'social server?\nAnswer with "yes" or just press '
  280. 'enter if you don\'t want to use it: ')
  281. config = configparser.ConfigParser()
  282. config['feeds'] = {}
  283. config['feeds']['feed'] = feed
  284. config['feeds']['user'] = username
  285. config['feeds']['password'] = password
  286. config['feeds']['shorten'] = shorten
  287. config['feeds']['fallback_feed'] = fallback_feed
  288. config['feeds']['format'] = post_format
  289. config['feeds']['insecure'] = insecure
  290. with open(config_name + '.ini', 'w') as configfile:
  291. config.write(configfile)
  292. def get(self, name):
  293. """
  294. Parse config file and return it on a list.
  295. Keyword arguments:
  296. name -- string containing the config's name
  297. """
  298. config = []
  299. parser = configparser.SafeConfigParser()
  300. parser.read(name)
  301. for name, value in parser.items('feeds'):
  302. config.append(value)
  303. return config
  304. class ParseOptions():
  305. """Parse command line options of this program."""
  306. def __init__(self):
  307. parser = argparse.ArgumentParser(description='Post feeds to GNU '
  308. 'Social', prog='gnusrss')
  309. parser.add_argument('-c', '--create-config', metavar='file_name',
  310. dest='create_config', help='creates a config file')
  311. parser.add_argument('-C', '--create-db', dest='create_database',
  312. action='store_true', help='creates the database')
  313. parser.add_argument('-p', '--post', metavar='config_file',
  314. dest='post', help='posts feeds')
  315. parser.add_argument('-P', '--post-all', dest='post_all',
  316. action='store_true', help='posts all feeds')
  317. parser.add_argument('-k', '--populate-database', metavar='file_name',
  318. dest='populate_database', help='fetch the RSS and'
  319. ' save it in the database')
  320. parser.add_argument('-v', '--version', dest='version',
  321. action='store_true', help='show version in the '
  322. 'database')
  323. self.db = Database()
  324. self.gs = GNUsrss()
  325. self.cnf = Config()
  326. self.args = parser.parse_args()
  327. # Make all options accesible within self
  328. self.create_database = self.args.create_database
  329. self.create_config = self.args.create_config
  330. self.post = self.args.post
  331. self.post_all = self.args.post_all
  332. self.populate_database = self.args.populate_database
  333. self.version = self.args.version
  334. self.parser = parser
  335. def declare_config(self):
  336. """Assign all config parameters to a self object."""
  337. config = self.cnf.get(self.config_name)
  338. self.feed = config[0]
  339. self.user = config[1].split('@')[0]
  340. self.password = config[2]
  341. self.shorten = config[3]
  342. self.fallback_feed = config[4]
  343. self.format = config[5]
  344. # Always use SSL
  345. self.server = 'https://' + config[1].split('@')[1]
  346. # Test since in versions previous to 0.2.2 didn't exist
  347. try:
  348. self.insecure = config[6]
  349. except:
  350. self.insecure = ''
  351. def post_notice(self):
  352. """Post notice to GNU social."""
  353. file_name = self.config_name
  354. # If first feed and fallback feed aren't available, fail gracefully
  355. try:
  356. posts = self.gs.parse_feed(self.feed, self.format)
  357. except Exception as e:
  358. print(e)
  359. if self.fallback_feed:
  360. posts = self.gs.parse_feed(self.fallback_feed, self.format)
  361. else:
  362. print('There\'s been a problem with ' + file_name + ' file.')
  363. return None
  364. posts = list(reversed(posts))
  365. new = self.gs.compare(posts)
  366. if new:
  367. # Post only the older item
  368. self.to_post = new[0]
  369. if self.shorten == 'yes':
  370. self.to_post = self.gs.shorten_all(self.to_post)
  371. if not self.populate_database:
  372. code = self.gs.post(self.to_post, self.server, self.user,
  373. self.password, self.insecure)
  374. self.save_in_database(code)
  375. def save_in_database(self, code):
  376. """
  377. Save posts in database
  378. Keyword arguments:
  379. code -- HTML code of the notice's post to GNU social
  380. """
  381. if self.create_config or self.populate_database or int(code) == \
  382. int(200):
  383. self.db.insert_data([self.to_post[0], self.to_post[1], 1,
  384. self.to_post[2], self.to_post[3],
  385. self.to_post[4]])
  386. elif code == 400:
  387. print('The notice couldn\'t be posted')
  388. def pointers(self):
  389. """This are the options of the program."""
  390. if self.version:
  391. print("v0.2.2.3")
  392. exit()
  393. if self.create_database:
  394. if os.path.exists('gnusrss.db'):
  395. overwrite = input('The database already exists. Are you '
  396. 'sure you want to overwrite it? (y/n) ')
  397. if overwrite == 'y':
  398. self.db.create_tables()
  399. else:
  400. self.db.create_tables()
  401. if not self.create_config and not self.populate_database and \
  402. not self.post and not self.post_all:
  403. self.db.close()
  404. if self.create_config:
  405. self.config_name = self.create_config + '.ini'
  406. self.cnf.create(self.create_config)
  407. populate = input('Do you want to populate the database? (y) Or you'
  408. ' prefer to post old items? (n) ')
  409. if populate == 'y':
  410. self.declare_config()
  411. posts = self.gs.parse_feed(self.feed, self.format)
  412. for post in posts:
  413. self.to_post = post
  414. self.save_in_database(0)
  415. self.db.close()
  416. elif self.post:
  417. self.config_name = self.post
  418. self.declare_config()
  419. self.post_notice()
  420. self.db.close()
  421. elif self.post_all:
  422. for config in listdir('.'):
  423. if config.endswith('.ini'):
  424. self.config_name = config
  425. self.declare_config()
  426. self.post_notice()
  427. self.db.close()
  428. elif self.populate_database:
  429. self.config_name = self.populate_database
  430. self.declare_config()
  431. posts = self.gs.parse_feed(self.feed, self.format)
  432. for post in posts:
  433. self.to_post = post
  434. self.save_in_database(0)
  435. self.db.close()
  436. elif len(argv) == 1:
  437. self.parser.print_help()
  438. if __name__ == "__main__":
  439. options = ParseOptions()
  440. options.pointers()