twitter2rss.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. #!/usr/bin/env python3.4
  2. # -*- coding: utf-8 -*-
  3. from html.parser import HTMLParser
  4. from sys import argv
  5. from threading import Thread
  6. from queue import Queue
  7. import datetime
  8. import PyRSS2Gen
  9. import hashlib
  10. import requests
  11. import sys
  12. import os
  13. class twitterParser(HTMLParser):
  14. """HTMLParser class object."""
  15. def __init__(self):
  16. """
  17. Inicialize __init__ class with some variables used later
  18. declared variables:
  19. self.recording -- int that controls if there's data recording
  20. self.data -- list containing the parsed HTML
  21. self.attributes -- list containing HTML tag's attributes
  22. self.tempData -- temporal list containing parsed HTML
  23. self.id -- string containin tag information for differentiating
  24. """
  25. HTMLParser.__init__(self, convert_charrefs=True)
  26. self.recording = 0
  27. self.data = []
  28. self.attributes = []
  29. self.tempData = []
  30. self.id = ''
  31. def handle_starttag(self, tag, attrs):
  32. """
  33. Identify when the tags of interest begins and start recording the data.
  34. return -- just a way for breake the loop
  35. """
  36. self.tag = tag
  37. if self.tag not in ['p', 'span', 'img']:
  38. return
  39. elif self.recording:
  40. self.recording += 1
  41. return
  42. # Key to find the tweets and identify if they are retweets.
  43. # It is likely to change over time, as Twitter do the same
  44. for name, value in attrs:
  45. if self.tag == 'p' and name == 'class' \
  46. and 'TweetTextSize TweetTextSize' in value:
  47. self.recording += 1
  48. self.attributes += attrs
  49. self.id = 'p'
  50. break
  51. elif self.tag == 'span' and name == 'class' \
  52. and 'js-retweet-text' in value:
  53. self.recording += 1
  54. self.attributes += attrs
  55. self.id = 'span'
  56. break
  57. elif self.tag == 'img' and name == 'class' and \
  58. 'Emoji Emoji--forText' in value:
  59. self.recording += 1
  60. self.attributes += attrs
  61. self.id = 'p'
  62. break
  63. else:
  64. return
  65. def handle_endtag(self, tag):
  66. """Identify when the tags of interest ends and stop recording data."""
  67. self.tag = tag
  68. if tag == 'p' and self.recording:
  69. self.recording -= 1
  70. elif tag == 'span' and self.recording:
  71. self.recording -= 1
  72. elif tag == 'img' and self.id == 'p' and self.recording:
  73. self.recording -= 1
  74. def handle_data(self, data):
  75. """When recording, save the data."""
  76. if self.recording:
  77. self.tempData.append(data)
  78. elif self.tempData != []:
  79. if self.id == 'p':
  80. self.data.append(self.tempData)
  81. self.tempData = []
  82. elif self.id == 'span':
  83. # Random hash to identify retweets
  84. self.tempData += [' 59bcc3ad6775562f845953cf01624225']
  85. self.data.append(self.tempData)
  86. self.tempData = []
  87. def return_value(self):
  88. """
  89. Return all saved data.
  90. return -- list of list of chopped strings
  91. """
  92. return self.data
  93. def retrieve_html(url):
  94. """
  95. Retrieve HTML code from url.
  96. url -- string containing an url to be retrieved
  97. return -- string containing HTML code or nothing if there's an error
  98. """
  99. try:
  100. code = requests.get(url).text
  101. except:
  102. return
  103. return code
  104. def sanitize(tweet):
  105. """
  106. Sanitize data. Tweet is a list of chopped up strings that need to be
  107. reassembled. Also, it takes out some weird chars.
  108. tweet -- list containing chopped strings with the data
  109. return -- string containing sanitized tweet
  110. """
  111. final = ''
  112. counter = 0
  113. errors = ['…', '\xa0']
  114. for part in tweet:
  115. if part not in errors:
  116. try:
  117. if 'https://' in part:
  118. final += ' '
  119. elif 'http://' in part:
  120. final += ' '
  121. elif 'pic.twitter.com/' in part:
  122. final += ' '
  123. except:
  124. pass
  125. final += part
  126. counter += 1
  127. if final:
  128. return final
  129. def create_feed(user, feeds):
  130. """
  131. Create feed file.
  132. user -- string containing twitter's username
  133. feeds -- list containing tweets
  134. """
  135. user = user.strip()
  136. items = []
  137. for feed in feeds:
  138. i = 0
  139. limite = 5
  140. cuatro_primeras = ''
  141. split = feed.split()
  142. if len(split) <= 5:
  143. limite = len(split)
  144. for i in range(0, limite):
  145. cuatro_primeras += split[i] + ' '
  146. i += 1
  147. # GUID specified to improve feed readers reading
  148. guid = hashlib.sha1(cuatro_primeras.encode()).hexdigest()
  149. item = PyRSS2Gen.RSSItem(
  150. title='@' + user + ' says: ' + cuatro_primeras + '...',
  151. link='https://twitter.com/' + user,
  152. description=feed,
  153. guid=PyRSS2Gen.Guid(guid, isPermaLink=False)
  154. )
  155. items.append(item)
  156. rss = PyRSS2Gen.RSS2(
  157. title='@' + user + ' Twitter\'s feed.',
  158. link='https://twitter.com/' + user,
  159. description='@' + user + ' Twitter\'s feed.',
  160. lastBuildDate=datetime.datetime.now(),
  161. items=items
  162. )
  163. rss.write_xml(open("feeds/" + user + ".xml", "w"), encoding='utf-8')
  164. def slave():
  165. """
  166. It creates threads and executes the __main__ part according to the
  167. 'threads' variable defined in __main__.
  168. """
  169. while True:
  170. tweets = []
  171. user = repr(q.get())
  172. user = user.replace("\\n", "")
  173. user = user.replace("'", "")
  174. code = retrieve_html('https://twitter.com/' + user + '?lang=en')
  175. if code == "":
  176. q.task_done()
  177. break
  178. parser = twitterParser()
  179. parser.feed(code)
  180. data = parser.return_value()
  181. for tweet in data:
  182. tweet = sanitize(tweet)
  183. tweets.append(tweet)
  184. tweets = mark_as_retweet(tweets)
  185. create_feed(user, tweets)
  186. q.task_done()
  187. def mark_as_retweet(tweets):
  188. """
  189. Mark tweet as retweet seeking a concrete number.
  190. tweets -- list of strings containing sanitized tweets
  191. return -- list of strings maked as retweets with the '♻' symbol
  192. """
  193. coincidence = []
  194. for num in enumerate(tweets):
  195. if '59bcc3ad6775562f845953cf01624225' in num[1]:
  196. coincidence.append(num[0])
  197. for coinc in coincidence:
  198. if coinc < len(tweets):
  199. tweets[coinc+1] = '♻' + tweets[coinc+1]
  200. for coinc in reversed(coincidence):
  201. tweets.pop(coinc)
  202. return tweets
  203. if __name__ == "__main__":
  204. # This variable can be modified
  205. threads = 2
  206. q = Queue()
  207. if not os.path.exists('feeds'):
  208. os.mkdir(("feeds"))
  209. for i in range(threads):
  210. t = Thread(target=slave)
  211. t.daemon = True
  212. t.start()
  213. if len(argv) == 2:
  214. user = argv[1]
  215. q.put(user)
  216. # block the end of the program until all threads are finished
  217. q.join()
  218. else:
  219. try:
  220. feed_file = open('twitter_users', 'r')
  221. except:
  222. print("The file twitter_users does not exist."
  223. " You must create it to continue.")
  224. sys.exit()
  225. feed = feed_file.readlines()
  226. feed_file.close()
  227. for user in feed:
  228. q.put(user)
  229. # block the end of the program until all threads are finished
  230. q.join()