twitter.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. """
  2. Twitter (Social media)
  3. @website https://twitter.com/
  4. @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. @todo publishedDate
  10. """
  11. from lxml import html
  12. from datetime import datetime
  13. from searx.engines.xpath import extract_text
  14. from searx.url_utils import urlencode, urljoin
  15. # engine dependent config
  16. categories = ['social media']
  17. language_support = True
  18. # search-url
  19. base_url = 'https://twitter.com/'
  20. search_url = base_url + 'search?'
  21. # specific xpath variables
  22. results_xpath = '//li[@data-item-type="tweet"]'
  23. avatar_xpath = './/img[contains(@class, "avatar")]/@src'
  24. link_xpath = './/small[@class="time"]//a'
  25. title_xpath = './/span[contains(@class, "username")]'
  26. content_xpath = './/p[contains(@class, "tweet-text")]'
  27. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  28. # do search-request
  29. def request(query, params):
  30. params['url'] = search_url + urlencode({'q': query})
  31. # set language if specified
  32. if params['language'] != 'all':
  33. params['cookies']['lang'] = params['language'].split('-')[0]
  34. else:
  35. params['cookies']['lang'] = 'en'
  36. return params
  37. # get response from search-request
  38. def response(resp):
  39. results = []
  40. dom = html.fromstring(resp.text)
  41. # parse results
  42. for tweet in dom.xpath(results_xpath):
  43. try:
  44. link = tweet.xpath(link_xpath)[0]
  45. content = extract_text(tweet.xpath(content_xpath)[0])
  46. img_src = tweet.xpath(avatar_xpath)[0]
  47. img_src = img_src.replace('_bigger', '_normal')
  48. except Exception:
  49. continue
  50. url = urljoin(base_url, link.attrib.get('href'))
  51. title = extract_text(tweet.xpath(title_xpath))
  52. pubdate = tweet.xpath(timestamp_xpath)
  53. if len(pubdate) > 0:
  54. timestamp = float(pubdate[0].attrib.get('data-time'))
  55. publishedDate = datetime.fromtimestamp(timestamp, None)
  56. # append result
  57. results.append({'url': url,
  58. 'title': title,
  59. 'content': content,
  60. 'img_src': img_src,
  61. 'publishedDate': publishedDate})
  62. else:
  63. # append result
  64. results.append({'url': url,
  65. 'title': title,
  66. 'content': content,
  67. 'img_src': img_src})
  68. # return results
  69. return results