digg.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. """
  2. Digg (News, Social media)
  3. @website https://digg.com/
  4. @provide-api no
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content, publishedDate, thumbnail
  9. """
  10. from urllib import quote_plus
  11. from json import loads
  12. from lxml import html
  13. from cgi import escape
  14. from dateutil import parser
  15. # engine dependent config
  16. categories = ['news', 'social media']
  17. paging = True
  18. # search-url
  19. base_url = 'https://digg.com/'
  20. search_url = base_url + 'api/search/{query}.json?position={position}&format=html'
  21. # specific xpath variables
  22. results_xpath = '//article'
  23. link_xpath = './/small[@class="time"]//a'
  24. title_xpath = './/h2//a//text()'
  25. content_xpath = './/p//text()'
  26. pubdate_xpath = './/time'
  27. # do search-request
  28. def request(query, params):
  29. offset = (params['pageno'] - 1) * 10
  30. params['url'] = search_url.format(position=offset,
  31. query=quote_plus(query))
  32. return params
  33. # get response from search-request
  34. def response(resp):
  35. results = []
  36. search_result = loads(resp.text)
  37. if 'html' not in search_result or search_result['html'] == '':
  38. return results
  39. dom = html.fromstring(search_result['html'])
  40. # parse results
  41. for result in dom.xpath(results_xpath):
  42. url = result.attrib.get('data-contenturl')
  43. thumbnail = result.xpath('.//img')[0].attrib.get('src')
  44. title = ''.join(result.xpath(title_xpath))
  45. content = escape(''.join(result.xpath(content_xpath)))
  46. pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
  47. publishedDate = parser.parse(pubdate)
  48. # http to https
  49. thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")
  50. # append result
  51. results.append({'url': url,
  52. 'title': title,
  53. 'content': content,
  54. 'template': 'videos.html',
  55. 'publishedDate': publishedDate,
  56. 'thumbnail': thumbnail})
  57. # return results
  58. return results