digg.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Digg (News, Social media)
  4. """
  5. # pylint: disable=missing-function-docstring
  6. from urllib.parse import urlencode
  7. from datetime import datetime
  8. from lxml import html
  9. from searx.utils import eval_xpath, extract_text
  10. # about
  11. about = {
  12. "website": 'https://digg.com',
  13. "wikidata_id": 'Q270478',
  14. "official_api_documentation": None,
  15. "use_official_api": False,
  16. "require_api_key": False,
  17. "results": 'HTML',
  18. }
  19. # engine dependent config
  20. categories = ['news', 'social media']
  21. paging = True
  22. base_url = 'https://digg.com'
  23. results_per_page = 10
  24. # search-url
  25. search_url = base_url + (
  26. '/search'
  27. '?{query}'
  28. '&size={size}'
  29. '&offset={offset}'
  30. )
  31. def request(query, params):
  32. offset = (params['pageno'] - 1) * results_per_page + 1
  33. params['url'] = search_url.format(
  34. query = urlencode({'q': query}),
  35. size = results_per_page,
  36. offset = offset,
  37. )
  38. return params
  39. def response(resp):
  40. results = []
  41. dom = html.fromstring(resp.text)
  42. results_list = eval_xpath(dom, '//section[contains(@class, "search-results")]')
  43. for result in results_list:
  44. titles = eval_xpath(result, '//article//header//h2')
  45. contents = eval_xpath(result, '//article//p')
  46. urls = eval_xpath(result, '//header/a/@href')
  47. published_dates = eval_xpath(result, '//article/div/div/time/@datetime')
  48. for (title, content, url, published_date) in zip(titles, contents, urls, published_dates):
  49. results.append({
  50. 'url': url,
  51. 'publishedDate': datetime.strptime(published_date, '%Y-%m-%dT%H:%M:%SZ'),
  52. 'title': extract_text(title),
  53. 'content' : extract_text(content),
  54. })
  55. return results