yahoo_news.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Yahoo (News)
  3. Yahoo News is "English only" and do not offer localized nor language queries.
  4. """
  5. # pylint: disable=invalid-name
  6. import re
  7. from urllib.parse import urlencode
  8. from datetime import datetime, timedelta
  9. from dateutil import parser
  10. from lxml import html
  11. from searx.utils import (
  12. eval_xpath_list,
  13. eval_xpath_getindex,
  14. extract_text,
  15. )
  16. from searx.engines.yahoo import parse_url
  17. # about
  18. about = {
  19. "website": 'https://news.yahoo.com',
  20. "wikidata_id": 'Q3044717',
  21. "official_api_documentation": 'https://developer.yahoo.com/api/',
  22. "use_official_api": False,
  23. "require_api_key": False,
  24. "results": 'HTML',
  25. }
  26. language_support = False
  27. time_range_support = False
  28. safesearch = False
  29. paging = True
  30. categories = ['news']
  31. # search-url
  32. search_url = (
  33. # fmt: off
  34. 'https://news.search.yahoo.com/search'
  35. '?{query}&b={offset}'
  36. # fmt: on
  37. )
  38. AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
  39. AGO_TIMEDELTA = {
  40. 'minute': timedelta(minutes=1),
  41. 'hour': timedelta(hours=1),
  42. 'day': timedelta(days=1),
  43. 'week': timedelta(days=7),
  44. 'month': timedelta(days=30),
  45. 'year': timedelta(days=365),
  46. }
  47. def request(query, params):
  48. offset = (params['pageno'] - 1) * 10 + 1
  49. params['url'] = search_url.format(offset=offset, query=urlencode({'p': query}))
  50. logger.debug("query_url --> %s", params['url'])
  51. return params
  52. def response(resp):
  53. results = []
  54. dom = html.fromstring(resp.text)
  55. # parse results
  56. for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
  57. url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
  58. if url is None:
  59. continue
  60. url = parse_url(url)
  61. title = extract_text(result.xpath('.//h4/a'))
  62. content = extract_text(result.xpath('.//p'))
  63. thumbnail = eval_xpath_getindex(result, './/img/@data-src', 0, None)
  64. item = {'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail}
  65. pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
  66. ago = AGO_RE.search(pub_date)
  67. if ago:
  68. number = int(ago.group(1))
  69. delta = AGO_TIMEDELTA[ago.group(2)]
  70. pub_date = datetime.now() - delta * number
  71. else:
  72. try:
  73. pub_date = parser.parse(pub_date)
  74. except parser.ParserError:
  75. pub_date = None
  76. if pub_date is not None:
  77. item['publishedDate'] = pub_date
  78. results.append(item)
  79. for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
  80. results.append({'suggestion': extract_text(suggestion)})
  81. return results