yahoo_news.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Yahoo (News)
  3. Yahoo News is "English only" and do not offer localized nor language queries.
  4. """
  5. # pylint: disable=invalid-name, missing-function-docstring
  6. import re
  7. from urllib.parse import urlencode
  8. from datetime import datetime, timedelta
  9. from dateutil import parser
  10. from lxml import html
  11. from searx import logger
  12. from searx.utils import (
  13. eval_xpath_list,
  14. eval_xpath_getindex,
  15. extract_text,
  16. )
  17. from searx.engines.yahoo import parse_url
  18. logger = logger.getChild('yahoo_news engine')
  19. # about
  20. about = {
  21. "website": 'https://news.yahoo.com',
  22. "wikidata_id": 'Q3044717',
  23. "official_api_documentation": 'https://developer.yahoo.com/api/',
  24. "use_official_api": False,
  25. "require_api_key": False,
  26. "results": 'HTML',
  27. }
  28. language_support = False
  29. time_range_support = False
  30. safesearch = False
  31. paging = True
  32. categories = ['news']
  33. # search-url
  34. search_url = (
  35. 'https://news.search.yahoo.com/search'
  36. '?{query}&b={offset}'
  37. )
  38. AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
  39. AGO_TIMEDELTA = {
  40. 'minute': timedelta(minutes=1),
  41. 'hour': timedelta(hours=1),
  42. 'day': timedelta(days=1),
  43. 'week': timedelta(days=7),
  44. 'month': timedelta(days=30),
  45. 'year': timedelta(days=365),
  46. }
  47. def request(query, params):
  48. offset = (params['pageno'] - 1) * 10 + 1
  49. params['url'] = search_url.format(
  50. offset = offset,
  51. query = urlencode({'p': query})
  52. )
  53. logger.debug("query_url --> %s", params['url'])
  54. return params
  55. def response(resp):
  56. results = []
  57. dom = html.fromstring(resp.text)
  58. # parse results
  59. for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
  60. url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
  61. if url is None:
  62. continue
  63. url = parse_url(url)
  64. title = extract_text(result.xpath('.//h4/a'))
  65. content = extract_text(result.xpath('.//p'))
  66. img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
  67. item = {
  68. 'url': url,
  69. 'title': title,
  70. 'content': content,
  71. 'img_src' : img_src
  72. }
  73. pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
  74. ago = AGO_RE.search(pub_date)
  75. if ago:
  76. number = int(ago.group(1))
  77. delta = AGO_TIMEDELTA[ago.group(2)]
  78. pub_date = datetime.now() - delta * number
  79. else:
  80. try:
  81. pub_date = parser.parse(pub_date)
  82. except parser.ParserError:
  83. pub_date = None
  84. if pub_date is not None:
  85. item['publishedDate'] = pub_date
  86. results.append(item)
  87. for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
  88. results.append({'suggestion': extract_text(suggestion)})
  89. return results