yandex.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Yandex (Web, images)"""
  3. from json import loads
  4. from urllib.parse import urlencode
  5. from html import unescape
  6. from lxml import html
  7. from searx.exceptions import SearxEngineCaptchaException
  8. from searx.utils import humanize_bytes, eval_xpath, eval_xpath_list, extract_text, extr
  9. # Engine metadata
  10. about = {
  11. "website": 'https://yandex.com/',
  12. "wikidata_id": 'Q5281',
  13. "official_api_documentation": "?",
  14. "use_official_api": False,
  15. "require_api_key": False,
  16. "results": 'HTML',
  17. }
  18. # Engine configuration
  19. categories = []
  20. paging = True
  21. search_type = ""
  22. # Search URL
  23. base_url_web = 'https://yandex.com/search/site/'
  24. base_url_images = 'https://yandex.com/images/search'
  25. results_xpath = '//li[contains(@class, "serp-item")]'
  26. url_xpath = './/a[@class="b-serp-item__title-link"]/@href'
  27. title_xpath = './/h3[@class="b-serp-item__title"]/a[@class="b-serp-item__title-link"]/span'
  28. content_xpath = './/div[@class="b-serp-item__content"]//div[@class="b-serp-item__text"]'
  29. def catch_bad_response(resp):
  30. if resp.url.path.startswith('/showcaptcha'):
  31. raise SearxEngineCaptchaException()
  32. def request(query, params):
  33. query_params_web = {
  34. "tmpl_version": "releases",
  35. "text": query,
  36. "web": "1",
  37. "frame": "1",
  38. "searchid": "3131712",
  39. }
  40. query_params_images = {
  41. "text": query,
  42. "uinfo": "sw-1920-sh-1080-ww-1125-wh-999",
  43. }
  44. if params['pageno'] > 1:
  45. query_params_web.update({"p": params["pageno"] - 1})
  46. query_params_images.update({"p": params["pageno"] - 1})
  47. params["cookies"] = {'cookie': "yp=1716337604.sp.family%3A0#1685406411.szm.1:1920x1080:1920x999"}
  48. if search_type == 'web':
  49. params['url'] = f"{base_url_web}?{urlencode(query_params_web)}"
  50. elif search_type == 'images':
  51. params['url'] = f"{base_url_images}?{urlencode(query_params_images)}"
  52. return params
  53. def response(resp):
  54. if search_type == 'web':
  55. catch_bad_response(resp)
  56. dom = html.fromstring(resp.text)
  57. results = []
  58. for result in eval_xpath_list(dom, results_xpath):
  59. results.append(
  60. {
  61. 'url': extract_text(eval_xpath(result, url_xpath)),
  62. 'title': extract_text(eval_xpath(result, title_xpath)),
  63. 'content': extract_text(eval_xpath(result, content_xpath)),
  64. }
  65. )
  66. return results
  67. if search_type == 'images':
  68. catch_bad_response(resp)
  69. html_data = html.fromstring(resp.text)
  70. html_sample = unescape(html.tostring(html_data, encoding='unicode'))
  71. content_between_tags = extr(
  72. html_sample, '{"location":"/images/search/', 'advRsyaSearchColumn":null}}', default="fail"
  73. )
  74. json_data = '{"location":"/images/search/' + content_between_tags + 'advRsyaSearchColumn":null}}'
  75. if content_between_tags == "fail":
  76. content_between_tags = extr(html_sample, '{"location":"/images/search/', 'false}}}')
  77. json_data = '{"location":"/images/search/' + content_between_tags + 'false}}}'
  78. json_resp = loads(json_data)
  79. results = []
  80. for _, item_data in json_resp['initialState']['serpList']['items']['entities'].items():
  81. title = item_data['snippet']['title']
  82. source = item_data['snippet']['url']
  83. thumb = item_data['image']
  84. fullsize_image = item_data['viewerData']['dups'][0]['url']
  85. height = item_data['viewerData']['dups'][0]['h']
  86. width = item_data['viewerData']['dups'][0]['w']
  87. filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes']
  88. humanized_filesize = humanize_bytes(filesize)
  89. results.append(
  90. {
  91. 'title': title,
  92. 'url': source,
  93. 'img_src': fullsize_image,
  94. 'filesize': humanized_filesize,
  95. 'thumbnail_src': thumb,
  96. 'template': 'images.html',
  97. 'resolution': f'{width} x {height}',
  98. }
  99. )
  100. return results
  101. return []