qwant.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Qwant (Web, Images, News, Social)
  4. """
  5. from datetime import datetime
  6. from json import loads
  7. from urllib.parse import urlencode
  8. from searx.utils import html_to_text, match_language
  9. from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
  10. from searx.raise_for_httperror import raise_for_httperror
  11. # about
  12. about = {
  13. "website": 'https://www.qwant.com/',
  14. "wikidata_id": 'Q14657870',
  15. "official_api_documentation": None,
  16. "use_official_api": True,
  17. "require_api_key": False,
  18. "results": 'JSON',
  19. }
  20. # engine dependent config
  21. categories = []
  22. paging = True
  23. supported_languages_url = 'https://qwant.com/region'
  24. category_to_keyword = {'general': 'web',
  25. 'images': 'images',
  26. 'news': 'news'}
  27. # search-url
  28. url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
  29. # do search-request
  30. def request(query, params):
  31. offset = (params['pageno'] - 1) * 10
  32. if categories[0] and categories[0] in category_to_keyword:
  33. params['url'] = url.format(keyword=category_to_keyword[categories[0]],
  34. query=urlencode({'q': query}),
  35. offset=offset)
  36. else:
  37. params['url'] = url.format(keyword='web',
  38. query=urlencode({'q': query}),
  39. offset=offset)
  40. # add language tag
  41. if params['language'] != 'all':
  42. language = match_language(params['language'], supported_languages, language_aliases)
  43. params['url'] += '&locale=' + language.replace('-', '_').lower()
  44. params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
  45. params['raise_for_httperror'] = False
  46. return params
  47. # get response from search-request
  48. def response(resp):
  49. results = []
  50. # According to https://www.qwant.com/js/app.js
  51. if resp.status_code == 429:
  52. raise SearxEngineCaptchaException()
  53. # raise for other errors
  54. raise_for_httperror(resp)
  55. # load JSON result
  56. search_results = loads(resp.text)
  57. # check for an API error
  58. if search_results.get('status') != 'success':
  59. raise SearxEngineAPIException('API error ' + str(search_results.get('error', '')))
  60. # return empty array if there are no results
  61. if 'data' not in search_results:
  62. return []
  63. data = search_results.get('data', {})
  64. res = data.get('result', {})
  65. # parse results
  66. for result in res.get('items', {}):
  67. title = html_to_text(result['title'])
  68. res_url = result['url']
  69. content = html_to_text(result['desc'])
  70. if category_to_keyword.get(categories[0], '') == 'web':
  71. results.append({'title': title,
  72. 'content': content,
  73. 'url': res_url})
  74. elif category_to_keyword.get(categories[0], '') == 'images':
  75. thumbnail_src = result['thumbnail']
  76. img_src = result['media']
  77. results.append({'template': 'images.html',
  78. 'url': res_url,
  79. 'title': title,
  80. 'content': '',
  81. 'thumbnail_src': thumbnail_src,
  82. 'img_src': img_src})
  83. elif category_to_keyword.get(categories[0], '') == 'news':
  84. published_date = datetime.fromtimestamp(result['date'], None)
  85. media = result.get('media', [])
  86. if len(media) > 0:
  87. img_src = media[0].get('pict', {}).get('url', None)
  88. else:
  89. img_src = None
  90. results.append({'url': res_url,
  91. 'title': title,
  92. 'publishedDate': published_date,
  93. 'content': content,
  94. 'img_src': img_src})
  95. return results
  96. # get supported languages from their site
  97. def _fetch_supported_languages(resp):
  98. # list of regions is embedded in page as a js object
  99. response_text = resp.text
  100. response_text = response_text[response_text.find('regionalisation'):]
  101. response_text = response_text[response_text.find('{'):response_text.find(');')]
  102. regions_json = loads(response_text)
  103. supported_languages = {}
  104. for lang in regions_json['languages'].values():
  105. for country in lang['countries']:
  106. lang_code = "{lang}-{country}".format(lang=lang['code'], country=country)
  107. supported_languages[lang_code] = {'name': lang['name']}
  108. return supported_languages