qwant.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Qwant (Web, News, Images, Videos)
  4. This engine uses the Qwant API (https://api.qwant.com/v3). The API is
  5. undocumented but can be reverse engineered by reading the network log of
  6. https://www.qwant.com/ queries.
  7. This implementation is used by different qwant engines in the settings.yml::
  8. - name: qwant
  9. categories: general
  10. ...
  11. - name: qwant news
  12. categories: news
  13. ...
  14. - name: qwant images
  15. categories: images
  16. ...
  17. - name: qwant videos
  18. categories: videos
  19. ...
  20. """
  21. from datetime import (
  22. datetime,
  23. timedelta,
  24. )
  25. from json import loads
  26. from urllib.parse import urlencode
  27. from flask_babel import gettext
  28. from searx.utils import match_language
  29. from searx.exceptions import SearxEngineAPIException
  30. from searx.raise_for_httperror import raise_for_httperror
  31. # about
  32. about = {
  33. "website": 'https://www.qwant.com/',
  34. "wikidata_id": 'Q14657870',
  35. "official_api_documentation": None,
  36. "use_official_api": True,
  37. "require_api_key": False,
  38. "results": 'JSON',
  39. }
  40. # engine dependent config
  41. categories = []
  42. paging = True
  43. supported_languages_url = about['website']
  44. category_to_keyword = {
  45. 'general': 'web',
  46. 'news': 'news',
  47. 'images': 'images',
  48. 'videos': 'videos',
  49. }
  50. # search-url
  51. url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={offset}'
  52. def request(query, params):
  53. """Qwant search request"""
  54. keyword = category_to_keyword[categories[0]]
  55. count = 10 # web: count must be equal to 10
  56. if keyword == 'images':
  57. count = 50
  58. offset = (params['pageno'] - 1) * count
  59. # count + offset must be lower than 250
  60. offset = min(offset, 199)
  61. else:
  62. offset = (params['pageno'] - 1) * count
  63. # count + offset must be lower than 50
  64. offset = min(offset, 40)
  65. params['url'] = url.format(
  66. keyword=keyword,
  67. query=urlencode({'q': query}),
  68. offset=offset,
  69. count=count,
  70. )
  71. # add language tag
  72. if params['language'] == 'all':
  73. params['url'] += '&locale=en_US'
  74. else:
  75. language = match_language(
  76. params['language'],
  77. supported_languages,
  78. language_aliases,
  79. )
  80. params['url'] += '&locale=' + language.replace('-', '_')
  81. params['raise_for_httperror'] = False
  82. return params
  83. def response(resp):
  84. """Get response from Qwant's search request"""
  85. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  86. keyword = category_to_keyword[categories[0]]
  87. results = []
  88. # load JSON result
  89. search_results = loads(resp.text)
  90. data = search_results.get('data', {})
  91. # check for an API error
  92. if search_results.get('status') != 'success':
  93. msg = ",".join(
  94. data.get(
  95. 'message',
  96. [
  97. 'unknown',
  98. ],
  99. )
  100. )
  101. raise SearxEngineAPIException('API error::' + msg)
  102. # raise for other errors
  103. raise_for_httperror(resp)
  104. if keyword == 'web':
  105. # The WEB query contains a list named 'mainline'. This list can contain
  106. # different result types (e.g. mainline[0]['type'] returns type of the
  107. # result items in mainline[0]['items']
  108. mainline = data.get('result', {}).get('items', {}).get('mainline', {})
  109. else:
  110. # Queries on News, Images and Videos do not have a list named 'mainline'
  111. # in the response. The result items are directly in the list
  112. # result['items'].
  113. mainline = data.get('result', {}).get('items', [])
  114. mainline = [
  115. {'type': keyword, 'items': mainline},
  116. ]
  117. # return empty array if there are no results
  118. if not mainline:
  119. return []
  120. for row in mainline:
  121. mainline_type = row.get('type', 'web')
  122. if mainline_type != keyword:
  123. continue
  124. if mainline_type == 'ads':
  125. # ignore adds
  126. continue
  127. mainline_items = row.get('items', [])
  128. for item in mainline_items:
  129. title = item.get('title', None)
  130. res_url = item.get('url', None)
  131. if mainline_type == 'web':
  132. content = item['desc']
  133. results.append(
  134. {
  135. 'title': title,
  136. 'url': res_url,
  137. 'content': content,
  138. }
  139. )
  140. elif mainline_type == 'news':
  141. pub_date = item['date']
  142. if pub_date is not None:
  143. pub_date = datetime.fromtimestamp(pub_date)
  144. news_media = item.get('media', [])
  145. img_src = None
  146. if news_media:
  147. img_src = news_media[0].get('pict', {}).get('url', None)
  148. results.append(
  149. {
  150. 'title': title,
  151. 'url': res_url,
  152. 'publishedDate': pub_date,
  153. 'img_src': img_src,
  154. }
  155. )
  156. elif mainline_type == 'images':
  157. thumbnail = item['thumbnail']
  158. img_src = item['media']
  159. results.append(
  160. {
  161. 'title': title,
  162. 'url': res_url,
  163. 'template': 'images.html',
  164. 'thumbnail_src': thumbnail,
  165. 'img_src': img_src,
  166. }
  167. )
  168. elif mainline_type == 'videos':
  169. # some videos do not have a description: while qwant-video
  170. # returns an empty string, such video from a qwant-web query
  171. # miss the 'desc' key.
  172. d, s, c = item.get('desc'), item.get('source'), item.get('channel')
  173. content_parts = []
  174. if d:
  175. content_parts.append(d)
  176. if s:
  177. content_parts.append("%s: %s " % (gettext("Source"), s))
  178. if c:
  179. content_parts.append("%s: %s " % (gettext("Channel"), c))
  180. content = ' // '.join(content_parts)
  181. length = item['duration']
  182. if length is not None:
  183. length = timedelta(milliseconds=length)
  184. pub_date = item['date']
  185. if pub_date is not None:
  186. pub_date = datetime.fromtimestamp(pub_date)
  187. thumbnail = item['thumbnail']
  188. # from some locations (DE and others?) the s2 link do
  189. # response a 'Please wait ..' but does not deliver the thumbnail
  190. thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1)
  191. results.append(
  192. {
  193. 'title': title,
  194. 'url': res_url,
  195. 'content': content,
  196. 'publishedDate': pub_date,
  197. 'thumbnail': thumbnail,
  198. 'template': 'videos.html',
  199. 'length': length,
  200. }
  201. )
  202. return results
  203. # get supported languages from their site
  204. def _fetch_supported_languages(resp):
  205. # list of regions is embedded in page as a js object
  206. response_text = resp.text
  207. response_text = response_text[response_text.find('INITIAL_PROPS'):]
  208. response_text = response_text[response_text.find('{'): response_text.find('</script>')]
  209. regions_json = loads(response_text)
  210. supported_languages = []
  211. for country, langs in regions_json['locales'].items():
  212. for lang in langs['langs']:
  213. lang_code = "{lang}-{country}".format(lang=lang, country=country)
  214. supported_languages.append(lang_code)
  215. return supported_languages