duckduckgo_definitions.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. DuckDuckGo (Instant Answer API)
  4. """
  5. import json
  6. from urllib.parse import urlencode, urlparse, urljoin
  7. from lxml import html
  8. from searx import logger
  9. from searx.data import WIKIDATA_UNITS
  10. from searx.engines.duckduckgo import language_aliases
  11. from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
  12. from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
  13. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  14. logger = logger.getChild('duckduckgo_definitions')
  15. # about
  16. about = {
  17. "website": 'https://duckduckgo.com/',
  18. "wikidata_id": 'Q12805',
  19. "official_api_documentation": 'https://duckduckgo.com/api',
  20. "use_official_api": True,
  21. "require_api_key": False,
  22. "results": 'JSON',
  23. }
  24. URL = 'https://api.duckduckgo.com/'\
  25. + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  26. WIKIDATA_PREFIX = [
  27. 'http://www.wikidata.org/entity/',
  28. 'https://www.wikidata.org/entity/'
  29. ]
  30. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  31. def is_broken_text(text):
  32. """ duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
  33. The href URL is broken, the "Related website" may contains some HTML.
  34. The best solution seems to ignore these results.
  35. """
  36. return text.startswith('http') and ' ' in text
  37. def result_to_text(text, htmlResult):
  38. # TODO : remove result ending with "Meaning" or "Category"
  39. result = None
  40. dom = html.fromstring(htmlResult)
  41. a = dom.xpath('//a')
  42. if len(a) >= 1:
  43. result = extract_text(a[0])
  44. else:
  45. result = text
  46. if not is_broken_text(result):
  47. return result
  48. return None
  49. def request(query, params):
  50. params['url'] = URL.format(query=urlencode({'q': query}))
  51. language = match_language(params['language'], supported_languages, language_aliases)
  52. language = language.split('-')[0]
  53. params['headers']['Accept-Language'] = language
  54. return params
  55. def response(resp):
  56. results = []
  57. search_res = json.loads(resp.text)
  58. # search_res.get('Entity') possible values (not exhaustive) :
  59. # * continent / country / department / location / waterfall
  60. # * actor / musician / artist
  61. # * book / performing art / film / television / media franchise / concert tour / playwright
  62. # * prepared food
  63. # * website / software / os / programming language / file format / software engineer
  64. # * company
  65. content = ''
  66. heading = search_res.get('Heading', '')
  67. attributes = []
  68. urls = []
  69. infobox_id = None
  70. relatedTopics = []
  71. # add answer if there is one
  72. answer = search_res.get('Answer', '')
  73. if answer:
  74. logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
  75. if search_res.get('AnswerType') not in ['calc', 'ip']:
  76. results.append({'answer': html_to_text(answer)})
  77. # add infobox
  78. if 'Definition' in search_res:
  79. content = content + search_res.get('Definition', '')
  80. if 'Abstract' in search_res:
  81. content = content + search_res.get('Abstract', '')
  82. # image
  83. image = search_res.get('Image')
  84. image = None if image == '' else image
  85. if image is not None and urlparse(image).netloc == '':
  86. image = urljoin('https://duckduckgo.com', image)
  87. # urls
  88. # Official website, Wikipedia page
  89. for ddg_result in search_res.get('Results', []):
  90. firstURL = ddg_result.get('FirstURL')
  91. text = ddg_result.get('Text')
  92. if firstURL is not None and text is not None:
  93. urls.append({'title': text, 'url': firstURL})
  94. results.append({'title': heading, 'url': firstURL})
  95. # related topics
  96. for ddg_result in search_res.get('RelatedTopics', []):
  97. if 'FirstURL' in ddg_result:
  98. firstURL = ddg_result.get('FirstURL')
  99. text = ddg_result.get('Text')
  100. if not is_broken_text(text):
  101. suggestion = result_to_text(text,
  102. ddg_result.get('Result'))
  103. if suggestion != heading and suggestion is not None:
  104. results.append({'suggestion': suggestion})
  105. elif 'Topics' in ddg_result:
  106. suggestions = []
  107. relatedTopics.append({'name': ddg_result.get('Name', ''),
  108. 'suggestions': suggestions})
  109. for topic_result in ddg_result.get('Topics', []):
  110. suggestion = result_to_text(topic_result.get('Text'),
  111. topic_result.get('Result'))
  112. if suggestion != heading and suggestion is not None:
  113. suggestions.append(suggestion)
  114. # abstract
  115. abstractURL = search_res.get('AbstractURL', '')
  116. if abstractURL != '':
  117. # add as result ? problem always in english
  118. infobox_id = abstractURL
  119. urls.append({'title': search_res.get('AbstractSource'),
  120. 'url': abstractURL,
  121. 'official': True})
  122. results.append({'url': abstractURL,
  123. 'title': heading})
  124. # definition
  125. definitionURL = search_res.get('DefinitionURL', '')
  126. if definitionURL != '':
  127. # add as result ? as answer ? problem always in english
  128. infobox_id = definitionURL
  129. urls.append({'title': search_res.get('DefinitionSource'),
  130. 'url': definitionURL})
  131. # to merge with wikidata's infobox
  132. if infobox_id:
  133. infobox_id = replace_http_by_https(infobox_id)
  134. # attributes
  135. # some will be converted to urls
  136. if 'Infobox' in search_res:
  137. infobox = search_res.get('Infobox')
  138. if 'content' in infobox:
  139. osm_zoom = 17
  140. coordinates = None
  141. for info in infobox.get('content'):
  142. data_type = info.get('data_type')
  143. data_label = info.get('label')
  144. data_value = info.get('value')
  145. # Workaround: ddg may return a double quote
  146. if data_value == '""':
  147. continue
  148. # Is it an external URL ?
  149. # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
  150. # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
  151. # * netflix_id
  152. external_url = get_external_url(data_type, data_value)
  153. if external_url is not None:
  154. urls.append({'title': data_label,
  155. 'url': external_url})
  156. elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
  157. # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
  158. # ignore wiki_maps_trigger: reference to a javascript
  159. # ignore google_play_artist_id: service shutdown
  160. pass
  161. elif data_type == 'string' and data_label == 'Website':
  162. # There is already an URL for the website
  163. pass
  164. elif data_type == 'area':
  165. attributes.append({'label': data_label,
  166. 'value': area_to_str(data_value),
  167. 'entity': 'P2046'})
  168. osm_zoom = area_to_osm_zoom(data_value.get('amount'))
  169. elif data_type == 'coordinates':
  170. if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
  171. # coordinate on Earth
  172. # get the zoom information from the area
  173. coordinates = info
  174. else:
  175. # coordinate NOT on Earth
  176. attributes.append({'label': data_label,
  177. 'value': data_value,
  178. 'entity': 'P625'})
  179. elif data_type == 'string':
  180. attributes.append({'label': data_label,
  181. 'value': data_value})
  182. if coordinates:
  183. data_label = coordinates.get('label')
  184. data_value = coordinates.get('value')
  185. latitude = data_value.get('latitude')
  186. longitude = data_value.get('longitude')
  187. url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
  188. urls.append({'title': 'OpenStreetMap',
  189. 'url': url,
  190. 'entity': 'P625'})
  191. if len(heading) > 0:
  192. # TODO get infobox.meta.value where .label='article_title'
  193. if image is None and len(attributes) == 0 and len(urls) == 1 and\
  194. len(relatedTopics) == 0 and len(content) == 0:
  195. results.append({'url': urls[0]['url'],
  196. 'title': heading,
  197. 'content': content})
  198. else:
  199. results.append({'infobox': heading,
  200. 'id': infobox_id,
  201. 'content': content,
  202. 'img_src': image,
  203. 'attributes': attributes,
  204. 'urls': urls,
  205. 'relatedTopics': relatedTopics})
  206. return results
  207. def unit_to_str(unit):
  208. for prefix in WIKIDATA_PREFIX:
  209. if unit.startswith(prefix):
  210. wikidata_entity = unit[len(prefix):]
  211. return WIKIDATA_UNITS.get(wikidata_entity, unit)
  212. return unit
  213. def area_to_str(area):
  214. """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
  215. unit = unit_to_str(area.get('unit'))
  216. if unit is not None:
  217. try:
  218. amount = float(area.get('amount'))
  219. return '{} {}'.format(amount, unit)
  220. except ValueError:
  221. pass
  222. return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))