duckduckgo_definitions.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. DuckDuckGo Instant Answer API
  4. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  5. The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
  6. reverse engineering we can see that some services (e.g. instant answers) still
  7. in use from the DDG search engine.
  8. As far we can say the *instant answers* API does not support languages, or at
  9. least we could not find out how language support should work. It seems that
  10. most of the features are based on English terms.
  11. """
  12. from typing import TYPE_CHECKING
  13. from urllib.parse import urlencode, urlparse, urljoin
  14. from lxml import html
  15. from searx.data import WIKIDATA_UNITS
  16. from searx.utils import extract_text, html_to_text, get_string_replaces_function
  17. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  18. from searx.result_types import EngineResults
  19. if TYPE_CHECKING:
  20. import logging
  21. logger: logging.Logger
  22. # about
  23. about = {
  24. "website": 'https://duckduckgo.com/',
  25. "wikidata_id": 'Q12805',
  26. "official_api_documentation": 'https://duckduckgo.com/api',
  27. "use_official_api": True,
  28. "require_api_key": False,
  29. "results": 'JSON',
  30. }
  31. send_accept_language_header = True
  32. URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  33. WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
  34. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  35. def is_broken_text(text):
  36. """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
  37. The href URL is broken, the "Related website" may contains some HTML.
  38. The best solution seems to ignore these results.
  39. """
  40. return text.startswith('http') and ' ' in text
  41. def result_to_text(text, htmlResult):
  42. # TODO : remove result ending with "Meaning" or "Category" # pylint: disable=fixme
  43. result = None
  44. dom = html.fromstring(htmlResult)
  45. a = dom.xpath('//a')
  46. if len(a) >= 1:
  47. result = extract_text(a[0])
  48. else:
  49. result = text
  50. if not is_broken_text(result):
  51. return result
  52. return None
  53. def request(query, params):
  54. params['url'] = URL.format(query=urlencode({'q': query}))
  55. return params
  56. def response(resp) -> EngineResults:
  57. # pylint: disable=too-many-locals, too-many-branches, too-many-statements
  58. results = EngineResults()
  59. search_res = resp.json()
  60. # search_res.get('Entity') possible values (not exhaustive) :
  61. # * continent / country / department / location / waterfall
  62. # * actor / musician / artist
  63. # * book / performing art / film / television / media franchise / concert tour / playwright
  64. # * prepared food
  65. # * website / software / os / programming language / file format / software engineer
  66. # * company
  67. content = ''
  68. heading = search_res.get('Heading', '')
  69. attributes = []
  70. urls = []
  71. infobox_id = None
  72. relatedTopics = []
  73. # add answer if there is one
  74. answer = search_res.get('Answer', '')
  75. if answer:
  76. answer_type = search_res.get('AnswerType')
  77. logger.debug('AnswerType="%s" Answer="%s"', answer_type, answer)
  78. if isinstance(answer, str) and answer_type not in ['calc', 'ip']:
  79. results.add(
  80. results.types.Answer(
  81. answer=html_to_text(answer),
  82. url=search_res.get('AbstractURL', ''),
  83. )
  84. )
  85. # add infobox
  86. if 'Definition' in search_res:
  87. content = content + search_res.get('Definition', '')
  88. if 'Abstract' in search_res:
  89. content = content + search_res.get('Abstract', '')
  90. # image
  91. image = search_res.get('Image')
  92. image = None if image == '' else image
  93. if image is not None and urlparse(image).netloc == '':
  94. image = urljoin('https://duckduckgo.com', image)
  95. # urls
  96. # Official website, Wikipedia page
  97. for ddg_result in search_res.get('Results', []):
  98. firstURL = ddg_result.get('FirstURL')
  99. text = ddg_result.get('Text')
  100. if firstURL is not None and text is not None:
  101. urls.append({'title': text, 'url': firstURL})
  102. results.append({'title': heading, 'url': firstURL})
  103. # related topics
  104. for ddg_result in search_res.get('RelatedTopics', []):
  105. if 'FirstURL' in ddg_result:
  106. firstURL = ddg_result.get('FirstURL')
  107. text = ddg_result.get('Text')
  108. if not is_broken_text(text):
  109. suggestion = result_to_text(text, ddg_result.get('Result'))
  110. if suggestion != heading and suggestion is not None:
  111. results.append({'suggestion': suggestion})
  112. elif 'Topics' in ddg_result:
  113. suggestions = []
  114. relatedTopics.append({'name': ddg_result.get('Name', ''), 'suggestions': suggestions})
  115. for topic_result in ddg_result.get('Topics', []):
  116. suggestion = result_to_text(topic_result.get('Text'), topic_result.get('Result'))
  117. if suggestion != heading and suggestion is not None:
  118. suggestions.append(suggestion)
  119. # abstract
  120. abstractURL = search_res.get('AbstractURL', '')
  121. if abstractURL != '':
  122. # add as result ? problem always in english
  123. infobox_id = abstractURL
  124. urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL, 'official': True})
  125. results.append({'url': abstractURL, 'title': heading})
  126. # definition
  127. definitionURL = search_res.get('DefinitionURL', '')
  128. if definitionURL != '':
  129. # add as result ? as answer ? problem always in english
  130. infobox_id = definitionURL
  131. urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
  132. # to merge with wikidata's infobox
  133. if infobox_id:
  134. infobox_id = replace_http_by_https(infobox_id)
  135. # attributes
  136. # some will be converted to urls
  137. if 'Infobox' in search_res:
  138. infobox = search_res.get('Infobox')
  139. if 'content' in infobox:
  140. osm_zoom = 17
  141. coordinates = None
  142. for info in infobox.get('content'):
  143. data_type = info.get('data_type')
  144. data_label = info.get('label')
  145. data_value = info.get('value')
  146. # Workaround: ddg may return a double quote
  147. if data_value == '""':
  148. continue
  149. # Is it an external URL ?
  150. # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
  151. # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
  152. # * netflix_id
  153. external_url = get_external_url(data_type, data_value)
  154. if external_url is not None:
  155. urls.append({'title': data_label, 'url': external_url})
  156. elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
  157. # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
  158. # ignore wiki_maps_trigger: reference to a javascript
  159. # ignore google_play_artist_id: service shutdown
  160. pass
  161. elif data_type == 'string' and data_label == 'Website':
  162. # There is already an URL for the website
  163. pass
  164. elif data_type == 'area':
  165. attributes.append({'label': data_label, 'value': area_to_str(data_value), 'entity': 'P2046'})
  166. osm_zoom = area_to_osm_zoom(data_value.get('amount'))
  167. elif data_type == 'coordinates':
  168. if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
  169. # coordinate on Earth
  170. # get the zoom information from the area
  171. coordinates = info
  172. else:
  173. # coordinate NOT on Earth
  174. attributes.append({'label': data_label, 'value': data_value, 'entity': 'P625'})
  175. elif data_type == 'string':
  176. attributes.append({'label': data_label, 'value': data_value})
  177. if coordinates:
  178. data_label = coordinates.get('label')
  179. data_value = coordinates.get('value')
  180. latitude = data_value.get('latitude')
  181. longitude = data_value.get('longitude')
  182. url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
  183. urls.append({'title': 'OpenStreetMap', 'url': url, 'entity': 'P625'})
  184. if len(heading) > 0:
  185. # TODO get infobox.meta.value where .label='article_title' # pylint: disable=fixme
  186. if image is None and len(attributes) == 0 and len(urls) == 1 and len(relatedTopics) == 0 and len(content) == 0:
  187. results.append({'url': urls[0]['url'], 'title': heading, 'content': content})
  188. else:
  189. results.append(
  190. {
  191. 'infobox': heading,
  192. 'id': infobox_id,
  193. 'content': content,
  194. 'img_src': image,
  195. 'attributes': attributes,
  196. 'urls': urls,
  197. 'relatedTopics': relatedTopics,
  198. }
  199. )
  200. return results
  201. def unit_to_str(unit):
  202. for prefix in WIKIDATA_PREFIX:
  203. if unit.startswith(prefix):
  204. wikidata_entity = unit[len(prefix) :]
  205. real_unit = WIKIDATA_UNITS.get(wikidata_entity)
  206. if real_unit is None:
  207. return unit
  208. return real_unit['symbol']
  209. return unit
  210. def area_to_str(area):
  211. """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
  212. unit = unit_to_str(area.get('unit'))
  213. if unit is not None:
  214. try:
  215. amount = float(area.get('amount'))
  216. return '{} {}'.format(amount, unit)
  217. except ValueError:
  218. pass
  219. return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))