wikidata.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """This module implements the Wikidata engine. Some implementations are shared
  3. from :ref:`wikipedia engine`.
  4. """
  5. # pylint: disable=missing-class-docstring
  6. from typing import TYPE_CHECKING
  7. from hashlib import md5
  8. from urllib.parse import urlencode, unquote
  9. from json import loads
  10. from dateutil.parser import isoparse
  11. from babel.dates import format_datetime, format_date, format_time, get_datetime_format
  12. from searx.data import WIKIDATA_UNITS
  13. from searx.network import post, get
  14. from searx.utils import searx_useragent, get_string_replaces_function
  15. from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
  16. from searx.engines.wikipedia import (
  17. fetch_wikimedia_traits,
  18. get_wiki_params,
  19. )
  20. from searx.enginelib.traits import EngineTraits
  21. if TYPE_CHECKING:
  22. import logging
  23. logger: logging.Logger
  24. traits: EngineTraits
  25. # about
  26. about = {
  27. "website": 'https://wikidata.org/',
  28. "wikidata_id": 'Q2013',
  29. "official_api_documentation": 'https://query.wikidata.org/',
  30. "use_official_api": True,
  31. "require_api_key": False,
  32. "results": 'JSON',
  33. }
  34. display_type = ["infobox"]
  35. """A list of display types composed from ``infobox`` and ``list``. The latter
  36. one will add a hit to the result list. The first one will show a hit in the
  37. info box. Both values can be set, or one of the two can be set."""
  38. # SPARQL
  39. SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql'
  40. SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain'
  41. WIKIDATA_PROPERTIES = {
  42. 'P434': 'MusicBrainz',
  43. 'P435': 'MusicBrainz',
  44. 'P436': 'MusicBrainz',
  45. 'P966': 'MusicBrainz',
  46. 'P345': 'IMDb',
  47. 'P2397': 'YouTube',
  48. 'P1651': 'YouTube',
  49. 'P2002': 'Twitter',
  50. 'P2013': 'Facebook',
  51. 'P2003': 'Instagram',
  52. 'P4033': 'Mastodon',
  53. 'P11947': 'Lemmy',
  54. 'P12622': 'PeerTube',
  55. }
  56. # SERVICE wikibase:mwapi : https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI
  57. # SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE
  58. # https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates
  59. # https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model
  60. # optimization:
  61. # * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization
  62. # * https://github.com/blazegraph/database/wiki/QueryHints
  63. QUERY_TEMPLATE = """
  64. SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT%
  65. WHERE
  66. {
  67. SERVICE wikibase:mwapi {
  68. bd:serviceParam wikibase:endpoint "www.wikidata.org";
  69. wikibase:api "EntitySearch";
  70. wikibase:limit 1;
  71. mwapi:search "%QUERY%";
  72. mwapi:language "%LANGUAGE%".
  73. ?item wikibase:apiOutputItem mwapi:item.
  74. }
  75. hint:Prior hint:runFirst "true".
  76. %WHERE%
  77. SERVICE wikibase:label {
  78. bd:serviceParam wikibase:language "%LANGUAGE%,en".
  79. ?item rdfs:label ?itemLabel .
  80. ?item schema:description ?itemDescription .
  81. %WIKIBASE_LABELS%
  82. }
  83. }
  84. GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY%
  85. """
  86. # Get the calendar names and the property names
  87. QUERY_PROPERTY_NAMES = """
  88. SELECT ?item ?name
  89. WHERE {
  90. {
  91. SELECT ?item
  92. WHERE { ?item wdt:P279* wd:Q12132 }
  93. } UNION {
  94. VALUES ?item { %ATTRIBUTES% }
  95. }
  96. OPTIONAL { ?item rdfs:label ?name. }
  97. }
  98. """
  99. # see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata)
  100. # hard coded here to avoid to an additional SPARQL request when the server starts
  101. DUMMY_ENTITY_URLS = set(
  102. "http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402")
  103. )
  104. # https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
  105. # https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
  106. sparql_string_escape = get_string_replaces_function(
  107. # fmt: off
  108. {
  109. '\t': '\\\t',
  110. '\n': '\\\n',
  111. '\r': '\\\r',
  112. '\b': '\\\b',
  113. '\f': '\\\f',
  114. '\"': '\\\"',
  115. '\'': '\\\'',
  116. '\\': '\\\\'
  117. }
  118. # fmt: on
  119. )
  120. replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
  121. def get_headers():
  122. # user agent: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits
  123. return {'Accept': 'application/sparql-results+json', 'User-Agent': searx_useragent()}
  124. def get_label_for_entity(entity_id, language):
  125. name = WIKIDATA_PROPERTIES.get(entity_id)
  126. if name is None:
  127. name = WIKIDATA_PROPERTIES.get((entity_id, language))
  128. if name is None:
  129. name = WIKIDATA_PROPERTIES.get((entity_id, language.split('-')[0]))
  130. if name is None:
  131. name = WIKIDATA_PROPERTIES.get((entity_id, 'en'))
  132. if name is None:
  133. name = entity_id
  134. return name
  135. def send_wikidata_query(query, method='GET'):
  136. if method == 'GET':
  137. # query will be cached by wikidata
  138. http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers())
  139. else:
  140. # query won't be cached by wikidata
  141. http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers())
  142. if http_response.status_code != 200:
  143. logger.debug('SPARQL endpoint error %s', http_response.content.decode())
  144. logger.debug('request time %s', str(http_response.elapsed))
  145. http_response.raise_for_status()
  146. return loads(http_response.content.decode())
  147. def request(query, params):
  148. eng_tag, _wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
  149. query, attributes = get_query(query, eng_tag)
  150. logger.debug("request --> language %s // len(attributes): %s", eng_tag, len(attributes))
  151. params['method'] = 'POST'
  152. params['url'] = SPARQL_ENDPOINT_URL
  153. params['data'] = {'query': query}
  154. params['headers'] = get_headers()
  155. params['language'] = eng_tag
  156. params['attributes'] = attributes
  157. return params
  158. def response(resp):
  159. results = []
  160. jsonresponse = loads(resp.content.decode())
  161. language = resp.search_params['language']
  162. attributes = resp.search_params['attributes']
  163. logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
  164. seen_entities = set()
  165. for result in jsonresponse.get('results', {}).get('bindings', []):
  166. attribute_result = {key: value['value'] for key, value in result.items()}
  167. entity_url = attribute_result['item']
  168. if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS:
  169. seen_entities.add(entity_url)
  170. results += get_results(attribute_result, attributes, language)
  171. else:
  172. logger.debug('The SPARQL request returns duplicate entities: %s', str(attribute_result))
  173. return results
  174. _IMG_SRC_DEFAULT_URL_PREFIX = "https://commons.wikimedia.org/wiki/Special:FilePath/"
  175. _IMG_SRC_NEW_URL_PREFIX = "https://upload.wikimedia.org/wikipedia/commons/thumb/"
  176. def get_thumbnail(img_src):
  177. """Get Thumbnail image from wikimedia commons
  178. Images from commons.wikimedia.org are (HTTP) redirected to
  179. upload.wikimedia.org. The redirected URL can be calculated by this
  180. function.
  181. - https://stackoverflow.com/a/33691240
  182. """
  183. logger.debug('get_thumbnail(): %s', img_src)
  184. if not img_src is None and _IMG_SRC_DEFAULT_URL_PREFIX in img_src.split()[0]:
  185. img_src_name = unquote(img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[0].replace("%20", "_"))
  186. img_src_name_first = img_src_name
  187. img_src_name_second = img_src_name
  188. if ".svg" in img_src_name.split()[0]:
  189. img_src_name_second = img_src_name + ".png"
  190. img_src_size = img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[1]
  191. img_src_size = img_src_size[img_src_size.index("=") + 1 : img_src_size.index("&")]
  192. img_src_name_md5 = md5(img_src_name.encode("utf-8")).hexdigest()
  193. img_src = (
  194. _IMG_SRC_NEW_URL_PREFIX
  195. + img_src_name_md5[0]
  196. + "/"
  197. + img_src_name_md5[0:2]
  198. + "/"
  199. + img_src_name_first
  200. + "/"
  201. + img_src_size
  202. + "px-"
  203. + img_src_name_second
  204. )
  205. logger.debug('get_thumbnail() redirected: %s', img_src)
  206. return img_src
  207. def get_results(attribute_result, attributes, language):
  208. # pylint: disable=too-many-branches
  209. results = []
  210. infobox_title = attribute_result.get('itemLabel')
  211. infobox_id = attribute_result['item']
  212. infobox_id_lang = None
  213. infobox_urls = []
  214. infobox_attributes = []
  215. infobox_content = attribute_result.get('itemDescription', [])
  216. img_src = None
  217. img_src_priority = 0
  218. for attribute in attributes:
  219. value = attribute.get_str(attribute_result, language)
  220. if value is not None and value != '':
  221. attribute_type = type(attribute)
  222. if attribute_type in (WDURLAttribute, WDArticle):
  223. # get_select() method : there is group_concat(distinct ...;separator=", ")
  224. # split the value here
  225. for url in value.split(', '):
  226. infobox_urls.append({'title': attribute.get_label(language), 'url': url, **attribute.kwargs})
  227. # "normal" results (not infobox) include official website and Wikipedia links.
  228. if "list" in display_type and (attribute.kwargs.get('official') or attribute_type == WDArticle):
  229. results.append({'title': infobox_title, 'url': url, "content": infobox_content})
  230. # update the infobox_id with the wikipedia URL
  231. # first the local wikipedia URL, and as fallback the english wikipedia URL
  232. if attribute_type == WDArticle and (
  233. (attribute.language == 'en' and infobox_id_lang is None) or attribute.language != 'en'
  234. ):
  235. infobox_id_lang = attribute.language
  236. infobox_id = url
  237. elif attribute_type == WDImageAttribute:
  238. # this attribute is an image.
  239. # replace the current image only the priority is lower
  240. # (the infobox contain only one image).
  241. if attribute.priority > img_src_priority:
  242. img_src = get_thumbnail(value)
  243. img_src_priority = attribute.priority
  244. elif attribute_type == WDGeoAttribute:
  245. # geocoordinate link
  246. # use the area to get the OSM zoom
  247. # Note: ignore the unit (must be km² otherwise the calculation is wrong)
  248. # Should use normalized value p:P2046/psn:P2046/wikibase:quantityAmount
  249. area = attribute_result.get('P2046')
  250. osm_zoom = area_to_osm_zoom(area) if area else 19
  251. url = attribute.get_geo_url(attribute_result, osm_zoom=osm_zoom)
  252. if url:
  253. infobox_urls.append({'title': attribute.get_label(language), 'url': url, 'entity': attribute.name})
  254. else:
  255. infobox_attributes.append(
  256. {'label': attribute.get_label(language), 'value': value, 'entity': attribute.name}
  257. )
  258. if infobox_id:
  259. infobox_id = replace_http_by_https(infobox_id)
  260. # add the wikidata URL at the end
  261. infobox_urls.append({'title': 'Wikidata', 'url': attribute_result['item']})
  262. if (
  263. "list" in display_type
  264. and img_src is None
  265. and len(infobox_attributes) == 0
  266. and len(infobox_urls) == 1
  267. and len(infobox_content) == 0
  268. ):
  269. results.append({'url': infobox_urls[0]['url'], 'title': infobox_title, 'content': infobox_content})
  270. elif "infobox" in display_type:
  271. results.append(
  272. {
  273. 'infobox': infobox_title,
  274. 'id': infobox_id,
  275. 'content': infobox_content,
  276. 'img_src': img_src,
  277. 'urls': infobox_urls,
  278. 'attributes': infobox_attributes,
  279. }
  280. )
  281. return results
  282. def get_query(query, language):
  283. attributes = get_attributes(language)
  284. select = [a.get_select() for a in attributes]
  285. where = list(filter(lambda s: len(s) > 0, [a.get_where() for a in attributes]))
  286. wikibase_label = list(filter(lambda s: len(s) > 0, [a.get_wikibase_label() for a in attributes]))
  287. group_by = list(filter(lambda s: len(s) > 0, [a.get_group_by() for a in attributes]))
  288. query = (
  289. QUERY_TEMPLATE.replace('%QUERY%', sparql_string_escape(query))
  290. .replace('%SELECT%', ' '.join(select))
  291. .replace('%WHERE%', '\n '.join(where))
  292. .replace('%WIKIBASE_LABELS%', '\n '.join(wikibase_label))
  293. .replace('%GROUP_BY%', ' '.join(group_by))
  294. .replace('%LANGUAGE%', language)
  295. )
  296. return query, attributes
  297. def get_attributes(language):
  298. # pylint: disable=too-many-statements
  299. attributes = []
  300. def add_value(name):
  301. attributes.append(WDAttribute(name))
  302. def add_amount(name):
  303. attributes.append(WDAmountAttribute(name))
  304. def add_label(name):
  305. attributes.append(WDLabelAttribute(name))
  306. def add_url(name, url_id=None, url_path_prefix=None, **kwargs):
  307. attributes.append(WDURLAttribute(name, url_id, url_path_prefix, kwargs))
  308. def add_image(name, url_id=None, priority=1):
  309. attributes.append(WDImageAttribute(name, url_id, priority))
  310. def add_date(name):
  311. attributes.append(WDDateAttribute(name))
  312. # Dates
  313. for p in [
  314. 'P571', # inception date
  315. 'P576', # dissolution date
  316. 'P580', # start date
  317. 'P582', # end date
  318. 'P569', # date of birth
  319. 'P570', # date of death
  320. 'P619', # date of spacecraft launch
  321. 'P620',
  322. ]: # date of spacecraft landing
  323. add_date(p)
  324. for p in [
  325. 'P27', # country of citizenship
  326. 'P495', # country of origin
  327. 'P17', # country
  328. 'P159',
  329. ]: # headquarters location
  330. add_label(p)
  331. # Places
  332. for p in [
  333. 'P36', # capital
  334. 'P35', # head of state
  335. 'P6', # head of government
  336. 'P122', # basic form of government
  337. 'P37',
  338. ]: # official language
  339. add_label(p)
  340. add_value('P1082') # population
  341. add_amount('P2046') # area
  342. add_amount('P281') # postal code
  343. add_label('P38') # currency
  344. add_amount('P2048') # height (building)
  345. # Media
  346. for p in [
  347. 'P400', # platform (videogames, computing)
  348. 'P50', # author
  349. 'P170', # creator
  350. 'P57', # director
  351. 'P175', # performer
  352. 'P178', # developer
  353. 'P162', # producer
  354. 'P176', # manufacturer
  355. 'P58', # screenwriter
  356. 'P272', # production company
  357. 'P264', # record label
  358. 'P123', # publisher
  359. 'P449', # original network
  360. 'P750', # distributed by
  361. 'P86',
  362. ]: # composer
  363. add_label(p)
  364. add_date('P577') # publication date
  365. add_label('P136') # genre (music, film, artistic...)
  366. add_label('P364') # original language
  367. add_value('P212') # ISBN-13
  368. add_value('P957') # ISBN-10
  369. add_label('P275') # copyright license
  370. add_label('P277') # programming language
  371. add_value('P348') # version
  372. add_label('P840') # narrative location
  373. # Languages
  374. add_value('P1098') # number of speakers
  375. add_label('P282') # writing system
  376. add_label('P1018') # language regulatory body
  377. add_value('P218') # language code (ISO 639-1)
  378. # Other
  379. add_label('P169') # ceo
  380. add_label('P112') # founded by
  381. add_label('P1454') # legal form (company, organization)
  382. add_label('P137') # operator (service, facility, ...)
  383. add_label('P1029') # crew members (tripulation)
  384. add_label('P225') # taxon name
  385. add_value('P274') # chemical formula
  386. add_label('P1346') # winner (sports, contests, ...)
  387. add_value('P1120') # number of deaths
  388. add_value('P498') # currency code (ISO 4217)
  389. # URL
  390. add_url('P856', official=True) # official website
  391. attributes.append(WDArticle(language)) # wikipedia (user language)
  392. if not language.startswith('en'):
  393. attributes.append(WDArticle('en')) # wikipedia (english)
  394. add_url('P1324') # source code repository
  395. add_url('P1581') # blog
  396. add_url('P434', url_id='musicbrainz_artist')
  397. add_url('P435', url_id='musicbrainz_work')
  398. add_url('P436', url_id='musicbrainz_release_group')
  399. add_url('P966', url_id='musicbrainz_label')
  400. add_url('P345', url_id='imdb_id')
  401. add_url('P2397', url_id='youtube_channel')
  402. add_url('P1651', url_id='youtube_video')
  403. add_url('P2002', url_id='twitter_profile')
  404. add_url('P2013', url_id='facebook_profile')
  405. add_url('P2003', url_id='instagram_profile')
  406. # Fediverse
  407. add_url('P4033', url_path_prefix='/@') # Mastodon user
  408. add_url('P11947', url_path_prefix='/c/') # Lemmy community
  409. add_url('P12622', url_path_prefix='/c/') # PeerTube channel
  410. # Map
  411. attributes.append(WDGeoAttribute('P625'))
  412. # Image
  413. add_image('P15', priority=1, url_id='wikimedia_image') # route map
  414. add_image('P242', priority=2, url_id='wikimedia_image') # locator map
  415. add_image('P154', priority=3, url_id='wikimedia_image') # logo
  416. add_image('P18', priority=4, url_id='wikimedia_image') # image
  417. add_image('P41', priority=5, url_id='wikimedia_image') # flag
  418. add_image('P2716', priority=6, url_id='wikimedia_image') # collage
  419. add_image('P2910', priority=7, url_id='wikimedia_image') # icon
  420. return attributes
  421. class WDAttribute:
  422. __slots__ = ('name',)
  423. def __init__(self, name):
  424. self.name = name
  425. def get_select(self):
  426. return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace('{name}', self.name)
  427. def get_label(self, language):
  428. return get_label_for_entity(self.name, language)
  429. def get_where(self):
  430. return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name)
  431. def get_wikibase_label(self):
  432. return ""
  433. def get_group_by(self):
  434. return ""
  435. def get_str(self, result, language): # pylint: disable=unused-argument
  436. return result.get(self.name + 's')
  437. def __repr__(self):
  438. return '<' + str(type(self).__name__) + ':' + self.name + '>'
  439. class WDAmountAttribute(WDAttribute):
  440. def get_select(self):
  441. return '?{name} ?{name}Unit'.replace('{name}', self.name)
  442. def get_where(self):
  443. return """ OPTIONAL { ?item p:{name} ?{name}Node .
  444. ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} .
  445. OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace(
  446. '{name}', self.name
  447. )
  448. def get_group_by(self):
  449. return self.get_select()
  450. def get_str(self, result, language):
  451. value = result.get(self.name)
  452. unit = result.get(self.name + "Unit")
  453. if unit is not None:
  454. unit = unit.replace('http://www.wikidata.org/entity/', '')
  455. return value + " " + get_label_for_entity(unit, language)
  456. return value
  457. class WDArticle(WDAttribute):
  458. __slots__ = 'language', 'kwargs'
  459. def __init__(self, language, kwargs=None):
  460. super().__init__('wikipedia')
  461. self.language = language
  462. self.kwargs = kwargs or {}
  463. def get_label(self, language):
  464. # language parameter is ignored
  465. return "Wikipedia ({language})".replace('{language}', self.language)
  466. def get_select(self):
  467. return "?article{language} ?articleName{language}".replace('{language}', self.language)
  468. def get_where(self):
  469. return """OPTIONAL { ?article{language} schema:about ?item ;
  470. schema:inLanguage "{language}" ;
  471. schema:isPartOf <https://{language}.wikipedia.org/> ;
  472. schema:name ?articleName{language} . }""".replace(
  473. '{language}', self.language
  474. )
  475. def get_group_by(self):
  476. return self.get_select()
  477. def get_str(self, result, language):
  478. key = 'article{language}'.replace('{language}', self.language)
  479. return result.get(key)
  480. class WDLabelAttribute(WDAttribute):
  481. def get_select(self):
  482. return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace('{name}', self.name)
  483. def get_where(self):
  484. return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name)
  485. def get_wikibase_label(self):
  486. return "?{name} rdfs:label ?{name}Label .".replace('{name}', self.name)
  487. def get_str(self, result, language):
  488. return result.get(self.name + 'Labels')
  489. class WDURLAttribute(WDAttribute):
  490. HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
  491. __slots__ = 'url_id', 'url_path_prefix', 'kwargs'
  492. def __init__(self, name, url_id=None, url_path_prefix=None, kwargs=None):
  493. """
  494. :param url_id: ID matching one key in ``external_urls.json`` for
  495. converting IDs to full URLs.
  496. :param url_path_prefix: Path prefix if the values are of format
  497. ``account@domain``. If provided, value are rewritten to
  498. ``https://<domain><url_path_prefix><account>``. For example::
  499. WDURLAttribute('P4033', url_path_prefix='/@')
  500. Adds Property `P4033 <https://www.wikidata.org/wiki/Property:P4033>`_
  501. to the wikidata query. This field might return for example
  502. ``libreoffice@fosstodon.org`` and the URL built from this is then:
  503. - account: ``libreoffice``
  504. - domain: ``fosstodon.org``
  505. - result url: https://fosstodon.org/@libreoffice
  506. """
  507. super().__init__(name)
  508. self.url_id = url_id
  509. self.url_path_prefix = url_path_prefix
  510. self.kwargs = kwargs
  511. def get_str(self, result, language):
  512. value = result.get(self.name + 's')
  513. if not value:
  514. return None
  515. value = value.split(',')[0]
  516. if self.url_id:
  517. url_id = self.url_id
  518. if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):
  519. value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE) :]
  520. url_id = 'wikimedia_image'
  521. return get_external_url(url_id, value)
  522. if self.url_path_prefix:
  523. [account, domain] = value.split('@')
  524. return f"https://{domain}{self.url_path_prefix}{account}"
  525. return value
  526. class WDGeoAttribute(WDAttribute):
  527. def get_label(self, language):
  528. return "OpenStreetMap"
  529. def get_select(self):
  530. return "?{name}Lat ?{name}Long".replace('{name}', self.name)
  531. def get_where(self):
  532. return """OPTIONAL { ?item p:{name}/psv:{name} [
  533. wikibase:geoLatitude ?{name}Lat ;
  534. wikibase:geoLongitude ?{name}Long ] }""".replace(
  535. '{name}', self.name
  536. )
  537. def get_group_by(self):
  538. return self.get_select()
  539. def get_str(self, result, language):
  540. latitude = result.get(self.name + 'Lat')
  541. longitude = result.get(self.name + 'Long')
  542. if latitude and longitude:
  543. return latitude + ' ' + longitude
  544. return None
  545. def get_geo_url(self, result, osm_zoom=19):
  546. latitude = result.get(self.name + 'Lat')
  547. longitude = result.get(self.name + 'Long')
  548. if latitude and longitude:
  549. return get_earth_coordinates_url(latitude, longitude, osm_zoom)
  550. return None
  551. class WDImageAttribute(WDURLAttribute):
  552. __slots__ = ('priority',)
  553. def __init__(self, name, url_id=None, priority=100):
  554. super().__init__(name, url_id)
  555. self.priority = priority
  556. class WDDateAttribute(WDAttribute):
  557. def get_select(self):
  558. return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name)
  559. def get_where(self):
  560. # To remove duplicate, add
  561. # FILTER NOT EXISTS { ?item p:{name}/psv:{name}/wikibase:timeValue ?{name}bis FILTER (?{name}bis < ?{name}) }
  562. # this filter is too slow, so the response function ignore duplicate results
  563. # (see the seen_entities variable)
  564. return """OPTIONAL { ?item p:{name}/psv:{name} [
  565. wikibase:timeValue ?{name} ;
  566. wikibase:timePrecision ?{name}timePrecision ;
  567. wikibase:timeTimezone ?{name}timeZone ;
  568. wikibase:timeCalendarModel ?{name}timeCalendar ] . }
  569. hint:Prior hint:rangeSafe true;""".replace(
  570. '{name}', self.name
  571. )
  572. def get_group_by(self):
  573. return self.get_select()
  574. def format_8(self, value, locale): # pylint: disable=unused-argument
  575. # precision: less than a year
  576. return value
  577. def format_9(self, value, locale):
  578. year = int(value)
  579. # precision: year
  580. if year < 1584:
  581. if year < 0:
  582. return str(year - 1)
  583. return str(year)
  584. timestamp = isoparse(value)
  585. return format_date(timestamp, format='yyyy', locale=locale)
  586. def format_10(self, value, locale):
  587. # precision: month
  588. timestamp = isoparse(value)
  589. return format_date(timestamp, format='MMMM y', locale=locale)
  590. def format_11(self, value, locale):
  591. # precision: day
  592. timestamp = isoparse(value)
  593. return format_date(timestamp, format='full', locale=locale)
  594. def format_13(self, value, locale):
  595. timestamp = isoparse(value)
  596. # precision: minute
  597. return (
  598. get_datetime_format(format, locale=locale)
  599. .replace("'", "")
  600. .replace('{0}', format_time(timestamp, 'full', tzinfo=None, locale=locale))
  601. .replace('{1}', format_date(timestamp, 'short', locale=locale))
  602. )
  603. def format_14(self, value, locale):
  604. # precision: second.
  605. return format_datetime(isoparse(value), format='full', locale=locale)
  606. DATE_FORMAT = {
  607. '0': ('format_8', 1000000000),
  608. '1': ('format_8', 100000000),
  609. '2': ('format_8', 10000000),
  610. '3': ('format_8', 1000000),
  611. '4': ('format_8', 100000),
  612. '5': ('format_8', 10000),
  613. '6': ('format_8', 1000),
  614. '7': ('format_8', 100),
  615. '8': ('format_8', 10),
  616. '9': ('format_9', 1), # year
  617. '10': ('format_10', 1), # month
  618. '11': ('format_11', 0), # day
  619. '12': ('format_13', 0), # hour (not supported by babel, display minute)
  620. '13': ('format_13', 0), # minute
  621. '14': ('format_14', 0), # second
  622. }
  623. def get_str(self, result, language):
  624. value = result.get(self.name)
  625. if value == '' or value is None:
  626. return None
  627. precision = result.get(self.name + 'timePrecision')
  628. date_format = WDDateAttribute.DATE_FORMAT.get(precision)
  629. if date_format is not None:
  630. format_method = getattr(self, date_format[0])
  631. precision = date_format[1]
  632. try:
  633. if precision >= 1:
  634. t = value.split('-')
  635. if value.startswith('-'):
  636. value = '-' + t[1]
  637. else:
  638. value = t[0]
  639. return format_method(value, language)
  640. except Exception: # pylint: disable=broad-except
  641. return value
  642. return value
  643. def debug_explain_wikidata_query(query, method='GET'):
  644. if method == 'GET':
  645. http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers())
  646. else:
  647. http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers())
  648. http_response.raise_for_status()
  649. return http_response.content
  650. def init(engine_settings=None): # pylint: disable=unused-argument
  651. # WIKIDATA_PROPERTIES : add unit symbols
  652. for k, v in WIKIDATA_UNITS.items():
  653. WIKIDATA_PROPERTIES[k] = v['symbol']
  654. # WIKIDATA_PROPERTIES : add property labels
  655. wikidata_property_names = []
  656. for attribute in get_attributes('en'):
  657. if type(attribute) in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute):
  658. if attribute.name not in WIKIDATA_PROPERTIES:
  659. wikidata_property_names.append("wd:" + attribute.name)
  660. query = QUERY_PROPERTY_NAMES.replace('%ATTRIBUTES%', " ".join(wikidata_property_names))
  661. jsonresponse = send_wikidata_query(query)
  662. for result in jsonresponse.get('results', {}).get('bindings', {}):
  663. name = result['name']['value']
  664. lang = result['name']['xml:lang']
  665. entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
  666. WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
  667. def fetch_traits(engine_traits: EngineTraits):
  668. """Uses languages evaluated from :py:obj:`wikipedia.fetch_wikimedia_traits
  669. <searx.engines.wikipedia.fetch_wikimedia_traits>` and removes
  670. - ``traits.custom['wiki_netloc']``: wikidata does not have net-locations for
  671. the languages and the list of all
  672. - ``traits.custom['WIKIPEDIA_LANGUAGES']``: not used in the wikipedia engine
  673. """
  674. fetch_wikimedia_traits(engine_traits)
  675. engine_traits.custom['wiki_netloc'] = {}
  676. engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []