update_wikidata_units.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. #!/usr/bin/env python
  2. import json
  3. import collections
  4. # set path
  5. from os.path import join
  6. from searx import searx_dir
  7. from searx.engines.wikidata import send_wikidata_query
  8. # the response contains duplicate ?item with the different ?symbol
  9. # "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result
  10. # even if a ?item has different ?symbol of the same rank.
  11. # A deterministic result
  12. # see:
  13. # * https://www.wikidata.org/wiki/Help:Ranking
  14. # * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section)
  15. # * https://w.wiki/32BT
  16. # see the result for https://www.wikidata.org/wiki/Q11582
  17. # there are multiple symbols the same rank
  18. SARQL_REQUEST = """
  19. SELECT DISTINCT ?item ?symbol
  20. WHERE
  21. {
  22. ?item wdt:P31/wdt:P279 wd:Q47574 .
  23. ?item p:P5061 ?symbolP .
  24. ?symbolP ps:P5061 ?symbol ;
  25. wikibase:rank ?rank .
  26. FILTER(LANG(?symbol) = "en").
  27. }
  28. ORDER BY ?item DESC(?rank) ?symbol
  29. """
  30. def get_data():
  31. results = collections.OrderedDict()
  32. response = send_wikidata_query(SARQL_REQUEST)
  33. for unit in response['results']['bindings']:
  34. name = unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
  35. unit = unit['symbol']['value']
  36. if name not in results:
  37. # ignore duplicate: always use the first one
  38. results[name] = unit
  39. return results
  40. def get_wikidata_units_filename():
  41. return join(join(searx_dir, "data"), "wikidata_units.json")
  42. with open(get_wikidata_units_filename(), 'w') as f:
  43. json.dump(get_data(), f, indent=4, ensure_ascii=False)