mediawiki.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
  3. the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
  4. endpoints that follow this pattern::
  5. https://{base_url}/w/api.php?action=query&list=search&format=json
  6. .. note::
  7. In its actual state, this engine is implemented to parse JSON result
  8. (`format=json`_) from a search query (`list=search`_). If you need other
  9. ``action`` and ``list`` types ask SearXNG developers to extend the
  10. implementation according to your needs.
  11. .. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
  12. .. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
  13. .. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
  14. .. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
  15. Configuration
  16. =============
  17. Request:
  18. - :py:obj:`base_url`
  19. - :py:obj:`search_type`
  20. - :py:obj:`srenablerewrites`
  21. - :py:obj:`srsort`
  22. - :py:obj:`srprop`
  23. Implementations
  24. ===============
  25. """
  26. from __future__ import annotations
  27. from typing import TYPE_CHECKING
  28. from datetime import datetime
  29. from urllib.parse import urlencode, quote
  30. from searx.utils import html_to_text
  31. from searx.enginelib.traits import EngineTraits
  32. if TYPE_CHECKING:
  33. import logging
  34. logger: logging.Logger
  35. traits: EngineTraits
  36. # about
  37. about = {
  38. "website": None,
  39. "wikidata_id": None,
  40. "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
  41. "use_official_api": True,
  42. "require_api_key": False,
  43. "results": 'JSON',
  44. }
  45. # engine dependent config
  46. categories = ['general']
  47. paging = True
  48. number_of_results = 5
  49. search_type: str = 'nearmatch'
  50. """Which type of search to perform. One of the following values: ``nearmatch``,
  51. ``text`` or ``title``.
  52. See ``srwhat`` argument in `list=search`_ documentation.
  53. """
  54. srenablerewrites: bool = True
  55. """Enable internal query rewriting (Type: boolean). Some search backends can
  56. rewrite the query into another which is thought to provide better results, for
  57. instance by correcting spelling errors.
  58. See ``srenablerewrites`` argument in `list=search`_ documentation.
  59. """
  60. srsort: str = 'relevance'
  61. """Set the sort order of returned results. One of the following values:
  62. ``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
  63. ``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
  64. ``none``, ``random``, ``relevance``, ``user_random``.
  65. See ``srenablerewrites`` argument in `list=search`_ documentation.
  66. """
  67. srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
  68. """Which properties to return.
  69. See ``srprop`` argument in `list=search`_ documentation.
  70. """
  71. base_url: str = 'https://{language}.wikipedia.org/'
  72. """Base URL of the Wikimedia wiki.
  73. ``{language}``:
  74. ISO 639-1 language code (en, de, fr ..) of the search language.
  75. """
  76. api_path: str = 'w/api.php'
  77. """The path the PHP api is listening on.
  78. The default path should work fine usually.
  79. """
  80. timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
  81. """The longhand version of MediaWiki time strings."""
  82. def request(query, params):
  83. # write search-language back to params, required in response
  84. if params['language'] == 'all':
  85. params['language'] = 'en'
  86. else:
  87. params['language'] = params['language'].split('-')[0]
  88. api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language'])
  89. offset = (params['pageno'] - 1) * number_of_results
  90. args = {
  91. 'action': 'query',
  92. 'list': 'search',
  93. 'format': 'json',
  94. 'srsearch': query,
  95. 'sroffset': offset,
  96. 'srlimit': number_of_results,
  97. 'srwhat': search_type,
  98. 'srprop': srprop,
  99. 'srsort': srsort,
  100. }
  101. if srenablerewrites:
  102. args['srenablerewrites'] = '1'
  103. params['url'] = api_url + urlencode(args)
  104. return params
  105. # get response from search-request
  106. def response(resp):
  107. results = []
  108. search_results = resp.json()
  109. # return empty array if there are no results
  110. if not search_results.get('query', {}).get('search'):
  111. return []
  112. for result in search_results['query']['search']:
  113. if result.get('snippet', '').startswith('#REDIRECT'):
  114. continue
  115. title = result['title']
  116. sectiontitle = result.get('sectiontitle')
  117. content = html_to_text(result.get('snippet', ''))
  118. metadata = html_to_text(result.get('categorysnippet', ''))
  119. timestamp = result.get('timestamp')
  120. url = (
  121. base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
  122. )
  123. if sectiontitle:
  124. # in case of sectiontitle create a link to the section in the wiki page
  125. url += '#' + quote(sectiontitle.replace(' ', '_').encode())
  126. title += ' / ' + sectiontitle
  127. item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
  128. if timestamp:
  129. item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
  130. results.append(item)
  131. # return results
  132. return results