123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- # SPDX-License-Identifier: AGPL-3.0-or-later
- """The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
- the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
- endpoints that follow this pattern::
- https://{base_url}/w/api.php?action=query&list=search&format=json
- .. note::
- In its actual state, this engine is implemented to parse JSON result
- (`format=json`_) from a search query (`list=search`_). If you need other
- ``action`` and ``list`` types ask SearXNG developers to extend the
- implementation according to your needs.
- .. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
- .. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
- .. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
- .. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
- Configuration
- =============
- Request:
- - :py:obj:`base_url`
- - :py:obj:`search_type`
- - :py:obj:`srenablerewrites`
- - :py:obj:`srsort`
- - :py:obj:`srprop`
- Implementations
- ===============
- """
- from __future__ import annotations
- from typing import TYPE_CHECKING
- from datetime import datetime
- from urllib.parse import urlencode, quote
- from searx.utils import html_to_text
- from searx.enginelib.traits import EngineTraits
- if TYPE_CHECKING:
- import logging
- logger: logging.Logger
- traits: EngineTraits
- # about
- about = {
- "website": None,
- "wikidata_id": None,
- "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
- "use_official_api": True,
- "require_api_key": False,
- "results": 'JSON',
- }
- # engine dependent config
- categories = ['general']
- paging = True
- number_of_results = 5
- search_type: str = 'nearmatch'
- """Which type of search to perform. One of the following values: ``nearmatch``,
- ``text`` or ``title``.
- See ``srwhat`` argument in `list=search`_ documentation.
- """
- srenablerewrites: bool = True
- """Enable internal query rewriting (Type: boolean). Some search backends can
- rewrite the query into another which is thought to provide better results, for
- instance by correcting spelling errors.
- See ``srenablerewrites`` argument in `list=search`_ documentation.
- """
- srsort: str = 'relevance'
- """Set the sort order of returned results. One of the following values:
- ``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
- ``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
- ``none``, ``random``, ``relevance``, ``user_random``.
- See ``srenablerewrites`` argument in `list=search`_ documentation.
- """
- srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
- """Which properties to return.
- See ``srprop`` argument in `list=search`_ documentation.
- """
- base_url: str = 'https://{language}.wikipedia.org/'
- """Base URL of the Wikimedia wiki.
- ``{language}``:
- ISO 639-1 language code (en, de, fr ..) of the search language.
- """
- api_path: str = 'w/api.php'
- """The path the PHP api is listening on.
- The default path should work fine usually.
- """
- timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
- """The longhand version of MediaWiki time strings."""
- def request(query, params):
- # write search-language back to params, required in response
- if params['language'] == 'all':
- params['language'] = 'en'
- else:
- params['language'] = params['language'].split('-')[0]
- api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language'])
- offset = (params['pageno'] - 1) * number_of_results
- args = {
- 'action': 'query',
- 'list': 'search',
- 'format': 'json',
- 'srsearch': query,
- 'sroffset': offset,
- 'srlimit': number_of_results,
- 'srwhat': search_type,
- 'srprop': srprop,
- 'srsort': srsort,
- }
- if srenablerewrites:
- args['srenablerewrites'] = '1'
- params['url'] = api_url + urlencode(args)
- return params
- # get response from search-request
- def response(resp):
- results = []
- search_results = resp.json()
- # return empty array if there are no results
- if not search_results.get('query', {}).get('search'):
- return []
- for result in search_results['query']['search']:
- if result.get('snippet', '').startswith('#REDIRECT'):
- continue
- title = result['title']
- sectiontitle = result.get('sectiontitle')
- content = html_to_text(result.get('snippet', ''))
- metadata = html_to_text(result.get('categorysnippet', ''))
- timestamp = result.get('timestamp')
- url = (
- base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
- )
- if sectiontitle:
- # in case of sectiontitle create a link to the section in the wiki page
- url += '#' + quote(sectiontitle.replace(' ', '_').encode())
- title += ' / ' + sectiontitle
- item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
- if timestamp:
- item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
- results.append(item)
- # return results
- return results
|