wikipedia.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. """
  2. Wikipedia (Web)
  3. @website https://{language}.wikipedia.org
  4. @provide-api yes
  5. @using-api yes
  6. @results JSON
  7. @stable yes
  8. @parse url, infobox
  9. """
  10. from json import loads
  11. from lxml.html import fromstring
  12. from searx.url_utils import quote, urlencode
  13. # search-url
  14. base_url = u'https://{language}.wikipedia.org/'
  15. search_url = base_url + u'w/api.php?'\
  16. 'action=query'\
  17. '&format=json'\
  18. '&{query}'\
  19. '&prop=extracts|pageimages'\
  20. '&exintro'\
  21. '&explaintext'\
  22. '&pithumbsize=300'\
  23. '&redirects'
  24. supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  25. # set language in base_url
  26. def url_lang(lang):
  27. lang = lang.split('-')[0]
  28. if lang == 'all' or lang not in supported_languages:
  29. language = 'en'
  30. else:
  31. language = lang
  32. return language
  33. # do search-request
  34. def request(query, params):
  35. if query.islower():
  36. query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
  37. params['url'] = search_url.format(query=urlencode({'titles': query}),
  38. language=url_lang(params['language']))
  39. return params
  40. # get first meaningful paragraph
  41. # this should filter out disambiguation pages and notes above first paragraph
  42. # "magic numbers" were obtained by fine tuning
  43. def extract_first_paragraph(content, title, image):
  44. first_paragraph = None
  45. failed_attempts = 0
  46. for paragraph in content.split('\n'):
  47. starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
  48. length = len(paragraph)
  49. if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
  50. first_paragraph = paragraph
  51. break
  52. failed_attempts += 1
  53. if failed_attempts > 3:
  54. return None
  55. return first_paragraph
  56. # get response from search-request
  57. def response(resp):
  58. results = []
  59. search_result = loads(resp.text)
  60. # wikipedia article's unique id
  61. # first valid id is assumed to be the requested article
  62. for article_id in search_result['query']['pages']:
  63. page = search_result['query']['pages'][article_id]
  64. if int(article_id) > 0:
  65. break
  66. if int(article_id) < 0:
  67. return []
  68. title = page.get('title')
  69. image = page.get('thumbnail')
  70. if image:
  71. image = image.get('source')
  72. extract = page.get('extract')
  73. summary = extract_first_paragraph(extract, title, image)
  74. # link to wikipedia article
  75. wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
  76. + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
  77. results.append({'url': wikipedia_link, 'title': title})
  78. results.append({'infobox': title,
  79. 'id': wikipedia_link,
  80. 'content': summary,
  81. 'img_src': image,
  82. 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
  83. return results
  84. # get supported languages from their site
  85. def _fetch_supported_languages(resp):
  86. supported_languages = {}
  87. dom = fromstring(resp.text)
  88. tables = dom.xpath('//table[contains(@class,"sortable")]')
  89. for table in tables:
  90. # exclude header row
  91. trs = table.xpath('.//tr')[1:]
  92. for tr in trs:
  93. td = tr.xpath('./td')
  94. code = td[3].xpath('./a')[0].text
  95. name = td[2].xpath('./a')[0].text
  96. english_name = td[1].xpath('./a')[0].text
  97. articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
  98. # exclude languages with too few articles
  99. if articles >= 100:
  100. supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
  101. return supported_languages