stackoverflow.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Stackoverflow (IT)
  4. """
  5. from urllib.parse import urlencode, urljoin, urlparse
  6. from lxml import html
  7. from searx.utils import extract_text
  8. from searx.exceptions import SearxEngineCaptchaException
  9. # about
  10. about = {
  11. "website": 'https://stackoverflow.com/',
  12. "wikidata_id": 'Q549037',
  13. "official_api_documentation": 'https://api.stackexchange.com/docs',
  14. "use_official_api": False,
  15. "require_api_key": False,
  16. "results": 'HTML',
  17. }
  18. # engine dependent config
  19. categories = ['it']
  20. paging = True
  21. # search-url
  22. url = 'https://stackoverflow.com/'
  23. search_url = url + 'search?{query}&page={pageno}'
  24. # specific xpath variables
  25. results_xpath = '//div[contains(@class,"question-summary")]'
  26. link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
  27. content_xpath = './/div[@class="excerpt"]'
  28. # do search-request
  29. def request(query, params):
  30. params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'])
  31. return params
  32. # get response from search-request
  33. def response(resp):
  34. resp_url = urlparse(resp.url)
  35. if resp_url.path.startswith('/nocaptcha'):
  36. raise SearxEngineCaptchaException()
  37. results = []
  38. dom = html.fromstring(resp.text)
  39. # parse results
  40. for result in dom.xpath(results_xpath):
  41. link = result.xpath(link_xpath)[0]
  42. href = urljoin(url, link.attrib.get('href'))
  43. title = extract_text(link)
  44. content = extract_text(result.xpath(content_xpath))
  45. # append result
  46. results.append({'url': href,
  47. 'title': title,
  48. 'content': content})
  49. # return results
  50. return results