sjp.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Słownik Języka Polskiego
  4. Dictionary of the polish language from PWN (sjp.pwn)
  5. """
  6. from lxml.html import fromstring
  7. from searx import logger
  8. from searx.utils import extract_text
  9. from searx.raise_for_httperror import raise_for_httperror
  10. logger = logger.getChild('sjp engine')
  11. # about
  12. about = {
  13. "website": 'https://sjp.pwn.pl',
  14. "wikidata_id": 'Q55117369',
  15. "official_api_documentation": None,
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. }
  20. categories = ['general']
  21. paging = False
  22. URL = 'https://sjp.pwn.pl'
  23. SEARCH_URL = URL + '/szukaj/{query}.html'
  24. word_xpath = '//div[@class="query"]'
  25. dict_xpath = ['//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]',
  26. '//div[@class="wyniki sjp-wyniki sjp-anchor"]',
  27. '//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]']
  28. def request(query, params):
  29. params['url'] = SEARCH_URL.format(query=query)
  30. logger.debug(f"query_url --> {params['url']}")
  31. return params
  32. def response(resp):
  33. results = []
  34. raise_for_httperror(resp)
  35. dom = fromstring(resp.text)
  36. word = extract_text(dom.xpath(word_xpath))
  37. definitions = []
  38. for dict_src in dict_xpath:
  39. for src in dom.xpath(dict_src):
  40. src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip()
  41. src_defs = []
  42. for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'):
  43. if def_item.xpath('./div[@class="znacz"]'):
  44. sub_defs = []
  45. for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
  46. def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
  47. sub_defs.append(def_sub_text)
  48. src_defs.append((word, sub_defs))
  49. else:
  50. def_text = extract_text(def_item).strip()
  51. def_link = def_item.xpath('./span/a/@href')
  52. if 'doroszewski' in def_link[0]:
  53. def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
  54. src_defs.append((def_text, ''))
  55. definitions.append((src_text, src_defs))
  56. if not definitions:
  57. return results
  58. infobox = ''
  59. for src in definitions:
  60. infobox += f"<div><small>{src[0]}</small>"
  61. infobox += "<ul>"
  62. for (def_text, sub_def) in src[1]:
  63. infobox += f"<li>{def_text}</li>"
  64. if sub_def:
  65. infobox += "<ol>"
  66. for sub_def_text in sub_def:
  67. infobox += f"<li>{sub_def_text}</li>"
  68. infobox += "</ol>"
  69. infobox += "</ul></div>"
  70. results.append({
  71. 'infobox': word,
  72. 'content': infobox,
  73. })
  74. return results