ina.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. INA (Videos)
  4. """
  5. from html import unescape
  6. from urllib.parse import urlencode
  7. from lxml import html
  8. from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
  9. # about
  10. about = {
  11. "website": 'https://www.ina.fr/',
  12. "wikidata_id": 'Q1665109',
  13. "official_api_documentation": None,
  14. "use_official_api": False,
  15. "require_api_key": False,
  16. "results": 'HTML',
  17. "language": 'fr',
  18. }
  19. # engine dependent config
  20. categories = ['videos']
  21. paging = True
  22. page_size = 12
  23. # search-url
  24. base_url = 'https://www.ina.fr'
  25. search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
  26. # specific xpath variables
  27. results_xpath = '//div[@id="searchHits"]/div'
  28. url_xpath = './/a/@href'
  29. title_xpath = './/div[contains(@class,"title-bloc-small")]'
  30. content_xpath = './/div[contains(@class,"sous-titre-fonction")]'
  31. thumbnail_xpath = './/img/@data-src'
  32. publishedDate_xpath = './/div[contains(@class,"dateAgenda")]'
  33. # do search-request
  34. def request(query, params):
  35. params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query}))
  36. return params
  37. # get response from search-request
  38. def response(resp):
  39. results = []
  40. # we get html in a JSON container...
  41. dom = html.fromstring(resp.text)
  42. # parse results
  43. for result in eval_xpath_list(dom, results_xpath):
  44. url_relative = eval_xpath_getindex(result, url_xpath, 0)
  45. url = base_url + url_relative
  46. title = unescape(extract_text(eval_xpath(result, title_xpath)))
  47. thumbnail = extract_text(eval_xpath(result, thumbnail_xpath))
  48. content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text(
  49. eval_xpath(result, content_xpath)
  50. )
  51. # append result
  52. results.append(
  53. {
  54. 'url': url,
  55. 'title': title,
  56. 'content': content,
  57. 'template': 'videos.html',
  58. 'thumbnail': thumbnail,
  59. }
  60. )
  61. # return results
  62. return results