ina.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. INA (Videos)
  4. """
  5. from json import loads
  6. from html import unescape
  7. from urllib.parse import urlencode
  8. from lxml import html
  9. from dateutil import parser
  10. from searx.utils import extract_text
  11. # about
  12. about = {
  13. "website": 'https://www.ina.fr/',
  14. "wikidata_id": 'Q1665109',
  15. "official_api_documentation": None,
  16. "use_official_api": False,
  17. "require_api_key": False,
  18. "results": 'HTML',
  19. }
  20. # engine dependent config
  21. categories = ['videos']
  22. paging = True
  23. page_size = 48
  24. # search-url
  25. base_url = 'https://www.ina.fr'
  26. search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
  27. # specific xpath variables
  28. results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]'
  29. url_xpath = './/a/@href'
  30. title_xpath = './/h3[@class="h3--title media-heading"]'
  31. thumbnail_xpath = './/img/@src'
  32. publishedDate_xpath = './/span[@class="broadcast"]'
  33. content_xpath = './/p[@class="media-body__summary"]'
  34. # do search-request
  35. def request(query, params):
  36. params['url'] = search_url.format(ps=page_size,
  37. start=params['pageno'] * page_size,
  38. query=urlencode({'q': query}))
  39. return params
  40. # get response from search-request
  41. def response(resp):
  42. results = []
  43. # we get html in a JSON container...
  44. response = loads(resp.text)
  45. dom = html.fromstring(response)
  46. # parse results
  47. for result in dom.xpath(results_xpath):
  48. videoid = result.xpath(url_xpath)[0]
  49. url = base_url + videoid
  50. title = unescape(extract_text(result.xpath(title_xpath)))
  51. try:
  52. thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
  53. except:
  54. thumbnail = ''
  55. if thumbnail and thumbnail[0] == '/':
  56. thumbnail = base_url + thumbnail
  57. d = extract_text(result.xpath(publishedDate_xpath)[0])
  58. d = d.split('/')
  59. # force ISO date to avoid wrong parsing
  60. d = "%s-%s-%s" % (d[2], d[1], d[0])
  61. publishedDate = parser.parse(d)
  62. content = extract_text(result.xpath(content_xpath))
  63. # append result
  64. results.append({'url': url,
  65. 'title': title,
  66. 'content': content,
  67. 'template': 'videos.html',
  68. 'publishedDate': publishedDate,
  69. 'thumbnail': thumbnail})
  70. # return results
  71. return results