rtve.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. import base64
  2. import io
  3. import struct
  4. from .common import InfoExtractor
  5. from ..compat import compat_b64decode
  6. from ..utils import (
  7. ExtractorError,
  8. determine_ext,
  9. float_or_none,
  10. qualities,
  11. remove_end,
  12. remove_start,
  13. try_get,
  14. )
  15. class RTVEALaCartaIE(InfoExtractor):
  16. IE_NAME = 'rtve.es:alacarta'
  17. IE_DESC = 'RTVE a la carta'
  18. _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'
  19. _TESTS = [{
  20. 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
  21. 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
  22. 'info_dict': {
  23. 'id': '2491869',
  24. 'ext': 'mp4',
  25. 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
  26. 'duration': 5024.566,
  27. 'series': 'Balonmano',
  28. },
  29. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  30. }, {
  31. 'note': 'Live stream',
  32. 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/',
  33. 'info_dict': {
  34. 'id': '1694255',
  35. 'ext': 'mp4',
  36. 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  37. 'is_live': True,
  38. },
  39. 'params': {
  40. 'skip_download': 'live stream',
  41. },
  42. }, {
  43. 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/',
  44. 'md5': 'd850f3c8731ea53952ebab489cf81cbf',
  45. 'info_dict': {
  46. 'id': '4236788',
  47. 'ext': 'mp4',
  48. 'title': 'Servir y proteger - Capítulo 104',
  49. 'duration': 3222.0,
  50. },
  51. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  52. }, {
  53. 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
  54. 'only_matching': True,
  55. }, {
  56. 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/',
  57. 'only_matching': True,
  58. }]
  59. def _real_initialize(self):
  60. user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode('utf-8')).decode('utf-8')
  61. self._manager = self._download_json(
  62. 'http://www.rtve.es/odin/loki/' + user_agent_b64,
  63. None, 'Fetching manager info')['manager']
  64. @staticmethod
  65. def _decrypt_url(png):
  66. encrypted_data = io.BytesIO(compat_b64decode(png)[8:])
  67. while True:
  68. length = struct.unpack('!I', encrypted_data.read(4))[0]
  69. chunk_type = encrypted_data.read(4)
  70. if chunk_type == b'IEND':
  71. break
  72. data = encrypted_data.read(length)
  73. if chunk_type == b'tEXt':
  74. alphabet_data, text = data.split(b'\0')
  75. quality, url_data = text.split(b'%%')
  76. alphabet = []
  77. e = 0
  78. d = 0
  79. for l in alphabet_data.decode('iso-8859-1'):
  80. if d == 0:
  81. alphabet.append(l)
  82. d = e = (e + 1) % 4
  83. else:
  84. d -= 1
  85. url = ''
  86. f = 0
  87. e = 3
  88. b = 1
  89. for letter in url_data.decode('iso-8859-1'):
  90. if f == 0:
  91. l = int(letter) * 10
  92. f = 1
  93. else:
  94. if e == 0:
  95. l += int(letter)
  96. url += alphabet[l]
  97. e = (b + 3) % 4
  98. f = 0
  99. b += 1
  100. else:
  101. e -= 1
  102. yield quality.decode(), url
  103. encrypted_data.read(4) # CRC
  104. def _extract_png_formats(self, video_id):
  105. png = self._download_webpage(
  106. 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id),
  107. video_id, 'Downloading url information', query={'q': 'v2'})
  108. q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL'])
  109. formats = []
  110. for quality, video_url in self._decrypt_url(png):
  111. ext = determine_ext(video_url)
  112. if ext == 'm3u8':
  113. formats.extend(self._extract_m3u8_formats(
  114. video_url, video_id, 'mp4', 'm3u8_native',
  115. m3u8_id='hls', fatal=False))
  116. elif ext == 'mpd':
  117. formats.extend(self._extract_mpd_formats(
  118. video_url, video_id, 'dash', fatal=False))
  119. else:
  120. formats.append({
  121. 'format_id': quality,
  122. 'quality': q(quality),
  123. 'url': video_url,
  124. })
  125. return formats
  126. def _real_extract(self, url):
  127. video_id = self._match_id(url)
  128. info = self._download_json(
  129. 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
  130. video_id)['page']['items'][0]
  131. if info['state'] == 'DESPU':
  132. raise ExtractorError('The video is no longer available', expected=True)
  133. title = info['title'].strip()
  134. formats = self._extract_png_formats(video_id)
  135. subtitles = None
  136. sbt_file = info.get('sbtFile')
  137. if sbt_file:
  138. subtitles = self.extract_subtitles(video_id, sbt_file)
  139. is_live = info.get('live') is True
  140. return {
  141. 'id': video_id,
  142. 'title': title,
  143. 'formats': formats,
  144. 'thumbnail': info.get('image'),
  145. 'subtitles': subtitles,
  146. 'duration': float_or_none(info.get('duration'), 1000),
  147. 'is_live': is_live,
  148. 'series': info.get('programTitle'),
  149. }
  150. def _get_subtitles(self, video_id, sub_file):
  151. subs = self._download_json(
  152. sub_file + '.json', video_id,
  153. 'Downloading subtitles info')['page']['items']
  154. return dict(
  155. (s['lang'], [{'ext': 'vtt', 'url': s['src']}])
  156. for s in subs)
  157. class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE
  158. IE_NAME = 'rtve.es:audio'
  159. IE_DESC = 'RTVE audio'
  160. _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P<id>[0-9]+)'
  161. _TESTS = [{
  162. 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/',
  163. 'md5': 'ae06d27bff945c4e87a50f89f6ce48ce',
  164. 'info_dict': {
  165. 'id': '5889192',
  166. 'ext': 'mp3',
  167. 'title': 'Códigos informáticos',
  168. 'thumbnail': r're:https?://.+/1598856591583.jpg',
  169. 'duration': 349.440,
  170. 'series': 'A hombros de gigantes',
  171. },
  172. }, {
  173. 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/',
  174. 'md5': '072855ab89a9450e0ba314c717fa5ebc',
  175. 'info_dict': {
  176. 'id': '5791165',
  177. 'ext': 'mp3',
  178. 'title': 'Ignatius Farray',
  179. 'thumbnail': r're:https?://.+/1613243011863.jpg',
  180. 'duration': 3559.559,
  181. 'series': 'En Radio 3'
  182. },
  183. }, {
  184. 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/',
  185. 'md5': '0eadab248cc8dd193fa5765712e84d5c',
  186. 'info_dict': {
  187. 'id': '6082623',
  188. 'ext': 'mp3',
  189. 'title': 'Capítulo 26 y último: La muerte de Victor',
  190. 'thumbnail': r're:https?://.+/1632147445707.jpg',
  191. 'duration': 3174.086,
  192. 'series': 'Frankenstein o el moderno Prometeo'
  193. },
  194. }]
  195. def _extract_png_formats(self, audio_id):
  196. """
  197. This function retrieves media related png thumbnail which obfuscate
  198. valuable information about the media. This information is decrypted
  199. via base class _decrypt_url function providing media quality and
  200. media url
  201. """
  202. png = self._download_webpage(
  203. 'http://www.rtve.es/ztnr/movil/thumbnail/%s/audios/%s.png' %
  204. (self._manager, audio_id),
  205. audio_id, 'Downloading url information', query={'q': 'v2'})
  206. q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL'])
  207. formats = []
  208. for quality, audio_url in self._decrypt_url(png):
  209. ext = determine_ext(audio_url)
  210. if ext == 'm3u8':
  211. formats.extend(self._extract_m3u8_formats(
  212. audio_url, audio_id, 'mp4', 'm3u8_native',
  213. m3u8_id='hls', fatal=False))
  214. elif ext == 'mpd':
  215. formats.extend(self._extract_mpd_formats(
  216. audio_url, audio_id, 'dash', fatal=False))
  217. else:
  218. formats.append({
  219. 'format_id': quality,
  220. 'quality': q(quality),
  221. 'url': audio_url,
  222. })
  223. return formats
  224. def _real_extract(self, url):
  225. audio_id = self._match_id(url)
  226. info = self._download_json(
  227. 'https://www.rtve.es/api/audios/%s.json' % audio_id,
  228. audio_id)['page']['items'][0]
  229. return {
  230. 'id': audio_id,
  231. 'title': info['title'].strip(),
  232. 'thumbnail': info.get('thumbnail'),
  233. 'duration': float_or_none(info.get('duration'), 1000),
  234. 'series': try_get(info, lambda x: x['programInfo']['title']),
  235. 'formats': self._extract_png_formats(audio_id),
  236. }
  237. class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE
  238. IE_NAME = 'rtve.es:infantil'
  239. IE_DESC = 'RTVE infantil'
  240. _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/'
  241. _TESTS = [{
  242. 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/',
  243. 'md5': '5747454717aedf9f9fdf212d1bcfc48d',
  244. 'info_dict': {
  245. 'id': '3040283',
  246. 'ext': 'mp4',
  247. 'title': 'Maneras de vivir',
  248. 'thumbnail': r're:https?://.+/1426182947956\.JPG',
  249. 'duration': 357.958,
  250. },
  251. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  252. }]
  253. class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE
  254. IE_NAME = 'rtve.es:live'
  255. IE_DESC = 'RTVE.es live streams'
  256. _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
  257. _TESTS = [{
  258. 'url': 'http://www.rtve.es/directo/la-1/',
  259. 'info_dict': {
  260. 'id': 'la-1',
  261. 'ext': 'mp4',
  262. 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  263. },
  264. 'params': {
  265. 'skip_download': 'live stream',
  266. }
  267. }]
  268. def _real_extract(self, url):
  269. mobj = self._match_valid_url(url)
  270. video_id = mobj.group('id')
  271. webpage = self._download_webpage(url, video_id)
  272. title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
  273. title = remove_start(title, 'Estoy viendo ')
  274. vidplayer_id = self._search_regex(
  275. (r'playerId=player([0-9]+)',
  276. r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)',
  277. r'data-id=["\'](\d+)'),
  278. webpage, 'internal video ID')
  279. return {
  280. 'id': video_id,
  281. 'title': title,
  282. 'formats': self._extract_png_formats(vidplayer_id),
  283. 'is_live': True,
  284. }
  285. class RTVETelevisionIE(InfoExtractor):
  286. IE_NAME = 'rtve.es:television'
  287. _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml'
  288. _TEST = {
  289. 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml',
  290. 'info_dict': {
  291. 'id': '3069778',
  292. 'ext': 'mp4',
  293. 'title': 'Documentos TV - La revolución del móvil',
  294. 'duration': 3496.948,
  295. },
  296. 'params': {
  297. 'skip_download': True,
  298. },
  299. }
  300. def _real_extract(self, url):
  301. page_id = self._match_id(url)
  302. webpage = self._download_webpage(url, page_id)
  303. alacarta_url = self._search_regex(
  304. r'data-location="alacarta_videos"[^<]+url&quot;:&quot;(http://www\.rtve\.es/alacarta.+?)&',
  305. webpage, 'alacarta url', default=None)
  306. if alacarta_url is None:
  307. raise ExtractorError(
  308. 'The webpage doesn\'t contain any video', expected=True)
  309. return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key())