hls.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. import binascii
  2. import io
  3. import re
  4. import urllib.parse
  5. from . import get_suitable_downloader
  6. from .external import FFmpegFD
  7. from .fragment import FragmentFD
  8. from .. import webvtt
  9. from ..dependencies import Cryptodome
  10. from ..utils import (
  11. bug_reports_message,
  12. parse_m3u8_attributes,
  13. remove_start,
  14. traverse_obj,
  15. update_url_query,
  16. urljoin,
  17. )
  18. class HlsFD(FragmentFD):
  19. """
  20. Download segments in a m3u8 manifest. External downloaders can take over
  21. the fragment downloads by supporting the 'm3u8_frag_urls' protocol and
  22. re-defining 'supports_manifest' function
  23. """
  24. FD_NAME = 'hlsnative'
  25. @staticmethod
  26. def _has_drm(manifest): # TODO: https://github.com/hypervideo/hypervideo/pull/5039
  27. return bool(re.search('|'.join((
  28. r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
  29. r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.apple\.streamingkeydelivery"', # Apple FairPlay
  30. r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.microsoft\.playready"', # Microsoft PlayReady
  31. r'#EXT-X-FAXS-CM:', # Adobe Flash Access
  32. )), manifest))
  33. @classmethod
  34. def can_download(cls, manifest, info_dict, allow_unplayable_formats=False):
  35. UNSUPPORTED_FEATURES = [
  36. # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
  37. # Live streams heuristic does not always work (e.g. geo restricted to Germany
  38. # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
  39. # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3]
  40. # This heuristic also is not correct since segments may not be appended as well.
  41. # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
  42. # no segments will definitely be appended to the end of the playlist.
  43. # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of
  44. # # event media playlists [4]
  45. # r'#EXT-X-MAP:', # media initialization [5]
  46. # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
  47. # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
  48. # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
  49. # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
  50. # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5
  51. ]
  52. if not allow_unplayable_formats:
  53. UNSUPPORTED_FEATURES += [
  54. r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1], but not necessarily DRM
  55. ]
  56. def check_results():
  57. yield not info_dict.get('is_live')
  58. for feature in UNSUPPORTED_FEATURES:
  59. yield not re.search(feature, manifest)
  60. if not allow_unplayable_formats:
  61. yield not cls._has_drm(manifest)
  62. return all(check_results())
  63. def real_download(self, filename, info_dict):
  64. man_url = info_dict['url']
  65. self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
  66. urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
  67. man_url = urlh.url
  68. s = urlh.read().decode('utf-8', 'ignore')
  69. can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None
  70. if can_download:
  71. has_ffmpeg = FFmpegFD.available()
  72. no_crypto = not Cryptodome.AES and '#EXT-X-KEY:METHOD=AES-128' in s
  73. if no_crypto and has_ffmpeg:
  74. can_download, message = False, 'The stream has AES-128 encryption and pycryptodome is not available'
  75. elif no_crypto:
  76. message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; '
  77. 'Decryption will be performed natively, but will be extremely slow')
  78. elif info_dict.get('extractor_key') == 'Generic' and re.search(r'(?m)#EXT-X-MEDIA-SEQUENCE:(?!0$)', s):
  79. install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and '
  80. message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, '
  81. f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command')
  82. if not can_download:
  83. if self._has_drm(s) and not self.params.get('allow_unplayable_formats'):
  84. if info_dict.get('has_drm') and self.params.get('test'):
  85. self.to_screen(f'[{self.FD_NAME}] This format is DRM protected', skip_eol=True)
  86. else:
  87. self.report_error(
  88. 'This format is DRM protected; Try selecting another format with --format or '
  89. 'add --check-formats to automatically fallback to the next best format', tb=False)
  90. return False
  91. message = message or 'Unsupported features have been detected'
  92. fd = FFmpegFD(self.ydl, self.params)
  93. self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}')
  94. return fd.real_download(filename, info_dict)
  95. elif message:
  96. self.report_warning(message)
  97. is_webvtt = info_dict['ext'] == 'vtt'
  98. if is_webvtt:
  99. real_downloader = None # Packing the fragments is not currently supported for external downloader
  100. else:
  101. real_downloader = get_suitable_downloader(
  102. info_dict, self.params, None, protocol='m3u8_frag_urls', to_stdout=(filename == '-'))
  103. if real_downloader and not real_downloader.supports_manifest(s):
  104. real_downloader = None
  105. if real_downloader:
  106. self.to_screen(f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}')
  107. def is_ad_fragment_start(s):
  108. return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s
  109. or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad'))
  110. def is_ad_fragment_end(s):
  111. return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s
  112. or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment'))
  113. fragments = []
  114. media_frags = 0
  115. ad_frags = 0
  116. ad_frag_next = False
  117. for line in s.splitlines():
  118. line = line.strip()
  119. if not line:
  120. continue
  121. if line.startswith('#'):
  122. if is_ad_fragment_start(line):
  123. ad_frag_next = True
  124. elif is_ad_fragment_end(line):
  125. ad_frag_next = False
  126. continue
  127. if ad_frag_next:
  128. ad_frags += 1
  129. continue
  130. media_frags += 1
  131. ctx = {
  132. 'filename': filename,
  133. 'total_frags': media_frags,
  134. 'ad_frags': ad_frags,
  135. }
  136. if real_downloader:
  137. self._prepare_external_frag_download(ctx)
  138. else:
  139. self._prepare_and_start_frag_download(ctx, info_dict)
  140. extra_state = ctx.setdefault('extra_state', {})
  141. format_index = info_dict.get('format_index')
  142. extra_query = None
  143. extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
  144. if extra_param_to_segment_url:
  145. extra_query = urllib.parse.parse_qs(extra_param_to_segment_url)
  146. i = 0
  147. media_sequence = 0
  148. decrypt_info = {'METHOD': 'NONE'}
  149. external_aes_key = traverse_obj(info_dict, ('hls_aes', 'key'))
  150. if external_aes_key:
  151. external_aes_key = binascii.unhexlify(remove_start(external_aes_key, '0x'))
  152. assert len(external_aes_key) in (16, 24, 32), 'Invalid length for HLS AES-128 key'
  153. external_aes_iv = traverse_obj(info_dict, ('hls_aes', 'iv'))
  154. if external_aes_iv:
  155. external_aes_iv = binascii.unhexlify(remove_start(external_aes_iv, '0x').zfill(32))
  156. byte_range = {}
  157. discontinuity_count = 0
  158. frag_index = 0
  159. ad_frag_next = False
  160. for line in s.splitlines():
  161. line = line.strip()
  162. if line:
  163. if not line.startswith('#'):
  164. if format_index and discontinuity_count != format_index:
  165. continue
  166. if ad_frag_next:
  167. continue
  168. frag_index += 1
  169. if frag_index <= ctx['fragment_index']:
  170. continue
  171. frag_url = urljoin(man_url, line)
  172. if extra_query:
  173. frag_url = update_url_query(frag_url, extra_query)
  174. fragments.append({
  175. 'frag_index': frag_index,
  176. 'url': frag_url,
  177. 'decrypt_info': decrypt_info,
  178. 'byte_range': byte_range,
  179. 'media_sequence': media_sequence,
  180. })
  181. media_sequence += 1
  182. elif line.startswith('#EXT-X-MAP'):
  183. if format_index and discontinuity_count != format_index:
  184. continue
  185. if frag_index > 0:
  186. self.report_error(
  187. 'Initialization fragment found after media fragments, unable to download')
  188. return False
  189. frag_index += 1
  190. map_info = parse_m3u8_attributes(line[11:])
  191. frag_url = urljoin(man_url, map_info.get('URI'))
  192. if extra_query:
  193. frag_url = update_url_query(frag_url, extra_query)
  194. if map_info.get('BYTERANGE'):
  195. splitted_byte_range = map_info.get('BYTERANGE').split('@')
  196. sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
  197. byte_range = {
  198. 'start': sub_range_start,
  199. 'end': sub_range_start + int(splitted_byte_range[0]),
  200. }
  201. fragments.append({
  202. 'frag_index': frag_index,
  203. 'url': frag_url,
  204. 'decrypt_info': decrypt_info,
  205. 'byte_range': byte_range,
  206. 'media_sequence': media_sequence
  207. })
  208. media_sequence += 1
  209. elif line.startswith('#EXT-X-KEY'):
  210. decrypt_url = decrypt_info.get('URI')
  211. decrypt_info = parse_m3u8_attributes(line[11:])
  212. if decrypt_info['METHOD'] == 'AES-128':
  213. if external_aes_iv:
  214. decrypt_info['IV'] = external_aes_iv
  215. elif 'IV' in decrypt_info:
  216. decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32))
  217. if external_aes_key:
  218. decrypt_info['KEY'] = external_aes_key
  219. else:
  220. decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI'])
  221. if extra_query:
  222. decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
  223. if decrypt_url != decrypt_info['URI']:
  224. decrypt_info['KEY'] = None
  225. elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
  226. media_sequence = int(line[22:])
  227. elif line.startswith('#EXT-X-BYTERANGE'):
  228. splitted_byte_range = line[17:].split('@')
  229. sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
  230. byte_range = {
  231. 'start': sub_range_start,
  232. 'end': sub_range_start + int(splitted_byte_range[0]),
  233. }
  234. elif is_ad_fragment_start(line):
  235. ad_frag_next = True
  236. elif is_ad_fragment_end(line):
  237. ad_frag_next = False
  238. elif line.startswith('#EXT-X-DISCONTINUITY'):
  239. discontinuity_count += 1
  240. i += 1
  241. # We only download the first fragment during the test
  242. if self.params.get('test', False):
  243. fragments = [fragments[0] if fragments else None]
  244. if real_downloader:
  245. info_dict['fragments'] = fragments
  246. fd = real_downloader(self.ydl, self.params)
  247. # TODO: Make progress updates work without hooking twice
  248. # for ph in self._progress_hooks:
  249. # fd.add_progress_hook(ph)
  250. return fd.real_download(filename, info_dict)
  251. if is_webvtt:
  252. def pack_fragment(frag_content, frag_index):
  253. output = io.StringIO()
  254. adjust = 0
  255. overflow = False
  256. mpegts_last = None
  257. for block in webvtt.parse_fragment(frag_content):
  258. if isinstance(block, webvtt.CueBlock):
  259. extra_state['webvtt_mpegts_last'] = mpegts_last
  260. if overflow:
  261. extra_state['webvtt_mpegts_adjust'] += 1
  262. overflow = False
  263. block.start += adjust
  264. block.end += adjust
  265. dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
  266. ready = []
  267. i = 0
  268. is_new = True
  269. while i < len(dedup_window):
  270. wcue = dedup_window[i]
  271. wblock = webvtt.CueBlock.from_json(wcue)
  272. i += 1
  273. if wblock.hinges(block):
  274. wcue['end'] = block.end
  275. is_new = False
  276. continue
  277. if wblock == block:
  278. is_new = False
  279. continue
  280. if wblock.end > block.start:
  281. continue
  282. ready.append(wblock)
  283. i -= 1
  284. del dedup_window[i]
  285. if is_new:
  286. dedup_window.append(block.as_json)
  287. for block in ready:
  288. block.write_into(output)
  289. # we only emit cues once they fall out of the duplicate window
  290. continue
  291. elif isinstance(block, webvtt.Magic):
  292. # take care of MPEG PES timestamp overflow
  293. if block.mpegts is None:
  294. block.mpegts = 0
  295. extra_state.setdefault('webvtt_mpegts_adjust', 0)
  296. block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
  297. if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
  298. overflow = True
  299. block.mpegts += 1 << 33
  300. mpegts_last = block.mpegts
  301. if frag_index == 1:
  302. extra_state['webvtt_mpegts'] = block.mpegts or 0
  303. extra_state['webvtt_local'] = block.local or 0
  304. # XXX: block.local = block.mpegts = None ?
  305. else:
  306. if block.mpegts is not None and block.local is not None:
  307. adjust = (
  308. (block.mpegts - extra_state.get('webvtt_mpegts', 0))
  309. - (block.local - extra_state.get('webvtt_local', 0))
  310. )
  311. continue
  312. elif isinstance(block, webvtt.HeaderBlock):
  313. if frag_index != 1:
  314. # XXX: this should probably be silent as well
  315. # or verify that all segments contain the same data
  316. self.report_warning(bug_reports_message(
  317. 'Discarding a %s block found in the middle of the stream; '
  318. 'if the subtitles display incorrectly,'
  319. % (type(block).__name__)))
  320. continue
  321. block.write_into(output)
  322. return output.getvalue().encode()
  323. def fin_fragments():
  324. dedup_window = extra_state.get('webvtt_dedup_window')
  325. if not dedup_window:
  326. return b''
  327. output = io.StringIO()
  328. for cue in dedup_window:
  329. webvtt.CueBlock.from_json(cue).write_into(output)
  330. return output.getvalue().encode()
  331. self.download_and_append_fragments(
  332. ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
  333. else:
  334. return self.download_and_append_fragments(ctx, fragments, info_dict)