hls.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. import binascii
  2. import io
  3. import re
  4. import urllib.parse
  5. from . import get_suitable_downloader
  6. from .external import FFmpegFD
  7. from .fragment import FragmentFD
  8. from .. import webvtt
  9. from ..dependencies import Cryptodome_AES
  10. from ..utils import bug_reports_message, parse_m3u8_attributes, update_url_query
  11. class HlsFD(FragmentFD):
  12. """
  13. Download segments in a m3u8 manifest. External downloaders can take over
  14. the fragment downloads by supporting the 'm3u8_frag_urls' protocol and
  15. re-defining 'supports_manifest' function
  16. """
  17. FD_NAME = 'hlsnative'
  18. @staticmethod
  19. def can_download(manifest, info_dict, allow_unplayable_formats=False):
  20. UNSUPPORTED_FEATURES = [
  21. # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
  22. # Live streams heuristic does not always work (e.g. geo restricted to Germany
  23. # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
  24. # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3]
  25. # This heuristic also is not correct since segments may not be appended as well.
  26. # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
  27. # no segments will definitely be appended to the end of the playlist.
  28. # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of
  29. # # event media playlists [4]
  30. # r'#EXT-X-MAP:', # media initialization [5]
  31. # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
  32. # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
  33. # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
  34. # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
  35. # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5
  36. ]
  37. if not allow_unplayable_formats:
  38. UNSUPPORTED_FEATURES += [
  39. r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1]
  40. ]
  41. def check_results():
  42. yield not info_dict.get('is_live')
  43. for feature in UNSUPPORTED_FEATURES:
  44. yield not re.search(feature, manifest)
  45. return all(check_results())
  46. def real_download(self, filename, info_dict):
  47. man_url = info_dict['url']
  48. self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
  49. urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
  50. man_url = urlh.geturl()
  51. s = urlh.read().decode('utf-8', 'ignore')
  52. can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None
  53. if can_download:
  54. has_ffmpeg = FFmpegFD.available()
  55. no_crypto = not Cryptodome_AES and '#EXT-X-KEY:METHOD=AES-128' in s
  56. if no_crypto and has_ffmpeg:
  57. can_download, message = False, 'The stream has AES-128 encryption and pycryptodome is not available'
  58. elif no_crypto:
  59. message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; '
  60. 'Decryption will be performed natively, but will be extremely slow')
  61. elif info_dict.get('extractor_key') == 'Generic' and re.search(r'(?m)#EXT-X-MEDIA-SEQUENCE:(?!0$)', s):
  62. install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and '
  63. message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, '
  64. f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command')
  65. if not can_download:
  66. has_drm = re.search('|'.join([
  67. r'#EXT-X-FAXS-CM:', # Adobe Flash Access
  68. r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
  69. ]), s)
  70. if has_drm and not self.params.get('allow_unplayable_formats'):
  71. self.report_error(
  72. 'This video is DRM protected; Try selecting another format with --format or '
  73. 'add --check-formats to automatically fallback to the next best format')
  74. return False
  75. message = message or 'Unsupported features have been detected'
  76. fd = FFmpegFD(self.ydl, self.params)
  77. self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}')
  78. return fd.real_download(filename, info_dict)
  79. elif message:
  80. self.report_warning(message)
  81. is_webvtt = info_dict['ext'] == 'vtt'
  82. if is_webvtt:
  83. real_downloader = None # Packing the fragments is not currently supported for external downloader
  84. else:
  85. real_downloader = get_suitable_downloader(
  86. info_dict, self.params, None, protocol='m3u8_frag_urls', to_stdout=(filename == '-'))
  87. if real_downloader and not real_downloader.supports_manifest(s):
  88. real_downloader = None
  89. if real_downloader:
  90. self.to_screen(f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}')
  91. def is_ad_fragment_start(s):
  92. return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s
  93. or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad'))
  94. def is_ad_fragment_end(s):
  95. return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s
  96. or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment'))
  97. fragments = []
  98. media_frags = 0
  99. ad_frags = 0
  100. ad_frag_next = False
  101. for line in s.splitlines():
  102. line = line.strip()
  103. if not line:
  104. continue
  105. if line.startswith('#'):
  106. if is_ad_fragment_start(line):
  107. ad_frag_next = True
  108. elif is_ad_fragment_end(line):
  109. ad_frag_next = False
  110. continue
  111. if ad_frag_next:
  112. ad_frags += 1
  113. continue
  114. media_frags += 1
  115. ctx = {
  116. 'filename': filename,
  117. 'total_frags': media_frags,
  118. 'ad_frags': ad_frags,
  119. }
  120. if real_downloader:
  121. self._prepare_external_frag_download(ctx)
  122. else:
  123. self._prepare_and_start_frag_download(ctx, info_dict)
  124. extra_state = ctx.setdefault('extra_state', {})
  125. format_index = info_dict.get('format_index')
  126. extra_query = None
  127. extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
  128. if extra_param_to_segment_url:
  129. extra_query = urllib.parse.parse_qs(extra_param_to_segment_url)
  130. i = 0
  131. media_sequence = 0
  132. decrypt_info = {'METHOD': 'NONE'}
  133. byte_range = {}
  134. discontinuity_count = 0
  135. frag_index = 0
  136. ad_frag_next = False
  137. for line in s.splitlines():
  138. line = line.strip()
  139. if line:
  140. if not line.startswith('#'):
  141. if format_index and discontinuity_count != format_index:
  142. continue
  143. if ad_frag_next:
  144. continue
  145. frag_index += 1
  146. if frag_index <= ctx['fragment_index']:
  147. continue
  148. frag_url = (
  149. line
  150. if re.match(r'^https?://', line)
  151. else urllib.parse.urljoin(man_url, line))
  152. if extra_query:
  153. frag_url = update_url_query(frag_url, extra_query)
  154. fragments.append({
  155. 'frag_index': frag_index,
  156. 'url': frag_url,
  157. 'decrypt_info': decrypt_info,
  158. 'byte_range': byte_range,
  159. 'media_sequence': media_sequence,
  160. })
  161. media_sequence += 1
  162. elif line.startswith('#EXT-X-MAP'):
  163. if format_index and discontinuity_count != format_index:
  164. continue
  165. if frag_index > 0:
  166. self.report_error(
  167. 'Initialization fragment found after media fragments, unable to download')
  168. return False
  169. frag_index += 1
  170. map_info = parse_m3u8_attributes(line[11:])
  171. frag_url = (
  172. map_info.get('URI')
  173. if re.match(r'^https?://', map_info.get('URI'))
  174. else urllib.parse.urljoin(man_url, map_info.get('URI')))
  175. if extra_query:
  176. frag_url = update_url_query(frag_url, extra_query)
  177. if map_info.get('BYTERANGE'):
  178. splitted_byte_range = map_info.get('BYTERANGE').split('@')
  179. sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
  180. byte_range = {
  181. 'start': sub_range_start,
  182. 'end': sub_range_start + int(splitted_byte_range[0]),
  183. }
  184. fragments.append({
  185. 'frag_index': frag_index,
  186. 'url': frag_url,
  187. 'decrypt_info': decrypt_info,
  188. 'byte_range': byte_range,
  189. 'media_sequence': media_sequence
  190. })
  191. media_sequence += 1
  192. elif line.startswith('#EXT-X-KEY'):
  193. decrypt_url = decrypt_info.get('URI')
  194. decrypt_info = parse_m3u8_attributes(line[11:])
  195. if decrypt_info['METHOD'] == 'AES-128':
  196. if 'IV' in decrypt_info:
  197. decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32))
  198. if not re.match(r'^https?://', decrypt_info['URI']):
  199. decrypt_info['URI'] = urllib.parse.urljoin(
  200. man_url, decrypt_info['URI'])
  201. if extra_query:
  202. decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
  203. if decrypt_url != decrypt_info['URI']:
  204. decrypt_info['KEY'] = None
  205. elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
  206. media_sequence = int(line[22:])
  207. elif line.startswith('#EXT-X-BYTERANGE'):
  208. splitted_byte_range = line[17:].split('@')
  209. sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
  210. byte_range = {
  211. 'start': sub_range_start,
  212. 'end': sub_range_start + int(splitted_byte_range[0]),
  213. }
  214. elif is_ad_fragment_start(line):
  215. ad_frag_next = True
  216. elif is_ad_fragment_end(line):
  217. ad_frag_next = False
  218. elif line.startswith('#EXT-X-DISCONTINUITY'):
  219. discontinuity_count += 1
  220. i += 1
  221. # We only download the first fragment during the test
  222. if self.params.get('test', False):
  223. fragments = [fragments[0] if fragments else None]
  224. if real_downloader:
  225. info_dict['fragments'] = fragments
  226. fd = real_downloader(self.ydl, self.params)
  227. # TODO: Make progress updates work without hooking twice
  228. # for ph in self._progress_hooks:
  229. # fd.add_progress_hook(ph)
  230. return fd.real_download(filename, info_dict)
  231. if is_webvtt:
  232. def pack_fragment(frag_content, frag_index):
  233. output = io.StringIO()
  234. adjust = 0
  235. overflow = False
  236. mpegts_last = None
  237. for block in webvtt.parse_fragment(frag_content):
  238. if isinstance(block, webvtt.CueBlock):
  239. extra_state['webvtt_mpegts_last'] = mpegts_last
  240. if overflow:
  241. extra_state['webvtt_mpegts_adjust'] += 1
  242. overflow = False
  243. block.start += adjust
  244. block.end += adjust
  245. dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
  246. ready = []
  247. i = 0
  248. is_new = True
  249. while i < len(dedup_window):
  250. wcue = dedup_window[i]
  251. wblock = webvtt.CueBlock.from_json(wcue)
  252. i += 1
  253. if wblock.hinges(block):
  254. wcue['end'] = block.end
  255. is_new = False
  256. continue
  257. if wblock == block:
  258. is_new = False
  259. continue
  260. if wblock.end > block.start:
  261. continue
  262. ready.append(wblock)
  263. i -= 1
  264. del dedup_window[i]
  265. if is_new:
  266. dedup_window.append(block.as_json)
  267. for block in ready:
  268. block.write_into(output)
  269. # we only emit cues once they fall out of the duplicate window
  270. continue
  271. elif isinstance(block, webvtt.Magic):
  272. # take care of MPEG PES timestamp overflow
  273. if block.mpegts is None:
  274. block.mpegts = 0
  275. extra_state.setdefault('webvtt_mpegts_adjust', 0)
  276. block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
  277. if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
  278. overflow = True
  279. block.mpegts += 1 << 33
  280. mpegts_last = block.mpegts
  281. if frag_index == 1:
  282. extra_state['webvtt_mpegts'] = block.mpegts or 0
  283. extra_state['webvtt_local'] = block.local or 0
  284. # XXX: block.local = block.mpegts = None ?
  285. else:
  286. if block.mpegts is not None and block.local is not None:
  287. adjust = (
  288. (block.mpegts - extra_state.get('webvtt_mpegts', 0))
  289. - (block.local - extra_state.get('webvtt_local', 0))
  290. )
  291. continue
  292. elif isinstance(block, webvtt.HeaderBlock):
  293. if frag_index != 1:
  294. # XXX: this should probably be silent as well
  295. # or verify that all segments contain the same data
  296. self.report_warning(bug_reports_message(
  297. 'Discarding a %s block found in the middle of the stream; '
  298. 'if the subtitles display incorrectly,'
  299. % (type(block).__name__)))
  300. continue
  301. block.write_into(output)
  302. return output.getvalue().encode()
  303. def fin_fragments():
  304. dedup_window = extra_state.get('webvtt_dedup_window')
  305. if not dedup_window:
  306. return b''
  307. output = io.StringIO()
  308. for cue in dedup_window:
  309. webvtt.CueBlock.from_json(cue).write_into(output)
  310. return output.getvalue().encode()
  311. self.download_and_append_fragments(
  312. ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
  313. else:
  314. return self.download_and_append_fragments(ctx, fragments, info_dict)