mhtml.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. import io
  2. import quopri
  3. import re
  4. import uuid
  5. from .fragment import FragmentFD
  6. from ..compat import imghdr
  7. from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin
  8. from ..version import __version__ as YT_DLP_VERSION
  9. class MhtmlFD(FragmentFD):
  10. _STYLESHEET = """\
  11. html, body {
  12. margin: 0;
  13. padding: 0;
  14. height: 100vh;
  15. }
  16. html {
  17. overflow-y: scroll;
  18. scroll-snap-type: y mandatory;
  19. }
  20. body {
  21. scroll-snap-type: y mandatory;
  22. display: flex;
  23. flex-flow: column;
  24. }
  25. body > figure {
  26. max-width: 100vw;
  27. max-height: 100vh;
  28. scroll-snap-align: center;
  29. }
  30. body > figure > figcaption {
  31. text-align: center;
  32. height: 2.5em;
  33. }
  34. body > figure > img {
  35. display: block;
  36. margin: auto;
  37. max-width: 100%;
  38. max-height: calc(100vh - 5em);
  39. }
  40. """
  41. _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
  42. _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
  43. @staticmethod
  44. def _escape_mime(s):
  45. return '=?utf-8?Q?' + (b''.join(
  46. bytes((b,)) if b >= 0x20 else b'=%02X' % b
  47. for b in quopri.encodestring(s.encode(), header=True)
  48. )).decode('us-ascii') + '?='
  49. def _gen_cid(self, i, fragment, frag_boundary):
  50. return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
  51. def _gen_stub(self, *, fragments, frag_boundary, title):
  52. output = io.StringIO()
  53. output.write((
  54. '<!DOCTYPE html>'
  55. '<html>'
  56. '<head>'
  57. '' '<meta name="generator" content="yt-dlp {version}">'
  58. '' '<title>{title}</title>'
  59. '' '<style>{styles}</style>'
  60. '<body>'
  61. ).format(
  62. version=escapeHTML(YT_DLP_VERSION),
  63. styles=self._STYLESHEET,
  64. title=escapeHTML(title)
  65. ))
  66. t0 = 0
  67. for i, frag in enumerate(fragments):
  68. output.write('<figure>')
  69. try:
  70. t1 = t0 + frag['duration']
  71. output.write((
  72. '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
  73. ).format(
  74. num=i + 1,
  75. t0=srt_subtitles_timecode(t0),
  76. t1=srt_subtitles_timecode(t1),
  77. duration=formatSeconds(frag['duration'], msec=True)
  78. ))
  79. except (KeyError, ValueError, TypeError):
  80. t1 = None
  81. output.write((
  82. '<figcaption>Slide #{num}</figcaption>'
  83. ).format(num=i + 1))
  84. output.write('<img src="cid:{cid}">'.format(
  85. cid=self._gen_cid(i, frag, frag_boundary)))
  86. output.write('</figure>')
  87. t0 = t1
  88. return output.getvalue()
  89. def real_download(self, filename, info_dict):
  90. fragment_base_url = info_dict.get('fragment_base_url')
  91. fragments = info_dict['fragments'][:1] if self.params.get(
  92. 'test', False) else info_dict['fragments']
  93. title = info_dict.get('title', info_dict['format_id'])
  94. origin = info_dict.get('webpage_url', info_dict['url'])
  95. ctx = {
  96. 'filename': filename,
  97. 'total_frags': len(fragments),
  98. }
  99. self._prepare_and_start_frag_download(ctx, info_dict)
  100. extra_state = ctx.setdefault('extra_state', {
  101. 'header_written': False,
  102. 'mime_boundary': str(uuid.uuid4()).replace('-', ''),
  103. })
  104. frag_boundary = extra_state['mime_boundary']
  105. if not extra_state['header_written']:
  106. stub = self._gen_stub(
  107. fragments=fragments,
  108. frag_boundary=frag_boundary,
  109. title=title
  110. )
  111. ctx['dest_stream'].write((
  112. 'MIME-Version: 1.0\r\n'
  113. 'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
  114. 'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
  115. 'Subject: {title}\r\n'
  116. 'Content-type: multipart/related; '
  117. '' 'boundary="{boundary}"; '
  118. '' 'type="text/html"\r\n'
  119. 'X.yt-dlp.Origin: {origin}\r\n'
  120. '\r\n'
  121. '--{boundary}\r\n'
  122. 'Content-Type: text/html; charset=utf-8\r\n'
  123. 'Content-Length: {length}\r\n'
  124. '\r\n'
  125. '{stub}\r\n'
  126. ).format(
  127. origin=origin,
  128. boundary=frag_boundary,
  129. length=len(stub),
  130. title=self._escape_mime(title),
  131. stub=stub
  132. ).encode())
  133. extra_state['header_written'] = True
  134. for i, fragment in enumerate(fragments):
  135. if (i + 1) <= ctx['fragment_index']:
  136. continue
  137. fragment_url = fragment.get('url')
  138. if not fragment_url:
  139. assert fragment_base_url
  140. fragment_url = urljoin(fragment_base_url, fragment['path'])
  141. success = self._download_fragment(ctx, fragment_url, info_dict)
  142. if not success:
  143. continue
  144. frag_content = self._read_fragment(ctx)
  145. frag_header = io.BytesIO()
  146. frag_header.write(
  147. b'--%b\r\n' % frag_boundary.encode('us-ascii'))
  148. frag_header.write(
  149. b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
  150. frag_header.write(
  151. b'Content-type: %b\r\n' % f'image/{imghdr.what(h=frag_content) or "jpeg"}'.encode())
  152. frag_header.write(
  153. b'Content-length: %u\r\n' % len(frag_content))
  154. frag_header.write(
  155. b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
  156. frag_header.write(
  157. b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
  158. frag_header.write(b'\r\n')
  159. self._append_fragment(
  160. ctx, frag_header.getvalue() + frag_content + b'\r\n')
  161. ctx['dest_stream'].write(
  162. b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
  163. return self._finish_frag_download(ctx, info_dict)