webvtt.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. """
  2. A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
  3. to be able to assemble a single stand-alone subtitle file, suitably adjusting
  4. timestamps on the way, while everything else is passed through unmodified.
  5. Regular expressions based on the W3C WebVTT specification
  6. <https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
  7. in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
  8. """
  9. import io
  10. import re
  11. from .utils import int_or_none, timetuple_from_msec
  12. class _MatchParser:
  13. """
  14. An object that maintains the current parsing position and allows
  15. conveniently advancing it as syntax elements are successfully parsed.
  16. """
  17. def __init__(self, string):
  18. self._data = string
  19. self._pos = 0
  20. def match(self, r):
  21. if isinstance(r, re.Pattern):
  22. return r.match(self._data, self._pos)
  23. if isinstance(r, str):
  24. if self._data.startswith(r, self._pos):
  25. return len(r)
  26. return None
  27. raise ValueError(r)
  28. def advance(self, by):
  29. if by is None:
  30. amt = 0
  31. elif isinstance(by, re.Match):
  32. amt = len(by.group(0))
  33. elif isinstance(by, str):
  34. amt = len(by)
  35. elif isinstance(by, int):
  36. amt = by
  37. else:
  38. raise ValueError(by)
  39. self._pos += amt
  40. return by
  41. def consume(self, r):
  42. return self.advance(self.match(r))
  43. def child(self):
  44. return _MatchChildParser(self)
  45. class _MatchChildParser(_MatchParser):
  46. """
  47. A child parser state, which advances through the same data as
  48. its parent, but has an independent position. This is useful when
  49. advancing through syntax elements we might later want to backtrack
  50. from.
  51. """
  52. def __init__(self, parent):
  53. super().__init__(parent._data)
  54. self.__parent = parent
  55. self._pos = parent._pos
  56. def commit(self):
  57. """
  58. Advance the parent state to the current position of this child state.
  59. """
  60. self.__parent._pos = self._pos
  61. return self.__parent
  62. class ParseError(Exception):
  63. def __init__(self, parser):
  64. super().__init__("Parse error at position %u (near %r)" % (
  65. parser._pos, parser._data[parser._pos:parser._pos + 100]
  66. ))
  67. # While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
  68. # prescribes that hours must be *2 or more* digits, timestamps with a single
  69. # digit for the hour part has been seen in the wild.
  70. # See https://github.com/yt-dlp/yt-dlp/issues/921
  71. _REGEX_TS = re.compile(r'''(?x)
  72. (?:([0-9]{1,}):)?
  73. ([0-9]{2}):
  74. ([0-9]{2})\.
  75. ([0-9]{3})?
  76. ''')
  77. _REGEX_EOF = re.compile(r'\Z')
  78. _REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
  79. _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
  80. _REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')
  81. def _parse_ts(ts):
  82. """
  83. Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
  84. into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
  85. """
  86. return 90 * sum(
  87. int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))
  88. def _format_ts(ts):
  89. """
  90. Convert an MPEG PES timestamp into a WebVTT timestamp.
  91. This will lose sub-millisecond precision.
  92. """
  93. return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))
  94. class Block:
  95. """
  96. An abstract WebVTT block.
  97. """
  98. def __init__(self, **kwargs):
  99. for key, val in kwargs.items():
  100. setattr(self, key, val)
  101. @classmethod
  102. def parse(cls, parser):
  103. m = parser.match(cls._REGEX)
  104. if not m:
  105. return None
  106. parser.advance(m)
  107. return cls(raw=m.group(0))
  108. def write_into(self, stream):
  109. stream.write(self.raw)
  110. class HeaderBlock(Block):
  111. """
  112. A WebVTT block that may only appear in the header part of the file,
  113. i.e. before any cue blocks.
  114. """
  115. pass
  116. class Magic(HeaderBlock):
  117. _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')
  118. # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
  119. # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
  120. # doesn’t specify the exact grammar nor where in the WebVTT
  121. # syntax it should be placed; the below has been devised based
  122. # on usage in the wild
  123. #
  124. # And strictly speaking, the presence of this extension violates
  125. # the W3C WebVTT spec. Oh well.
  126. _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
  127. _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
  128. _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
  129. _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
  130. # This was removed from the spec in the 2017 revision;
  131. # the last spec draft to describe this syntax element is
  132. # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
  133. # Nevertheless, YouTube keeps serving those
  134. _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
  135. @classmethod
  136. def __parse_tsmap(cls, parser):
  137. parser = parser.child()
  138. while True:
  139. m = parser.consume(cls._REGEX_TSMAP_LOCAL)
  140. if m:
  141. m = parser.consume(_REGEX_TS)
  142. if m is None:
  143. raise ParseError(parser)
  144. local = _parse_ts(m)
  145. if local is None:
  146. raise ParseError(parser)
  147. else:
  148. m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
  149. if m:
  150. mpegts = int_or_none(m.group(1))
  151. if mpegts is None:
  152. raise ParseError(parser)
  153. else:
  154. raise ParseError(parser)
  155. if parser.consume(cls._REGEX_TSMAP_SEP):
  156. continue
  157. if parser.consume(_REGEX_NL):
  158. break
  159. raise ParseError(parser)
  160. parser.commit()
  161. return local, mpegts
  162. @classmethod
  163. def parse(cls, parser):
  164. parser = parser.child()
  165. m = parser.consume(cls._REGEX)
  166. if not m:
  167. raise ParseError(parser)
  168. extra = m.group(1)
  169. local, mpegts, meta = None, None, ''
  170. while not parser.consume(_REGEX_NL):
  171. if parser.consume(cls._REGEX_TSMAP):
  172. local, mpegts = cls.__parse_tsmap(parser)
  173. continue
  174. m = parser.consume(cls._REGEX_META)
  175. if m:
  176. meta += m.group(0)
  177. continue
  178. raise ParseError(parser)
  179. parser.commit()
  180. return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
  181. def write_into(self, stream):
  182. stream.write('WEBVTT')
  183. if self.extra is not None:
  184. stream.write(self.extra)
  185. stream.write('\n')
  186. if self.local or self.mpegts:
  187. stream.write('X-TIMESTAMP-MAP=LOCAL:')
  188. stream.write(_format_ts(self.local if self.local is not None else 0))
  189. stream.write(',MPEGTS:')
  190. stream.write(str(self.mpegts if self.mpegts is not None else 0))
  191. stream.write('\n')
  192. if self.meta:
  193. stream.write(self.meta)
  194. stream.write('\n')
  195. class StyleBlock(HeaderBlock):
  196. _REGEX = re.compile(r'''(?x)
  197. STYLE[\ \t]*(?:\r\n|[\r\n])
  198. ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
  199. (?:\r\n|[\r\n])
  200. ''')
  201. class RegionBlock(HeaderBlock):
  202. _REGEX = re.compile(r'''(?x)
  203. REGION[\ \t]*
  204. ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
  205. (?:\r\n|[\r\n])
  206. ''')
  207. class CommentBlock(Block):
  208. _REGEX = re.compile(r'''(?x)
  209. NOTE(?:\r\n|[\ \t\r\n])
  210. ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
  211. (?:\r\n|[\r\n])
  212. ''')
  213. class CueBlock(Block):
  214. """
  215. A cue block. The payload is not interpreted.
  216. """
  217. _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
  218. _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
  219. _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
  220. _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')
  221. @classmethod
  222. def parse(cls, parser):
  223. parser = parser.child()
  224. id = None
  225. m = parser.consume(cls._REGEX_ID)
  226. if m:
  227. id = m.group(1)
  228. m0 = parser.consume(_REGEX_TS)
  229. if not m0:
  230. return None
  231. if not parser.consume(cls._REGEX_ARROW):
  232. return None
  233. m1 = parser.consume(_REGEX_TS)
  234. if not m1:
  235. return None
  236. m2 = parser.consume(cls._REGEX_SETTINGS)
  237. parser.consume(_REGEX_OPTIONAL_WHITESPACE)
  238. if not parser.consume(_REGEX_NL):
  239. return None
  240. start = _parse_ts(m0)
  241. end = _parse_ts(m1)
  242. settings = m2.group(1) if m2 is not None else None
  243. text = io.StringIO()
  244. while True:
  245. m = parser.consume(cls._REGEX_PAYLOAD)
  246. if not m:
  247. break
  248. text.write(m.group(0))
  249. parser.commit()
  250. return cls(
  251. id=id,
  252. start=start, end=end, settings=settings,
  253. text=text.getvalue()
  254. )
  255. def write_into(self, stream):
  256. if self.id is not None:
  257. stream.write(self.id)
  258. stream.write('\n')
  259. stream.write(_format_ts(self.start))
  260. stream.write(' --> ')
  261. stream.write(_format_ts(self.end))
  262. if self.settings is not None:
  263. stream.write(' ')
  264. stream.write(self.settings)
  265. stream.write('\n')
  266. stream.write(self.text)
  267. stream.write('\n')
  268. @property
  269. def as_json(self):
  270. return {
  271. 'id': self.id,
  272. 'start': self.start,
  273. 'end': self.end,
  274. 'text': self.text,
  275. 'settings': self.settings,
  276. }
  277. def __eq__(self, other):
  278. return self.as_json == other.as_json
  279. @classmethod
  280. def from_json(cls, json):
  281. return cls(
  282. id=json['id'],
  283. start=json['start'],
  284. end=json['end'],
  285. text=json['text'],
  286. settings=json['settings']
  287. )
  288. def hinges(self, other):
  289. if self.text != other.text:
  290. return False
  291. if self.settings != other.settings:
  292. return False
  293. return self.start <= self.end == other.start <= other.end
  294. def parse_fragment(frag_content):
  295. """
  296. A generator that yields (partially) parsed WebVTT blocks when given
  297. a bytes object containing the raw contents of a WebVTT file.
  298. """
  299. parser = _MatchParser(frag_content.decode())
  300. yield Magic.parse(parser)
  301. while not parser.match(_REGEX_EOF):
  302. if parser.consume(_REGEX_BLANK):
  303. continue
  304. block = RegionBlock.parse(parser)
  305. if block:
  306. yield block
  307. continue
  308. block = StyleBlock.parse(parser)
  309. if block:
  310. yield block
  311. continue
  312. block = CommentBlock.parse(parser)
  313. if block:
  314. yield block # XXX: or skip
  315. continue
  316. break
  317. while not parser.match(_REGEX_EOF):
  318. if parser.consume(_REGEX_BLANK):
  319. continue
  320. block = CommentBlock.parse(parser)
  321. if block:
  322. yield block # XXX: or skip
  323. continue
  324. block = CueBlock.parse(parser)
  325. if block:
  326. yield block
  327. continue
  328. raise ParseError(parser)