nxml-rap.el 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode -*- lexical-binding:t -*-
  2. ;; Copyright (C) 2003-2004, 2007-2017 Free Software Foundation, Inc.
  3. ;; Author: James Clark
  4. ;; Keywords: wp, hypermedia, languages, XML
  5. ;; This file is part of GNU Emacs.
  6. ;; GNU Emacs is free software: you can redistribute it and/or modify
  7. ;; it under the terms of the GNU General Public License as published by
  8. ;; the Free Software Foundation, either version 3 of the License, or
  9. ;; (at your option) any later version.
  10. ;; GNU Emacs is distributed in the hope that it will be useful,
  11. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ;; GNU General Public License for more details.
  14. ;; You should have received a copy of the GNU General Public License
  15. ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
  16. ;;; Commentary:
  17. ;; This uses xmltok.el to do XML parsing. The fundamental problem is
  18. ;; how to handle changes. We don't want to maintain a complete parse
  19. ;; tree. We also don't want to reparse from the start of the document
  20. ;; on every keystroke. However, it is not possible in general to
  21. ;; parse an XML document correctly starting at a random point in the
  22. ;; middle. The main problems are comments, CDATA sections and
  23. ;; processing instructions: these can all contain things that are
  24. ;; indistinguishable from elements. Literals in the prolog are also a
  25. ;; problem. Attribute value literals are not a problem because
  26. ;; attribute value literals cannot contain less-than signs.
  27. ;;
  28. ;; Our strategy is to keep track of just the problematic things.
  29. ;; Specifically, we keep track of all comments, CDATA sections and
  30. ;; processing instructions in the instance. We do this by marking all
  31. ;; except the first character of these with a non-nil nxml-inside text
  32. ;; property. The value of the nxml-inside property is comment,
  33. ;; cdata-section or processing-instruction. The first character does
  34. ;; not have the nxml-inside property so we can find the beginning of
  35. ;; the construct by looking for a change in a text property value
  36. ;; (Emacs provides primitives for this). We use text properties
  37. ;; rather than overlays, since the implementation of overlays doesn't
  38. ;; look like it scales to large numbers of overlays in a buffer.
  39. ;;
  40. ;; We don't in fact track all these constructs, but only track them in
  41. ;; some initial part of the instance.
  42. ;;
  43. ;; Thus to parse some random point in the file we first ensure that we
  44. ;; have scanned up to that point. Then we search backwards for a
  45. ;; <. Then we check whether the < has an nxml-inside property. If it
  46. ;; does we go backwards to first character that does not have an
  47. ;; nxml-inside property (this character must be a <). Then we start
  48. ;; parsing forward from the < we have found.
  49. ;;
  50. ;; The prolog has to be parsed specially, so we also keep track of the
  51. ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
  52. ;; every change to the prolog. This won't work well if people try to
  53. ;; edit huge internal subsets. Hopefully that will be rare.
  54. ;;
  55. ;; We keep track of the changes by adding to the buffer's
  56. ;; after-change-functions hook. Scanning is also done as a
  57. ;; prerequisite to fontification by adding to fontification-functions
  58. ;; (in the same way as jit-lock). This means that scanning for these
  59. ;; constructs had better be quick. Fortunately it is. Firstly, the
  60. ;; typical proportion of comments, CDATA sections and processing
  61. ;; instructions is small relative to other things. Secondly, to scan
  62. ;; we just search for the regexp <[!?].
  63. ;;; Code:
  64. (require 'xmltok)
  65. (require 'nxml-util)
  66. (require 'sgml-mode)
  67. (defvar-local nxml-prolog-end nil
  68. "Integer giving position following end of the prolog.")
  69. (defsubst nxml-get-inside (pos)
  70. (save-excursion (nth 8 (syntax-ppss pos))))
  71. (defun nxml-inside-end (pos)
  72. "Return the end of the inside region containing POS.
  73. Return nil if the character at POS is not inside."
  74. (save-excursion
  75. (let ((ppss (syntax-ppss pos)))
  76. (when (nth 8 ppss)
  77. (goto-char (nth 8 ppss))
  78. (with-syntax-table sgml-tag-syntax-table
  79. (if (nth 3 ppss)
  80. (progn (forward-comment 1) (point))
  81. (or (scan-sexps (point) 1) (point-max))))))))
  82. (defun nxml-inside-start (pos)
  83. "Return the start of the inside region containing POS.
  84. Return nil if the character at POS is not inside."
  85. (save-excursion (nth 8 (syntax-ppss pos))))
  86. ;;; Change management
  87. ;; n-s-p only called from nxml-mode.el, where this variable is defined.
  88. (defvar nxml-prolog-regions)
  89. (defun nxml-scan-prolog ()
  90. (goto-char (point-min))
  91. (let (xmltok-dtd
  92. xmltok-errors)
  93. (setq nxml-prolog-regions (xmltok-forward-prolog))
  94. (setq nxml-prolog-end (point))))
  95. ;;; Random access parsing
  96. (defun nxml-token-after ()
  97. "Return the position after the token containing the char after point.
  98. Sets up the variables `xmltok-type', `xmltok-start',
  99. `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
  100. `xmltok-namespace-attributes' in the same was as does
  101. `xmltok-forward'. The prolog will be treated as a single token with
  102. type `prolog'."
  103. (let ((pos (point)))
  104. (if (< pos nxml-prolog-end)
  105. (progn
  106. (setq xmltok-type 'prolog
  107. xmltok-start (point-min))
  108. (min nxml-prolog-end (point-max)))
  109. (nxml-ensure-scan-up-to-date)
  110. (if (nxml-get-inside pos)
  111. (save-excursion
  112. (nxml-move-outside-backwards)
  113. (xmltok-forward)
  114. (point))
  115. (save-excursion
  116. (if (or (eq (char-after) ?<)
  117. (search-backward "<"
  118. (max (point-min) nxml-prolog-end)
  119. t))
  120. (nxml-move-outside-backwards)
  121. (goto-char (if (<= (point-min) nxml-prolog-end)
  122. nxml-prolog-end
  123. (or (nxml-inside-end (point-min))
  124. (point-min)))))
  125. (while (and (nxml-tokenize-forward)
  126. (<= (point) pos)))
  127. (point))))))
  128. (defun nxml-token-before ()
  129. "Return the position after the token containing the char before point.
  130. Sets variables like `nxml-token-after'."
  131. (if (/= (point-min) (point))
  132. (save-excursion
  133. (goto-char (1- (point)))
  134. (nxml-token-after))
  135. (setq xmltok-start (point))
  136. (setq xmltok-type nil)
  137. (point)))
  138. (defun nxml-tokenize-forward ()
  139. (let (xmltok-errors)
  140. (xmltok-forward)
  141. xmltok-type))
  142. (defun nxml-move-tag-backwards (bound)
  143. "Move point backwards outside any “inside” regions or tags.
  144. Point will not move past `nxml-prolog-end'.
  145. Point will either be at BOUND or a `<' character starting a tag
  146. outside any “inside” regions.
  147. As a precondition, point must be >= BOUND."
  148. (nxml-move-outside-backwards)
  149. (when (not (equal (char-after) ?<))
  150. (if (search-backward "<" bound t)
  151. (progn
  152. (nxml-move-outside-backwards)
  153. (when (not (equal (char-after) ?<))
  154. (search-backward "<" bound t)))
  155. (goto-char bound))))
  156. (defun nxml-move-outside-backwards ()
  157. "Move point to first character of the containing special thing.
  158. Leave point unmoved if it is not inside anything special."
  159. (let ((start (nxml-inside-start (point))))
  160. (when start
  161. (goto-char start)
  162. (when (nxml-get-inside (point))
  163. (error "Char before inside-start at %s is still \"inside\"" (point))))))
  164. (defun nxml-ensure-scan-up-to-date ()
  165. (syntax-propertize (point)))
  166. ;;; Element scanning
  167. (defun nxml-scan-element-forward (from &optional up)
  168. "Scan forward from FROM over a single balanced element.
  169. Point must be between tokens. Return the position of the end of
  170. the tag that ends the element. `xmltok-start' will contain the
  171. position of the start of the tag. If UP is non-nil, then scan
  172. past end-tag of element containing point. If no element is
  173. found, return nil. If a well-formedness error prevents scanning,
  174. signal an `nxml-scan-error'. Point is not moved."
  175. (let ((open-tags (and up t))
  176. found)
  177. (save-excursion
  178. (goto-char from)
  179. (while (cond ((not (nxml-tokenize-forward))
  180. (when (consp open-tags)
  181. (nxml-scan-error (cadr open-tags)
  182. "Start-tag has no end-tag"))
  183. nil)
  184. ((eq xmltok-type 'start-tag)
  185. (setq open-tags
  186. (cons (xmltok-start-tag-qname)
  187. (cons xmltok-start
  188. open-tags)))
  189. t)
  190. ((eq xmltok-type 'end-tag)
  191. (cond ((not open-tags) nil)
  192. ((not (consp open-tags)) (setq found (point)) nil)
  193. ((not (string= (car open-tags)
  194. (xmltok-end-tag-qname)))
  195. (nxml-scan-error (+ 2 xmltok-start)
  196. "Mismatched end-tag; \
  197. expected `%s'"
  198. (car open-tags)))
  199. ((setq open-tags (cddr open-tags)) t)
  200. (t (setq found (point)) nil)))
  201. ((memq xmltok-type '(empty-element
  202. partial-empty-element))
  203. (if open-tags
  204. t
  205. (setq found (point))
  206. nil))
  207. ((eq xmltok-type 'partial-end-tag)
  208. (cond ((not open-tags) nil)
  209. ((not (consp open-tags)) (setq found (point)) nil)
  210. ((setq open-tags (cddr open-tags)) t)
  211. (t (setq found (point)) nil)))
  212. ((eq xmltok-type 'partial-start-tag)
  213. (nxml-scan-error xmltok-start
  214. "Missing `>'"))
  215. (t t))))
  216. found))
  217. (defun nxml-scan-element-backward (from &optional up bound)
  218. "Scan backward from FROM over a single balanced element.
  219. Point must be between tokens. Return the position of the end of
  220. the tag that starts the element. `xmltok-start' will contain the
  221. position of the start of the tag. If UP is non-nil, then scan
  222. past start-tag of element containing point. If BOUND is non-nil,
  223. then don't scan back past BOUND. If no element is found, return
  224. nil. If a well-formedness error prevents scanning, signal an
  225. `nxml-scan-error'. Point is not moved."
  226. (let ((open-tags (and up t))
  227. token-end found)
  228. (save-excursion
  229. (goto-char from)
  230. (while (cond ((or (< (point) nxml-prolog-end)
  231. (not (search-backward "<"
  232. (max (or bound 0)
  233. nxml-prolog-end)
  234. t)))
  235. (when (and (consp open-tags) (not bound))
  236. (nxml-scan-error (cadr open-tags)
  237. "End-tag has no start-tag"))
  238. nil)
  239. ((progn
  240. (nxml-move-outside-backwards)
  241. (save-excursion
  242. (nxml-tokenize-forward)
  243. (setq token-end (point)))
  244. (eq xmltok-type 'end-tag))
  245. (setq open-tags
  246. (cons (xmltok-end-tag-qname)
  247. (cons xmltok-start open-tags)))
  248. t)
  249. ((eq xmltok-type 'start-tag)
  250. (cond ((not open-tags) nil)
  251. ((not (consp open-tags))
  252. (setq found token-end)
  253. nil)
  254. ((and (car open-tags)
  255. (not (string= (car open-tags)
  256. (xmltok-start-tag-qname))))
  257. (nxml-scan-error (1+ xmltok-start)
  258. "Mismatched start-tag; \
  259. expected `%s'"
  260. (car open-tags)))
  261. ((setq open-tags (cddr open-tags)) t)
  262. (t (setq found token-end) nil)))
  263. ((memq xmltok-type '(empty-element
  264. partial-empty-element))
  265. (if open-tags
  266. t
  267. (setq found token-end)
  268. nil))
  269. ((eq xmltok-type 'partial-end-tag)
  270. (setq open-tags
  271. (cons nil (cons xmltok-start open-tags)))
  272. t)
  273. ((eq xmltok-type 'partial-start-tag)
  274. ;; if we have only a partial-start-tag
  275. ;; then it's unlikely that there's a matching
  276. ;; end-tag, so it's probably not helpful
  277. ;; to treat it as a complete start-tag
  278. (nxml-scan-error xmltok-start
  279. "Missing `>'"))
  280. (t t))))
  281. found))
  282. (defun nxml-scan-error (&rest args)
  283. (signal 'nxml-scan-error args))
  284. (define-error 'nxml-scan-error
  285. "Scan over element that is not well-formed" 'nxml-error)
  286. (provide 'nxml-rap)
  287. ;;; nxml-rap.el ends here