nxml-rap.el 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode
  2. ;; Copyright (C) 2003-2004, 2007-2012 Free Software Foundation, Inc.
  3. ;; Author: James Clark
  4. ;; Keywords: XML
  5. ;; This file is part of GNU Emacs.
  6. ;; GNU Emacs is free software: you can redistribute it and/or modify
  7. ;; it under the terms of the GNU General Public License as published by
  8. ;; the Free Software Foundation, either version 3 of the License, or
  9. ;; (at your option) any later version.
  10. ;; GNU Emacs is distributed in the hope that it will be useful,
  11. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ;; GNU General Public License for more details.
  14. ;; You should have received a copy of the GNU General Public License
  15. ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
  16. ;;; Commentary:
  17. ;; This uses xmltok.el to do XML parsing. The fundamental problem is
  18. ;; how to handle changes. We don't want to maintain a complete parse
  19. ;; tree. We also don't want to reparse from the start of the document
  20. ;; on every keystroke. However, it is not possible in general to
  21. ;; parse an XML document correctly starting at a random point in the
  22. ;; middle. The main problems are comments, CDATA sections and
  23. ;; processing instructions: these can all contain things that are
  24. ;; indistinguishable from elements. Literals in the prolog are also a
  25. ;; problem. Attribute value literals are not a problem because
  26. ;; attribute value literals cannot contain less-than signs.
  27. ;;
  28. ;; Our strategy is to keep track of just the problematic things.
  29. ;; Specifically, we keep track of all comments, CDATA sections and
  30. ;; processing instructions in the instance. We do this by marking all
  31. ;; except the first character of these with a non-nil nxml-inside text
  32. ;; property. The value of the nxml-inside property is comment,
  33. ;; cdata-section or processing-instruction. The first character does
  34. ;; not have the nxml-inside property so we can find the beginning of
  35. ;; the construct by looking for a change in a text property value
  36. ;; (Emacs provides primitives for this). We use text properties
  37. ;; rather than overlays, since the implementation of overlays doesn't
  38. ;; look like it scales to large numbers of overlays in a buffer.
  39. ;;
  40. ;; We don't in fact track all these constructs, but only track them in
  41. ;; some initial part of the instance. The variable `nxml-scan-end'
  42. ;; contains the limit of where we have scanned up to for them.
  43. ;;
  44. ;; Thus to parse some random point in the file we first ensure that we
  45. ;; have scanned up to that point. Then we search backwards for a
  46. ;; <. Then we check whether the < has an nxml-inside property. If it
  47. ;; does we go backwards to first character that does not have an
  48. ;; nxml-inside property (this character must be a <). Then we start
  49. ;; parsing forward from the < we have found.
  50. ;;
  51. ;; The prolog has to be parsed specially, so we also keep track of the
  52. ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
  53. ;; every change to the prolog. This won't work well if people try to
  54. ;; edit huge internal subsets. Hopefully that will be rare.
  55. ;;
  56. ;; We keep track of the changes by adding to the buffer's
  57. ;; after-change-functions hook. Scanning is also done as a
  58. ;; prerequisite to fontification by adding to fontification-functions
  59. ;; (in the same way as jit-lock). This means that scanning for these
  60. ;; constructs had better be quick. Fortunately it is. Firstly, the
  61. ;; typical proportion of comments, CDATA sections and processing
  62. ;; instructions is small relative to other things. Secondly, to scan
  63. ;; we just search for the regexp <[!?].
  64. ;;
  65. ;; One problem is unclosed comments, processing instructions and CDATA
  66. ;; sections. Suppose, for example, we encounter a <!-- but there's no
  67. ;; matching -->. This is not an unexpected situation if the user is
  68. ;; creating a comment. It is not helpful to treat the whole of the
  69. ;; file starting from the <!-- onwards as a single unclosed comment
  70. ;; token. Instead we treat just the <!-- as a piece of not well-formed
  71. ;; markup and continue. The problem is that if at some later stage a
  72. ;; --> gets added to the buffer after the unclosed <!--, we will need
  73. ;; to reparse the buffer starting from the <!--. We need to keep
  74. ;; track of these reparse dependencies; they are called dependent
  75. ;; regions in the code.
  76. ;;; Code:
  77. (require 'xmltok)
  78. (require 'nxml-util)
  79. (defvar nxml-prolog-end nil
  80. "Integer giving position following end of the prolog.")
  81. (make-variable-buffer-local 'nxml-prolog-end)
  82. (defvar nxml-scan-end nil
  83. "Marker giving position up to which we have scanned.
  84. nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end
  85. must not be an inside position in the following sense. A position is
  86. inside if the following character is a part of, but not the first
  87. character of, a CDATA section, comment or processing instruction.
  88. Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that
  89. are inside positions must have a non-nil `nxml-inside' property whose
  90. value is a symbol specifying what it is inside. Any characters with a
  91. non-nil `fontified' property must have position < nxml-scan-end and
  92. the correct face. Dependent regions must also be established for any
  93. unclosed constructs starting before nxml-scan-end.
  94. There must be no `nxml-inside' properties after nxml-scan-end.")
  95. (make-variable-buffer-local 'nxml-scan-end)
  96. (defsubst nxml-get-inside (pos)
  97. (get-text-property pos 'nxml-inside))
  98. (defsubst nxml-clear-inside (start end)
  99. (nxml-debug-clear-inside start end)
  100. (remove-text-properties start end '(nxml-inside nil)))
  101. (defsubst nxml-set-inside (start end type)
  102. (nxml-debug-set-inside start end)
  103. (put-text-property start end 'nxml-inside type))
  104. (defun nxml-inside-end (pos)
  105. "Return the end of the inside region containing POS.
  106. Return nil if the character at POS is not inside."
  107. (if (nxml-get-inside pos)
  108. (or (next-single-property-change pos 'nxml-inside)
  109. (point-max))
  110. nil))
  111. (defun nxml-inside-start (pos)
  112. "Return the start of the inside region containing POS.
  113. Return nil if the character at POS is not inside."
  114. (if (nxml-get-inside pos)
  115. (or (previous-single-property-change (1+ pos) 'nxml-inside)
  116. (point-min))
  117. nil))
  118. ;;; Change management
  119. (defun nxml-scan-after-change (start end)
  120. "Restore `nxml-scan-end' invariants after a change.
  121. The change happened between START and END.
  122. Return position after which lexical state is unchanged.
  123. END must be > `nxml-prolog-end'. START must be outside
  124. any 'inside' regions and at the beginning of a token."
  125. (if (>= start nxml-scan-end)
  126. nxml-scan-end
  127. (let ((inside-remove-start start)
  128. xmltok-errors
  129. xmltok-dependent-regions)
  130. (while (or (when (xmltok-forward-special (min end nxml-scan-end))
  131. (when (memq xmltok-type
  132. '(comment
  133. cdata-section
  134. processing-instruction))
  135. (nxml-clear-inside inside-remove-start
  136. (1+ xmltok-start))
  137. (nxml-set-inside (1+ xmltok-start)
  138. (point)
  139. xmltok-type)
  140. (setq inside-remove-start (point)))
  141. (if (< (point) (min end nxml-scan-end))
  142. t
  143. (setq end (point))
  144. nil))
  145. ;; The end of the change was inside but is now outside.
  146. ;; Imagine something really weird like
  147. ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> -->
  148. ;; and suppose we deleted "<![CDATA[f"
  149. (let ((inside-end (nxml-inside-end end)))
  150. (when inside-end
  151. (setq end inside-end)
  152. t))))
  153. (nxml-clear-inside inside-remove-start end)
  154. (nxml-clear-dependent-regions start end)
  155. (nxml-mark-parse-dependent-regions))
  156. (when (> end nxml-scan-end)
  157. (set-marker nxml-scan-end end))
  158. end))
  159. ;; n-s-p only called from nxml-mode.el, where this variable is defined.
  160. (defvar nxml-prolog-regions)
  161. (defun nxml-scan-prolog ()
  162. (goto-char (point-min))
  163. (let (xmltok-dtd
  164. xmltok-errors
  165. xmltok-dependent-regions)
  166. (setq nxml-prolog-regions (xmltok-forward-prolog))
  167. (setq nxml-prolog-end (point))
  168. (nxml-clear-inside (point-min) nxml-prolog-end)
  169. (nxml-clear-dependent-regions (point-min) nxml-prolog-end)
  170. (nxml-mark-parse-dependent-regions))
  171. (when (< nxml-scan-end nxml-prolog-end)
  172. (set-marker nxml-scan-end nxml-prolog-end)))
  173. ;;; Dependent regions
  174. (defun nxml-adjust-start-for-dependent-regions (start end pre-change-length)
  175. (let ((overlays (overlays-in (1- start) start))
  176. (adjusted-start start))
  177. (while overlays
  178. (let* ((overlay (car overlays))
  179. (ostart (overlay-start overlay)))
  180. (when (and (eq (overlay-get overlay 'category) 'nxml-dependent)
  181. (< ostart adjusted-start))
  182. (let ((funargs (overlay-get overlay 'nxml-funargs)))
  183. (when (apply (car funargs)
  184. (append (list start
  185. end
  186. pre-change-length
  187. ostart
  188. (overlay-end overlay))
  189. (cdr funargs)))
  190. (setq adjusted-start ostart)))))
  191. (setq overlays (cdr overlays)))
  192. adjusted-start))
  193. (defun nxml-mark-parse-dependent-regions ()
  194. (while xmltok-dependent-regions
  195. (apply 'nxml-mark-parse-dependent-region
  196. (car xmltok-dependent-regions))
  197. (setq xmltok-dependent-regions
  198. (cdr xmltok-dependent-regions))))
  199. (defun nxml-mark-parse-dependent-region (fun start end &rest args)
  200. (let ((overlay (make-overlay start end nil t t)))
  201. (overlay-put overlay 'category 'nxml-dependent)
  202. (overlay-put overlay 'nxml-funargs (cons fun args))))
  203. (put 'nxml-dependent 'evaporate t)
  204. (defun nxml-clear-dependent-regions (start end)
  205. (let ((overlays (overlays-in start end)))
  206. (while overlays
  207. (let* ((overlay (car overlays))
  208. (category (overlay-get overlay 'category)))
  209. (when (and (eq category 'nxml-dependent)
  210. (<= start (overlay-start overlay)))
  211. (delete-overlay overlay)))
  212. (setq overlays (cdr overlays)))))
  213. ;;; Random access parsing
  214. (defun nxml-token-after ()
  215. "Return the position after the token containing the char after point.
  216. Sets up the variables `xmltok-type', `xmltok-start',
  217. `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
  218. `xmltok-namespace-attributes' in the same was as does
  219. `xmltok-forward'. The prolog will be treated as a single token with
  220. type `prolog'."
  221. (let ((pos (point)))
  222. (if (< pos nxml-prolog-end)
  223. (progn
  224. (setq xmltok-type 'prolog
  225. xmltok-start (point-min))
  226. (min nxml-prolog-end (point-max)))
  227. (nxml-ensure-scan-up-to-date)
  228. (if (nxml-get-inside pos)
  229. (save-excursion
  230. (nxml-move-outside-backwards)
  231. (xmltok-forward)
  232. (point))
  233. (save-excursion
  234. (if (or (eq (char-after) ?<)
  235. (search-backward "<"
  236. (max (point-min) nxml-prolog-end)
  237. t))
  238. (nxml-move-outside-backwards)
  239. (goto-char (if (<= (point-min) nxml-prolog-end)
  240. nxml-prolog-end
  241. (or (nxml-inside-end (point-min))
  242. (point-min)))))
  243. (while (and (nxml-tokenize-forward)
  244. (<= (point) pos)))
  245. (point))))))
  246. (defun nxml-token-before ()
  247. "Return the position after the token containing the char before point.
  248. Sets variables like `nxml-token-after'."
  249. (if (/= (point-min) (point))
  250. (save-excursion
  251. (goto-char (1- (point)))
  252. (nxml-token-after))
  253. (setq xmltok-start (point))
  254. (setq xmltok-type nil)
  255. (point)))
  256. (defun nxml-tokenize-forward ()
  257. (let (xmltok-dependent-regions
  258. xmltok-errors)
  259. (when (and (xmltok-forward)
  260. (> (point) nxml-scan-end))
  261. (cond ((memq xmltok-type '(comment
  262. cdata-section
  263. processing-instruction))
  264. (nxml-with-unmodifying-text-property-changes
  265. (nxml-set-inside (1+ xmltok-start) (point) xmltok-type)))
  266. (xmltok-dependent-regions
  267. (nxml-mark-parse-dependent-regions)))
  268. (set-marker nxml-scan-end (point)))
  269. xmltok-type))
  270. (defun nxml-move-tag-backwards (bound)
  271. "Move point backwards outside any 'inside' regions or tags.
  272. Point will not move past `nxml-prolog-end'.
  273. Point will either be at BOUND or a '<' character starting a tag
  274. outside any 'inside' regions. Ignores dependent regions.
  275. As a precondition, point must be >= BOUND."
  276. (nxml-move-outside-backwards)
  277. (when (not (equal (char-after) ?<))
  278. (if (search-backward "<" bound t)
  279. (progn
  280. (nxml-move-outside-backwards)
  281. (when (not (equal (char-after) ?<))
  282. (search-backward "<" bound t)))
  283. (goto-char bound))))
  284. (defun nxml-move-outside-backwards ()
  285. "Move point to first character of the containing special thing.
  286. Leave point unmoved if it is not inside anything special."
  287. (let ((start (nxml-inside-start (point))))
  288. (when start
  289. (goto-char (1- start))
  290. (when (nxml-get-inside (point))
  291. (error "Char before inside-start at %s had nxml-inside property %s"
  292. (point)
  293. (nxml-get-inside (point)))))))
  294. (defun nxml-ensure-scan-up-to-date ()
  295. (let ((pos (point)))
  296. (when (< nxml-scan-end pos)
  297. (save-excursion
  298. (goto-char nxml-scan-end)
  299. (let (xmltok-errors
  300. xmltok-dependent-regions)
  301. (while (when (xmltok-forward-special pos)
  302. (when (memq xmltok-type
  303. '(comment
  304. processing-instruction
  305. cdata-section))
  306. (nxml-with-unmodifying-text-property-changes
  307. (nxml-set-inside (1+ xmltok-start)
  308. (point)
  309. xmltok-type)))
  310. (if (< (point) pos)
  311. t
  312. (setq pos (point))
  313. nil)))
  314. (nxml-clear-dependent-regions nxml-scan-end pos)
  315. (nxml-mark-parse-dependent-regions)
  316. (set-marker nxml-scan-end pos))))))
  317. ;;; Element scanning
  318. (defun nxml-scan-element-forward (from &optional up)
  319. "Scan forward from FROM over a single balanced element.
  320. Point must be between tokens. Return the position of the end of
  321. the tag that ends the element. `xmltok-start' will contain the
  322. position of the start of the tag. If UP is non-nil, then scan
  323. past end-tag of element containing point. If no element is
  324. found, return nil. If a well-formedness error prevents scanning,
  325. signal an `nxml-scan-error'. Point is not moved."
  326. (let ((open-tags (and up t))
  327. found)
  328. (save-excursion
  329. (goto-char from)
  330. (while (cond ((not (nxml-tokenize-forward))
  331. (when (consp open-tags)
  332. (nxml-scan-error (cadr open-tags)
  333. "Start-tag has no end-tag"))
  334. nil)
  335. ((eq xmltok-type 'start-tag)
  336. (setq open-tags
  337. (cons (xmltok-start-tag-qname)
  338. (cons xmltok-start
  339. open-tags)))
  340. t)
  341. ((eq xmltok-type 'end-tag)
  342. (cond ((not open-tags) nil)
  343. ((not (consp open-tags)) (setq found (point)) nil)
  344. ((not (string= (car open-tags)
  345. (xmltok-end-tag-qname)))
  346. (nxml-scan-error (+ 2 xmltok-start)
  347. "Mismatched end-tag; \
  348. expected `%s'"
  349. (car open-tags)))
  350. ((setq open-tags (cddr open-tags)) t)
  351. (t (setq found (point)) nil)))
  352. ((memq xmltok-type '(empty-element
  353. partial-empty-element))
  354. (if open-tags
  355. t
  356. (setq found (point))
  357. nil))
  358. ((eq xmltok-type 'partial-end-tag)
  359. (cond ((not open-tags) nil)
  360. ((not (consp open-tags)) (setq found (point)) nil)
  361. ((setq open-tags (cddr open-tags)) t)
  362. (t (setq found (point)) nil)))
  363. ((eq xmltok-type 'partial-start-tag)
  364. (nxml-scan-error xmltok-start
  365. "Missing `>'"))
  366. (t t))))
  367. found))
  368. (defun nxml-scan-element-backward (from &optional up bound)
  369. "Scan backward from FROM over a single balanced element.
  370. Point must be between tokens. Return the position of the end of
  371. the tag that starts the element. `xmltok-start' will contain the
  372. position of the start of the tag. If UP is non-nil, then scan
  373. past start-tag of element containing point. If BOUND is non-nil,
  374. then don't scan back past BOUND. If no element is found, return
  375. nil. If a well-formedness error prevents scanning, signal an
  376. `nxml-scan-error'. Point is not moved."
  377. (let ((open-tags (and up t))
  378. token-end found)
  379. (save-excursion
  380. (goto-char from)
  381. (while (cond ((or (< (point) nxml-prolog-end)
  382. (not (search-backward "<"
  383. (max (or bound 0)
  384. nxml-prolog-end)
  385. t)))
  386. (when (and (consp open-tags) (not bound))
  387. (nxml-scan-error (cadr open-tags)
  388. "End-tag has no start-tag"))
  389. nil)
  390. ((progn
  391. (nxml-move-outside-backwards)
  392. (save-excursion
  393. (nxml-tokenize-forward)
  394. (setq token-end (point)))
  395. (eq xmltok-type 'end-tag))
  396. (setq open-tags
  397. (cons (xmltok-end-tag-qname)
  398. (cons xmltok-start open-tags)))
  399. t)
  400. ((eq xmltok-type 'start-tag)
  401. (cond ((not open-tags) nil)
  402. ((not (consp open-tags))
  403. (setq found token-end)
  404. nil)
  405. ((and (car open-tags)
  406. (not (string= (car open-tags)
  407. (xmltok-start-tag-qname))))
  408. (nxml-scan-error (1+ xmltok-start)
  409. "Mismatched start-tag; \
  410. expected `%s'"
  411. (car open-tags)))
  412. ((setq open-tags (cddr open-tags)) t)
  413. (t (setq found token-end) nil)))
  414. ((memq xmltok-type '(empty-element
  415. partial-empty-element))
  416. (if open-tags
  417. t
  418. (setq found token-end)
  419. nil))
  420. ((eq xmltok-type 'partial-end-tag)
  421. (setq open-tags
  422. (cons nil (cons xmltok-start open-tags)))
  423. t)
  424. ((eq xmltok-type 'partial-start-tag)
  425. ;; if we have only a partial-start-tag
  426. ;; then it's unlikely that there's a matching
  427. ;; end-tag, so it's probably not helpful
  428. ;; to treat it as a complete start-tag
  429. (nxml-scan-error xmltok-start
  430. "Missing `>'"))
  431. (t t))))
  432. found))
  433. (defun nxml-scan-error (&rest args)
  434. (signal 'nxml-scan-error args))
  435. (put 'nxml-scan-error
  436. 'error-conditions
  437. '(error nxml-error nxml-scan-error))
  438. (put 'nxml-scan-error
  439. 'error-message
  440. "Scan over element that is not well-formed")
  441. (provide 'nxml-rap)
  442. ;;; nxml-rap.el ends here