xmltok.el 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924
  1. ;;; xmltok.el --- XML tokenization
  2. ;; Copyright (C) 2003, 2007-2012 Free Software Foundation, Inc.
  3. ;; Author: James Clark
  4. ;; Keywords: XML
  5. ;; This file is part of GNU Emacs.
  6. ;; GNU Emacs is free software: you can redistribute it and/or modify
  7. ;; it under the terms of the GNU General Public License as published by
  8. ;; the Free Software Foundation, either version 3 of the License, or
  9. ;; (at your option) any later version.
  10. ;; GNU Emacs is distributed in the hope that it will be useful,
  11. ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ;; GNU General Public License for more details.
  14. ;; You should have received a copy of the GNU General Public License
  15. ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
  16. ;;; Commentary:
  17. ;; This implements an XML 1.0 parser. It also implements the XML
  18. ;; Namespaces Recommendation. It is designed to be conforming, but it
  19. ;; works a bit differently from a normal XML parser. An XML document
  20. ;; consists of the prolog and an instance. The prolog is parsed as a
  21. ;; single unit using `xmltok-forward-prolog'. The instance is
  22. ;; considered as a sequence of tokens, where a token is something like
  23. ;; a start-tag, a comment, a chunk of data or a CDATA section. The
  24. ;; tokenization of the instance is stateless: the tokenization of one
  25. ;; part of the instance does not depend on tokenization of the
  26. ;; preceding part of the instance. This allows the instance to be
  27. ;; parsed incrementally. The main entry point is `xmltok-forward':
  28. ;; this can be called at any point in the instance provided it is
  29. ;; between tokens. The other entry point is `xmltok-forward-special'
  30. ;; which skips over tokens other comments, processing instructions or
  31. ;; CDATA sections (i.e. the constructs in an instance that can contain
  32. ;; less than signs that don't start a token).
  33. ;;
  34. ;; This is a non-validating XML 1.0 processor. It does not resolve
  35. ;; parameter entities (including the external DTD subset) and it does
  36. ;; not resolve external general entities.
  37. ;;
  38. ;; It is non-conformant by design in the following respects.
  39. ;;
  40. ;; 1. It expects the client to detect aspects of well-formedness that
  41. ;; are not internal to a single token, specifically checking that
  42. ;; end-tags match start-tags and that the instance contains exactly
  43. ;; one element.
  44. ;;
  45. ;; 2. It expects the client to detect duplicate attributes. Detection
  46. ;; of duplicate attributes after expansion of namespace prefixes
  47. ;; requires the namespace processing state. Detection of duplicate
  48. ;; attributes before expansion of namespace prefixes does not, but is
  49. ;; redundant given that the client will do detection of duplicate
  50. ;; attributes after expansion of namespace prefixes.
  51. ;;
  52. ;; 3. It allows the client to recover from well-formedness errors.
  53. ;; This is essential for use in applications where the document is
  54. ;; being parsed during the editing process.
  55. ;;
  56. ;; 4. It does not support documents that do not conform to the lexical
  57. ;; requirements of the XML Namespaces Recommendation (e.g. a document
  58. ;; with a colon in an entity name).
  59. ;;
  60. ;; There are also a number of things that have not yet been
  61. ;; implemented that make it non-conformant.
  62. ;;
  63. ;; 1. It does not implement default attributes. ATTLIST declarations
  64. ;; are parsed, but no checking is done on the content of attribute
  65. ;; value literals specifying default attribute values, and default
  66. ;; attribute values are not reported to the client.
  67. ;;
  68. ;; 2. It does not implement internal entities containing elements. If
  69. ;; an internal entity is referenced and parsing its replacement text
  70. ;; yields one or more tags, then it will skip the reference and
  71. ;; report this to the client.
  72. ;;
  73. ;; 3. It does not check the syntax of public identifiers in the DTD.
  74. ;;
  75. ;; 4. It allows some non-ASCII characters in certain situations where
  76. ;; it should not. For example, it only enforces XML 1.0's
  77. ;; restrictions on name characters strictly for ASCII characters. The
  78. ;; problem here is XML's character model is based squarely on Unicode,
  79. ;; whereas Emacs's is not (as of version 21). It is not clear what
  80. ;; the right thing to do is.
  81. ;;; Code:
  82. (defvar xmltok-type nil)
  83. (defvar xmltok-start nil)
  84. (defvar xmltok-name-colon nil)
  85. (defvar xmltok-name-end nil)
  86. (defvar xmltok-replacement nil
  87. "String containing replacement for a character or entity reference.")
  88. (defvar xmltok-attributes nil
  89. "List containing attributes of last scanned element.
  90. Each member of the list is a vector representing an attribute, which
  91. can be accessed using the functions `xmltok-attribute-name-start',
  92. `xmltok-attribute-name-colon', `xmltok-attribute-name-end',
  93. `xmltok-attribute-value-start', `xmltok-attribute-value-end',
  94. `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.")
  95. (defvar xmltok-namespace-attributes nil
  96. "List containing namespace declarations of last scanned element.
  97. List has same format as `xmltok-attributes'.")
  98. (defvar xmltok-dtd nil
  99. "Information about the DTD used by `xmltok-forward'.
  100. `xmltok-forward-prolog' sets this up.
  101. It consists of an alist of general entity names vs definitions. The
  102. first member of the alist is t if references to entities not in the
  103. alist are well-formed \(e.g. because there's an external subset that
  104. wasn't parsed).
  105. Each general entity name is a string. The definition is either nil,
  106. a symbol, a string, a cons cell. If the definition is nil, then it
  107. means that it's an internal entity but the result of parsing it is
  108. unknown. If it is a symbol, then the symbol is either `unparsed',
  109. meaning the entity is an unparsed entity, `external', meaning the
  110. entity is or references an external entity, `element', meaning the
  111. entity includes one or more elements, or `not-well-formed', meaning
  112. the replacement text is not well-formed. If the definition is a
  113. string, then the replacement text of the entity is that string; this
  114. happens only during the parsing of the prolog. If the definition is
  115. a cons cell \(ER . AR), then ER specifies the string that results
  116. from referencing the entity in element content and AR is either nil,
  117. meaning the replacement text included a <, or a string which is the
  118. normalized attribute value.")
  119. (defvar xmltok-dependent-regions nil
  120. "List of descriptors of regions that a parsed token depends on.
  121. A token depends on a region if the region occurs after the token and a
  122. change in the region may require the token to be reparsed. This only
  123. happens with markup that is not well-formed. For example, if a <?
  124. occurs without a matching ?>, then the <? is returned as a
  125. not-well-formed token. However, this token is dependent on region
  126. from the end of the token to the end of the buffer: if this ever
  127. contains ?> then the buffer must be reparsed from the <?.
  128. A region descriptor is a list (FUN START END ARG ...), where FUN is a
  129. function to be called when the region changes, START and END are
  130. integers giving the start and end of the region, and ARG... are
  131. additional arguments to be passed to FUN. FUN will be called with 5
  132. arguments followed by the additional arguments if any: the position of
  133. the start of the changed area in the region, the position of the end
  134. of the changed area in the region, the length of the changed area
  135. before the change, the position of the start of the region, the
  136. position of the end of the region. FUN must return non-nil if the
  137. region needs reparsing. FUN will be called in a `save-excursion'
  138. with match-data saved.
  139. `xmltok-forward', `xmltok-forward-special' and `xmltok-forward-prolog'
  140. may add entries to the beginning of this list, but will not clear it.
  141. `xmltok-forward' and `xmltok-forward-special' will only add entries
  142. when returning tokens of type not-well-formed.")
  143. (defvar xmltok-errors nil
  144. "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'.
  145. When `xmltok-forward' and `xmltok-forward-prolog' detect a
  146. well-formedness error, they will add an entry to the beginning of this
  147. list. Each entry is a vector [MESSAGE START END], where MESSAGE is a
  148. string giving the error message and START and END are integers
  149. indicating the position of the error.")
  150. (defmacro xmltok-save (&rest body)
  151. `(let (xmltok-type
  152. xmltok-start
  153. xmltok-name-colon
  154. xmltok-name-end
  155. xmltok-replacement
  156. xmltok-attributes
  157. xmltok-namespace-attributes
  158. xmltok-dependent-regions
  159. xmltok-errors)
  160. ,@body))
  161. (put 'xmltok-save 'lisp-indent-function 0)
  162. (def-edebug-spec xmltok-save t)
  163. (defsubst xmltok-attribute-name-start (att)
  164. (aref att 0))
  165. (defsubst xmltok-attribute-name-colon (att)
  166. (aref att 1))
  167. (defsubst xmltok-attribute-name-end (att)
  168. (aref att 2))
  169. (defsubst xmltok-attribute-value-start (att)
  170. (aref att 3))
  171. (defsubst xmltok-attribute-value-end (att)
  172. (aref att 4))
  173. (defsubst xmltok-attribute-raw-normalized-value (att)
  174. "Return an object representing the normalized value of ATT.
  175. This can be t indicating that the normalized value is the same as
  176. the buffer substring from the start to the end of the value, or nil
  177. indicating that the value is not well-formed or a string."
  178. (aref att 5))
  179. (defsubst xmltok-attribute-refs (att)
  180. "Return a list of the entity and character references in ATT.
  181. Each member is a vector [TYPE START END] where TYPE is either char-ref
  182. or entity-ref and START and END are integers giving the start and end of
  183. the reference. Nested entity references are not included in the list."
  184. (aref att 6))
  185. (defun xmltok-attribute-prefix (att)
  186. (let ((colon (xmltok-attribute-name-colon att)))
  187. (and colon
  188. (buffer-substring-no-properties (xmltok-attribute-name-start att)
  189. colon))))
  190. (defun xmltok-attribute-local-name (att)
  191. (let ((colon (xmltok-attribute-name-colon att)))
  192. (buffer-substring-no-properties (if colon
  193. (1+ colon)
  194. (xmltok-attribute-name-start att))
  195. (xmltok-attribute-name-end att))))
  196. (defun xmltok-attribute-value (att)
  197. (let ((rnv (xmltok-attribute-raw-normalized-value att)))
  198. (and rnv
  199. (if (stringp rnv)
  200. rnv
  201. (buffer-substring-no-properties (xmltok-attribute-value-start att)
  202. (xmltok-attribute-value-end att))))))
  203. (defun xmltok-start-tag-prefix ()
  204. (and xmltok-name-colon
  205. (buffer-substring-no-properties (1+ xmltok-start)
  206. xmltok-name-colon)))
  207. (defun xmltok-start-tag-local-name ()
  208. (buffer-substring-no-properties (1+ (or xmltok-name-colon
  209. xmltok-start))
  210. xmltok-name-end))
  211. (defun xmltok-end-tag-prefix ()
  212. (and xmltok-name-colon
  213. (buffer-substring-no-properties (+ 2 xmltok-start)
  214. xmltok-name-colon)))
  215. (defun xmltok-end-tag-local-name ()
  216. (buffer-substring-no-properties (if xmltok-name-colon
  217. (1+ xmltok-name-colon)
  218. (+ 2 xmltok-start))
  219. xmltok-name-end))
  220. (defun xmltok-start-tag-qname ()
  221. (buffer-substring-no-properties (+ xmltok-start 1) xmltok-name-end))
  222. (defun xmltok-end-tag-qname ()
  223. (buffer-substring-no-properties (+ xmltok-start 2) xmltok-name-end))
  224. (defsubst xmltok-make-attribute (name-begin
  225. name-colon
  226. name-end
  227. &optional
  228. value-begin
  229. value-end
  230. raw-normalized-value)
  231. "Make an attribute.
  232. RAW-NORMALIZED-VALUE is nil if the value is not well-formed,
  233. t if the normalized value is the string between VALUE-BEGIN
  234. and VALUE-END, otherwise a STRING giving the value."
  235. (vector name-begin
  236. name-colon
  237. name-end
  238. value-begin
  239. value-end
  240. raw-normalized-value
  241. nil))
  242. (defsubst xmltok-error-message (err)
  243. (aref err 0))
  244. (defsubst xmltok-error-start (err)
  245. (aref err 1))
  246. (defsubst xmltok-error-end (err)
  247. (aref err 2))
  248. (defsubst xmltok-make-error (message start end)
  249. (vector message start end))
  250. (defun xmltok-add-error (message &optional start end)
  251. (setq xmltok-errors
  252. (cons (xmltok-make-error message
  253. (or start xmltok-start)
  254. (or end (point)))
  255. xmltok-errors)))
  256. (defun xmltok-add-dependent (fun &optional start end &rest args)
  257. (setq xmltok-dependent-regions
  258. (cons (cons fun
  259. (cons (or start xmltok-start)
  260. (cons (or end (point-max))
  261. args)))
  262. xmltok-dependent-regions)))
  263. (defun xmltok-forward ()
  264. (setq xmltok-start (point))
  265. (let* ((case-fold-search nil)
  266. (space-count (skip-chars-forward " \t\r\n"))
  267. (ch (char-after)))
  268. (cond ((eq ch ?\<)
  269. (cond ((> space-count 0)
  270. (setq xmltok-type 'space))
  271. (t
  272. (forward-char 1)
  273. (xmltok-scan-after-lt))))
  274. ((eq ch ?\&)
  275. (cond ((> space-count 0)
  276. (setq xmltok-type 'space))
  277. (t
  278. (forward-char 1)
  279. (xmltok-scan-after-amp 'xmltok-handle-entity))))
  280. ((re-search-forward "[<&]\\|\\(]]>\\)" nil t)
  281. (cond ((not (match-beginning 1))
  282. (goto-char (match-beginning 0))
  283. ;; must have got a non-space char
  284. (setq xmltok-type 'data))
  285. ((= (match-beginning 1) xmltok-start)
  286. (xmltok-add-error "Found `]]>' not closing a CDATA section")
  287. (setq xmltok-type 'not-well-formed))
  288. (t
  289. (goto-char (match-beginning 0))
  290. (setq xmltok-type
  291. (if (= (point) (+ xmltok-start space-count))
  292. 'space
  293. 'data)))))
  294. ((eq ch nil)
  295. (setq xmltok-type
  296. (if (> space-count 0)
  297. 'space
  298. nil)))
  299. (t
  300. (goto-char (point-max))
  301. (setq xmltok-type 'data)))))
  302. (defun xmltok-forward-special (bound)
  303. "Scan forward past the first special token starting at or after point.
  304. Return nil if there is no special token that starts before BOUND.
  305. CDATA sections, processing instructions and comments (and indeed
  306. anything starting with < following by ? or !) count as special.
  307. Return the type of the token."
  308. (when (re-search-forward "<[?!]" (1+ bound) t)
  309. (setq xmltok-start (match-beginning 0))
  310. (goto-char (1+ xmltok-start))
  311. (let ((case-fold-search nil))
  312. (xmltok-scan-after-lt))))
  313. (eval-when-compile
  314. ;; A symbolic regexp is represented by a list whose CAR is the string
  315. ;; containing the regexp and whose cdr is a list of symbolic names
  316. ;; for the groups in the string.
  317. ;; Construct a symbolic regexp from a regexp.
  318. (defun xmltok-r (str)
  319. (cons str nil))
  320. ;; Concatenate zero of more regexps and symbolic regexps.
  321. (defun xmltok+ (&rest args)
  322. (let (strs names)
  323. (while args
  324. (let ((arg (car args)))
  325. (if (stringp arg)
  326. (setq strs (cons arg strs))
  327. (setq strs (cons (car arg) strs))
  328. (setq names (cons (cdr arg) names)))
  329. (setq args (cdr args))))
  330. (cons (apply 'concat (nreverse strs))
  331. (apply 'append (nreverse names))))))
  332. (eval-when-compile
  333. ;; Make a symbolic group named NAME from the regexp R.
  334. ;; R may be a symbolic regexp or an ordinary regexp.
  335. (defmacro xmltok-g (name &rest r)
  336. (let ((sym (make-symbol "r")))
  337. `(let ((,sym (xmltok+ ,@r)))
  338. (if (stringp ,sym)
  339. (cons (concat "\\(" ,sym "\\)") (cons ',name nil))
  340. (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym)))))))
  341. (defun xmltok-p (&rest r) (xmltok+ "\\(?:"
  342. (apply 'xmltok+ r)
  343. "\\)"))
  344. ;; Get the group index of ELEM in a LIST of symbols.
  345. (defun xmltok-get-index (elem list)
  346. (or elem
  347. (error "Missing group name"))
  348. (let ((found nil)
  349. (i 1))
  350. (while list
  351. (cond ((eq elem (car list))
  352. (setq found i)
  353. (setq list nil))
  354. (t
  355. (setq i (1+ i))
  356. (setq list (cdr list)))))
  357. (or found
  358. (error "Bad group name %s" elem))))
  359. ;; Define a macro SYM using a symbolic regexp R.
  360. ;; SYM can be called in three ways:
  361. ;; (SYM regexp)
  362. ;; expands to the regexp in R
  363. ;; (SYM start G)
  364. ;; expands to
  365. ;; (match-beginning N)
  366. ;; where N is the group index of G in R.
  367. ;; (SYM end G)
  368. ;; expands to
  369. ;; (match-end N)
  370. ;; where N is the group index of G in R.
  371. (defmacro xmltok-defregexp (sym r)
  372. `(defalias ',sym
  373. (let ((r ,r))
  374. `(macro lambda (action &optional group-name)
  375. (cond ((eq action 'regexp)
  376. ,(car r))
  377. ((or (eq action 'start) (eq action 'beginning))
  378. (list 'match-beginning (xmltok-get-index group-name
  379. ',(cdr r))))
  380. ((eq action 'end)
  381. (list 'match-end (xmltok-get-index group-name
  382. ',(cdr r))))
  383. ((eq action 'string)
  384. (list 'match-string
  385. (xmltok-get-index group-name ',(cdr r))))
  386. ((eq action 'string-no-properties)
  387. (list 'match-string-no-properties
  388. (xmltok-get-index group-name ',(cdr r))))
  389. (t (error "Invalid action: %s" action))))))))
  390. (eval-when-compile
  391. (let* ((or "\\|")
  392. (open "\\(?:")
  393. (gopen "\\(")
  394. (close "\\)")
  395. (name-start-char "[_[:alpha:]]")
  396. (name-continue-not-start-char "[-.[:digit:]]")
  397. (name-continue-char "[-._[:alnum:]]")
  398. (* "*")
  399. (+ "+")
  400. (opt "?")
  401. (question "\\?")
  402. (s "[ \r\t\n]")
  403. (s+ (concat s +))
  404. (s* (concat s *))
  405. (ncname (concat name-start-char name-continue-char *))
  406. (entity-ref
  407. (xmltok+ (xmltok-g entity-name ncname)
  408. (xmltok-g entity-ref-close ";") opt))
  409. (decimal-ref
  410. (xmltok+ (xmltok-g decimal "[0-9]" +)
  411. (xmltok-g decimal-ref-close ";") opt))
  412. (hex-ref
  413. (xmltok+ "x" open
  414. (xmltok-g hex "[0-9a-fA-F]" +)
  415. (xmltok-g hex-ref-close ";") opt
  416. close opt))
  417. (char-ref
  418. (xmltok+ (xmltok-g number-sign "#")
  419. open decimal-ref or hex-ref close opt))
  420. (start-tag-close
  421. (xmltok+ open (xmltok-g start-tag-close s* ">")
  422. or open (xmltok-g empty-tag-slash s* "/")
  423. (xmltok-g empty-tag-close ">") opt close
  424. or (xmltok-g start-tag-s s+)
  425. close))
  426. (start-tag
  427. (xmltok+ (xmltok-g start-tag-name
  428. ncname (xmltok-g start-tag-colon ":" ncname) opt)
  429. start-tag-close opt))
  430. (end-tag
  431. (xmltok+ (xmltok-g end-tag-slash "/")
  432. open (xmltok-g end-tag-name
  433. ncname
  434. (xmltok-g end-tag-colon ":" ncname) opt)
  435. (xmltok-g end-tag-close s* ">") opt
  436. close opt))
  437. (comment
  438. (xmltok+ (xmltok-g markup-declaration "!")
  439. (xmltok-g comment-first-dash "-"
  440. (xmltok-g comment-open "-") opt) opt))
  441. (cdata-section
  442. (xmltok+ "!"
  443. (xmltok-g marked-section-open "\\[")
  444. open "C"
  445. open "D"
  446. open "A"
  447. open "T"
  448. open "A"
  449. (xmltok-g cdata-section-open "\\[" ) opt
  450. close opt ; A
  451. close opt ; T
  452. close opt ; A
  453. close opt ; D
  454. close opt)) ; C
  455. (processing-instruction
  456. (xmltok-g processing-instruction-question question)))
  457. (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close))
  458. (xmltok-defregexp xmltok-after-amp
  459. (xmltok+ entity-ref or char-ref))
  460. (xmltok-defregexp xmltok-after-lt
  461. (xmltok+ start-tag
  462. or end-tag
  463. ;; cdata-section must come before comment
  464. ;; because we treat <! as a comment
  465. ;; and Emacs doesn't do fully greedy matching
  466. ;; by default
  467. or cdata-section
  468. or comment
  469. or processing-instruction))
  470. (xmltok-defregexp
  471. xmltok-attribute
  472. (let* ((lit1
  473. (xmltok+ "'"
  474. "[^<'&\r\n\t]*"
  475. (xmltok-g complex1 "[&\r\n\t][^<']*") opt
  476. "'"))
  477. (lit2 (cons (replace-regexp-in-string "'" "\"" (car lit1))
  478. '(complex2)))
  479. (literal (xmltok-g literal lit1 or lit2))
  480. (name (xmltok+ open (xmltok-g xmlns "xmlns") or ncname close
  481. (xmltok-g colon ":" ncname) opt)))
  482. (xmltok+ (xmltok-g name name)
  483. s* "="
  484. ;; If the literal isn't followed by what it should be,
  485. ;; then the closing delimiter is probably really the
  486. ;; opening delimiter of another literal, so don't
  487. ;; absorb the literal in this case.
  488. open s* literal start-tag-close close opt)))
  489. (xmltok-defregexp
  490. xmltok-xml-declaration
  491. (let* ((literal-content "[-._:a-zA-Z0-9]+")
  492. (literal
  493. (concat open "\"" literal-content "\""
  494. or "'" literal-content "'" close))
  495. (version-att
  496. (xmltok+ open
  497. s+ (xmltok-g version-name "version")
  498. s* "="
  499. s* (xmltok-g version-value literal)
  500. close opt))
  501. (encoding-att
  502. (xmltok+ open
  503. s+ (xmltok-g encoding-name "encoding")
  504. s* "="
  505. s* (xmltok-g encoding-value literal)
  506. close opt))
  507. (yes-no
  508. (concat open "yes" or "no" close))
  509. (standalone-att
  510. (xmltok+ open
  511. s+ (xmltok-g standalone-name "standalone")
  512. s* "="
  513. s* (xmltok-g standalone-value
  514. "\"" yes-no "\"" or "'" yes-no "'")
  515. close opt)))
  516. (xmltok+ "<" question "xml"
  517. version-att
  518. encoding-att
  519. standalone-att
  520. s* question ">")))
  521. (xmltok-defregexp
  522. xmltok-prolog
  523. (let* ((single-char (xmltok-g single-char "[[|,(\"'>]"))
  524. (internal-subset-close (xmltok-g internal-subset-close
  525. "][ \t\r\n]*>"))
  526. (starts-with-close-paren
  527. (xmltok-g close-paren
  528. ")"
  529. (xmltok-p
  530. (xmltok-g close-paren-occur "[+?]")
  531. or
  532. (xmltok-g close-paren-star "\\*"))
  533. opt))
  534. (starts-with-percent
  535. (xmltok-g percent
  536. "%" (xmltok-g param-entity-ref
  537. ncname
  538. (xmltok-g param-entity-ref-close
  539. ";") opt) opt))
  540. (starts-with-nmtoken-not-name
  541. (xmltok-g nmtoken
  542. (xmltok-p name-continue-not-start-char or ":")
  543. (xmltok-p name-continue-char or ":") *))
  544. (nmtoken-after-colon
  545. (xmltok+
  546. (xmltok-p name-continue-not-start-char or ":")
  547. (xmltok-p name-continue-char or ":") *
  548. or
  549. name-start-char
  550. name-continue-char *
  551. ":"
  552. (xmltok-p name-continue-char or ":") *))
  553. (after-ncname
  554. (xmltok+ (xmltok-g ncname-nmtoken
  555. ":" (xmltok-p nmtoken-after-colon))
  556. or (xmltok-p (xmltok-g colon ":" ncname)
  557. (xmltok-g colon-name-occur "[?+*]") opt)
  558. or (xmltok-g ncname-occur "[?+*]")
  559. or (xmltok-g ncname-colon ":")))
  560. (starts-with-name
  561. (xmltok-g name ncname (xmltok-p after-ncname) opt))
  562. (starts-with-hash
  563. (xmltok-g pound
  564. "#" (xmltok-g hash-name ncname)))
  565. (markup-declaration
  566. (xmltok-g markup-declaration
  567. "!" (xmltok-p (xmltok-g comment-first-dash "-"
  568. (xmltok-g comment-open "-") opt)
  569. or (xmltok-g named-markup-declaration
  570. ncname)) opt))
  571. (after-lt
  572. (xmltok+ markup-declaration
  573. or (xmltok-g processing-instruction-question
  574. question)
  575. or (xmltok-g instance-start
  576. ncname)))
  577. (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt)))
  578. (xmltok+ starts-with-lt
  579. or single-char
  580. or starts-with-close-paren
  581. or starts-with-percent
  582. or starts-with-name
  583. or starts-with-nmtoken-not-name
  584. or starts-with-hash
  585. or internal-subset-close)))))
  586. (defconst xmltok-ncname-regexp (xmltok-ncname regexp))
  587. (defun xmltok-scan-after-lt ()
  588. (cond ((not (looking-at (xmltok-after-lt regexp)))
  589. (xmltok-add-error "`<' that is not markup must be entered as `&lt;'")
  590. (setq xmltok-type 'not-well-formed))
  591. (t
  592. (goto-char (match-end 0))
  593. (cond ((xmltok-after-lt start start-tag-close)
  594. (setq xmltok-name-end
  595. (xmltok-after-lt end start-tag-name))
  596. (setq xmltok-name-colon
  597. (xmltok-after-lt start start-tag-colon))
  598. (setq xmltok-attributes nil)
  599. (setq xmltok-namespace-attributes nil)
  600. (setq xmltok-type 'start-tag))
  601. ((xmltok-after-lt start end-tag-close)
  602. (setq xmltok-name-end
  603. (xmltok-after-lt end end-tag-name))
  604. (setq xmltok-name-colon
  605. (xmltok-after-lt start end-tag-colon))
  606. (setq xmltok-type 'end-tag))
  607. ((xmltok-after-lt start start-tag-s)
  608. (setq xmltok-name-end
  609. (xmltok-after-lt end start-tag-name))
  610. (setq xmltok-name-colon
  611. (xmltok-after-lt start start-tag-colon))
  612. (setq xmltok-namespace-attributes nil)
  613. (setq xmltok-attributes nil)
  614. (xmltok-scan-attributes)
  615. xmltok-type)
  616. ((xmltok-after-lt start empty-tag-close)
  617. (setq xmltok-name-end
  618. (xmltok-after-lt end start-tag-name))
  619. (setq xmltok-name-colon
  620. (xmltok-after-lt start start-tag-colon))
  621. (setq xmltok-attributes nil)
  622. (setq xmltok-namespace-attributes nil)
  623. (setq xmltok-type 'empty-element))
  624. ((xmltok-after-lt start cdata-section-open)
  625. (setq xmltok-type
  626. (if (search-forward "]]>" nil t)
  627. 'cdata-section
  628. (xmltok-add-error "No closing ]]>")
  629. (xmltok-add-dependent 'xmltok-unclosed-reparse-p
  630. nil
  631. nil
  632. "]]>")
  633. 'not-well-formed)))
  634. ((xmltok-after-lt start processing-instruction-question)
  635. (xmltok-scan-after-processing-instruction-open))
  636. ((xmltok-after-lt start comment-open)
  637. (xmltok-scan-after-comment-open))
  638. ((xmltok-after-lt start empty-tag-slash)
  639. (setq xmltok-name-end
  640. (xmltok-after-lt end start-tag-name))
  641. (setq xmltok-name-colon
  642. (xmltok-after-lt start start-tag-colon))
  643. (setq xmltok-attributes nil)
  644. (setq xmltok-namespace-attributes nil)
  645. (xmltok-add-error "Expected `/>'" (1- (point)))
  646. (setq xmltok-type 'partial-empty-element))
  647. ((xmltok-after-lt start start-tag-name)
  648. (xmltok-add-error "Missing `>'"
  649. nil
  650. (1+ xmltok-start))
  651. (setq xmltok-name-end
  652. (xmltok-after-lt end start-tag-name))
  653. (setq xmltok-name-colon
  654. (xmltok-after-lt start start-tag-colon))
  655. (setq xmltok-namespace-attributes nil)
  656. (setq xmltok-attributes nil)
  657. (setq xmltok-type 'partial-start-tag))
  658. ((xmltok-after-lt start end-tag-name)
  659. (setq xmltok-name-end (xmltok-after-lt end end-tag-name))
  660. (setq xmltok-name-colon
  661. (xmltok-after-lt start end-tag-colon))
  662. (cond ((and (not xmltok-name-colon)
  663. (eq (char-after) ?:))
  664. (goto-char (1+ (point)))
  665. (xmltok-add-error "Expected name following `:'"
  666. (1- (point))))
  667. (t
  668. (xmltok-add-error "Missing `>'"
  669. nil
  670. (1+ xmltok-start))))
  671. (setq xmltok-type 'partial-end-tag))
  672. ((xmltok-after-lt start end-tag-slash)
  673. (xmltok-add-error "Expected name following `</'")
  674. (setq xmltok-name-end nil)
  675. (setq xmltok-name-colon nil)
  676. (setq xmltok-type 'partial-end-tag))
  677. ((xmltok-after-lt start marked-section-open)
  678. (xmltok-add-error "Expected `CDATA[' after `<!['"
  679. xmltok-start
  680. (+ 3 xmltok-start))
  681. (setq xmltok-type 'not-well-formed))
  682. ((xmltok-after-lt start comment-first-dash)
  683. (xmltok-add-error "Expected `-' after `<!-'"
  684. xmltok-start
  685. (+ 3 xmltok-start))
  686. (setq xmltok-type 'not-well-formed))
  687. ((xmltok-after-lt start markup-declaration)
  688. (xmltok-add-error "Expected `[CDATA[' or `--' after `<!'"
  689. xmltok-start
  690. (+ 2 xmltok-start))
  691. (setq xmltok-type 'not-well-formed))
  692. (t
  693. (xmltok-add-error "Not well-formed")
  694. (setq xmltok-type 'not-well-formed))))))
  695. ;; XXX This should be unified with
  696. ;; xmltok-scan-prolog-after-processing-instruction-open
  697. ;; XXX maybe should include rest of line (up to any <,>) in unclosed PI
  698. (defun xmltok-scan-after-processing-instruction-open ()
  699. (cond ((not (search-forward "?>" nil t))
  700. (xmltok-add-error "No closing ?>"
  701. xmltok-start
  702. (+ xmltok-start 2))
  703. (xmltok-add-dependent 'xmltok-unclosed-reparse-p
  704. nil
  705. nil
  706. "?>")
  707. (setq xmltok-type 'not-well-formed))
  708. (t
  709. (cond ((not (save-excursion
  710. (goto-char (+ 2 xmltok-start))
  711. (and (looking-at (xmltok-ncname regexp))
  712. (setq xmltok-name-end (match-end 0)))))
  713. (setq xmltok-name-end (+ xmltok-start 2))
  714. (xmltok-add-error "<? not followed by name"
  715. (+ xmltok-start 2)
  716. (+ xmltok-start 3)))
  717. ((not (or (memq (char-after xmltok-name-end)
  718. '(?\n ?\t ?\r ? ))
  719. (= xmltok-name-end (- (point) 2))))
  720. (xmltok-add-error "Target not followed by whitespace"
  721. xmltok-name-end
  722. (1+ xmltok-name-end)))
  723. ((and (= xmltok-name-end (+ xmltok-start 5))
  724. (save-excursion
  725. (goto-char (+ xmltok-start 2))
  726. (let ((case-fold-search t))
  727. (looking-at "xml"))))
  728. (xmltok-add-error "Processing instruction target is xml"
  729. (+ xmltok-start 2)
  730. (+ xmltok-start 5))))
  731. (setq xmltok-type 'processing-instruction))))
  732. (defun xmltok-scan-after-comment-open ()
  733. (setq xmltok-type
  734. (cond ((not (search-forward "--" nil t))
  735. (xmltok-add-error "No closing -->")
  736. (xmltok-add-dependent 'xmltok-unclosed-reparse-p
  737. nil
  738. nil
  739. ;; not --> because
  740. ;; -- is not allowed
  741. ;; in comments in XML
  742. "--")
  743. 'not-well-formed)
  744. ((eq (char-after) ?>)
  745. (goto-char (1+ (point)))
  746. 'comment)
  747. (t
  748. (xmltok-add-dependent
  749. 'xmltok-semi-closed-reparse-p
  750. nil
  751. (point)
  752. "--"
  753. 2)
  754. ;; just include the <!-- in the token
  755. (goto-char (+ xmltok-start 4))
  756. ;; Need do this after the goto-char because
  757. ;; marked error should just apply to <!--
  758. (xmltok-add-error "First following `--' not followed by `>'")
  759. 'not-well-formed))))
  760. (defun xmltok-scan-attributes ()
  761. (let ((recovering nil)
  762. (atts-needing-normalization nil))
  763. (while (cond ((or (looking-at (xmltok-attribute regexp))
  764. ;; use non-greedy group
  765. (when (looking-at (concat "[^<>\n]+?"
  766. (xmltok-attribute regexp)))
  767. (unless recovering
  768. (xmltok-add-error "Malformed attribute"
  769. (point)
  770. (save-excursion
  771. (goto-char (xmltok-attribute start
  772. name))
  773. (skip-chars-backward "\r\n\t ")
  774. (point))))
  775. t))
  776. (setq recovering nil)
  777. (goto-char (match-end 0))
  778. (let ((att (xmltok-add-attribute)))
  779. (when att
  780. (setq atts-needing-normalization
  781. (cons att atts-needing-normalization))))
  782. (cond ((xmltok-attribute start start-tag-s) t)
  783. ((xmltok-attribute start start-tag-close)
  784. (setq xmltok-type 'start-tag)
  785. nil)
  786. ((xmltok-attribute start empty-tag-close)
  787. (setq xmltok-type 'empty-element)
  788. nil)
  789. ((xmltok-attribute start empty-tag-slash)
  790. (setq xmltok-type 'partial-empty-element)
  791. (xmltok-add-error "Expected `/>'"
  792. (1- (point)))
  793. nil)
  794. ((looking-at "[ \t\r\n]*[\"']")
  795. (goto-char (match-end 0))
  796. (xmltok-add-error "Missing closing delimiter"
  797. (1- (point)))
  798. (setq recovering t)
  799. t)
  800. ((looking-at "[ \t]*\\([^ \t\r\n\"'=<>/]+\\)[ \t\r\n/>]")
  801. (goto-char (match-end 1))
  802. (xmltok-add-error "Attribute value not quoted"
  803. (match-beginning 1))
  804. (setq recovering t)
  805. t)
  806. (t
  807. (xmltok-add-error "Missing attribute value"
  808. (1- (point)))
  809. (setq recovering t)
  810. t)))
  811. ((looking-at "[^<>\n]*/>")
  812. (let ((start (point)))
  813. (goto-char (match-end 0))
  814. (unless recovering
  815. (xmltok-add-error "Malformed empty-element"
  816. start
  817. (- (point) 2))))
  818. (setq xmltok-type 'empty-element)
  819. nil)
  820. ((looking-at "[^<>\n]*>")
  821. (let ((start (point)))
  822. (goto-char (match-end 0))
  823. (unless recovering
  824. (xmltok-add-error "Malformed start-tag"
  825. start
  826. (1- (point)))))
  827. (setq xmltok-type 'start-tag)
  828. nil)
  829. (t
  830. (when recovering
  831. (skip-chars-forward "^<>\n"))
  832. (xmltok-add-error "Missing `>'"
  833. xmltok-start
  834. (1+ xmltok-start))
  835. (setq xmltok-type 'partial-start-tag)
  836. nil)))
  837. (while atts-needing-normalization
  838. (xmltok-normalize-attribute (car atts-needing-normalization))
  839. (setq atts-needing-normalization (cdr atts-needing-normalization))))
  840. (setq xmltok-attributes
  841. (nreverse xmltok-attributes))
  842. (setq xmltok-namespace-attributes
  843. (nreverse xmltok-namespace-attributes)))
  844. (defun xmltok-add-attribute ()
  845. "Return the attribute if it needs normalizing, otherwise nil."
  846. (let* ((needs-normalizing nil)
  847. (att
  848. (if (xmltok-attribute start literal)
  849. (progn
  850. (setq needs-normalizing
  851. (or (xmltok-attribute start complex1)
  852. (xmltok-attribute start complex2)))
  853. (xmltok-make-attribute (xmltok-attribute start name)
  854. (xmltok-attribute start colon)
  855. (xmltok-attribute end name)
  856. (1+ (xmltok-attribute start literal))
  857. (1- (xmltok-attribute end literal))
  858. (not needs-normalizing)))
  859. (xmltok-make-attribute (xmltok-attribute start name)
  860. (xmltok-attribute start colon)
  861. (xmltok-attribute end name)))))
  862. (if (xmltok-attribute start xmlns)
  863. (setq xmltok-namespace-attributes
  864. (cons att xmltok-namespace-attributes))
  865. (setq xmltok-attributes
  866. (cons att xmltok-attributes)))
  867. (and needs-normalizing
  868. att)))
  869. (defun xmltok-normalize-attribute (att)
  870. (let ((end (xmltok-attribute-value-end att))
  871. (well-formed t)
  872. (value-parts nil)
  873. (refs nil))
  874. (save-excursion
  875. (goto-char (xmltok-attribute-value-start att))
  876. (while (progn
  877. (let ((n (skip-chars-forward "^\r\t\n&" end)))
  878. (when (> n 0)
  879. (setq value-parts
  880. (cons (buffer-substring-no-properties (- (point) n)
  881. (point))
  882. value-parts))))
  883. (when (< (point) end)
  884. (goto-char (1+ (point)))
  885. (cond ((eq (char-before) ?\&)
  886. (let ((xmltok-start (1- (point)))
  887. xmltok-type xmltok-replacement)
  888. (xmltok-scan-after-amp
  889. (lambda (start end)
  890. (xmltok-handle-entity start end t)))
  891. (cond ((or (eq xmltok-type 'char-ref)
  892. (eq xmltok-type 'entity-ref))
  893. (setq refs
  894. (cons (vector xmltok-type
  895. xmltok-start
  896. (point))
  897. refs))
  898. (if xmltok-replacement
  899. (setq value-parts
  900. (cons xmltok-replacement
  901. value-parts))
  902. (setq well-formed nil)))
  903. (t (setq well-formed nil)))))
  904. (t (setq value-parts
  905. (cons " " value-parts)))))
  906. (< (point) end))))
  907. (when well-formed
  908. (aset att 5 (apply 'concat (nreverse value-parts))))
  909. (aset att 6 (nreverse refs))))
  910. (defun xmltok-scan-after-amp (entity-handler)
  911. (cond ((not (looking-at (xmltok-after-amp regexp)))
  912. (xmltok-add-error "`&' that is not markup must be entered as `&amp;'")
  913. (setq xmltok-type 'not-well-formed))
  914. (t
  915. (goto-char (match-end 0))
  916. (cond ((xmltok-after-amp start entity-ref-close)
  917. (funcall entity-handler
  918. (xmltok-after-amp start entity-name)
  919. (xmltok-after-amp end entity-name))
  920. (setq xmltok-type 'entity-ref))
  921. ((xmltok-after-amp start decimal-ref-close)
  922. (xmltok-scan-char-ref (xmltok-after-amp start decimal)
  923. (xmltok-after-amp end decimal)
  924. 10))
  925. ((xmltok-after-amp start hex-ref-close)
  926. (xmltok-scan-char-ref (xmltok-after-amp start hex)
  927. (xmltok-after-amp end hex)
  928. 16))
  929. ((xmltok-after-amp start number-sign)
  930. (xmltok-add-error "Missing character number")
  931. (setq xmltok-type 'not-well-formed))
  932. (t
  933. (xmltok-add-error "Missing closing `;'")
  934. (setq xmltok-type 'not-well-formed))))))
  935. (defconst xmltok-entity-error-messages
  936. '((unparsed . "Referenced entity is unparsed")
  937. (not-well-formed . "Referenced entity is not well-formed")
  938. (external nil . "Referenced entity is external")
  939. (element nil . "Referenced entity contains <")))
  940. (defun xmltok-handle-entity (start end &optional attributep)
  941. (let* ((name (buffer-substring-no-properties start end))
  942. (name-def (assoc name xmltok-dtd))
  943. (def (cdr name-def)))
  944. (cond ((setq xmltok-replacement (and (consp def)
  945. (if attributep
  946. (cdr def)
  947. (car def)))))
  948. ((null name-def)
  949. (unless (eq (car xmltok-dtd) t)
  950. (xmltok-add-error "Referenced entity has not been defined"
  951. start
  952. end)))
  953. ((and attributep (consp def))
  954. (xmltok-add-error "Referenced entity contains <"
  955. start
  956. end))
  957. (t
  958. (let ((err (cdr (assq def xmltok-entity-error-messages))))
  959. (when (consp err)
  960. (setq err (if attributep (cdr err) (car err))))
  961. (when err
  962. (xmltok-add-error err start end)))))))
  963. (defun xmltok-scan-char-ref (start end base)
  964. (setq xmltok-replacement
  965. (let ((n (string-to-number (buffer-substring-no-properties start end)
  966. base)))
  967. (cond ((and (integerp n) (xmltok-valid-char-p n))
  968. (setq n (xmltok-unicode-to-char n))
  969. (and n (string n)))
  970. (t
  971. (xmltok-add-error "Invalid character code" start end)
  972. nil))))
  973. (setq xmltok-type 'char-ref))
  974. (defun xmltok-char-number (start end)
  975. (let* ((base (if (eq (char-after (+ start 2)) ?x)
  976. 16
  977. 10))
  978. (n (string-to-number
  979. (buffer-substring-no-properties (+ start (if (= base 16) 3 2))
  980. (1- end))
  981. base)))
  982. (and (integerp n)
  983. (xmltok-valid-char-p n)
  984. n)))
  985. (defun xmltok-unclosed-reparse-p (change-start
  986. change-end
  987. pre-change-length
  988. start
  989. end
  990. delimiter)
  991. (let ((len-1 (1- (length delimiter))))
  992. (goto-char (max start (- change-start len-1)))
  993. (search-forward delimiter (min end (+ change-end len-1)) t)))
  994. ;; Handles a <!-- with the next -- not followed by >
  995. (defun xmltok-semi-closed-reparse-p (change-start
  996. change-end
  997. pre-change-length
  998. start
  999. end
  1000. delimiter
  1001. delimiter-length)
  1002. (or (<= (- end delimiter-length) change-end)
  1003. (xmltok-unclosed-reparse-p change-start
  1004. change-end
  1005. pre-change-length
  1006. start
  1007. end
  1008. delimiter)))
  1009. (defun xmltok-valid-char-p (n)
  1010. "Return non-nil if N is the Unicode code of a valid XML character."
  1011. (cond ((< n #x20) (memq n '(#xA #xD #x9)))
  1012. ((< n #xD800) t)
  1013. ((< n #xE000) nil)
  1014. ((< n #xFFFE) t)
  1015. (t (and (> n #xFFFF)
  1016. (< n #x110000)))))
  1017. (defun xmltok-unicode-to-char (n)
  1018. "Return the character corresponding to Unicode scalar value N.
  1019. Return nil if unsupported in Emacs."
  1020. (decode-char 'ucs n))
  1021. ;;; Prolog parsing
  1022. (defvar xmltok-contains-doctype nil)
  1023. (defvar xmltok-doctype-external-subset-flag nil)
  1024. (defvar xmltok-internal-subset-start nil)
  1025. (defvar xmltok-had-param-entity-ref nil)
  1026. (defvar xmltok-prolog-regions nil)
  1027. (defvar xmltok-standalone nil
  1028. "Non-nil if there was an XML declaration specifying standalone=\"yes\".")
  1029. (defvar xmltok-markup-declaration-doctype-flag nil)
  1030. (defconst xmltok-predefined-entity-alist
  1031. '(("lt" "<" . "<")
  1032. ("gt" ">" . ">")
  1033. ("amp" "&" . "&")
  1034. ("apos" "'" . "'")
  1035. ("quot" "\"" . "\"")))
  1036. (defun xmltok-forward-prolog ()
  1037. "Move forward to the end of the XML prolog.
  1038. Returns a list of vectors [TYPE START END] where TYPE is a symbol and
  1039. START and END are integers giving the start and end of the region of
  1040. that type. TYPE can be one of xml-declaration,
  1041. xml-declaration-attribute-name, xml-declaration-attribute-value,
  1042. comment, processing-instruction-left, processing-instruction-right,
  1043. markup-declaration-open, markup-declaration-close,
  1044. internal-subset-open, internal-subset-close, hash-name, keyword,
  1045. literal, encoding-name.
  1046. Adds to `xmltok-errors' and `xmltok-dependent-regions' as appropriate."
  1047. (let ((case-fold-search nil)
  1048. xmltok-start
  1049. xmltok-type
  1050. xmltok-prolog-regions
  1051. xmltok-contains-doctype
  1052. xmltok-internal-subset-start
  1053. xmltok-had-param-entity-ref
  1054. xmltok-standalone
  1055. xmltok-doctype-external-subset-flag
  1056. xmltok-markup-declaration-doctype-flag)
  1057. (setq xmltok-dtd xmltok-predefined-entity-alist)
  1058. (xmltok-scan-xml-declaration)
  1059. (xmltok-next-prolog-token)
  1060. (while (condition-case err
  1061. (when (xmltok-parse-prolog-item)
  1062. (xmltok-next-prolog-token))
  1063. (xmltok-markup-declaration-parse-error
  1064. (xmltok-skip-markup-declaration))))
  1065. (when xmltok-internal-subset-start
  1066. (xmltok-add-error "No closing ]"
  1067. (1- xmltok-internal-subset-start)
  1068. xmltok-internal-subset-start))
  1069. (xmltok-parse-entities)
  1070. ;; XXX prune dependent-regions for those entirely in prolog
  1071. (nreverse xmltok-prolog-regions)))
  1072. (defconst xmltok-bad-xml-decl-regexp
  1073. "[ \t\r\n]*<\\?xml\\(?:[ \t\r\n]\\|\\?>\\)")
  1074. ;;;###autoload
  1075. (defun xmltok-get-declared-encoding-position (&optional limit)
  1076. "Return the position of the encoding in the XML declaration at point.
  1077. If there is a well-formed XML declaration starting at point and it
  1078. contains an encoding declaration, then return (START . END)
  1079. where START and END are the positions of the start and the end
  1080. of the encoding name; if there is no encoding declaration return
  1081. the position where and encoding declaration could be inserted.
  1082. If there is XML that is not well-formed that looks like an XML
  1083. declaration, return nil. Otherwise, return t.
  1084. If LIMIT is non-nil, then do not consider characters beyond LIMIT."
  1085. (cond ((let ((case-fold-search nil))
  1086. (and (looking-at (xmltok-xml-declaration regexp))
  1087. (or (not limit) (<= (match-end 0) limit))))
  1088. (let ((end (xmltok-xml-declaration end encoding-value)))
  1089. (if end
  1090. (cons (1+ (xmltok-xml-declaration start encoding-value))
  1091. (1- end))
  1092. (or (xmltok-xml-declaration end version-value)
  1093. (+ (point) 5)))))
  1094. ((not (let ((case-fold-search t))
  1095. (looking-at xmltok-bad-xml-decl-regexp))))))
  1096. (defun xmltok-scan-xml-declaration ()
  1097. (when (looking-at (xmltok-xml-declaration regexp))
  1098. (xmltok-add-prolog-region 'xml-declaration (point) (match-end 0))
  1099. (goto-char (match-end 0))
  1100. (when (xmltok-xml-declaration start version-name)
  1101. (xmltok-add-prolog-region 'xml-declaration-attribute-name
  1102. (xmltok-xml-declaration start version-name)
  1103. (xmltok-xml-declaration end version-name))
  1104. (let ((start (xmltok-xml-declaration start version-value))
  1105. (end (xmltok-xml-declaration end version-value)))
  1106. (xmltok-add-prolog-region 'xml-declaration-attribute-value
  1107. start
  1108. end)))
  1109. ;; XXX need to check encoding name
  1110. ;; Should start with letter, not contain colon
  1111. (when (xmltok-xml-declaration start encoding-name)
  1112. (xmltok-add-prolog-region 'xml-declaration-attribute-name
  1113. (xmltok-xml-declaration start encoding-name)
  1114. (xmltok-xml-declaration end encoding-name))
  1115. (let ((start (xmltok-xml-declaration start encoding-value))
  1116. (end (xmltok-xml-declaration end encoding-value)))
  1117. (xmltok-add-prolog-region 'encoding-name
  1118. (1+ start)
  1119. (1- end))
  1120. (xmltok-add-prolog-region 'xml-declaration-attribute-value
  1121. start
  1122. end)))
  1123. (when (xmltok-xml-declaration start standalone-name)
  1124. (xmltok-add-prolog-region 'xml-declaration-attribute-name
  1125. (xmltok-xml-declaration start standalone-name)
  1126. (xmltok-xml-declaration end standalone-name))
  1127. (let ((start (xmltok-xml-declaration start standalone-value))
  1128. (end (xmltok-xml-declaration end standalone-value)))
  1129. (xmltok-add-prolog-region 'xml-declaration-attribute-value
  1130. start
  1131. end)
  1132. (setq xmltok-standalone
  1133. (string= (buffer-substring-no-properties (1+ start) (1- end))
  1134. "yes"))))
  1135. t))
  1136. (defconst xmltok-markup-declaration-alist
  1137. '(("ELEMENT" . xmltok-parse-element-declaration)
  1138. ("ATTLIST" . xmltok-parse-attlist-declaration)
  1139. ("ENTITY" . xmltok-parse-entity-declaration)
  1140. ("NOTATION" . xmltok-parse-notation-declaration)))
  1141. (defun xmltok-parse-prolog-item ()
  1142. (cond ((eq xmltok-type 'comment)
  1143. (xmltok-add-prolog-region 'comment
  1144. xmltok-start
  1145. (point))
  1146. t)
  1147. ((eq xmltok-type 'processing-instruction))
  1148. ((eq xmltok-type 'named-markup-declaration)
  1149. (setq xmltok-markup-declaration-doctype-flag nil)
  1150. (xmltok-add-prolog-region 'markup-declaration-open
  1151. xmltok-start
  1152. (point))
  1153. (let* ((name (buffer-substring-no-properties
  1154. (+ xmltok-start 2)
  1155. (point)))
  1156. (fun (cdr (assoc name xmltok-markup-declaration-alist))))
  1157. (cond (fun
  1158. (unless xmltok-internal-subset-start
  1159. (xmltok-add-error
  1160. "Declaration allowed only in internal subset"))
  1161. (funcall fun))
  1162. ((string= name "DOCTYPE")
  1163. (xmltok-parse-doctype))
  1164. (t
  1165. (xmltok-add-error "Unknown markup declaration"
  1166. (+ xmltok-start 2))
  1167. (xmltok-next-prolog-token)
  1168. (xmltok-markup-declaration-parse-error))))
  1169. t)
  1170. ((or (eq xmltok-type 'end-prolog)
  1171. (not xmltok-type))
  1172. nil)
  1173. ((eq xmltok-type 'internal-subset-close)
  1174. (xmltok-add-prolog-region 'internal-subset-close
  1175. xmltok-start
  1176. (1+ xmltok-start))
  1177. (xmltok-add-prolog-region 'markup-declaration-close
  1178. (1- (point))
  1179. (point))
  1180. (if xmltok-internal-subset-start
  1181. (setq xmltok-internal-subset-start nil)
  1182. (xmltok-add-error "]> outside internal subset"))
  1183. t)
  1184. ((eq xmltok-type 'param-entity-ref)
  1185. (if xmltok-internal-subset-start
  1186. (setq xmltok-had-param-entity-ref t)
  1187. (xmltok-add-error "Parameter entity reference outside document type declaration"))
  1188. t)
  1189. ;; If we don't do this, we can get thousands of errors when
  1190. ;; a plain text file is parsed.
  1191. ((not xmltok-internal-subset-start)
  1192. (when (let ((err (car xmltok-errors)))
  1193. (or (not err)
  1194. (<= (xmltok-error-end err) xmltok-start)))
  1195. (goto-char xmltok-start))
  1196. nil)
  1197. ((eq xmltok-type 'not-well-formed) t)
  1198. (t
  1199. (xmltok-add-error "Token allowed only inside markup declaration")
  1200. t)))
  1201. (defun xmltok-parse-doctype ()
  1202. (setq xmltok-markup-declaration-doctype-flag t)
  1203. (xmltok-next-prolog-token)
  1204. (when xmltok-internal-subset-start
  1205. (xmltok-add-error "DOCTYPE declaration not allowed in internal subset")
  1206. (xmltok-markup-declaration-parse-error))
  1207. (when xmltok-contains-doctype
  1208. (xmltok-add-error "Duplicate DOCTYPE declaration")
  1209. (xmltok-markup-declaration-parse-error))
  1210. (setq xmltok-contains-doctype t)
  1211. (xmltok-require-token 'name 'prefixed-name)
  1212. (xmltok-require-next-token "SYSTEM" "PUBLIC" ?\[ ?>)
  1213. (cond ((eq xmltok-type ?\[)
  1214. (setq xmltok-internal-subset-start (point)))
  1215. ((eq xmltok-type ?>))
  1216. (t
  1217. (setq xmltok-doctype-external-subset-flag t)
  1218. (xmltok-parse-external-id)
  1219. (xmltok-require-token ?\[ ?>)
  1220. (when (eq xmltok-type ?\[)
  1221. (setq xmltok-internal-subset-start (point))))))
  1222. (defun xmltok-parse-attlist-declaration ()
  1223. (xmltok-require-next-token 'prefixed-name 'name)
  1224. (while (progn
  1225. (xmltok-require-next-token ?> 'name 'prefixed-name)
  1226. (if (eq xmltok-type ?>)
  1227. nil
  1228. (xmltok-require-next-token ?\(
  1229. "CDATA"
  1230. "ID"
  1231. "IDREF"
  1232. "IDREFS"
  1233. "ENTITY"
  1234. "ENTITIES"
  1235. "NMTOKEN"
  1236. "NMTOKENS"
  1237. "NOTATION")
  1238. (cond ((eq xmltok-type ?\()
  1239. (xmltok-parse-nmtoken-group))
  1240. ((string= (xmltok-current-token-string)
  1241. "NOTATION")
  1242. (xmltok-require-next-token ?\()
  1243. (xmltok-parse-nmtoken-group)))
  1244. (xmltok-require-next-token "#IMPLIED"
  1245. "#REQUIRED"
  1246. "#FIXED"
  1247. 'literal)
  1248. (when (string= (xmltok-current-token-string) "#FIXED")
  1249. (xmltok-require-next-token 'literal))
  1250. t))))
  1251. (defun xmltok-parse-nmtoken-group ()
  1252. (while (progn
  1253. (xmltok-require-next-token 'nmtoken 'prefixed-name 'name)
  1254. (xmltok-require-next-token ?| ?\))
  1255. (eq xmltok-type ?|))))
  1256. (defun xmltok-parse-element-declaration ()
  1257. (xmltok-require-next-token 'name 'prefixed-name)
  1258. (xmltok-require-next-token "EMPTY" "ANY" ?\()
  1259. (when (eq xmltok-type ?\()
  1260. (xmltok-require-next-token "#PCDATA"
  1261. 'name
  1262. 'prefixed-name
  1263. 'name-occur
  1264. ?\()
  1265. (cond ((eq xmltok-type 'hash-name)
  1266. (xmltok-require-next-token ?| ?\) 'close-paren-star)
  1267. (while (eq xmltok-type ?|)
  1268. (xmltok-require-next-token 'name 'prefixed-name)
  1269. (xmltok-require-next-token 'close-paren-star ?|)))
  1270. (t (xmltok-parse-model-group))))
  1271. (xmltok-require-next-token ?>))
  1272. (defun xmltok-parse-model-group ()
  1273. (xmltok-parse-model-group-member)
  1274. (xmltok-require-next-token ?|
  1275. ?,
  1276. ?\)
  1277. 'close-paren-star
  1278. 'close-paren-occur)
  1279. (when (memq xmltok-type '(?, ?|))
  1280. (let ((connector xmltok-type))
  1281. (while (progn
  1282. (xmltok-next-prolog-token)
  1283. (xmltok-parse-model-group-member)
  1284. (xmltok-require-next-token connector
  1285. ?\)
  1286. 'close-paren-star
  1287. 'close-paren-occur)
  1288. (eq xmltok-type connector))))))
  1289. (defun xmltok-parse-model-group-member ()
  1290. (xmltok-require-token 'name
  1291. 'prefixed-name
  1292. 'name-occur
  1293. ?\()
  1294. (when (eq xmltok-type ?\()
  1295. (xmltok-next-prolog-token)
  1296. (xmltok-parse-model-group)))
  1297. (defun xmltok-parse-entity-declaration ()
  1298. (let (paramp name)
  1299. (xmltok-require-next-token 'name ?%)
  1300. (when (eq xmltok-type ?%)
  1301. (setq paramp t)
  1302. (xmltok-require-next-token 'name))
  1303. (setq name (xmltok-current-token-string))
  1304. (xmltok-require-next-token 'literal "SYSTEM" "PUBLIC")
  1305. (cond ((eq xmltok-type 'literal)
  1306. (let ((replacement (xmltok-parse-entity-value)))
  1307. (unless paramp
  1308. (xmltok-define-entity name replacement)))
  1309. (xmltok-require-next-token ?>))
  1310. (t
  1311. (xmltok-parse-external-id)
  1312. (if paramp
  1313. (xmltok-require-token ?>)
  1314. (xmltok-require-token ?> "NDATA")
  1315. (if (eq xmltok-type ?>)
  1316. (xmltok-define-entity name 'external)
  1317. (xmltok-require-next-token 'name)
  1318. (xmltok-require-next-token ?>)
  1319. (xmltok-define-entity name 'unparsed)))))))
  1320. (defun xmltok-define-entity (name value)
  1321. (when (and (or (not xmltok-had-param-entity-ref)
  1322. xmltok-standalone)
  1323. (not (assoc name xmltok-dtd)))
  1324. (setq xmltok-dtd
  1325. (cons (cons name value) xmltok-dtd))))
  1326. (defun xmltok-parse-entity-value ()
  1327. (let ((lim (1- (point)))
  1328. (well-formed t)
  1329. value-parts
  1330. start)
  1331. (save-excursion
  1332. (goto-char (1+ xmltok-start))
  1333. (setq start (point))
  1334. (while (progn
  1335. (skip-chars-forward "^%&" lim)
  1336. (when (< (point) lim)
  1337. (goto-char (1+ (point)))
  1338. (cond ((eq (char-before) ?%)
  1339. (xmltok-add-error "Parameter entity references are not allowed in the internal subset"
  1340. (1- (point))
  1341. (point))
  1342. (setq well-formed nil))
  1343. (t
  1344. (let ((xmltok-start (1- (point)))
  1345. xmltok-type xmltok-replacement)
  1346. (xmltok-scan-after-amp (lambda (start end)))
  1347. (cond ((eq xmltok-type 'char-ref)
  1348. (setq value-parts
  1349. (cons (buffer-substring-no-properties
  1350. start
  1351. xmltok-start)
  1352. value-parts))
  1353. (setq value-parts
  1354. (cons xmltok-replacement
  1355. value-parts))
  1356. (setq start (point)))
  1357. ((eq xmltok-type 'not-well-formed)
  1358. (setq well-formed nil))))))
  1359. t))))
  1360. (if (not well-formed)
  1361. nil
  1362. (apply 'concat
  1363. (nreverse (cons (buffer-substring-no-properties start lim)
  1364. value-parts))))))
  1365. (defun xmltok-parse-notation-declaration ()
  1366. (xmltok-require-next-token 'name)
  1367. (xmltok-require-next-token "SYSTEM" "PUBLIC")
  1368. (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
  1369. (xmltok-require-next-token 'literal)
  1370. (cond (publicp
  1371. (xmltok-require-next-token 'literal ?>)
  1372. (unless (eq xmltok-type ?>)
  1373. (xmltok-require-next-token ?>)))
  1374. (t (xmltok-require-next-token ?>)))))
  1375. (defun xmltok-parse-external-id ()
  1376. (xmltok-require-token "SYSTEM" "PUBLIC")
  1377. (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
  1378. (xmltok-require-next-token 'literal)
  1379. (when publicp
  1380. (xmltok-require-next-token 'literal)))
  1381. (xmltok-next-prolog-token))
  1382. (defun xmltok-require-next-token (&rest types)
  1383. (xmltok-next-prolog-token)
  1384. (apply 'xmltok-require-token types))
  1385. (defun xmltok-require-token (&rest types)
  1386. ;; XXX Generate a more helpful error message
  1387. (while (and (not (let ((type (car types)))
  1388. (if (stringp (car types))
  1389. (string= (xmltok-current-token-string) type)
  1390. (eq type xmltok-type))))
  1391. (setq types (cdr types))))
  1392. (unless types
  1393. (when (and xmltok-type
  1394. (not (eq xmltok-type 'not-well-formed)))
  1395. (xmltok-add-error "Unexpected token"))
  1396. (xmltok-markup-declaration-parse-error))
  1397. (let ((region-type (xmltok-prolog-region-type (car types))))
  1398. (when region-type
  1399. (xmltok-add-prolog-region region-type
  1400. xmltok-start
  1401. (point)))))
  1402. (defun xmltok-current-token-string ()
  1403. (buffer-substring-no-properties xmltok-start (point)))
  1404. (put 'xmltok-markup-declaration-parse-error
  1405. 'error-conditions
  1406. '(error xmltok-markup-declaration-parse-error))
  1407. (put 'xmltok-markup-declaration-parse-error
  1408. 'error-message
  1409. "Syntax error in markup declaration")
  1410. (defun xmltok-markup-declaration-parse-error ()
  1411. (signal 'xmltok-markup-declaration-parse-error nil))
  1412. (defun xmltok-skip-markup-declaration ()
  1413. (while (cond ((eq xmltok-type ?>)
  1414. (xmltok-next-prolog-token)
  1415. nil)
  1416. ((and xmltok-markup-declaration-doctype-flag
  1417. (eq xmltok-type ?\[))
  1418. (setq xmltok-internal-subset-start (point))
  1419. (xmltok-next-prolog-token)
  1420. nil)
  1421. ((memq xmltok-type '(nil
  1422. end-prolog
  1423. named-markup-declaration
  1424. comment
  1425. processing-instruction))
  1426. nil)
  1427. ((and xmltok-internal-subset-start
  1428. (eq xmltok-type 'internal-subset-close))
  1429. nil)
  1430. (t (xmltok-next-prolog-token) t)))
  1431. xmltok-type)
  1432. (defun xmltok-prolog-region-type (required)
  1433. (cond ((cdr (assq xmltok-type
  1434. '((literal . literal)
  1435. (?> . markup-declaration-close)
  1436. (?\[ . internal-subset-open)
  1437. (hash-name . hash-name)))))
  1438. ((and (stringp required) (eq xmltok-type 'name))
  1439. 'keyword)))
  1440. ;; Return new token type.
  1441. (defun xmltok-next-prolog-token ()
  1442. (skip-chars-forward " \t\r\n")
  1443. (setq xmltok-start (point))
  1444. (cond ((not (and (looking-at (xmltok-prolog regexp))
  1445. (goto-char (match-end 0))))
  1446. (let ((ch (char-after)))
  1447. (cond (ch
  1448. (goto-char (1+ (point)))
  1449. (xmltok-add-error "Illegal char in prolog")
  1450. (setq xmltok-type 'not-well-formed))
  1451. (t (setq xmltok-type nil)))))
  1452. ((or (xmltok-prolog start ncname-occur)
  1453. (xmltok-prolog start colon-name-occur))
  1454. (setq xmltok-name-end (1- (point)))
  1455. (setq xmltok-name-colon (xmltok-prolog start colon))
  1456. (setq xmltok-type 'name-occur))
  1457. ((xmltok-prolog start colon)
  1458. (setq xmltok-name-end (point))
  1459. (setq xmltok-name-colon (xmltok-prolog start colon))
  1460. (unless (looking-at "[ \t\r\n>),|[%]")
  1461. (xmltok-add-error "Missing space after name"))
  1462. (setq xmltok-type 'prefixed-name))
  1463. ((or (xmltok-prolog start ncname-nmtoken)
  1464. (xmltok-prolog start ncname-colon))
  1465. (unless (looking-at "[ \t\r\n>),|[%]")
  1466. (xmltok-add-error "Missing space after name token"))
  1467. (setq xmltok-type 'nmtoken))
  1468. ((xmltok-prolog start name)
  1469. (setq xmltok-name-end (point))
  1470. (setq xmltok-name-colon nil)
  1471. (unless (looking-at "[ \t\r\n>),|[%]")
  1472. (xmltok-add-error "Missing space after name"))
  1473. (setq xmltok-type 'name))
  1474. ((xmltok-prolog start hash-name)
  1475. (setq xmltok-name-end (point))
  1476. (unless (looking-at "[ \t\r\n>)|%]")
  1477. (xmltok-add-error "Missing space after name"))
  1478. (setq xmltok-type 'hash-name))
  1479. ((xmltok-prolog start processing-instruction-question)
  1480. (xmltok-scan-prolog-after-processing-instruction-open))
  1481. ((xmltok-prolog start comment-open)
  1482. ;; XXX if not-well-formed, ignore some stuff
  1483. (xmltok-scan-after-comment-open))
  1484. ((xmltok-prolog start named-markup-declaration)
  1485. (setq xmltok-type 'named-markup-declaration))
  1486. ((xmltok-prolog start instance-start)
  1487. (goto-char xmltok-start)
  1488. (setq xmltok-type 'end-prolog))
  1489. ((xmltok-prolog start close-paren-star)
  1490. (setq xmltok-type 'close-paren-star))
  1491. ((xmltok-prolog start close-paren-occur)
  1492. (setq xmltok-type 'close-paren-occur))
  1493. ((xmltok-prolog start close-paren)
  1494. (unless (looking-at "[ \t\r\n>,|)]")
  1495. (xmltok-add-error "Missing space after )"))
  1496. (setq xmltok-type ?\)))
  1497. ((xmltok-prolog start single-char)
  1498. (let ((ch (char-before)))
  1499. (cond ((memq ch '(?\" ?\'))
  1500. (xmltok-scan-prolog-literal))
  1501. (t (setq xmltok-type ch)))))
  1502. ((xmltok-prolog start percent)
  1503. (cond ((xmltok-prolog start param-entity-ref-close)
  1504. (setq xmltok-name-end (1- (point)))
  1505. (setq xmltok-type 'param-entity-ref))
  1506. ((xmltok-prolog start param-entity-ref)
  1507. (xmltok-add-error "Missing ;")
  1508. (setq xmltok-name-end (point))
  1509. (setq xmltok-type 'param-entity-ref))
  1510. ((looking-at "[ \t\r\n%]")
  1511. (setq xmltok-type ?%))
  1512. (t
  1513. (xmltok-add-error "Expected name after %")
  1514. (setq xmltok-type 'not-well-formed))))
  1515. ((xmltok-prolog start nmtoken)
  1516. (unless (looking-at "[ \t\r\n>),|[%]")
  1517. (xmltok-add-error "Missing space after name token"))
  1518. (setq xmltok-type 'nmtoken))
  1519. ((xmltok-prolog start internal-subset-close)
  1520. (setq xmltok-type 'internal-subset-close))
  1521. ((xmltok-prolog start pound)
  1522. (xmltok-add-error "Expected name after #")
  1523. (setq xmltok-type 'not-well-formed))
  1524. ((xmltok-prolog start markup-declaration)
  1525. (xmltok-add-error "Expected name or -- after <!")
  1526. (setq xmltok-type 'not-well-formed))
  1527. ((xmltok-prolog start comment-first-dash)
  1528. (xmltok-add-error "Expected <!--")
  1529. (setq xmltok-type 'not-well-formed))
  1530. ((xmltok-prolog start less-than)
  1531. (xmltok-add-error "Incomplete markup")
  1532. (setq xmltok-type 'not-well-formed))
  1533. (t (error "Unhandled token in prolog %s"
  1534. (match-string-no-properties 0)))))
  1535. (defun xmltok-scan-prolog-literal ()
  1536. (let* ((delim (string (char-before)))
  1537. (safe-end (save-excursion
  1538. (skip-chars-forward (concat "^<>[]" delim))
  1539. (point)))
  1540. (end (save-excursion
  1541. (goto-char safe-end)
  1542. (search-forward delim nil t))))
  1543. (or (cond ((not end)
  1544. (xmltok-add-dependent 'xmltok-unclosed-reparse-p
  1545. nil
  1546. nil
  1547. delim)
  1548. nil)
  1549. ((save-excursion
  1550. (goto-char end)
  1551. (looking-at "[ \t\r\n>%[]"))
  1552. (goto-char end)
  1553. (setq xmltok-type 'literal))
  1554. ((eq (1+ safe-end) end)
  1555. (goto-char end)
  1556. (xmltok-add-error (format "Missing space after %s" delim)
  1557. safe-end)
  1558. (setq xmltok-type 'literal))
  1559. (t
  1560. (xmltok-add-dependent 'xmltok-semi-closed-reparse-p
  1561. xmltok-start
  1562. (1+ end)
  1563. delim
  1564. 1)
  1565. nil))
  1566. (progn
  1567. (xmltok-add-error (format "Missing closing %s" delim))
  1568. (goto-char safe-end)
  1569. (skip-chars-backward " \t\r\n")
  1570. (setq xmltok-type 'not-well-formed)))))
  1571. (defun xmltok-scan-prolog-after-processing-instruction-open ()
  1572. (cond ((not (search-forward "?>" nil t))
  1573. (xmltok-add-error "No closing ?>"
  1574. xmltok-start
  1575. (+ xmltok-start 2))
  1576. (xmltok-add-dependent 'xmltok-unclosed-reparse-p
  1577. nil
  1578. nil
  1579. "?>")
  1580. (setq xmltok-type 'not-well-formed))
  1581. (t
  1582. (let* ((end (point))
  1583. (target
  1584. (save-excursion
  1585. (goto-char (+ xmltok-start 2))
  1586. (and (looking-at (xmltok-ncname regexp))
  1587. (or (memq (char-after (match-end 0))
  1588. '(?\n ?\t ?\r ? ))
  1589. (= (match-end 0) (- end 2)))
  1590. (match-string-no-properties 0)))))
  1591. (cond ((not target)
  1592. (xmltok-add-error "\
  1593. Processing instruction does not start with a name"
  1594. (+ xmltok-start 2)
  1595. (+ xmltok-start 3)))
  1596. ((not (and (= (length target) 3)
  1597. (let ((case-fold-search t))
  1598. (string-match "xml" target)))))
  1599. ((= xmltok-start 1)
  1600. (xmltok-add-error "Invalid XML declaration"
  1601. xmltok-start
  1602. (point)))
  1603. ((save-excursion
  1604. (goto-char xmltok-start)
  1605. (looking-at (xmltok-xml-declaration regexp)))
  1606. (xmltok-add-error "XML declaration not at beginning of file"
  1607. xmltok-start
  1608. (point)))
  1609. (t
  1610. (xmltok-add-error "Processing instruction has target of xml"
  1611. (+ xmltok-start 2)
  1612. (+ xmltok-start 5))))
  1613. (xmltok-add-prolog-region 'processing-instruction-left
  1614. xmltok-start
  1615. (+ xmltok-start
  1616. 2
  1617. (if target
  1618. (length target)
  1619. 0)))
  1620. (xmltok-add-prolog-region 'processing-instruction-right
  1621. (if target
  1622. (save-excursion
  1623. (goto-char (+ xmltok-start
  1624. (length target)
  1625. 2))
  1626. (skip-chars-forward " \t\r\n")
  1627. (point))
  1628. (+ xmltok-start 2))
  1629. (point)))
  1630. (setq xmltok-type 'processing-instruction))))
  1631. (defun xmltok-parse-entities ()
  1632. (let ((todo xmltok-dtd))
  1633. (when (and (or xmltok-had-param-entity-ref
  1634. xmltok-doctype-external-subset-flag)
  1635. (not xmltok-standalone))
  1636. (setq xmltok-dtd (cons t xmltok-dtd)))
  1637. (while todo
  1638. (xmltok-parse-entity (car todo))
  1639. (setq todo (cdr todo)))))
  1640. (defun xmltok-parse-entity (name-def)
  1641. (let ((def (cdr name-def))
  1642. ;; in case its value is buffer local
  1643. (xmltok-dtd xmltok-dtd)
  1644. buf)
  1645. (when (stringp def)
  1646. (if (string-match "\\`[^&<\t\r\n]*\\'" def)
  1647. (setcdr name-def (cons def def))
  1648. (setcdr name-def 'not-well-formed) ; avoid infinite expansion loops
  1649. (setq buf (get-buffer-create
  1650. (format " *Entity %s*" (car name-def))))
  1651. (with-current-buffer buf
  1652. (erase-buffer)
  1653. (insert def)
  1654. (goto-char (point-min))
  1655. (setcdr name-def
  1656. (xmltok-parse-entity-replacement)))
  1657. (kill-buffer buf)))))
  1658. (defun xmltok-parse-entity-replacement ()
  1659. (let ((def (cons "" "")))
  1660. (while (let* ((start (point))
  1661. (found (re-search-forward "[<&\t\r\n]\\|]]>" nil t))
  1662. (ch (and found (char-before)))
  1663. (str (buffer-substring-no-properties
  1664. start
  1665. (if found
  1666. (match-beginning 0)
  1667. (point-max)))))
  1668. (setq def
  1669. (xmltok-append-entity-def def
  1670. (cons str str)))
  1671. (cond ((not found) nil)
  1672. ((eq ch ?>)
  1673. (setq def 'not-well-formed)
  1674. nil)
  1675. ((eq ch ?<)
  1676. (xmltok-save
  1677. (setq xmltok-start (1- (point)))
  1678. (xmltok-scan-after-lt)
  1679. (setq def
  1680. (xmltok-append-entity-def
  1681. def
  1682. (cond ((memq xmltok-type
  1683. '(start-tag
  1684. end-tag
  1685. empty-element))
  1686. 'element)
  1687. ((memq xmltok-type
  1688. '(comment
  1689. processing-instruction))
  1690. (cons "" nil))
  1691. ((eq xmltok-type
  1692. 'cdata-section)
  1693. (cons (buffer-substring-no-properties
  1694. (+ xmltok-start 9)
  1695. (- (point) 3))
  1696. nil))
  1697. (t 'not-well-formed)))))
  1698. t)
  1699. ((eq ch ?&)
  1700. (let ((xmltok-start (1- (point)))
  1701. xmltok-type
  1702. xmltok-replacement
  1703. xmltok-errors)
  1704. (xmltok-scan-after-amp 'xmltok-handle-nested-entity)
  1705. (cond ((eq xmltok-type 'entity-ref)
  1706. (setq def
  1707. (xmltok-append-entity-def
  1708. def
  1709. xmltok-replacement)))
  1710. ((eq xmltok-type 'char-ref)
  1711. (setq def
  1712. (xmltok-append-entity-def
  1713. def
  1714. (if xmltok-replacement
  1715. (cons xmltok-replacement
  1716. xmltok-replacement)
  1717. (and xmltok-errors 'not-well-formed)))))
  1718. (t
  1719. (setq def 'not-well-formed))))
  1720. t)
  1721. (t
  1722. (setq def
  1723. (xmltok-append-entity-def
  1724. def
  1725. (cons (match-string-no-properties 0)
  1726. " ")))
  1727. t))))
  1728. def))
  1729. (defun xmltok-handle-nested-entity (start end)
  1730. (let* ((name-def (assoc (buffer-substring-no-properties start end)
  1731. xmltok-dtd))
  1732. (def (cdr name-def)))
  1733. (when (stringp def)
  1734. (xmltok-parse-entity name-def)
  1735. (setq def (cdr name-def)))
  1736. (setq xmltok-replacement
  1737. (cond ((null name-def)
  1738. (if (eq (car xmltok-dtd) t)
  1739. nil
  1740. 'not-well-formed))
  1741. ((eq def 'unparsed) 'not-well-formed)
  1742. (t def)))))
  1743. (defun xmltok-append-entity-def (d1 d2)
  1744. (cond ((consp d1)
  1745. (if (consp d2)
  1746. (cons (concat (car d1) (car d2))
  1747. (and (cdr d1)
  1748. (cdr d2)
  1749. (concat (cdr d1) (cdr d2))))
  1750. d2))
  1751. ((consp d2) d1)
  1752. (t
  1753. (let ((defs '(not-well-formed external element)))
  1754. (while (not (or (eq (car defs) d1)
  1755. (eq (car defs) d2)))
  1756. (setq defs (cdr defs)))
  1757. (car defs)))))
  1758. (defun xmltok-add-prolog-region (type start end)
  1759. (setq xmltok-prolog-regions
  1760. (cons (vector type start end)
  1761. xmltok-prolog-regions)))
  1762. (defun xmltok-merge-attributes ()
  1763. "Return a list merging `xmltok-attributes' and `xmltok-namespace-attributes'.
  1764. The members of the merged list are in order of occurrence in the
  1765. document. The list may share list structure with `xmltok-attributes'
  1766. and `xmltok-namespace-attributes'."
  1767. (cond ((not xmltok-namespace-attributes)
  1768. xmltok-attributes)
  1769. ((not xmltok-attributes)
  1770. xmltok-namespace-attributes)
  1771. (t
  1772. (let ((atts1 xmltok-attributes)
  1773. (atts2 xmltok-namespace-attributes)
  1774. merged)
  1775. (while (and atts1 atts2)
  1776. (cond ((< (xmltok-attribute-name-start (car atts1))
  1777. (xmltok-attribute-name-start (car atts2)))
  1778. (setq merged (cons (car atts1) merged))
  1779. (setq atts1 (cdr atts1)))
  1780. (t
  1781. (setq merged (cons (car atts2) merged))
  1782. (setq atts2 (cdr atts2)))))
  1783. (setq merged (nreverse merged))
  1784. (cond (atts1 (setq merged (nconc merged atts1)))
  1785. (atts2 (setq merged (nconc merged atts2))))
  1786. merged))))
  1787. ;;; Testing
  1788. (defun xmltok-forward-test ()
  1789. (interactive)
  1790. (if (xmltok-forward)
  1791. (message "Scanned %s" xmltok-type)
  1792. (message "Scanned nothing")))
  1793. (defun xmltok-next-prolog-token-test ()
  1794. (interactive)
  1795. (if (xmltok-next-prolog-token)
  1796. (message "Scanned %s"
  1797. (if (integerp xmltok-type)
  1798. (string xmltok-type)
  1799. xmltok-type))
  1800. (message "Scanned end of file")))
  1801. (provide 'xmltok)
  1802. ;;; xmltok.el ends here