simple.scm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. ;;;; (sxml simple) -- a simple interface to the SSAX parser
  2. ;;;;
  3. ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
  4. ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
  5. ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
  6. ;;;;
  7. ;;;; This library is free software; you can redistribute it and/or
  8. ;;;; modify it under the terms of the GNU Lesser General Public
  9. ;;;; License as published by the Free Software Foundation; either
  10. ;;;; version 3 of the License, or (at your option) any later version.
  11. ;;;;
  12. ;;;; This library is distributed in the hope that it will be useful,
  13. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;;;; Lesser General Public License for more details.
  16. ;;;;
  17. ;;;; You should have received a copy of the GNU Lesser General Public
  18. ;;;; License along with this library; if not, write to the Free Software
  19. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;;;;
  21. ;;; Commentary:
  22. ;;
  23. ;;A simple interface to XML parsing and serialization.
  24. ;;
  25. ;;; Code:
  26. (define-module (sxml simple)
  27. #:use-module (sxml ssax input-parse)
  28. #:use-module (sxml ssax)
  29. #:use-module (sxml transform)
  30. #:use-module (ice-9 match)
  31. #:export (xml->sxml sxml->xml sxml->string))
  32. ;; Helpers from upstream/SSAX.scm.
  33. ;;
  34. ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
  35. ; given the list of fragments (some of which are text strings)
  36. ; reverse the list and concatenate adjacent text strings.
  37. ; We can prove from the general case below that if LIST-OF-FRAGS
  38. ; has zero or one element, the result of the procedure is equal?
  39. ; to its argument. This fact justifies the shortcut evaluation below.
  40. (define (ssax:reverse-collect-str fragments)
  41. (cond
  42. ((null? fragments) '()) ; a shortcut
  43. ((null? (cdr fragments)) fragments) ; see the comment above
  44. (else
  45. (let loop ((fragments fragments) (result '()) (strs '()))
  46. (cond
  47. ((null? fragments)
  48. (if (null? strs) result
  49. (cons (string-concatenate/shared strs) result)))
  50. ((string? (car fragments))
  51. (loop (cdr fragments) result (cons (car fragments) strs)))
  52. (else
  53. (loop (cdr fragments)
  54. (cons
  55. (car fragments)
  56. (if (null? strs) result
  57. (cons (string-concatenate/shared strs) result)))
  58. '())))))))
  59. (define (read-internal-doctype-as-string port)
  60. (string-concatenate/shared
  61. (let loop ()
  62. (let ((fragment
  63. (next-token '() '(#\]) "reading internal DOCTYPE" port)))
  64. (if (eqv? #\> (peek-next-char port))
  65. (begin
  66. (read-char port)
  67. (cons fragment '()))
  68. (cons* fragment "]" (loop)))))))
  69. ;; Ideas for the future for this interface:
  70. ;;
  71. ;; * Allow doctypes to provide parsed entities
  72. ;;
  73. ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler
  74. ;; below)
  75. ;;
  76. ;; * Parse internal DTDs
  77. ;;
  78. ;; * Parse external DTDs
  79. ;;
  80. (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key
  81. (namespaces '())
  82. (declare-namespaces? #t)
  83. (trim-whitespace? #f)
  84. (entities '())
  85. (default-entity-handler #f)
  86. (doctype-handler #f))
  87. "Use SSAX to parse an XML document into SXML. Takes one optional
  88. argument, @var{string-or-port}, which defaults to the current input
  89. port."
  90. ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix
  91. ;; that the user wants on elements of a given namespace in the
  92. ;; resulting SXML, regardless of the abbreviated namespaces defined in
  93. ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true,
  94. ;; these namespaces are treated as if they were declared in the DTD.
  95. ;; ENTITIES: alist of SYMBOL -> STRING.
  96. ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
  97. ;; A DOC-PREFIX of #f indicates that it comes from the user.
  98. ;; Otherwise, prefixes are symbols.
  99. (define (munge-namespaces namespaces)
  100. (map (lambda (el)
  101. (match el
  102. ((prefix . uri-string)
  103. (cons* (and declare-namespaces? prefix)
  104. prefix
  105. (ssax:uri-string->symbol uri-string)))))
  106. namespaces))
  107. (define (user-namespaces)
  108. (munge-namespaces namespaces))
  109. (define (user-entities)
  110. (if (and default-entity-handler
  111. (not (assq '*DEFAULT* entities)))
  112. (acons '*DEFAULT* default-entity-handler entities)
  113. entities))
  114. (define (name->sxml name)
  115. (match name
  116. ((prefix . local-part)
  117. (symbol-append prefix (string->symbol ":") local-part))
  118. (_ name)))
  119. (define (doctype-continuation seed)
  120. (lambda* (#:key (entities '()) (namespaces '()))
  121. (values #f
  122. (append entities (user-entities))
  123. (append (munge-namespaces namespaces) (user-namespaces))
  124. seed)))
  125. ;; The SEED in this parser is the SXML: initialized to '() at each new
  126. ;; level by the fdown handlers; built in reverse by the fhere parsers;
  127. ;; and reverse-collected by the fup handlers.
  128. (define parser
  129. (ssax:make-parser
  130. NEW-LEVEL-SEED ; fdown
  131. (lambda (elem-gi attributes namespaces expected-content seed)
  132. '())
  133. FINISH-ELEMENT ; fup
  134. (lambda (elem-gi attributes namespaces parent-seed seed)
  135. (let ((seed (if trim-whitespace?
  136. (ssax:reverse-collect-str-drop-ws seed)
  137. (ssax:reverse-collect-str seed)))
  138. (attrs (attlist-fold
  139. (lambda (attr accum)
  140. (cons (list (name->sxml (car attr)) (cdr attr))
  141. accum))
  142. '() attributes)))
  143. (acons (name->sxml elem-gi)
  144. (if (null? attrs)
  145. seed
  146. (cons (cons '@ attrs) seed))
  147. parent-seed)))
  148. CHAR-DATA-HANDLER ; fhere
  149. (lambda (string1 string2 seed)
  150. (if (string-null? string2)
  151. (cons string1 seed)
  152. (cons* string2 string1 seed)))
  153. DOCTYPE
  154. ;; -> ELEMS ENTITIES NAMESPACES SEED
  155. ;;
  156. ;; ELEMS is for validation and currently unused.
  157. ;;
  158. ;; ENTITIES is an alist of parsed entities (symbol -> string).
  159. ;;
  160. ;; NAMESPACES is as above.
  161. ;;
  162. ;; SEED builds up the content.
  163. (lambda (port docname systemid internal-subset? seed)
  164. (call-with-values
  165. (lambda ()
  166. (cond
  167. (doctype-handler
  168. (doctype-handler docname systemid
  169. (and internal-subset?
  170. (read-internal-doctype-as-string port))))
  171. (else
  172. (when internal-subset?
  173. (ssax:skip-internal-dtd port))
  174. (values))))
  175. (doctype-continuation seed)))
  176. UNDECL-ROOT
  177. ;; This is like the DOCTYPE handler, but for documents that do not
  178. ;; have a <!DOCTYPE!> entry.
  179. (lambda (elem-gi seed)
  180. (call-with-values
  181. (lambda ()
  182. (if doctype-handler
  183. (doctype-handler #f #f #f)
  184. (values)))
  185. (doctype-continuation seed)))
  186. PI
  187. ((*DEFAULT*
  188. . (lambda (port pi-tag seed)
  189. (cons
  190. (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
  191. seed))))))
  192. (let* ((port (if (string? string-or-port)
  193. (open-input-string string-or-port)
  194. string-or-port))
  195. (elements (reverse (parser port '()))))
  196. `(*TOP* ,@elements)))
  197. (define check-name
  198. (let ((*good-cache* (make-hash-table)))
  199. (lambda (name)
  200. (if (not (hashq-ref *good-cache* name))
  201. (let* ((str (symbol->string name))
  202. (i (string-index str #\:))
  203. (head (or (and i (substring str 0 i)) str))
  204. (tail (and i (substring str (1+ i)))))
  205. (and i (string-index (substring str (1+ i)) #\:)
  206. (error "Invalid QName: more than one colon" name))
  207. (for-each
  208. (lambda (s)
  209. (and s
  210. (or (char-alphabetic? (string-ref s 0))
  211. (eq? (string-ref s 0) #\_)
  212. (error "Invalid name starting character" s name))
  213. (string-for-each
  214. (lambda (c)
  215. (or (char-alphabetic? c) (string-index "0123456789.-_" c)
  216. (error "Invalid name character" c s name)))
  217. s)))
  218. (list head tail))
  219. (hashq-set! *good-cache* name #t))))))
  220. ;; The following two functions serialize tags and attributes. They are
  221. ;; being used in the node handlers for the post-order function, see
  222. ;; below.
  223. (define (attribute-value->xml value port)
  224. (cond
  225. ((pair? value)
  226. (attribute-value->xml (car value) port)
  227. (attribute-value->xml (cdr value) port))
  228. ((null? value)
  229. *unspecified*)
  230. ((string? value)
  231. (string->escaped-xml value port))
  232. ((procedure? value)
  233. (with-output-to-port port value))
  234. (else
  235. (string->escaped-xml
  236. (call-with-output-string (lambda (port) (display value port)))
  237. port))))
  238. (define (attribute->xml attr value port)
  239. (check-name attr)
  240. (display attr port)
  241. (display "=\"" port)
  242. (attribute-value->xml value port)
  243. (display #\" port))
  244. (define (element->xml tag attrs body port)
  245. (check-name tag)
  246. (display #\< port)
  247. (display tag port)
  248. (if attrs
  249. (let lp ((attrs attrs))
  250. (if (pair? attrs)
  251. (let ((attr (car attrs)))
  252. (display #\space port)
  253. (if (pair? attr)
  254. (attribute->xml (car attr) (cdr attr) port)
  255. (error "bad attribute" tag attr))
  256. (lp (cdr attrs)))
  257. (if (not (null? attrs))
  258. (error "bad attributes" tag attrs)))))
  259. (if (pair? body)
  260. (begin
  261. (display #\> port)
  262. (let lp ((body body))
  263. (cond
  264. ((pair? body)
  265. (sxml->xml (car body) port)
  266. (lp (cdr body)))
  267. ((null? body)
  268. (display "</" port)
  269. (display tag port)
  270. (display ">" port))
  271. (else
  272. (error "bad element body" tag body)))))
  273. (display " />" port)))
  274. ;; FIXME: ensure name is valid
  275. (define (entity->xml name port)
  276. (display #\& port)
  277. (display name port)
  278. (display #\; port))
  279. ;; FIXME: ensure tag and str are valid
  280. (define (pi->xml tag str port)
  281. (display "<?" port)
  282. (display tag port)
  283. (display #\space port)
  284. (display str port)
  285. (display "?>" port))
  286. (define* (sxml->xml tree #:optional (port (current-output-port)))
  287. "Serialize the sxml tree @var{tree} as XML. The output will be written
  288. to the current output port, unless the optional argument @var{port} is
  289. present."
  290. (cond
  291. ((pair? tree)
  292. (if (symbol? (car tree))
  293. ;; An element.
  294. (let ((tag (car tree)))
  295. (case tag
  296. ((*TOP*)
  297. (sxml->xml (cdr tree) port))
  298. ((*ENTITY*)
  299. (if (and (list? (cdr tree)) (= (length (cdr tree)) 1))
  300. (entity->xml (cadr tree) port)
  301. (error "bad *ENTITY* args" (cdr tree))))
  302. ((*PI*)
  303. (if (and (list? (cdr tree)) (= (length (cdr tree)) 2))
  304. (pi->xml (cadr tree) (caddr tree) port)
  305. (error "bad *PI* args" (cdr tree))))
  306. (else
  307. (let* ((elems (cdr tree))
  308. (attrs (and (pair? elems) (pair? (car elems))
  309. (eq? '@ (caar elems))
  310. (cdar elems))))
  311. (element->xml tag attrs (if attrs (cdr elems) elems) port)))))
  312. ;; A nodelist.
  313. (for-each (lambda (x) (sxml->xml x port)) tree)))
  314. ((string? tree)
  315. (string->escaped-xml tree port))
  316. ((null? tree) *unspecified*)
  317. ((not tree) *unspecified*)
  318. ((eqv? tree #t) *unspecified*)
  319. ((procedure? tree)
  320. (with-output-to-port port tree))
  321. (else
  322. (string->escaped-xml
  323. (call-with-output-string (lambda (port) (display tree port)))
  324. port))))
  325. (define (sxml->string sxml)
  326. "Detag an sxml tree @var{sxml} into a string. Does not perform any
  327. formatting."
  328. (string-concatenate-reverse
  329. (foldts
  330. (lambda (seed tree) ; fdown
  331. '())
  332. (lambda (seed kid-seed tree) ; fup
  333. (append! kid-seed seed))
  334. (lambda (seed tree) ; fhere
  335. (if (string? tree) (cons tree seed) seed))
  336. '()
  337. sxml)))
  338. (define (make-char-quotator char-encoding)
  339. (let ((bad-chars (list->char-set (map car char-encoding))))
  340. ;; Check to see if str contains one of the characters in charset,
  341. ;; from the position i onward. If so, return that character's index.
  342. ;; otherwise, return #f
  343. (define (index-cset str i charset)
  344. (string-index str charset i))
  345. ;; The body of the function
  346. (lambda (str port)
  347. (let ((bad-pos (index-cset str 0 bad-chars)))
  348. (if (not bad-pos)
  349. (display str port) ; str had all good chars
  350. (let loop ((from 0) (to bad-pos))
  351. (cond
  352. ((>= from (string-length str)) *unspecified*)
  353. ((not to)
  354. (display (substring str from (string-length str)) port))
  355. (else
  356. (let ((quoted-char
  357. (cdr (assv (string-ref str to) char-encoding)))
  358. (new-to
  359. (index-cset str (+ 1 to) bad-chars)))
  360. (if (< from to)
  361. (display (substring str from to) port))
  362. (display quoted-char port)
  363. (loop (1+ to) new-to))))))))))
  364. ;; Given a string, check to make sure it does not contain characters
  365. ;; such as '<' or '&' that require encoding. Return either the original
  366. ;; string, or a list of string fragments with special characters
  367. ;; replaced by appropriate character entities.
  368. (define string->escaped-xml
  369. (make-char-quotator
  370. '((#\< . "&lt;") (#\> . "&gt;") (#\& . "&amp;") (#\" . "&quot;"))))
  371. ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac
  372. ;;; simple.scm ends here