simple.scm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. ;;;; (sxml simple) -- a simple interface to the SSAX parser
  2. ;;;;
  3. ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
  4. ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
  5. ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
  6. ;;;;
  7. ;;;; This library is free software; you can redistribute it and/or
  8. ;;;; modify it under the terms of the GNU Lesser General Public
  9. ;;;; License as published by the Free Software Foundation; either
  10. ;;;; version 3 of the License, or (at your option) any later version.
  11. ;;;;
  12. ;;;; This library is distributed in the hope that it will be useful,
  13. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;;;; Lesser General Public License for more details.
  16. ;;;;
  17. ;;;; You should have received a copy of the GNU Lesser General Public
  18. ;;;; License along with this library; if not, write to the Free Software
  19. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;;;;
  21. ;;; Commentary:
  22. ;;
  23. ;;A simple interface to XML parsing and serialization.
  24. ;;
  25. ;;; Code:
  26. (define-module (sxml simple)
  27. #:use-module (sxml ssax input-parse)
  28. #:use-module (sxml ssax)
  29. #:use-module (sxml transform)
  30. #:use-module (ice-9 match)
  31. #:use-module (srfi srfi-13)
  32. #:export (xml->sxml sxml->xml sxml->string))
  33. ;; Helpers from upstream/SSAX.scm.
  34. ;;
  35. ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
  36. ; given the list of fragments (some of which are text strings)
  37. ; reverse the list and concatenate adjacent text strings.
  38. ; We can prove from the general case below that if LIST-OF-FRAGS
  39. ; has zero or one element, the result of the procedure is equal?
  40. ; to its argument. This fact justifies the shortcut evaluation below.
  41. (define (ssax:reverse-collect-str fragments)
  42. (cond
  43. ((null? fragments) '()) ; a shortcut
  44. ((null? (cdr fragments)) fragments) ; see the comment above
  45. (else
  46. (let loop ((fragments fragments) (result '()) (strs '()))
  47. (cond
  48. ((null? fragments)
  49. (if (null? strs) result
  50. (cons (string-concatenate/shared strs) result)))
  51. ((string? (car fragments))
  52. (loop (cdr fragments) result (cons (car fragments) strs)))
  53. (else
  54. (loop (cdr fragments)
  55. (cons
  56. (car fragments)
  57. (if (null? strs) result
  58. (cons (string-concatenate/shared strs) result)))
  59. '())))))))
  60. (define (read-internal-doctype-as-string port)
  61. (string-concatenate/shared
  62. (let loop ()
  63. (let ((fragment
  64. (next-token '() '(#\]) "reading internal DOCTYPE" port)))
  65. (if (eqv? #\> (peek-next-char port))
  66. (begin
  67. (read-char port)
  68. (cons fragment '()))
  69. (cons* fragment "]" (loop)))))))
  70. ;; Ideas for the future for this interface:
  71. ;;
  72. ;; * Allow doctypes to provide parsed entities
  73. ;;
  74. ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler
  75. ;; below)
  76. ;;
  77. ;; * Parse internal DTDs
  78. ;;
  79. ;; * Parse external DTDs
  80. ;;
  81. (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key
  82. (namespaces '())
  83. (declare-namespaces? #t)
  84. (trim-whitespace? #f)
  85. (entities '())
  86. (default-entity-handler #f)
  87. (doctype-handler #f))
  88. "Use SSAX to parse an XML document into SXML. Takes one optional
  89. argument, @var{string-or-port}, which defaults to the current input
  90. port."
  91. ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix
  92. ;; that the user wants on elements of a given namespace in the
  93. ;; resulting SXML, regardless of the abbreviated namespaces defined in
  94. ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true,
  95. ;; these namespaces are treated as if they were declared in the DTD.
  96. ;; ENTITIES: alist of SYMBOL -> STRING.
  97. ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
  98. ;; A DOC-PREFIX of #f indicates that it comes from the user.
  99. ;; Otherwise, prefixes are symbols.
  100. (define (munge-namespaces namespaces)
  101. (map (lambda (el)
  102. (match el
  103. ((prefix . uri-string)
  104. (cons* (and declare-namespaces? prefix)
  105. prefix
  106. (ssax:uri-string->symbol uri-string)))))
  107. namespaces))
  108. (define (user-namespaces)
  109. (munge-namespaces namespaces))
  110. (define (user-entities)
  111. (if (and default-entity-handler
  112. (not (assq '*DEFAULT* entities)))
  113. (acons '*DEFAULT* default-entity-handler entities)
  114. entities))
  115. (define (name->sxml name)
  116. (match name
  117. ((prefix . local-part)
  118. (symbol-append prefix (string->symbol ":") local-part))
  119. (_ name)))
  120. (define (doctype-continuation seed)
  121. (lambda* (#:key (entities '()) (namespaces '()))
  122. (values #f
  123. (append entities (user-entities))
  124. (append (munge-namespaces namespaces) (user-namespaces))
  125. seed)))
  126. ;; The SEED in this parser is the SXML: initialized to '() at each new
  127. ;; level by the fdown handlers; built in reverse by the fhere parsers;
  128. ;; and reverse-collected by the fup handlers.
  129. (define parser
  130. (ssax:make-parser
  131. NEW-LEVEL-SEED ; fdown
  132. (lambda (elem-gi attributes namespaces expected-content seed)
  133. '())
  134. FINISH-ELEMENT ; fup
  135. (lambda (elem-gi attributes namespaces parent-seed seed)
  136. (let ((seed (if trim-whitespace?
  137. (ssax:reverse-collect-str-drop-ws seed)
  138. (ssax:reverse-collect-str seed)))
  139. (attrs (attlist-fold
  140. (lambda (attr accum)
  141. (cons (list (name->sxml (car attr)) (cdr attr))
  142. accum))
  143. '() attributes)))
  144. (acons (name->sxml elem-gi)
  145. (if (null? attrs)
  146. seed
  147. (cons (cons '@ attrs) seed))
  148. parent-seed)))
  149. CHAR-DATA-HANDLER ; fhere
  150. (lambda (string1 string2 seed)
  151. (if (string-null? string2)
  152. (cons string1 seed)
  153. (cons* string2 string1 seed)))
  154. DOCTYPE
  155. ;; -> ELEMS ENTITIES NAMESPACES SEED
  156. ;;
  157. ;; ELEMS is for validation and currently unused.
  158. ;;
  159. ;; ENTITIES is an alist of parsed entities (symbol -> string).
  160. ;;
  161. ;; NAMESPACES is as above.
  162. ;;
  163. ;; SEED builds up the content.
  164. (lambda (port docname systemid internal-subset? seed)
  165. (call-with-values
  166. (lambda ()
  167. (cond
  168. (doctype-handler
  169. (doctype-handler docname systemid
  170. (and internal-subset?
  171. (read-internal-doctype-as-string port))))
  172. (else
  173. (when internal-subset?
  174. (ssax:skip-internal-dtd port))
  175. (values))))
  176. (doctype-continuation seed)))
  177. UNDECL-ROOT
  178. ;; This is like the DOCTYPE handler, but for documents that do not
  179. ;; have a <!DOCTYPE!> entry.
  180. (lambda (elem-gi seed)
  181. (call-with-values
  182. (lambda ()
  183. (if doctype-handler
  184. (doctype-handler #f #f #f)
  185. (values)))
  186. (doctype-continuation seed)))
  187. PI
  188. ((*DEFAULT*
  189. . (lambda (port pi-tag seed)
  190. (cons
  191. (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
  192. seed))))))
  193. (let* ((port (if (string? string-or-port)
  194. (open-input-string string-or-port)
  195. string-or-port))
  196. (elements (reverse (parser port '()))))
  197. `(*TOP* ,@elements)))
  198. (define check-name
  199. (let ((*good-cache* (make-hash-table)))
  200. (lambda (name)
  201. (if (not (hashq-ref *good-cache* name))
  202. (let* ((str (symbol->string name))
  203. (i (string-index str #\:))
  204. (head (or (and i (substring str 0 i)) str))
  205. (tail (and i (substring str (1+ i)))))
  206. (and i (string-index (substring str (1+ i)) #\:)
  207. (error "Invalid QName: more than one colon" name))
  208. (for-each
  209. (lambda (s)
  210. (and s
  211. (or (char-alphabetic? (string-ref s 0))
  212. (eq? (string-ref s 0) #\_)
  213. (error "Invalid name starting character" s name))
  214. (string-for-each
  215. (lambda (c)
  216. (or (char-alphabetic? c) (string-index "0123456789.-_" c)
  217. (error "Invalid name character" c s name)))
  218. s)))
  219. (list head tail))
  220. (hashq-set! *good-cache* name #t))))))
  221. ;; The following two functions serialize tags and attributes. They are
  222. ;; being used in the node handlers for the post-order function, see
  223. ;; below.
  224. (define (attribute-value->xml value port)
  225. (cond
  226. ((pair? value)
  227. (attribute-value->xml (car value) port)
  228. (attribute-value->xml (cdr value) port))
  229. ((null? value)
  230. *unspecified*)
  231. ((string? value)
  232. (string->escaped-xml value port))
  233. ((procedure? value)
  234. (with-output-to-port port value))
  235. (else
  236. (string->escaped-xml
  237. (call-with-output-string (lambda (port) (display value port)))
  238. port))))
  239. (define (attribute->xml attr value port)
  240. (check-name attr)
  241. (display attr port)
  242. (display "=\"" port)
  243. (attribute-value->xml value port)
  244. (display #\" port))
  245. (define (element->xml tag attrs body port)
  246. (check-name tag)
  247. (display #\< port)
  248. (display tag port)
  249. (if attrs
  250. (let lp ((attrs attrs))
  251. (if (pair? attrs)
  252. (let ((attr (car attrs)))
  253. (display #\space port)
  254. (if (pair? attr)
  255. (attribute->xml (car attr) (cdr attr) port)
  256. (error "bad attribute" tag attr))
  257. (lp (cdr attrs)))
  258. (if (not (null? attrs))
  259. (error "bad attributes" tag attrs)))))
  260. (if (pair? body)
  261. (begin
  262. (display #\> port)
  263. (let lp ((body body))
  264. (cond
  265. ((pair? body)
  266. (sxml->xml (car body) port)
  267. (lp (cdr body)))
  268. ((null? body)
  269. (display "</" port)
  270. (display tag port)
  271. (display ">" port))
  272. (else
  273. (error "bad element body" tag body)))))
  274. (display " />" port)))
  275. ;; FIXME: ensure name is valid
  276. (define (entity->xml name port)
  277. (display #\& port)
  278. (display name port)
  279. (display #\; port))
  280. ;; FIXME: ensure tag and str are valid
  281. (define (pi->xml tag str port)
  282. (display "<?" port)
  283. (display tag port)
  284. (display #\space port)
  285. (display str port)
  286. (display "?>" port))
  287. (define* (sxml->xml tree #:optional (port (current-output-port)))
  288. "Serialize the sxml tree @var{tree} as XML. The output will be written
  289. to the current output port, unless the optional argument @var{port} is
  290. present."
  291. (cond
  292. ((pair? tree)
  293. (if (symbol? (car tree))
  294. ;; An element.
  295. (let ((tag (car tree)))
  296. (case tag
  297. ((*TOP*)
  298. (sxml->xml (cdr tree) port))
  299. ((*ENTITY*)
  300. (if (and (list? (cdr tree)) (= (length (cdr tree)) 1))
  301. (entity->xml (cadr tree) port)
  302. (error "bad *ENTITY* args" (cdr tree))))
  303. ((*PI*)
  304. (if (and (list? (cdr tree)) (= (length (cdr tree)) 2))
  305. (pi->xml (cadr tree) (caddr tree) port)
  306. (error "bad *PI* args" (cdr tree))))
  307. (else
  308. (let* ((elems (cdr tree))
  309. (attrs (and (pair? elems) (pair? (car elems))
  310. (eq? '@ (caar elems))
  311. (cdar elems))))
  312. (element->xml tag attrs (if attrs (cdr elems) elems) port)))))
  313. ;; A nodelist.
  314. (for-each (lambda (x) (sxml->xml x port)) tree)))
  315. ((string? tree)
  316. (string->escaped-xml tree port))
  317. ((null? tree) *unspecified*)
  318. ((not tree) *unspecified*)
  319. ((eqv? tree #t) *unspecified*)
  320. ((procedure? tree)
  321. (with-output-to-port port tree))
  322. (else
  323. (string->escaped-xml
  324. (call-with-output-string (lambda (port) (display tree port)))
  325. port))))
  326. (define (sxml->string sxml)
  327. "Detag an sxml tree @var{sxml} into a string. Does not perform any
  328. formatting."
  329. (string-concatenate-reverse
  330. (foldts
  331. (lambda (seed tree) ; fdown
  332. '())
  333. (lambda (seed kid-seed tree) ; fup
  334. (append! kid-seed seed))
  335. (lambda (seed tree) ; fhere
  336. (if (string? tree) (cons tree seed) seed))
  337. '()
  338. sxml)))
  339. (define (make-char-quotator char-encoding)
  340. (let ((bad-chars (list->char-set (map car char-encoding))))
  341. ;; Check to see if str contains one of the characters in charset,
  342. ;; from the position i onward. If so, return that character's index.
  343. ;; otherwise, return #f
  344. (define (index-cset str i charset)
  345. (string-index str charset i))
  346. ;; The body of the function
  347. (lambda (str port)
  348. (let ((bad-pos (index-cset str 0 bad-chars)))
  349. (if (not bad-pos)
  350. (display str port) ; str had all good chars
  351. (let loop ((from 0) (to bad-pos))
  352. (cond
  353. ((>= from (string-length str)) *unspecified*)
  354. ((not to)
  355. (display (substring str from (string-length str)) port))
  356. (else
  357. (let ((quoted-char
  358. (cdr (assv (string-ref str to) char-encoding)))
  359. (new-to
  360. (index-cset str (+ 1 to) bad-chars)))
  361. (if (< from to)
  362. (display (substring str from to) port))
  363. (display quoted-char port)
  364. (loop (1+ to) new-to))))))))))
  365. ;; Given a string, check to make sure it does not contain characters
  366. ;; such as '<' or '&' that require encoding. Return either the original
  367. ;; string, or a list of string fragments with special characters
  368. ;; replaced by appropriate character entities.
  369. (define string->escaped-xml
  370. (make-char-quotator
  371. '((#\< . "&lt;") (#\> . "&gt;") (#\& . "&amp;") (#\" . "&quot;"))))
  372. ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac
  373. ;;; simple.scm ends here