123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408 |
- ;;;; (sxml simple) -- a simple interface to the SSAX parser
- ;;;;
- ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
- ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
- ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
- ;;;;
- ;;;; This library is free software; you can redistribute it and/or
- ;;;; modify it under the terms of the GNU Lesser General Public
- ;;;; License as published by the Free Software Foundation; either
- ;;;; version 3 of the License, or (at your option) any later version.
- ;;;;
- ;;;; This library is distributed in the hope that it will be useful,
- ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;;;; Lesser General Public License for more details.
- ;;;;
- ;;;; You should have received a copy of the GNU Lesser General Public
- ;;;; License along with this library; if not, write to the Free Software
- ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;;;;
- ;;; Commentary:
- ;;
- ;;A simple interface to XML parsing and serialization.
- ;;
- ;;; Code:
- (define-module (sxml simple)
- #:use-module (sxml ssax input-parse)
- #:use-module (sxml ssax)
- #:use-module (sxml transform)
- #:use-module (ice-9 match)
- #:export (xml->sxml sxml->xml sxml->string))
- ;; Helpers from upstream/SSAX.scm.
- ;;
- ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
- ; given the list of fragments (some of which are text strings)
- ; reverse the list and concatenate adjacent text strings.
- ; We can prove from the general case below that if LIST-OF-FRAGS
- ; has zero or one element, the result of the procedure is equal?
- ; to its argument. This fact justifies the shortcut evaluation below.
- (define (ssax:reverse-collect-str fragments)
- (cond
- ((null? fragments) '()) ; a shortcut
- ((null? (cdr fragments)) fragments) ; see the comment above
- (else
- (let loop ((fragments fragments) (result '()) (strs '()))
- (cond
- ((null? fragments)
- (if (null? strs) result
- (cons (string-concatenate/shared strs) result)))
- ((string? (car fragments))
- (loop (cdr fragments) result (cons (car fragments) strs)))
- (else
- (loop (cdr fragments)
- (cons
- (car fragments)
- (if (null? strs) result
- (cons (string-concatenate/shared strs) result)))
- '())))))))
- (define (read-internal-doctype-as-string port)
- (string-concatenate/shared
- (let loop ()
- (let ((fragment
- (next-token '() '(#\]) "reading internal DOCTYPE" port)))
- (if (eqv? #\> (peek-next-char port))
- (begin
- (read-char port)
- (cons fragment '()))
- (cons* fragment "]" (loop)))))))
- ;; Ideas for the future for this interface:
- ;;
- ;; * Allow doctypes to provide parsed entities
- ;;
- ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler
- ;; below)
- ;;
- ;; * Parse internal DTDs
- ;;
- ;; * Parse external DTDs
- ;;
- (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key
- (namespaces '())
- (declare-namespaces? #t)
- (trim-whitespace? #f)
- (entities '())
- (default-entity-handler #f)
- (doctype-handler #f))
- "Use SSAX to parse an XML document into SXML. Takes one optional
- argument, @var{string-or-port}, which defaults to the current input
- port."
- ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix
- ;; that the user wants on elements of a given namespace in the
- ;; resulting SXML, regardless of the abbreviated namespaces defined in
- ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true,
- ;; these namespaces are treated as if they were declared in the DTD.
- ;; ENTITIES: alist of SYMBOL -> STRING.
- ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
- ;; A DOC-PREFIX of #f indicates that it comes from the user.
- ;; Otherwise, prefixes are symbols.
- (define (munge-namespaces namespaces)
- (map (lambda (el)
- (match el
- ((prefix . uri-string)
- (cons* (and declare-namespaces? prefix)
- prefix
- (ssax:uri-string->symbol uri-string)))))
- namespaces))
- (define (user-namespaces)
- (munge-namespaces namespaces))
- (define (user-entities)
- (if (and default-entity-handler
- (not (assq '*DEFAULT* entities)))
- (acons '*DEFAULT* default-entity-handler entities)
- entities))
- (define (name->sxml name)
- (match name
- ((prefix . local-part)
- (symbol-append prefix (string->symbol ":") local-part))
- (_ name)))
- (define (doctype-continuation seed)
- (lambda* (#:key (entities '()) (namespaces '()))
- (values #f
- (append entities (user-entities))
- (append (munge-namespaces namespaces) (user-namespaces))
- seed)))
- ;; The SEED in this parser is the SXML: initialized to '() at each new
- ;; level by the fdown handlers; built in reverse by the fhere parsers;
- ;; and reverse-collected by the fup handlers.
- (define parser
- (ssax:make-parser
- NEW-LEVEL-SEED ; fdown
- (lambda (elem-gi attributes namespaces expected-content seed)
- '())
-
- FINISH-ELEMENT ; fup
- (lambda (elem-gi attributes namespaces parent-seed seed)
- (let ((seed (if trim-whitespace?
- (ssax:reverse-collect-str-drop-ws seed)
- (ssax:reverse-collect-str seed)))
- (attrs (attlist-fold
- (lambda (attr accum)
- (cons (list (name->sxml (car attr)) (cdr attr))
- accum))
- '() attributes)))
- (acons (name->sxml elem-gi)
- (if (null? attrs)
- seed
- (cons (cons '@ attrs) seed))
- parent-seed)))
- CHAR-DATA-HANDLER ; fhere
- (lambda (string1 string2 seed)
- (if (string-null? string2)
- (cons string1 seed)
- (cons* string2 string1 seed)))
- DOCTYPE
- ;; -> ELEMS ENTITIES NAMESPACES SEED
- ;;
- ;; ELEMS is for validation and currently unused.
- ;;
- ;; ENTITIES is an alist of parsed entities (symbol -> string).
- ;;
- ;; NAMESPACES is as above.
- ;;
- ;; SEED builds up the content.
- (lambda (port docname systemid internal-subset? seed)
- (call-with-values
- (lambda ()
- (cond
- (doctype-handler
- (doctype-handler docname systemid
- (and internal-subset?
- (read-internal-doctype-as-string port))))
- (else
- (when internal-subset?
- (ssax:skip-internal-dtd port))
- (values))))
- (doctype-continuation seed)))
- UNDECL-ROOT
- ;; This is like the DOCTYPE handler, but for documents that do not
- ;; have a <!DOCTYPE!> entry.
- (lambda (elem-gi seed)
- (call-with-values
- (lambda ()
- (if doctype-handler
- (doctype-handler #f #f #f)
- (values)))
- (doctype-continuation seed)))
- PI
- ((*DEFAULT*
- . (lambda (port pi-tag seed)
- (cons
- (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
- seed))))))
- (let* ((port (if (string? string-or-port)
- (open-input-string string-or-port)
- string-or-port))
- (elements (reverse (parser port '()))))
- `(*TOP* ,@elements)))
- (define check-name
- (let ((*good-cache* (make-hash-table)))
- (lambda (name)
- (if (not (hashq-ref *good-cache* name))
- (let* ((str (symbol->string name))
- (i (string-index str #\:))
- (head (or (and i (substring str 0 i)) str))
- (tail (and i (substring str (1+ i)))))
- (and i (string-index (substring str (1+ i)) #\:)
- (error "Invalid QName: more than one colon" name))
- (for-each
- (lambda (s)
- (and s
- (or (char-alphabetic? (string-ref s 0))
- (eq? (string-ref s 0) #\_)
- (error "Invalid name starting character" s name))
- (string-for-each
- (lambda (c)
- (or (char-alphabetic? c) (string-index "0123456789.-_" c)
- (error "Invalid name character" c s name)))
- s)))
- (list head tail))
- (hashq-set! *good-cache* name #t))))))
- ;; The following two functions serialize tags and attributes. They are
- ;; being used in the node handlers for the post-order function, see
- ;; below.
- (define (attribute-value->xml value port)
- (cond
- ((pair? value)
- (attribute-value->xml (car value) port)
- (attribute-value->xml (cdr value) port))
- ((null? value)
- *unspecified*)
- ((string? value)
- (string->escaped-xml value port))
- ((procedure? value)
- (with-output-to-port port value))
- (else
- (string->escaped-xml
- (call-with-output-string (lambda (port) (display value port)))
- port))))
- (define (attribute->xml attr value port)
- (check-name attr)
- (display attr port)
- (display "=\"" port)
- (attribute-value->xml value port)
- (display #\" port))
- (define (element->xml tag attrs body port)
- (check-name tag)
- (display #\< port)
- (display tag port)
- (if attrs
- (let lp ((attrs attrs))
- (if (pair? attrs)
- (let ((attr (car attrs)))
- (display #\space port)
- (if (pair? attr)
- (attribute->xml (car attr) (cdr attr) port)
- (error "bad attribute" tag attr))
- (lp (cdr attrs)))
- (if (not (null? attrs))
- (error "bad attributes" tag attrs)))))
- (if (pair? body)
- (begin
- (display #\> port)
- (let lp ((body body))
- (cond
- ((pair? body)
- (sxml->xml (car body) port)
- (lp (cdr body)))
- ((null? body)
- (display "</" port)
- (display tag port)
- (display ">" port))
- (else
- (error "bad element body" tag body)))))
- (display " />" port)))
- ;; FIXME: ensure name is valid
- (define (entity->xml name port)
- (display #\& port)
- (display name port)
- (display #\; port))
- ;; FIXME: ensure tag and str are valid
- (define (pi->xml tag str port)
- (display "<?" port)
- (display tag port)
- (display #\space port)
- (display str port)
- (display "?>" port))
- (define* (sxml->xml tree #:optional (port (current-output-port)))
- "Serialize the sxml tree @var{tree} as XML. The output will be written
- to the current output port, unless the optional argument @var{port} is
- present."
- (cond
- ((pair? tree)
- (if (symbol? (car tree))
- ;; An element.
- (let ((tag (car tree)))
- (case tag
- ((*TOP*)
- (sxml->xml (cdr tree) port))
- ((*ENTITY*)
- (if (and (list? (cdr tree)) (= (length (cdr tree)) 1))
- (entity->xml (cadr tree) port)
- (error "bad *ENTITY* args" (cdr tree))))
- ((*PI*)
- (if (and (list? (cdr tree)) (= (length (cdr tree)) 2))
- (pi->xml (cadr tree) (caddr tree) port)
- (error "bad *PI* args" (cdr tree))))
- (else
- (let* ((elems (cdr tree))
- (attrs (and (pair? elems) (pair? (car elems))
- (eq? '@ (caar elems))
- (cdar elems))))
- (element->xml tag attrs (if attrs (cdr elems) elems) port)))))
- ;; A nodelist.
- (for-each (lambda (x) (sxml->xml x port)) tree)))
- ((string? tree)
- (string->escaped-xml tree port))
- ((null? tree) *unspecified*)
- ((not tree) *unspecified*)
- ((eqv? tree #t) *unspecified*)
- ((procedure? tree)
- (with-output-to-port port tree))
- (else
- (string->escaped-xml
- (call-with-output-string (lambda (port) (display tree port)))
- port))))
- (define (sxml->string sxml)
- "Detag an sxml tree @var{sxml} into a string. Does not perform any
- formatting."
- (string-concatenate-reverse
- (foldts
- (lambda (seed tree) ; fdown
- '())
- (lambda (seed kid-seed tree) ; fup
- (append! kid-seed seed))
- (lambda (seed tree) ; fhere
- (if (string? tree) (cons tree seed) seed))
- '()
- sxml)))
- (define (make-char-quotator char-encoding)
- (let ((bad-chars (list->char-set (map car char-encoding))))
-
- ;; Check to see if str contains one of the characters in charset,
- ;; from the position i onward. If so, return that character's index.
- ;; otherwise, return #f
- (define (index-cset str i charset)
- (string-index str charset i))
-
- ;; The body of the function
- (lambda (str port)
- (let ((bad-pos (index-cset str 0 bad-chars)))
- (if (not bad-pos)
- (display str port) ; str had all good chars
- (let loop ((from 0) (to bad-pos))
- (cond
- ((>= from (string-length str)) *unspecified*)
- ((not to)
- (display (substring str from (string-length str)) port))
- (else
- (let ((quoted-char
- (cdr (assv (string-ref str to) char-encoding)))
- (new-to
- (index-cset str (+ 1 to) bad-chars)))
- (if (< from to)
- (display (substring str from to) port))
- (display quoted-char port)
- (loop (1+ to) new-to))))))))))
- ;; Given a string, check to make sure it does not contain characters
- ;; such as '<' or '&' that require encoding. Return either the original
- ;; string, or a list of string fragments with special characters
- ;; replaced by appropriate character entities.
- (define string->escaped-xml
- (make-char-quotator
- '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """))))
- ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac
- ;;; simple.scm ends here
|