ssax.scm 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. ;;;; (sxml ssax) -- the SSAX parser
  2. ;;;;
  3. ;;;; Copyright (C) 2009, 2010,2012,2013 Free Software Foundation, Inc.
  4. ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
  5. ;;;; Written 2001,2002,2003,2004 by Oleg Kiselyov <oleg at pobox dot com> as SSAX.scm.
  6. ;;;;
  7. ;;;; This library is free software; you can redistribute it and/or
  8. ;;;; modify it under the terms of the GNU Lesser General Public
  9. ;;;; License as published by the Free Software Foundation; either
  10. ;;;; version 3 of the License, or (at your option) any later version.
  11. ;;;;
  12. ;;;; This library is distributed in the hope that it will be useful,
  13. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;;;; Lesser General Public License for more details.
  16. ;;;;
  17. ;;;; You should have received a copy of the GNU Lesser General Public
  18. ;;;; License along with this library; if not, write to the Free Software
  19. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;;;;
  21. ;;; Commentary:
  22. ;;
  23. ;@subheading Functional XML parsing framework
  24. ;@subsubheading SAX/DOM and SXML parsers with support for XML Namespaces and validation
  25. ;
  26. ; This is a package of low-to-high level lexing and parsing procedures
  27. ; that can be combined to yield a SAX, a DOM, a validating parser, or
  28. ; a parser intended for a particular document type. The procedures in
  29. ; the package can be used separately to tokenize or parse various
  30. ; pieces of XML documents. The package supports XML Namespaces,
  31. ; internal and external parsed entities, user-controlled handling of
  32. ; whitespace, and validation. This module therefore is intended to be
  33. ; a framework, a set of "Lego blocks" you can use to build a parser
  34. ; following any discipline and performing validation to any degree. As
  35. ; an example of the parser construction, this file includes a
  36. ; semi-validating SXML parser.
  37. ; The present XML framework has a "sequential" feel of SAX yet a
  38. ; "functional style" of DOM. Like a SAX parser, the framework scans the
  39. ; document only once and permits incremental processing. An application
  40. ; that handles document elements in order can run as efficiently as
  41. ; possible. @emph{Unlike} a SAX parser, the framework does not require
  42. ; an application register stateful callbacks and surrender control to
  43. ; the parser. Rather, it is the application that can drive the framework
  44. ; -- calling its functions to get the current lexical or syntax element.
  45. ; These functions do not maintain or mutate any state save the input
  46. ; port. Therefore, the framework permits parsing of XML in a pure
  47. ; functional style, with the input port being a monad (or a linear,
  48. ; read-once parameter).
  49. ; Besides the @var{port}, there is another monad -- @var{seed}. Most of
  50. ; the middle- and high-level parsers are single-threaded through the
  51. ; @var{seed}. The functions of this framework do not process or affect
  52. ; the @var{seed} in any way: they simply pass it around as an instance
  53. ; of an opaque datatype. User functions, on the other hand, can use the
  54. ; seed to maintain user's state, to accumulate parsing results, etc. A
  55. ; user can freely mix his own functions with those of the framework. On
  56. ; the other hand, the user may wish to instantiate a high-level parser:
  57. ; @code{SSAX:make-elem-parser} or @code{SSAX:make-parser}. In the latter
  58. ; case, the user must provide functions of specific signatures, which
  59. ; are called at predictable moments during the parsing: to handle
  60. ; character data, element data, or processing instructions (PI). The
  61. ; functions are always given the @var{seed}, among other parameters, and
  62. ; must return the new @var{seed}.
  63. ; From a functional point of view, XML parsing is a combined
  64. ; pre-post-order traversal of a "tree" that is the XML document
  65. ; itself. This down-and-up traversal tells the user about an element
  66. ; when its start tag is encountered. The user is notified about the
  67. ; element once more, after all element's children have been
  68. ; handled. The process of XML parsing therefore is a fold over the
  69. ; raw XML document. Unlike a fold over trees defined in [1], the
  70. ; parser is necessarily single-threaded -- obviously as elements
  71. ; in a text XML document are laid down sequentially. The parser
  72. ; therefore is a tree fold that has been transformed to accept an
  73. ; accumulating parameter [1,2].
  74. ; Formally, the denotational semantics of the parser can be expressed
  75. ; as
  76. ;@smallexample
  77. ; parser:: (Start-tag -> Seed -> Seed) ->
  78. ; (Start-tag -> Seed -> Seed -> Seed) ->
  79. ; (Char-Data -> Seed -> Seed) ->
  80. ; XML-text-fragment -> Seed -> Seed
  81. ; parser fdown fup fchar "<elem attrs> content </elem>" seed
  82. ; = fup "<elem attrs>" seed
  83. ; (parser fdown fup fchar "content" (fdown "<elem attrs>" seed))
  84. ;
  85. ; parser fdown fup fchar "char-data content" seed
  86. ; = parser fdown fup fchar "content" (fchar "char-data" seed)
  87. ;
  88. ; parser fdown fup fchar "elem-content content" seed
  89. ; = parser fdown fup fchar "content" (
  90. ; parser fdown fup fchar "elem-content" seed)
  91. ;@end smallexample
  92. ; Compare the last two equations with the left fold
  93. ;@smallexample
  94. ; fold-left kons elem:list seed = fold-left kons list (kons elem seed)
  95. ;@end smallexample
  96. ; The real parser created by @code{SSAX:make-parser} is slightly more
  97. ; complicated, to account for processing instructions, entity
  98. ; references, namespaces, processing of document type declaration, etc.
  99. ; The XML standard document referred to in this module is
  100. ; @uref{http://www.w3.org/TR/1998/REC-xml-19980210.html}
  101. ;
  102. ; The present file also defines a procedure that parses the text of an
  103. ; XML document or of a separate element into SXML, an S-expression-based
  104. ; model of an XML Information Set. SXML is also an Abstract Syntax Tree
  105. ; of an XML document. SXML is similar but not identical to DOM; SXML is
  106. ; particularly suitable for Scheme-based XML/HTML authoring, SXPath
  107. ; queries, and tree transformations. See SXML.html for more details.
  108. ; SXML is a term implementation of evaluation of the XML document [3].
  109. ; The other implementation is context-passing.
  110. ; The present frameworks fully supports the XML Namespaces Recommendation:
  111. ; @uref{http://www.w3.org/TR/REC-xml-names/}
  112. ; Other links:
  113. ;@table @asis
  114. ;@item [1]
  115. ; Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold,"
  116. ; Proc. ICFP'98, 1998, pp. 273-279.
  117. ;@item [2]
  118. ; Richard S. Bird, The promotion and accumulation strategies in
  119. ; transformational programming, ACM Trans. Progr. Lang. Systems,
  120. ; 6(4):487-504, October 1984.
  121. ;@item [3]
  122. ; Ralf Hinze, "Deriving Backtracking Monad Transformers,"
  123. ; Functional Pearl. Proc ICFP'00, pp. 186-197.
  124. ;@end table
  125. ;;
  126. ;;; Code:
  127. (define-module (sxml ssax)
  128. #:use-module (sxml ssax input-parse)
  129. #:use-module (srfi srfi-1)
  130. #:use-module (srfi srfi-13)
  131. #:export (current-ssax-error-port
  132. with-ssax-error-to-port
  133. xml-token? xml-token-kind xml-token-head
  134. make-empty-attlist attlist-add
  135. attlist-null?
  136. attlist-remove-top
  137. attlist->alist attlist-fold
  138. define-parsed-entity!
  139. reset-parsed-entity-definitions!
  140. ssax:uri-string->symbol
  141. ssax:skip-internal-dtd
  142. ssax:read-pi-body-as-string
  143. ssax:reverse-collect-str-drop-ws
  144. ssax:read-markup-token
  145. ssax:read-cdata-body
  146. ssax:read-char-ref
  147. ssax:read-attributes
  148. ssax:complete-start-tag
  149. ssax:read-external-id
  150. ssax:read-char-data
  151. ssax:xml->sxml
  152. ssax:make-parser
  153. ssax:make-pi-parser
  154. ssax:make-elem-parser))
  155. (define (parser-error port message . rest)
  156. (apply throw 'parser-error port message rest))
  157. (define ascii->char integer->char)
  158. (define char->ascii char->integer)
  159. (define current-ssax-error-port
  160. (make-parameter (current-error-port)))
  161. (define *current-ssax-error-port*
  162. (parameter-fluid current-ssax-error-port))
  163. (define (with-ssax-error-to-port port thunk)
  164. (parameterize ((current-ssax-error-port port))
  165. (thunk)))
  166. (define (ssax:warn port . args)
  167. (with-output-to-port (current-ssax-error-port)
  168. (lambda ()
  169. (display ";;; SSAX warning: ")
  170. (for-each display args)
  171. (newline))))
  172. (define (ucscode->string codepoint)
  173. (string (integer->char codepoint)))
  174. (define char-newline #\newline)
  175. (define char-return #\return)
  176. (define char-tab #\tab)
  177. (define nl "\n")
  178. ;; This isn't a great API, but a more proper fix will involve hacking
  179. ;; SSAX.
  180. (define (reset-parsed-entity-definitions!)
  181. "Restore the set of parsed entity definitions to its initial state."
  182. (set! ssax:predefined-parsed-entities
  183. '((amp . "&")
  184. (lt . "<")
  185. (gt . ">")
  186. (apos . "'")
  187. (quot . "\""))))
  188. (define (define-parsed-entity! entity str)
  189. "Define a new parsed entity. @var{entity} should be a symbol.
  190. Instances of &@var{entity}; in XML text will be replaced with the
  191. string @var{str}, which will then be parsed."
  192. (set! ssax:predefined-parsed-entities
  193. (acons entity str ssax:predefined-parsed-entities)))
  194. ;; Execute a sequence of forms and return the result of the _first_ one.
  195. ;; Like PROG1 in Lisp. Typically used to evaluate one or more forms with
  196. ;; side effects and return a value that must be computed before some or
  197. ;; all of the side effects happen.
  198. (define-syntax begin0
  199. (syntax-rules ()
  200. ((begin0 form form1 ... )
  201. (let ((val form)) form1 ... val))))
  202. ; Like let* but allowing for multiple-value bindings
  203. (define-syntax let*-values
  204. (syntax-rules ()
  205. ((let*-values () . bodies) (begin . bodies))
  206. ((let*-values (((var) initializer) . rest) . bodies)
  207. (let ((var initializer)) ; a single var optimization
  208. (let*-values rest . bodies)))
  209. ((let*-values ((vars initializer) . rest) . bodies)
  210. (call-with-values (lambda () initializer) ; the most generic case
  211. (lambda vars (let*-values rest . bodies))))))
  212. ;; needed for some dumb reason
  213. (define inc 1+)
  214. (define dec 1-)
  215. (define-syntax include-from-path/filtered
  216. (lambda (x)
  217. (define (read-filtered accept-list file)
  218. (with-input-from-file (%search-load-path file)
  219. (lambda ()
  220. (let loop ((sexp (read)) (out '()))
  221. (cond
  222. ((eof-object? sexp) (reverse out))
  223. ((and (pair? sexp) (memq (car sexp) accept-list))
  224. (loop (read) (cons sexp out)))
  225. (else
  226. (loop (read) out)))))))
  227. (syntax-case x ()
  228. ((_ accept-list file)
  229. (with-syntax (((exp ...) (datum->syntax
  230. x
  231. (read-filtered
  232. (syntax->datum #'accept-list)
  233. (syntax->datum #'file)))))
  234. #'(begin exp ...))))))
  235. (include-from-path "sxml/upstream/assert.scm")
  236. (include-from-path/filtered
  237. (define define-syntax ssax:define-labeled-arg-macro)
  238. "sxml/upstream/SSAX.scm")