xpath.scm 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. ;;;; (sxml xpath) -- SXPath
  2. ;;;;
  3. ;;;; Copyright (C) 2009 Free Software Foundation, Inc.
  4. ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
  5. ;;;; Written 2001 by Oleg Kiselyov <oleg at pobox dot com> SXPath.scm.
  6. ;;;;
  7. ;;;; This library is free software; you can redistribute it and/or
  8. ;;;; modify it under the terms of the GNU Lesser General Public
  9. ;;;; License as published by the Free Software Foundation; either
  10. ;;;; version 3 of the License, or (at your option) any later version.
  11. ;;;;
  12. ;;;; This library is distributed in the hope that it will be useful,
  13. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;;;; Lesser General Public License for more details.
  16. ;;;;
  17. ;;;; You should have received a copy of the GNU Lesser General Public
  18. ;;;; License along with this library; if not, write to the Free Software
  19. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;;;;
  21. ;;; Commentary:
  22. ;;
  23. ;;@heading SXPath: SXML Query Language
  24. ;;
  25. ;; SXPath is a query language for SXML, an instance of XML Information
  26. ;; set (Infoset) in the form of s-expressions. See @code{(sxml ssax)}
  27. ;; for the definition of SXML and more details. SXPath is also a
  28. ;; translation into Scheme of an XML Path Language,
  29. ;; @uref{http://www.w3.org/TR/xpath,XPath}. XPath and SXPath describe
  30. ;; means of selecting a set of Infoset's items or their properties.
  31. ;;
  32. ;; To facilitate queries, XPath maps the XML Infoset into an explicit
  33. ;; tree, and introduces important notions of a location path and a
  34. ;; current, context node. A location path denotes a selection of a set of
  35. ;; nodes relative to a context node. Any XPath tree has a distinguished,
  36. ;; root node -- which serves as the context node for absolute location
  37. ;; paths. Location path is recursively defined as a location step joined
  38. ;; with a location path. A location step is a simple query of the
  39. ;; database relative to a context node. A step may include expressions
  40. ;; that further filter the selected set. Each node in the resulting set
  41. ;; is used as a context node for the adjoining location path. The result
  42. ;; of the step is a union of the sets returned by the latter location
  43. ;; paths.
  44. ;;
  45. ;; The SXML representation of the XML Infoset (see SSAX.scm) is rather
  46. ;; suitable for querying as it is. Bowing to the XPath specification,
  47. ;; we will refer to SXML information items as 'Nodes':
  48. ;;@example
  49. ;; <Node> ::= <Element> | <attributes-coll> | <attrib>
  50. ;; | "text string" | <PI>
  51. ;;@end example
  52. ;; This production can also be described as
  53. ;;@example
  54. ;; <Node> ::= (name . <Nodeset>) | "text string"
  55. ;;@end example
  56. ;; An (ordered) set of nodes is just a list of the constituent nodes:
  57. ;;@example
  58. ;; <Nodeset> ::= (<Node> ...)
  59. ;;@end example
  60. ;; Nodesets, and Nodes other than text strings are both lists. A
  61. ;; <Nodeset> however is either an empty list, or a list whose head is not
  62. ;; a symbol. A symbol at the head of a node is either an XML name (in
  63. ;; which case it's a tag of an XML element), or an administrative name
  64. ;; such as '@@'. This uniform list representation makes processing rather
  65. ;; simple and elegant, while avoiding confusion. The multi-branch tree
  66. ;; structure formed by the mutually-recursive datatypes <Node> and
  67. ;; <Nodeset> lends itself well to processing by functional languages.
  68. ;;
  69. ;; A location path is in fact a composite query over an XPath tree or
  70. ;; its branch. A singe step is a combination of a projection, selection
  71. ;; or a transitive closure. Multiple steps are combined via join and
  72. ;; union operations. This insight allows us to @emph{elegantly}
  73. ;; implement XPath as a sequence of projection and filtering primitives
  74. ;; -- converters -- joined by @dfn{combinators}. Each converter takes a
  75. ;; node and returns a nodeset which is the result of the corresponding
  76. ;; query relative to that node. A converter can also be called on a set
  77. ;; of nodes. In that case it returns a union of the corresponding
  78. ;; queries over each node in the set. The union is easily implemented as
  79. ;; a list append operation as all nodes in a SXML tree are considered
  80. ;; distinct, by XPath conventions. We also preserve the order of the
  81. ;; members in the union. Query combinators are high-order functions:
  82. ;; they take converter(s) (which is a Node|Nodeset -> Nodeset function)
  83. ;; and compose or otherwise combine them. We will be concerned with only
  84. ;; relative location paths [XPath]: an absolute location path is a
  85. ;; relative path applied to the root node.
  86. ;;
  87. ;; Similarly to XPath, SXPath defines full and abbreviated notations
  88. ;; for location paths. In both cases, the abbreviated notation can be
  89. ;; mechanically expanded into the full form by simple rewriting
  90. ;; rules. In case of SXPath the corresponding rules are given as
  91. ;; comments to a sxpath function, below. The regression test suite at
  92. ;; the end of this file shows a representative sample of SXPaths in
  93. ;; both notations, juxtaposed with the corresponding XPath
  94. ;; expressions. Most of the samples are borrowed literally from the
  95. ;; XPath specification, while the others are adjusted for our running
  96. ;; example, tree1.
  97. ;;
  98. ;;; Code:
  99. (define-module (sxml xpath)
  100. #:use-module (ice-9 pretty-print)
  101. #:export (nodeset? node-typeof? node-eq? node-equal? node-pos
  102. filter take-until take-after map-union node-reverse
  103. node-trace select-kids node-self node-join node-reduce
  104. node-or node-closure node-parent
  105. sxpath))
  106. ;; Upstream version:
  107. ; $Id: SXPath.scm,v 3.5 2001/01/12 23:20:35 oleg Exp oleg $
  108. (define (nodeset? x)
  109. (or (and (pair? x) (not (symbol? (car x)))) (null? x)))
  110. ;-------------------------
  111. ; Basic converters and applicators
  112. ; A converter is a function
  113. ; type Converter = Node|Nodeset -> Nodeset
  114. ; A converter can also play a role of a predicate: in that case, if a
  115. ; converter, applied to a node or a nodeset, yields a non-empty
  116. ; nodeset, the converter-predicate is deemed satisfied. Throughout
  117. ; this file a nil nodeset is equivalent to #f in denoting a failure.
  118. ; The following function implements a 'Node test' as defined in
  119. ; Sec. 2.3 of XPath document. A node test is one of the components of a
  120. ; location step. It is also a converter-predicate in SXPath.
  121. ;
  122. ; The function node-typeof? takes a type criterion and returns a function,
  123. ; which, when applied to a node, will tell if the node satisfies
  124. ; the test.
  125. ; node-typeof? :: Crit -> Node -> Boolean
  126. ;
  127. ; The criterion 'crit' is a symbol, one of the following:
  128. ; id - tests if the Node has the right name (id)
  129. ; @ - tests if the Node is an <attributes-coll>
  130. ; * - tests if the Node is an <Element>
  131. ; *text* - tests if the Node is a text node
  132. ; *PI* - tests if the Node is a PI node
  133. ; *any* - #t for any type of Node
  134. (define (node-typeof? crit)
  135. (lambda (node)
  136. (case crit
  137. ((*) (and (pair? node) (not (memq (car node) '(@ *PI*)))))
  138. ((*any*) #t)
  139. ((*text*) (string? node))
  140. (else
  141. (and (pair? node) (eq? crit (car node))))
  142. )))
  143. ; Curried equivalence converter-predicates
  144. (define (node-eq? other)
  145. (lambda (node)
  146. (eq? other node)))
  147. (define (node-equal? other)
  148. (lambda (node)
  149. (equal? other node)))
  150. ; node-pos:: N -> Nodeset -> Nodeset, or
  151. ; node-pos:: N -> Converter
  152. ; Select the N'th element of a Nodeset and return as a singular Nodeset;
  153. ; Return an empty nodeset if the Nth element does not exist.
  154. ; ((node-pos 1) Nodeset) selects the node at the head of the Nodeset,
  155. ; if exists; ((node-pos 2) Nodeset) selects the Node after that, if
  156. ; exists.
  157. ; N can also be a negative number: in that case the node is picked from
  158. ; the tail of the list.
  159. ; ((node-pos -1) Nodeset) selects the last node of a non-empty nodeset;
  160. ; ((node-pos -2) Nodeset) selects the last but one node, if exists.
  161. (define (node-pos n)
  162. (lambda (nodeset)
  163. (cond
  164. ((not (nodeset? nodeset)) '())
  165. ((null? nodeset) nodeset)
  166. ((eqv? n 1) (list (car nodeset)))
  167. ((negative? n) ((node-pos (+ n 1 (length nodeset))) nodeset))
  168. (else
  169. (or (positive? n) (error "yikes!"))
  170. ((node-pos (1- n)) (cdr nodeset))))))
  171. ; filter:: Converter -> Converter
  172. ; A filter applicator, which introduces a filtering context. The argument
  173. ; converter is considered a predicate, with either #f or nil result meaning
  174. ; failure.
  175. (define (filter pred?)
  176. (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
  177. (let loop ((lst (if (nodeset? lst) lst (list lst))) (res '()))
  178. (if (null? lst)
  179. (reverse res)
  180. (let ((pred-result (pred? (car lst))))
  181. (loop (cdr lst)
  182. (if (and pred-result (not (null? pred-result)))
  183. (cons (car lst) res)
  184. res)))))))
  185. ; take-until:: Converter -> Converter, or
  186. ; take-until:: Pred -> Node|Nodeset -> Nodeset
  187. ; Given a converter-predicate and a nodeset, apply the predicate to
  188. ; each element of the nodeset, until the predicate yields anything but #f or
  189. ; nil. Return the elements of the input nodeset that have been processed
  190. ; till that moment (that is, which fail the predicate).
  191. ; take-until is a variation of the filter above: take-until passes
  192. ; elements of an ordered input set till (but not including) the first
  193. ; element that satisfies the predicate.
  194. ; The nodeset returned by ((take-until (not pred)) nset) is a subset --
  195. ; to be more precise, a prefix -- of the nodeset returned by
  196. ; ((filter pred) nset)
  197. (define (take-until pred?)
  198. (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
  199. (let loop ((lst (if (nodeset? lst) lst (list lst))))
  200. (if (null? lst) lst
  201. (let ((pred-result (pred? (car lst))))
  202. (if (and pred-result (not (null? pred-result)))
  203. '()
  204. (cons (car lst) (loop (cdr lst)))))
  205. ))))
  206. ; take-after:: Converter -> Converter, or
  207. ; take-after:: Pred -> Node|Nodeset -> Nodeset
  208. ; Given a converter-predicate and a nodeset, apply the predicate to
  209. ; each element of the nodeset, until the predicate yields anything but #f or
  210. ; nil. Return the elements of the input nodeset that have not been processed:
  211. ; that is, return the elements of the input nodeset that follow the first
  212. ; element that satisfied the predicate.
  213. ; take-after along with take-until partition an input nodeset into three
  214. ; parts: the first element that satisfies a predicate, all preceding
  215. ; elements and all following elements.
  216. (define (take-after pred?)
  217. (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset)
  218. (let loop ((lst (if (nodeset? lst) lst (list lst))))
  219. (if (null? lst) lst
  220. (let ((pred-result (pred? (car lst))))
  221. (if (and pred-result (not (null? pred-result)))
  222. (cdr lst)
  223. (loop (cdr lst))))
  224. ))))
  225. ; Apply proc to each element of lst and return the list of results.
  226. ; if proc returns a nodeset, splice it into the result
  227. ;
  228. ; From another point of view, map-union is a function Converter->Converter,
  229. ; which places an argument-converter in a joining context.
  230. (define (map-union proc lst)
  231. (if (null? lst) lst
  232. (let ((proc-res (proc (car lst))))
  233. ((if (nodeset? proc-res) append cons)
  234. proc-res (map-union proc (cdr lst))))))
  235. ; node-reverse :: Converter, or
  236. ; node-reverse:: Node|Nodeset -> Nodeset
  237. ; Reverses the order of nodes in the nodeset
  238. ; This basic converter is needed to implement a reverse document order
  239. ; (see the XPath Recommendation).
  240. (define node-reverse
  241. (lambda (node-or-nodeset)
  242. (if (not (nodeset? node-or-nodeset)) (list node-or-nodeset)
  243. (reverse node-or-nodeset))))
  244. ; node-trace:: String -> Converter
  245. ; (node-trace title) is an identity converter. In addition it prints out
  246. ; a node or nodeset it is applied to, prefixed with the 'title'.
  247. ; This converter is very useful for debugging.
  248. (define (node-trace title)
  249. (lambda (node-or-nodeset)
  250. (display "\n-->")
  251. (display title)
  252. (display " :")
  253. (pretty-print node-or-nodeset)
  254. node-or-nodeset))
  255. ;-------------------------
  256. ; Converter combinators
  257. ;
  258. ; Combinators are higher-order functions that transmogrify a converter
  259. ; or glue a sequence of converters into a single, non-trivial
  260. ; converter. The goal is to arrive at converters that correspond to
  261. ; XPath location paths.
  262. ;
  263. ; From a different point of view, a combinator is a fixed, named
  264. ; _pattern_ of applying converters. Given below is a complete set of
  265. ; such patterns that together implement XPath location path
  266. ; specification. As it turns out, all these combinators can be built
  267. ; from a small number of basic blocks: regular functional composition,
  268. ; map-union and filter applicators, and the nodeset union.
  269. ; select-kids:: Pred -> Node -> Nodeset
  270. ; Given a Node, return an (ordered) subset its children that satisfy
  271. ; the Pred (a converter, actually)
  272. ; select-kids:: Pred -> Nodeset -> Nodeset
  273. ; The same as above, but select among children of all the nodes in
  274. ; the Nodeset
  275. ;
  276. ; More succinctly, the signature of this function is
  277. ; select-kids:: Converter -> Converter
  278. (define (select-kids test-pred?)
  279. (lambda (node) ; node or node-set
  280. (cond
  281. ((null? node) node)
  282. ((not (pair? node)) '()) ; No children
  283. ((symbol? (car node))
  284. ((filter test-pred?) (cdr node))) ; it's a single node
  285. (else (map-union (select-kids test-pred?) node)))))
  286. ; node-self:: Pred -> Node -> Nodeset, or
  287. ; node-self:: Converter -> Converter
  288. ; Similar to select-kids but apply to the Node itself rather
  289. ; than to its children. The resulting Nodeset will contain either one
  290. ; component, or will be empty (if the Node failed the Pred).
  291. (define node-self filter)
  292. ; node-join:: [LocPath] -> Node|Nodeset -> Nodeset, or
  293. ; node-join:: [Converter] -> Converter
  294. ; join the sequence of location steps or paths as described
  295. ; in the title comments above.
  296. (define (node-join . selectors)
  297. (lambda (nodeset) ; Nodeset or node
  298. (let loop ((nodeset nodeset) (selectors selectors))
  299. (if (null? selectors) nodeset
  300. (loop
  301. (if (nodeset? nodeset)
  302. (map-union (car selectors) nodeset)
  303. ((car selectors) nodeset))
  304. (cdr selectors))))))
  305. ; node-reduce:: [LocPath] -> Node|Nodeset -> Nodeset, or
  306. ; node-reduce:: [Converter] -> Converter
  307. ; A regular functional composition of converters.
  308. ; From a different point of view,
  309. ; ((apply node-reduce converters) nodeset)
  310. ; is equivalent to
  311. ; (foldl apply nodeset converters)
  312. ; i.e., folding, or reducing, a list of converters with the nodeset
  313. ; as a seed.
  314. (define (node-reduce . converters)
  315. (lambda (nodeset) ; Nodeset or node
  316. (let loop ((nodeset nodeset) (converters converters))
  317. (if (null? converters) nodeset
  318. (loop ((car converters) nodeset) (cdr converters))))))
  319. ; node-or:: [Converter] -> Converter
  320. ; This combinator applies all converters to a given node and
  321. ; produces the union of their results.
  322. ; This combinator corresponds to a union, '|' operation for XPath
  323. ; location paths.
  324. ; (define (node-or . converters)
  325. ; (lambda (node-or-nodeset)
  326. ; (if (null? converters) node-or-nodeset
  327. ; (append
  328. ; ((car converters) node-or-nodeset)
  329. ; ((apply node-or (cdr converters)) node-or-nodeset)))))
  330. ; More optimal implementation follows
  331. (define (node-or . converters)
  332. (lambda (node-or-nodeset)
  333. (let loop ((result '()) (converters converters))
  334. (if (null? converters) result
  335. (loop (append result (or ((car converters) node-or-nodeset) '()))
  336. (cdr converters))))))
  337. ; node-closure:: Converter -> Converter
  338. ; Select all _descendants_ of a node that satisfy a converter-predicate.
  339. ; This combinator is similar to select-kids but applies to
  340. ; grand... children as well.
  341. ; This combinator implements the "descendant::" XPath axis
  342. ; Conceptually, this combinator can be expressed as
  343. ; (define (node-closure f)
  344. ; (node-or
  345. ; (select-kids f)
  346. ; (node-reduce (select-kids (node-typeof? '*)) (node-closure f))))
  347. ; This definition, as written, looks somewhat like a fixpoint, and it
  348. ; will run forever. It is obvious however that sooner or later
  349. ; (select-kids (node-typeof? '*)) will return an empty nodeset. At
  350. ; this point further iterations will no longer affect the result and
  351. ; can be stopped.
  352. (define (node-closure test-pred?)
  353. (lambda (node) ; Nodeset or node
  354. (let loop ((parent node) (result '()))
  355. (if (null? parent) result
  356. (loop ((select-kids (node-typeof? '*)) parent)
  357. (append result
  358. ((select-kids test-pred?) parent)))
  359. ))))
  360. ; node-parent:: RootNode -> Converter
  361. ; (node-parent rootnode) yields a converter that returns a parent of a
  362. ; node it is applied to. If applied to a nodeset, it returns the list
  363. ; of parents of nodes in the nodeset. The rootnode does not have
  364. ; to be the root node of the whole SXML tree -- it may be a root node
  365. ; of a branch of interest.
  366. ; Given the notation of Philip Wadler's paper on semantics of XSLT,
  367. ; parent(x) = { y | y=subnode*(root), x=subnode(y) }
  368. ; Therefore, node-parent is not the fundamental converter: it can be
  369. ; expressed through the existing ones. Yet node-parent is a rather
  370. ; convenient converter. It corresponds to a parent:: axis of SXPath.
  371. ; Note that the parent:: axis can be used with an attribute node as well!
  372. (define (node-parent rootnode)
  373. (lambda (node) ; Nodeset or node
  374. (if (nodeset? node) (map-union (node-parent rootnode) node)
  375. (let ((pred
  376. (node-or
  377. (node-reduce
  378. (node-self (node-typeof? '*))
  379. (select-kids (node-eq? node)))
  380. (node-join
  381. (select-kids (node-typeof? '@))
  382. (select-kids (node-eq? node))))))
  383. ((node-or
  384. (node-self pred)
  385. (node-closure pred))
  386. rootnode)))))
  387. ;-------------------------
  388. ; Evaluate an abbreviated SXPath
  389. ; sxpath:: AbbrPath -> Converter, or
  390. ; sxpath:: AbbrPath -> Node|Nodeset -> Nodeset
  391. ; AbbrPath is a list. It is translated to the full SXPath according
  392. ; to the following rewriting rules
  393. ; (sxpath '()) -> (node-join)
  394. ; (sxpath '(path-component ...)) ->
  395. ; (node-join (sxpath1 path-component) (sxpath '(...)))
  396. ; (sxpath1 '//) -> (node-or
  397. ; (node-self (node-typeof? '*any*))
  398. ; (node-closure (node-typeof? '*any*)))
  399. ; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x))
  400. ; (sxpath1 '(eq? x)) -> (select-kids (node-eq? x))
  401. ; (sxpath1 ?symbol) -> (select-kids (node-typeof? ?symbol)
  402. ; (sxpath1 procedure) -> procedure
  403. ; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...))
  404. ; (sxpath1 '(path reducer ...)) ->
  405. ; (node-reduce (sxpath path) (sxpathr reducer) ...)
  406. ; (sxpathr number) -> (node-pos number)
  407. ; (sxpathr path-filter) -> (filter (sxpath path-filter))
  408. (define (sxpath path)
  409. (lambda (nodeset)
  410. (let loop ((nodeset nodeset) (path path))
  411. (cond
  412. ((null? path) nodeset)
  413. ((nodeset? nodeset)
  414. (map-union (sxpath path) nodeset))
  415. ((procedure? (car path))
  416. (loop ((car path) nodeset) (cdr path)))
  417. ((eq? '// (car path))
  418. (loop
  419. ((if (nodeset? nodeset) append cons) nodeset
  420. ((node-closure (node-typeof? '*any*)) nodeset))
  421. (cdr path)))
  422. ((symbol? (car path))
  423. (loop ((select-kids (node-typeof? (car path))) nodeset)
  424. (cdr path)))
  425. ((and (pair? (car path)) (eq? 'equal? (caar path)))
  426. (loop ((select-kids (apply node-equal? (cdar path))) nodeset)
  427. (cdr path)))
  428. ((and (pair? (car path)) (eq? 'eq? (caar path)))
  429. (loop ((select-kids (apply node-eq? (cdar path))) nodeset)
  430. (cdr path)))
  431. ((pair? (car path))
  432. (let reducer ((nodeset
  433. (if (symbol? (caar path))
  434. ((select-kids (node-typeof? (caar path))) nodeset)
  435. (loop nodeset (caar path))))
  436. (reducing-path (cdar path)))
  437. (cond
  438. ((null? reducing-path) (loop nodeset (cdr path)))
  439. ((number? (car reducing-path))
  440. (reducer ((node-pos (car reducing-path)) nodeset)
  441. (cdr reducing-path)))
  442. (else
  443. (reducer ((filter (sxpath (car reducing-path))) nodeset)
  444. (cdr reducing-path))))))
  445. (else
  446. (error "Invalid path step: " (car path)))))))
  447. ;;; arch-tag: c4e57abf-6b61-4612-a6aa-d1536d440774
  448. ;;; xpath.scm ends here