regex.scm 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. ;;;; Copyright (C) 1997, 1999, 2001, 2004, 2005, 2006, 2008 Free Software Foundation, Inc.
  2. ;;;;
  3. ;;;; This library is free software; you can redistribute it and/or
  4. ;;;; modify it under the terms of the GNU Lesser General Public
  5. ;;;; License as published by the Free Software Foundation; either
  6. ;;;; version 2.1 of the License, or (at your option) any later version.
  7. ;;;;
  8. ;;;; This library is distributed in the hope that it will be useful,
  9. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. ;;;; Lesser General Public License for more details.
  12. ;;;;
  13. ;;;; You should have received a copy of the GNU Lesser General Public
  14. ;;;; License along with this library; if not, write to the Free Software
  15. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  16. ;;;;
  17. ;;; Commentary:
  18. ;; These procedures are exported:
  19. ;; (match:count match)
  20. ;; (match:string match)
  21. ;; (match:prefix match)
  22. ;; (match:suffix match)
  23. ;; (regexp-match? match)
  24. ;; (regexp-quote string)
  25. ;; (match:start match . submatch-num)
  26. ;; (match:end match . submatch-num)
  27. ;; (match:substring match . submatch-num)
  28. ;; (string-match pattern str . start)
  29. ;; (regexp-substitute port match . items)
  30. ;; (fold-matches regexp string init proc . flags)
  31. ;; (list-matches regexp string . flags)
  32. ;; (regexp-substitute/global port regexp string . items)
  33. ;;; Code:
  34. ;;;; POSIX regex support functions.
  35. (define-module (ice-9 regex)
  36. :export (match:count match:string match:prefix match:suffix
  37. regexp-match? regexp-quote match:start match:end match:substring
  38. string-match regexp-substitute fold-matches list-matches
  39. regexp-substitute/global))
  40. ;; References:
  41. ;;
  42. ;; POSIX spec:
  43. ;; http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  44. ;;; FIXME:
  45. ;;; It is not clear what should happen if a `match' function
  46. ;;; is passed a `match number' which is out of bounds for the
  47. ;;; regexp match: return #f, or throw an error? These routines
  48. ;;; throw an out-of-range error.
  49. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  50. ;;;; These procedures are not defined in SCSH, but I found them useful.
  51. (define (match:count match)
  52. (- (vector-length match) 1))
  53. (define (match:string match)
  54. (vector-ref match 0))
  55. (define (match:prefix match)
  56. (substring (match:string match) 0 (match:start match 0)))
  57. (define (match:suffix match)
  58. (substring (match:string match) (match:end match 0)))
  59. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  60. ;;;; SCSH compatibility routines.
  61. (define (regexp-match? match)
  62. (and (vector? match)
  63. (string? (vector-ref match 0))
  64. (let loop ((i 1))
  65. (cond ((>= i (vector-length match)) #t)
  66. ((and (pair? (vector-ref match i))
  67. (integer? (car (vector-ref match i)))
  68. (integer? (cdr (vector-ref match i))))
  69. (loop (+ 1 i)))
  70. (else #f)))))
  71. ;; * . \ ^ $ and [ are special in both regexp/basic and regexp/extended and
  72. ;; can be backslash escaped.
  73. ;;
  74. ;; ( ) + ? { } and | are special in regexp/extended so must be quoted. But
  75. ;; that can't be done with a backslash since in regexp/basic where they're
  76. ;; not special, adding a backslash makes them become special. Character
  77. ;; class forms [(] etc are used instead.
  78. ;;
  79. ;; ) is not special when not preceded by a (, and * and ? are not special at
  80. ;; the start of a string, but we quote all of these always, so the result
  81. ;; can be concatenated or merged into some larger regexp.
  82. ;;
  83. ;; ] is not special outside a [ ] character class, so doesn't need to be
  84. ;; quoted.
  85. ;;
  86. (define (regexp-quote string)
  87. (call-with-output-string
  88. (lambda (p)
  89. (string-for-each (lambda (c)
  90. (case c
  91. ((#\* #\. #\\ #\^ #\$ #\[)
  92. (write-char #\\ p)
  93. (write-char c p))
  94. ((#\( #\) #\+ #\? #\{ #\} #\|)
  95. (write-char #\[ p)
  96. (write-char c p)
  97. (write-char #\] p))
  98. (else
  99. (write-char c p))))
  100. string))))
  101. (define (match:start match . args)
  102. (let* ((matchnum (if (pair? args)
  103. (+ 1 (car args))
  104. 1))
  105. (start (car (vector-ref match matchnum))))
  106. (if (= start -1) #f start)))
  107. (define (match:end match . args)
  108. (let* ((matchnum (if (pair? args)
  109. (+ 1 (car args))
  110. 1))
  111. (end (cdr (vector-ref match matchnum))))
  112. (if (= end -1) #f end)))
  113. (define (match:substring match . args)
  114. (let* ((matchnum (if (pair? args)
  115. (car args)
  116. 0))
  117. (start (match:start match matchnum))
  118. (end (match:end match matchnum)))
  119. (and start end (substring (match:string match) start end))))
  120. (define (string-match pattern str . args)
  121. (let ((rx (make-regexp pattern))
  122. (start (if (pair? args) (car args) 0)))
  123. (regexp-exec rx str start)))
  124. (define (regexp-substitute port match . items)
  125. ;; If `port' is #f, send output to a string.
  126. (if (not port)
  127. (call-with-output-string
  128. (lambda (p)
  129. (apply regexp-substitute p match items)))
  130. ;; Otherwise, process each substitution argument in `items'.
  131. (for-each (lambda (obj)
  132. (cond ((string? obj) (display obj port))
  133. ((integer? obj) (display (match:substring match obj) port))
  134. ((eq? 'pre obj) (display (match:prefix match) port))
  135. ((eq? 'post obj) (display (match:suffix match) port))
  136. (else (error 'wrong-type-arg obj))))
  137. items)))
  138. ;;; If we call fold-matches, below, with a regexp that can match the
  139. ;;; empty string, it's not obvious what "all the matches" means. How
  140. ;;; many empty strings are there in the string "a"? Our answer:
  141. ;;;
  142. ;;; This function applies PROC to every non-overlapping, maximal
  143. ;;; match of REGEXP in STRING.
  144. ;;;
  145. ;;; "non-overlapping": There are two non-overlapping matches of "" in
  146. ;;; "a" --- one before the `a', and one after. There are three
  147. ;;; non-overlapping matches of "q|x*" in "aqb": the empty strings
  148. ;;; before `a' and after `b', and `q'. The two empty strings before
  149. ;;; and after `q' don't count, because they overlap with the match of
  150. ;;; "q".
  151. ;;;
  152. ;;; "maximal": There are three distinct maximal matches of "x*" in
  153. ;;; "axxxb": one before the `a', one covering `xxx', and one after the
  154. ;;; `b'. Around or within `xxx', only the match covering all three
  155. ;;; x's counts, because the rest are not maximal.
  156. (define (fold-matches regexp string init proc . flags)
  157. (let ((regexp (if (regexp? regexp) regexp (make-regexp regexp)))
  158. (flags (if (null? flags) 0 (car flags))))
  159. (let loop ((start 0)
  160. (value init)
  161. (abuts #f)) ; True if start abuts a previous match.
  162. (let ((m (if (> start (string-length string)) #f
  163. (regexp-exec regexp string start flags))))
  164. (cond
  165. ((not m) value)
  166. ((and (= (match:start m) (match:end m)) abuts)
  167. ;; We matched an empty string, but that would overlap the
  168. ;; match immediately before. Try again at a position
  169. ;; further to the right.
  170. (loop (+ start 1) value #f))
  171. (else
  172. (loop (match:end m) (proc m value) #t)))))))
  173. (define (list-matches regexp string . flags)
  174. (reverse! (apply fold-matches regexp string '() cons flags)))
  175. (define (regexp-substitute/global port regexp string . items)
  176. ;; If `port' is #f, send output to a string.
  177. (if (not port)
  178. (call-with-output-string
  179. (lambda (p)
  180. (apply regexp-substitute/global p regexp string items)))
  181. ;; Walk the set of non-overlapping, maximal matches.
  182. (let next-match ((matches (list-matches regexp string))
  183. (start 0))
  184. (if (null? matches)
  185. (display (substring string start) port)
  186. (let ((m (car matches)))
  187. ;; Process all of the items for this match. Don't use
  188. ;; for-each, because we need to make sure 'post at the
  189. ;; end of the item list is a tail call.
  190. (let next-item ((items items))
  191. (define (do-item item)
  192. (cond
  193. ((string? item) (display item port))
  194. ((integer? item) (display (match:substring m item) port))
  195. ((procedure? item) (display (item m) port))
  196. ((eq? item 'pre)
  197. (display
  198. (substring string start (match:start m))
  199. port))
  200. ((eq? item 'post)
  201. (next-match (cdr matches) (match:end m)))
  202. (else (error 'wrong-type-arg item))))
  203. (if (pair? items)
  204. (if (null? (cdr items))
  205. (do-item (car items)) ; This is a tail call.
  206. (begin
  207. (do-item (car items)) ; This is not.
  208. (next-item (cdr items)))))))))))