regex.scm 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. ;;;; Copyright (C) 1997, 1999, 2001, 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc.
  2. ;;;;
  3. ;;;; This library is free software; you can redistribute it and/or
  4. ;;;; modify it under the terms of the GNU Lesser General Public
  5. ;;;; License as published by the Free Software Foundation; either
  6. ;;;; version 3 of the License, or (at your option) any later version.
  7. ;;;;
  8. ;;;; This library is distributed in the hope that it will be useful,
  9. ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. ;;;; Lesser General Public License for more details.
  12. ;;;;
  13. ;;;; You should have received a copy of the GNU Lesser General Public
  14. ;;;; License along with this library; if not, write to the Free Software
  15. ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  16. ;;;;
  17. ;;; Commentary:
  18. ;; These procedures are exported:
  19. ;; (match:count match)
  20. ;; (match:string match)
  21. ;; (match:prefix match)
  22. ;; (match:suffix match)
  23. ;; (regexp-match? match)
  24. ;; (regexp-quote string)
  25. ;; (match:start match . submatch-num)
  26. ;; (match:end match . submatch-num)
  27. ;; (match:substring match . submatch-num)
  28. ;; (string-match pattern str . start)
  29. ;; (regexp-substitute port match . items)
  30. ;; (fold-matches regexp string init proc . flags)
  31. ;; (list-matches regexp string . flags)
  32. ;; (regexp-substitute/global port regexp string . items)
  33. ;;; Code:
  34. ;;;; POSIX regex support functions.
  35. (define-module (ice-9 regex)
  36. #:export (match:count match:string match:prefix match:suffix
  37. regexp-match? regexp-quote match:start match:end match:substring
  38. string-match regexp-substitute fold-matches list-matches
  39. regexp-substitute/global))
  40. ;; References:
  41. ;;
  42. ;; POSIX spec:
  43. ;; http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  44. ;;; FIXME:
  45. ;;; It is not clear what should happen if a `match' function
  46. ;;; is passed a `match number' which is out of bounds for the
  47. ;;; regexp match: return #f, or throw an error? These routines
  48. ;;; throw an out-of-range error.
  49. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  50. ;;;; These procedures are not defined in SCSH, but I found them useful.
  51. (define (match:count match)
  52. (- (vector-length match) 1))
  53. (define (match:string match)
  54. (vector-ref match 0))
  55. (define (match:prefix match)
  56. (substring (match:string match) 0 (match:start match 0)))
  57. (define (match:suffix match)
  58. (substring (match:string match) (match:end match 0)))
  59. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  60. ;;;; SCSH compatibility routines.
  61. (define (regexp-match? match)
  62. (and (vector? match)
  63. (string? (vector-ref match 0))
  64. (let loop ((i 1))
  65. (cond ((>= i (vector-length match)) #t)
  66. ((and (pair? (vector-ref match i))
  67. (integer? (car (vector-ref match i)))
  68. (integer? (cdr (vector-ref match i))))
  69. (loop (+ 1 i)))
  70. (else #f)))))
  71. ;; * . \ ^ $ and [ are special in both regexp/basic and regexp/extended and
  72. ;; can be backslash escaped.
  73. ;;
  74. ;; ( ) + ? { } and | are special in regexp/extended so must be quoted. But
  75. ;; that can't be done with a backslash since in regexp/basic where they're
  76. ;; not special, adding a backslash makes them become special. Character
  77. ;; class forms [(] etc are used instead.
  78. ;;
  79. ;; ) is not special when not preceded by a (, and * and ? are not special at
  80. ;; the start of a string, but we quote all of these always, so the result
  81. ;; can be concatenated or merged into some larger regexp.
  82. ;;
  83. ;; ] is not special outside a [ ] character class, so doesn't need to be
  84. ;; quoted.
  85. ;;
  86. (define (regexp-quote string)
  87. (call-with-output-string
  88. (lambda (p)
  89. (string-for-each (lambda (c)
  90. (case c
  91. ((#\* #\. #\\ #\^ #\$ #\[)
  92. (write-char #\\ p)
  93. (write-char c p))
  94. ((#\( #\) #\+ #\? #\{ #\} #\|)
  95. (write-char #\[ p)
  96. (write-char c p)
  97. (write-char #\] p))
  98. (else
  99. (write-char c p))))
  100. string))))
  101. (define* (match:start match #:optional (n 0))
  102. (let ((start (car (vector-ref match (1+ n)))))
  103. (if (= start -1) #f start)))
  104. (define* (match:end match #:optional (n 0))
  105. (let* ((end (cdr (vector-ref match (1+ n)))))
  106. (if (= end -1) #f end)))
  107. (define* (match:substring match #:optional (n 0))
  108. (let* ((start (match:start match n))
  109. (end (match:end match n)))
  110. (and start end (substring (match:string match) start end))))
  111. (define (string-match pattern str . args)
  112. (let ((rx (make-regexp pattern))
  113. (start (if (pair? args) (car args) 0)))
  114. (regexp-exec rx str start)))
  115. (define (regexp-substitute port match . items)
  116. ;; If `port' is #f, send output to a string.
  117. (if (not port)
  118. (call-with-output-string
  119. (lambda (p)
  120. (apply regexp-substitute p match items)))
  121. ;; Otherwise, process each substitution argument in `items'.
  122. (for-each (lambda (obj)
  123. (cond ((string? obj) (display obj port))
  124. ((integer? obj) (display (match:substring match obj) port))
  125. ((eq? 'pre obj) (display (match:prefix match) port))
  126. ((eq? 'post obj) (display (match:suffix match) port))
  127. (else (error 'wrong-type-arg obj))))
  128. items)))
  129. ;;; If we call fold-matches, below, with a regexp that can match the
  130. ;;; empty string, it's not obvious what "all the matches" means. How
  131. ;;; many empty strings are there in the string "a"? Our answer:
  132. ;;;
  133. ;;; This function applies PROC to every non-overlapping, maximal
  134. ;;; match of REGEXP in STRING.
  135. ;;;
  136. ;;; "non-overlapping": There are two non-overlapping matches of "" in
  137. ;;; "a" --- one before the `a', and one after. There are three
  138. ;;; non-overlapping matches of "q|x*" in "aqb": the empty strings
  139. ;;; before `a' and after `b', and `q'. The two empty strings before
  140. ;;; and after `q' don't count, because they overlap with the match of
  141. ;;; "q".
  142. ;;;
  143. ;;; "maximal": There are three distinct maximal matches of "x*" in
  144. ;;; "axxxb": one before the `a', one covering `xxx', and one after the
  145. ;;; `b'. Around or within `xxx', only the match covering all three
  146. ;;; x's counts, because the rest are not maximal.
  147. (define* (fold-matches regexp string init proc #:optional (flags 0))
  148. (let ((regexp (if (regexp? regexp) regexp (make-regexp regexp))))
  149. (let loop ((start 0)
  150. (value init)
  151. (abuts #f)) ; True if start abuts a previous match.
  152. (define bol (if (zero? start) 0 regexp/notbol))
  153. (let ((m (if (> start (string-length string)) #f
  154. (regexp-exec regexp string start (logior flags bol)))))
  155. (cond
  156. ((not m) value)
  157. ((and (= (match:start m) (match:end m)) abuts)
  158. ;; We matched an empty string, but that would overlap the
  159. ;; match immediately before. Try again at a position
  160. ;; further to the right.
  161. (loop (+ start 1) value #f))
  162. (else
  163. (loop (match:end m) (proc m value) #t)))))))
  164. (define* (list-matches regexp string #:optional (flags 0))
  165. (reverse! (fold-matches regexp string '() cons flags)))
  166. (define (regexp-substitute/global port regexp string . items)
  167. ;; If `port' is #f, send output to a string.
  168. (if (not port)
  169. (call-with-output-string
  170. (lambda (p)
  171. (apply regexp-substitute/global p regexp string items)))
  172. ;; Walk the set of non-overlapping, maximal matches.
  173. (let next-match ((matches (list-matches regexp string))
  174. (start 0))
  175. (if (null? matches)
  176. (display (substring string start) port)
  177. (let ((m (car matches)))
  178. ;; Process all of the items for this match. Don't use
  179. ;; for-each, because we need to make sure 'post at the
  180. ;; end of the item list is a tail call.
  181. (let next-item ((items items))
  182. (define (do-item item)
  183. (cond
  184. ((string? item) (display item port))
  185. ((integer? item) (display (match:substring m item) port))
  186. ((procedure? item) (display (item m) port))
  187. ((eq? item 'pre)
  188. (display
  189. (substring string start (match:start m))
  190. port))
  191. ((eq? item 'post)
  192. (next-match (cdr matches) (match:end m)))
  193. (else (error 'wrong-type-arg item))))
  194. (if (pair? items)
  195. (if (null? (cdr items))
  196. (do-item (car items)) ; This is a tail call.
  197. (begin
  198. (do-item (car items)) ; This is not.
  199. (next-item (cdr items)))))))))))