123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230 |
- ;;;; Copyright (C) 1997, 1999, 2001, 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc.
- ;;;;
- ;;;; This library is free software; you can redistribute it and/or
- ;;;; modify it under the terms of the GNU Lesser General Public
- ;;;; License as published by the Free Software Foundation; either
- ;;;; version 3 of the License, or (at your option) any later version.
- ;;;;
- ;;;; This library is distributed in the hope that it will be useful,
- ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;;;; Lesser General Public License for more details.
- ;;;;
- ;;;; You should have received a copy of the GNU Lesser General Public
- ;;;; License along with this library; if not, write to the Free Software
- ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;;;;
- ;;; Commentary:
- ;; These procedures are exported:
- ;; (match:count match)
- ;; (match:string match)
- ;; (match:prefix match)
- ;; (match:suffix match)
- ;; (regexp-match? match)
- ;; (regexp-quote string)
- ;; (match:start match . submatch-num)
- ;; (match:end match . submatch-num)
- ;; (match:substring match . submatch-num)
- ;; (string-match pattern str . start)
- ;; (regexp-substitute port match . items)
- ;; (fold-matches regexp string init proc . flags)
- ;; (list-matches regexp string . flags)
- ;; (regexp-substitute/global port regexp string . items)
- ;;; Code:
- ;;;; POSIX regex support functions.
- (define-module (ice-9 regex)
- #:export (match:count match:string match:prefix match:suffix
- regexp-match? regexp-quote match:start match:end match:substring
- string-match regexp-substitute fold-matches list-matches
- regexp-substitute/global))
- ;; References:
- ;;
- ;; POSIX spec:
- ;; http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
- ;;; FIXME:
- ;;; It is not clear what should happen if a `match' function
- ;;; is passed a `match number' which is out of bounds for the
- ;;; regexp match: return #f, or throw an error? These routines
- ;;; throw an out-of-range error.
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;;;; These procedures are not defined in SCSH, but I found them useful.
- (define (match:count match)
- (- (vector-length match) 1))
- (define (match:string match)
- (vector-ref match 0))
- (define (match:prefix match)
- (substring (match:string match) 0 (match:start match 0)))
- (define (match:suffix match)
- (substring (match:string match) (match:end match 0)))
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;;;; SCSH compatibility routines.
- (define (regexp-match? match)
- (and (vector? match)
- (string? (vector-ref match 0))
- (let loop ((i 1))
- (cond ((>= i (vector-length match)) #t)
- ((and (pair? (vector-ref match i))
- (integer? (car (vector-ref match i)))
- (integer? (cdr (vector-ref match i))))
- (loop (+ 1 i)))
- (else #f)))))
- ;; * . \ ^ $ and [ are special in both regexp/basic and regexp/extended and
- ;; can be backslash escaped.
- ;;
- ;; ( ) + ? { } and | are special in regexp/extended so must be quoted. But
- ;; that can't be done with a backslash since in regexp/basic where they're
- ;; not special, adding a backslash makes them become special. Character
- ;; class forms [(] etc are used instead.
- ;;
- ;; ) is not special when not preceded by a (, and * and ? are not special at
- ;; the start of a string, but we quote all of these always, so the result
- ;; can be concatenated or merged into some larger regexp.
- ;;
- ;; ] is not special outside a [ ] character class, so doesn't need to be
- ;; quoted.
- ;;
- (define (regexp-quote string)
- (call-with-output-string
- (lambda (p)
- (string-for-each (lambda (c)
- (case c
- ((#\* #\. #\\ #\^ #\$ #\[)
- (write-char #\\ p)
- (write-char c p))
- ((#\( #\) #\+ #\? #\{ #\} #\|)
- (write-char #\[ p)
- (write-char c p)
- (write-char #\] p))
- (else
- (write-char c p))))
- string))))
- (define* (match:start match #:optional (n 0))
- (let ((start (car (vector-ref match (1+ n)))))
- (if (= start -1) #f start)))
- (define* (match:end match #:optional (n 0))
- (let* ((end (cdr (vector-ref match (1+ n)))))
- (if (= end -1) #f end)))
- (define* (match:substring match #:optional (n 0))
- (let* ((start (match:start match n))
- (end (match:end match n)))
- (and start end (substring (match:string match) start end))))
- (define (string-match pattern str . args)
- (let ((rx (make-regexp pattern))
- (start (if (pair? args) (car args) 0)))
- (regexp-exec rx str start)))
- (define (regexp-substitute port match . items)
- ;; If `port' is #f, send output to a string.
- (if (not port)
- (call-with-output-string
- (lambda (p)
- (apply regexp-substitute p match items)))
- ;; Otherwise, process each substitution argument in `items'.
- (for-each (lambda (obj)
- (cond ((string? obj) (display obj port))
- ((integer? obj) (display (match:substring match obj) port))
- ((eq? 'pre obj) (display (match:prefix match) port))
- ((eq? 'post obj) (display (match:suffix match) port))
- (else (error 'wrong-type-arg obj))))
- items)))
- ;;; If we call fold-matches, below, with a regexp that can match the
- ;;; empty string, it's not obvious what "all the matches" means. How
- ;;; many empty strings are there in the string "a"? Our answer:
- ;;;
- ;;; This function applies PROC to every non-overlapping, maximal
- ;;; match of REGEXP in STRING.
- ;;;
- ;;; "non-overlapping": There are two non-overlapping matches of "" in
- ;;; "a" --- one before the `a', and one after. There are three
- ;;; non-overlapping matches of "q|x*" in "aqb": the empty strings
- ;;; before `a' and after `b', and `q'. The two empty strings before
- ;;; and after `q' don't count, because they overlap with the match of
- ;;; "q".
- ;;;
- ;;; "maximal": There are three distinct maximal matches of "x*" in
- ;;; "axxxb": one before the `a', one covering `xxx', and one after the
- ;;; `b'. Around or within `xxx', only the match covering all three
- ;;; x's counts, because the rest are not maximal.
- (define* (fold-matches regexp string init proc #:optional (flags 0))
- (let ((regexp (if (regexp? regexp) regexp (make-regexp regexp))))
- (let loop ((start 0)
- (value init)
- (abuts #f)) ; True if start abuts a previous match.
- (define bol (if (zero? start) 0 regexp/notbol))
- (let ((m (if (> start (string-length string)) #f
- (regexp-exec regexp string start (logior flags bol)))))
- (cond
- ((not m) value)
- ((and (= (match:start m) (match:end m)) abuts)
- ;; We matched an empty string, but that would overlap the
- ;; match immediately before. Try again at a position
- ;; further to the right.
- (loop (+ start 1) value #f))
- (else
- (loop (match:end m) (proc m value) #t)))))))
- (define* (list-matches regexp string #:optional (flags 0))
- (reverse! (fold-matches regexp string '() cons flags)))
- (define (regexp-substitute/global port regexp string . items)
- ;; If `port' is #f, send output to a string.
- (if (not port)
- (call-with-output-string
- (lambda (p)
- (apply regexp-substitute/global p regexp string items)))
- ;; Walk the set of non-overlapping, maximal matches.
- (let next-match ((matches (list-matches regexp string))
- (start 0))
- (if (null? matches)
- (display (substring string start) port)
- (let ((m (car matches)))
- ;; Process all of the items for this match. Don't use
- ;; for-each, because we need to make sure 'post at the
- ;; end of the item list is a tail call.
- (let next-item ((items items))
- (define (do-item item)
- (cond
- ((string? item) (display item port))
- ((integer? item) (display (match:substring m item) port))
- ((procedure? item) (display (item m) port))
- ((eq? item 'pre)
- (display
- (substring string start (match:start m))
- port))
- ((eq? item 'post)
- (next-match (cdr matches) (match:end m)))
- (else (error 'wrong-type-arg item))))
- (if (pair? items)
- (if (null? (cdr items))
- (do-item (car items)) ; This is a tail call.
- (begin
- (do-item (car items)) ; This is not.
- (next-item (cdr items)))))))))))
|