regexp.scm 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. ; Copyright (c) 1993-2007 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. ; Exports:
  3. ; make-regexp
  4. ; regexp?
  5. ; regexp-match
  6. ; regexp-match?
  7. ; regexp-match-start
  8. ; regexp-match-end
  9. ; regexp-option
  10. ; The compiled version of the expression is produced when needed.
  11. (define-record-type regexp :regexp
  12. (really-make-regexp pattern pattern-byte-string
  13. compiled
  14. extended? ignore-case? submatches? newline?)
  15. regexp?
  16. (pattern regexp-pattern) ; immutable string
  17. (pattern-byte-string regexp-pattern-byte-string)
  18. (compiled real-regexp-compiled set-regexp-compiled!) ; #f or a c-record
  19. (extended? regexp-extended?) ; four flags
  20. (ignore-case? regexp-ignore-case?)
  21. (submatches? regexp-submatches?)
  22. (newline? regexp-newline?))
  23. ; Drop the compiled version when resuming. We may be resuming on a different
  24. ; architecture, or version of the library, or whatever.
  25. (define-record-resumer :regexp
  26. (lambda (regexp)
  27. (set-regexp-compiled! regexp #f)))
  28. ; There are four options when making a regular expression.
  29. (define-enumerated-type regexp-option :regexp-option
  30. regexp-option?
  31. regexp-options
  32. regexp-option-name
  33. regexp-option-index
  34. (extended ignore-case submatches newline))
  35. ; Loop down finding which options are present and checking for duplicates.
  36. ; This is not specific to regular expressions.
  37. ;
  38. ; It would be nice if this could handle values as well, as in
  39. ; (make-regexp "sldkjf" (regexp-option size 10))
  40. (define (decode-boolean-options options all-options predicate indexer)
  41. (let ((map (make-vector (vector-length all-options) #f)))
  42. (let loop ((options options))
  43. (if (null? options)
  44. (vector->list map)
  45. (let ((option (car options)))
  46. (if (predicate option)
  47. (let ((index (indexer option)))
  48. (if (vector-ref map index)
  49. 'duplicates
  50. (begin
  51. (vector-set! map index #t)
  52. (loop (cdr options)))))
  53. 'bad-value))))))
  54. ; The only thing we do here is to decode the options and make sure that the
  55. ; pattern is immutable, as it won't be used until later.
  56. (define (make-regexp pattern . options)
  57. (let ((options (decode-boolean-options options
  58. regexp-options
  59. regexp-option?
  60. regexp-option-index)))
  61. (if (and (string? pattern)
  62. (pair? options))
  63. (let* ((pattern (string->immutable-string pattern))
  64. (pattern-byte-string (string->byte-vector pattern))
  65. (regexp (apply really-make-regexp pattern pattern-byte-string #f options)))
  66. (add-finalizer! regexp free-compiled-regexp)
  67. regexp)
  68. (apply call-error "invalid argument(s)"
  69. make-regexp
  70. pattern
  71. options))))
  72. ; Free up the C-heap storage used for the compiled regexp.
  73. (define (free-compiled-regexp regexp)
  74. (let ((compiled (real-regexp-compiled regexp)))
  75. (if compiled
  76. (call-imported-binding posix-free-regexp compiled))))
  77. ; We compile the pattern if that hasn't already been done, raising an error
  78. ; if anything goes wrong.
  79. (define (regexp-compiled regexp)
  80. (or (real-regexp-compiled regexp)
  81. (let ((compiled (call-imported-binding posix-compile-regexp
  82. (regexp-pattern-byte-string regexp)
  83. (regexp-extended? regexp)
  84. (regexp-ignore-case? regexp)
  85. (regexp-submatches? regexp)
  86. (regexp-newline? regexp))))
  87. (if (not (integer? compiled))
  88. (begin
  89. (set-regexp-compiled! regexp compiled)
  90. compiled)
  91. (let ((message (call-imported-binding posix-regexp-error-message
  92. (regexp-pattern-byte-string regexp)
  93. (regexp-extended? regexp)
  94. (regexp-ignore-case? regexp)
  95. (regexp-submatches? regexp)
  96. (regexp-newline? regexp))))
  97. (error (if message
  98. (string-append "Posix regexp: " message)
  99. "inconsistent results from Posix regexp compiler")
  100. regexp))))))
  101. ; Call the pattern matcher. We return #F if the match fails. On a successful
  102. ; match, we either return #T or a list of match records, depending on the value
  103. ; of SUBMATCHES?.
  104. (define (regexp-match regexp string start submatches? starts-line? ends-line?)
  105. (cond
  106. ((not (and (regexp? regexp)
  107. (string? string)))
  108. (call-error "invalid argument"
  109. regexp-match
  110. regexp string start starts-line? ends-line?))
  111. ((and submatches?
  112. (not (regexp-submatches? regexp)))
  113. (call-error "regexp not compiled for submatches"
  114. regexp-match
  115. regexp string start starts-line? ends-line?))
  116. (else
  117. (call-imported-binding posix-regexp-match
  118. (regexp-compiled regexp)
  119. (string->byte-vector string)
  120. start
  121. submatches?
  122. starts-line?
  123. ends-line?))))
  124. ; we can't do any better with POSIX, Mike thinks
  125. (define (string->byte-vector s)
  126. (os-string->byte-vector
  127. (call-with-os-string-text-codec
  128. latin-1-codec
  129. (lambda ()
  130. (string->os-string s)))))
  131. ; These are made by the C code. The SUBMATCHES field is not used by us,
  132. ; but is used by the functional interface.
  133. (define-record-type match :match
  134. (make-match start end submatches)
  135. match?
  136. (start match-start)
  137. (end match-end)
  138. (submatches match-submatches))
  139. (define-record-discloser :match
  140. (lambda (rem)
  141. (list 'match
  142. (match-start rem)
  143. (match-end rem))))
  144. (define-exported-binding "posix-regexp-match-type" :match)
  145. ; The various C functions we use.
  146. (import-definition posix-compile-regexp)
  147. (import-definition posix-regexp-match)
  148. (import-definition posix-regexp-error-message)
  149. (import-definition posix-free-regexp)