regexp.scm 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Richard Kelsey, Jonathan Rees, Mike Sperber, Marcus Crestani,
  3. ; Roderic Morris
  4. ; Exports:
  5. ; make-regexp
  6. ; regexp?
  7. ; regexp-match
  8. ; regexp-match?
  9. ; regexp-match-start
  10. ; regexp-match-end
  11. ; regexp-option
  12. (import-dynamic-externals "=scheme48external/posix")
  13. ; The compiled version of the expression is produced when needed.
  14. (define-record-type regexp :regexp
  15. (really-make-regexp pattern pattern-byte-string
  16. compiled
  17. extended? ignore-case? submatches? newline?)
  18. regexp?
  19. (pattern regexp-pattern) ; immutable string
  20. (pattern-byte-string regexp-pattern-byte-string)
  21. (compiled real-regexp-compiled set-regexp-compiled!) ; #f or a c-record
  22. (extended? regexp-extended?) ; four flags
  23. (ignore-case? regexp-ignore-case?)
  24. (submatches? regexp-submatches?)
  25. (newline? regexp-newline?))
  26. ; Drop the compiled version when resuming. We may be resuming on a different
  27. ; architecture, or version of the library, or whatever.
  28. (define-record-resumer :regexp
  29. (lambda (regexp)
  30. (set-regexp-compiled! regexp #f)))
  31. ; There are four options when making a regular expression.
  32. (define-enumerated-type regexp-option :regexp-option
  33. regexp-option?
  34. regexp-options
  35. regexp-option-name
  36. regexp-option-index
  37. (extended ignore-case submatches newline))
  38. ; Loop down finding which options are present and checking for duplicates.
  39. ; This is not specific to regular expressions.
  40. ;
  41. ; It would be nice if this could handle values as well, as in
  42. ; (make-regexp "sldkjf" (regexp-option size 10))
  43. (define (decode-boolean-options options all-options predicate indexer)
  44. (let ((map (make-vector (vector-length all-options) #f)))
  45. (let loop ((options options))
  46. (if (null? options)
  47. (vector->list map)
  48. (let ((option (car options)))
  49. (if (predicate option)
  50. (let ((index (indexer option)))
  51. (if (vector-ref map index)
  52. 'duplicates
  53. (begin
  54. (vector-set! map index #t)
  55. (loop (cdr options)))))
  56. 'bad-value))))))
  57. ; The only thing we do here is to decode the options and make sure that the
  58. ; pattern is immutable, as it won't be used until later.
  59. (define (make-regexp pattern . options)
  60. (let ((options (decode-boolean-options options
  61. regexp-options
  62. regexp-option?
  63. regexp-option-index)))
  64. (if (and (string? pattern)
  65. (pair? options))
  66. (let* ((pattern (string->immutable-string pattern))
  67. (pattern-byte-string (string->byte-vector pattern))
  68. (regexp (apply really-make-regexp pattern pattern-byte-string #f options)))
  69. (add-finalizer! regexp free-compiled-regexp)
  70. regexp)
  71. (apply assertion-violation 'make-regexp
  72. "invalid argument(s)"
  73. pattern
  74. options))))
  75. ; Free up the C-heap storage used for the compiled regexp.
  76. (define (free-compiled-regexp regexp)
  77. (let ((compiled (real-regexp-compiled regexp)))
  78. (if compiled
  79. (call-imported-binding-2 posix-free-regexp compiled))))
  80. ; We compile the pattern if that hasn't already been done, raising an error
  81. ; if anything goes wrong.
  82. (define (regexp-compiled regexp)
  83. (or (real-regexp-compiled regexp)
  84. (let ((compiled (call-imported-binding-2 posix-compile-regexp
  85. (regexp-pattern-byte-string regexp)
  86. (regexp-extended? regexp)
  87. (regexp-ignore-case? regexp)
  88. (regexp-submatches? regexp)
  89. (regexp-newline? regexp))))
  90. (if (not (integer? compiled))
  91. (begin
  92. (set-regexp-compiled! regexp compiled)
  93. compiled)
  94. (let ((message (call-imported-binding-2 posix-regexp-error-message
  95. (regexp-pattern-byte-string regexp)
  96. (regexp-extended? regexp)
  97. (regexp-ignore-case? regexp)
  98. (regexp-submatches? regexp)
  99. (regexp-newline? regexp))))
  100. (error 'regexp.compiled
  101. (if message
  102. (string-append "Posix regexp: " (os-string->string
  103. (byte-vector->os-string message)))
  104. "inconsistent results from Posix regexp compiler")
  105. regexp))))))
  106. ; Call the pattern matcher. We return #F if the match fails. On a successful
  107. ; match, we either return #T or a list of match records, depending on the value
  108. ; of SUBMATCHES?.
  109. (define (regexp-match regexp string start submatches? starts-line? ends-line?)
  110. (cond
  111. ((not (and (regexp? regexp)
  112. (string? string)))
  113. (assertion-violation 'regexp-match
  114. "invalid argument"
  115. regexp string start starts-line? ends-line?))
  116. ((and submatches?
  117. (not (regexp-submatches? regexp)))
  118. (assertion-violation 'regexp-match
  119. "regexp not compiled for submatches"
  120. regexp string start starts-line? ends-line?))
  121. (else
  122. (call-imported-binding-2 posix-regexp-match
  123. (regexp-compiled regexp)
  124. (string->byte-vector string)
  125. start
  126. submatches?
  127. starts-line?
  128. ends-line?))))
  129. ; we can't do any better with POSIX, Mike thinks
  130. (define (string->byte-vector s)
  131. (os-string->byte-vector
  132. (call-with-os-string-text-codec
  133. latin-1-codec
  134. (lambda ()
  135. (string->os-string s)))))
  136. ; These are made by the C code. The SUBMATCHES field is not used by us,
  137. ; but is used by the functional interface.
  138. (define-record-type match :match
  139. (make-match start end submatches)
  140. match?
  141. (start match-start)
  142. (end match-end)
  143. (submatches match-submatches))
  144. (define-record-discloser :match
  145. (lambda (rem)
  146. (list 'match
  147. (match-start rem)
  148. (match-end rem))))
  149. (define-exported-binding "posix-regexp-match-type" :match)
  150. ; The various C functions we use.
  151. (import-definition posix-compile-regexp)
  152. (import-definition posix-regexp-match)
  153. (import-definition posix-regexp-error-message)
  154. (import-definition posix-free-regexp)